diff options
Diffstat (limited to 'fs')
532 files changed, 29400 insertions, 16672 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 2a5de610dd8f..bdabb2765d1b 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -483,6 +483,9 @@ static int v9fs_test_inode(struct inode *inode, void *data) if (v9inode->qid.type != st->qid.type) return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; return 1; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 70f9887c59a9..7f6ae21a27b3 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -87,6 +87,9 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) if (v9inode->qid.type != st->qid.type) return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; return 1; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 8b75463cb211..af03c2a901eb 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -94,13 +94,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; - sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; + sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME; if (!v9ses->cache) - sb->s_flags |= MS_SYNCHRONOUS; + sb->s_flags |= SB_SYNCHRONOUS; #ifdef CONFIG_9P_FS_POSIX_ACL if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif return 0; diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index b2f82cf6bf86..58c2bbd385ad 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -34,8 +34,8 @@ config ARCH_BINFMT_ELF_STATE config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" - default y - depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) + default y if !BINFMT_ELF + depends on (ARM || FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load diff --git a/fs/adfs/super.c b/fs/adfs/super.c index c9fdfb112933..cfda2c7caedc 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -213,7 +213,7 @@ static int parse_options(struct super_block *sb, char *options) static int adfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NODIRATIME; + *flags |= SB_NODIRATIME; return parse_options(sb, data); } @@ -372,7 +372,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; int ret = -EINVAL; - sb->s_flags |= MS_NODIRATIME; + sb->s_flags |= SB_NODIRATIME; asb = kzalloc(sizeof(*asb), GFP_KERNEL); if (!asb) diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 185d5ab7e986..0f0e6925e97d 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -453,7 +453,7 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...) pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); if (!sb_rdonly(sb)) pr_warn("Remounting filesystem read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; va_end(args); } diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 2b1399611d9e..5ba9ef2742f6 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -250,12 +250,12 @@ int affs_init_bitmap(struct super_block *sb, int *flags) int i, res = 0; struct affs_sb_info *sbi = AFFS_SB(sb); - if (*flags & MS_RDONLY) + if (*flags & SB_RDONLY) return 0; if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) { pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -288,7 +288,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) if (affs_checksum_block(sb, bh)) { pr_warn("Bitmap %u invalid - mounting %s read only.\n", bm->bm_key, sb->s_id); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; goto out; } pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key); diff --git a/fs/affs/super.c b/fs/affs/super.c index 884bedab7266..1117e36134cc 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -356,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = AFFS_SUPER_MAGIC; sb->s_op = &affs_sops; - sb->s_flags |= MS_NODIRATIME; + sb->s_flags |= SB_NODIRATIME; sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); if (!sbi) @@ -466,7 +466,7 @@ got_root: if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS || chksum == MUFS_DCOFS) && !sb_rdonly(sb)) { pr_notice("Dircache FS - mounting %s read only\n", sb->s_id); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } switch (chksum) { case MUFS_FS: @@ -488,7 +488,7 @@ got_root: /* fall thru */ case FS_OFS: affs_set_opt(sbi->s_flags, SF_OFS); - sb->s_flags |= MS_NOEXEC; + sb->s_flags |= SB_NOEXEC; break; case MUFS_DCOFS: case MUFS_INTLOFS: @@ -497,7 +497,7 @@ got_root: case FS_INTLOFS: affs_set_opt(sbi->s_flags, SF_INTL); affs_set_opt(sbi->s_flags, SF_OFS); - sb->s_flags |= MS_NOEXEC; + sb->s_flags |= SB_NOEXEC; break; default: pr_err("Unknown filesystem on device %s: %08X\n", @@ -513,7 +513,7 @@ got_root: sig, sig[3] + '0', blocksize); } - sb->s_flags |= MS_NODEV | MS_NOSUID; + sb->s_flags |= SB_NODEV | SB_NOSUID; sbi->s_data_blksize = sb->s_blocksize; if (affs_test_opt(sbi->s_flags, SF_OFS)) @@ -570,7 +570,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); sync_filesystem(sb); - *flags |= MS_NODIRATIME; + *flags |= SB_NODIRATIME; memcpy(volume, sbi->s_volume, 32); if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, @@ -596,10 +596,10 @@ affs_remount(struct super_block *sb, int *flags, char *data) memcpy(sbi->s_volume, volume, 32); spin_unlock(&sbi->symlink_lock); - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (*flags & MS_RDONLY) + if (*flags & SB_RDONLY) affs_free_bitmap(sb); else res = affs_init_bitmap(sb, flags); diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 641148208e90..45b7fc405fa6 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -7,6 +7,7 @@ afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o kafs-objs := \ $(afs-cache-y) \ + addr_list.o \ callback.o \ cell.o \ cmservice.o \ @@ -19,14 +20,14 @@ kafs-objs := \ misc.o \ mntpt.o \ proc.o \ + rotate.o \ rxrpc.o \ security.o \ server.o \ + server_list.o \ super.o \ netdevices.o \ vlclient.o \ - vlocation.o \ - vnode.o \ volume.o \ write.o \ xattr.o diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c new file mode 100644 index 000000000000..a537368ba0db --- /dev/null +++ b/fs/afs/addr_list.c @@ -0,0 +1,381 @@ +/* Server address list management + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/dns_resolver.h> +#include <linux/inet.h> +#include <keys/rxrpc-type.h> +#include "internal.h" +#include "afs_fs.h" + +//#define AFS_MAX_ADDRESSES +// ((unsigned int)((PAGE_SIZE - sizeof(struct afs_addr_list)) / +// sizeof(struct sockaddr_rxrpc))) +#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) + +/* + * Release an address list. + */ +void afs_put_addrlist(struct afs_addr_list *alist) +{ + if (alist && refcount_dec_and_test(&alist->usage)) + call_rcu(&alist->rcu, (rcu_callback_t)kfree); +} + +/* + * Allocate an address list. + */ +struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, + unsigned short service, + unsigned short port) +{ + struct afs_addr_list *alist; + unsigned int i; + + _enter("%u,%u,%u", nr, service, port); + + alist = kzalloc(sizeof(*alist) + sizeof(alist->addrs[0]) * nr, + GFP_KERNEL); + if (!alist) + return NULL; + + refcount_set(&alist->usage, 1); + + for (i = 0; i < nr; i++) { + struct sockaddr_rxrpc *srx = &alist->addrs[i]; + srx->srx_family = AF_RXRPC; + srx->srx_service = service; + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin6); + srx->transport.sin6.sin6_family = AF_INET6; + srx->transport.sin6.sin6_port = htons(port); + } + + return alist; +} + +/* + * Parse a text string consisting of delimited addresses. + */ +struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, + char delim, + unsigned short service, + unsigned short port) +{ + struct afs_addr_list *alist; + const char *p, *end = text + len; + unsigned int nr = 0; + + _enter("%*.*s,%c", (int)len, (int)len, text, delim); + + if (!len) + return ERR_PTR(-EDESTADDRREQ); + + if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len))) + delim = ','; + + /* Count the addresses */ + p = text; + do { + if (!*p) + return ERR_PTR(-EINVAL); + if (*p == delim) + continue; + nr++; + if (*p == '[') { + p++; + if (p == end) + return ERR_PTR(-EINVAL); + p = memchr(p, ']', end - p); + if (!p) + return ERR_PTR(-EINVAL); + p++; + if (p >= end) + break; + } + + p = memchr(p, delim, end - p); + if (!p) + break; + p++; + } while (p < end); + + _debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES); + if (nr > AFS_MAX_ADDRESSES) + nr = AFS_MAX_ADDRESSES; + + alist = afs_alloc_addrlist(nr, service, port); + if (!alist) + return ERR_PTR(-ENOMEM); + + /* Extract the addresses */ + p = text; + do { + struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs]; + char tdelim = delim; + + if (*p == delim) { + p++; + continue; + } + + if (*p == '[') { + p++; + tdelim = ']'; + } + + if (in4_pton(p, end - p, + (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3], + tdelim, &p)) { + srx->transport.sin6.sin6_addr.s6_addr32[0] = 0; + srx->transport.sin6.sin6_addr.s6_addr32[1] = 0; + srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); + } else if (in6_pton(p, end - p, + srx->transport.sin6.sin6_addr.s6_addr, + tdelim, &p)) { + /* Nothing to do */ + } else { + goto bad_address; + } + + if (tdelim == ']') { + if (p == end || *p != ']') + goto bad_address; + p++; + } + + if (p < end) { + if (*p == '+') { + /* Port number specification "+1234" */ + unsigned int xport = 0; + p++; + if (p >= end || !isdigit(*p)) + goto bad_address; + do { + xport *= 10; + xport += *p - '0'; + if (xport > 65535) + goto bad_address; + p++; + } while (p < end && isdigit(*p)); + srx->transport.sin6.sin6_port = htons(xport); + } else if (*p == delim) { + p++; + } else { + goto bad_address; + } + } + + alist->nr_addrs++; + } while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES); + + _leave(" = [nr %u]", alist->nr_addrs); + return alist; + +bad_address: + kfree(alist); + return ERR_PTR(-EINVAL); +} + +/* + * Compare old and new address lists to see if there's been any change. + * - How to do this in better than O(Nlog(N)) time? + * - We don't really want to sort the address list, but would rather take the + * list as we got it so as not to undo record rotation by the DNS server. + */ +#if 0 +static int afs_cmp_addr_list(const struct afs_addr_list *a1, + const struct afs_addr_list *a2) +{ +} +#endif + +/* + * Perform a DNS query for VL servers and build a up an address list. + */ +struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) +{ + struct afs_addr_list *alist; + char *vllist = NULL; + int ret; + + _enter("%s", cell->name); + + ret = dns_query("afsdb", cell->name, cell->name_len, + "ipv4", &vllist, _expiry); + if (ret < 0) + return ERR_PTR(ret); + + alist = afs_parse_text_addrs(vllist, strlen(vllist), ',', + VL_SERVICE, AFS_VL_PORT); + if (IS_ERR(alist)) { + kfree(vllist); + if (alist != ERR_PTR(-ENOMEM)) + pr_err("Failed to parse DNS data\n"); + return alist; + } + + kfree(vllist); + return alist; +} + +/* + * Merge an IPv4 entry into a fileserver address list. + */ +void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) +{ + struct sockaddr_in6 *a; + __be16 xport = htons(port); + int i; + + for (i = 0; i < alist->nr_ipv4; i++) { + a = &alist->addrs[i].transport.sin6; + if (xdr == a->sin6_addr.s6_addr32[3] && + xport == a->sin6_port) + return; + if (xdr == a->sin6_addr.s6_addr32[3] && + xport < a->sin6_port) + break; + if (xdr < a->sin6_addr.s6_addr32[3]) + break; + } + + if (i < alist->nr_addrs) + memmove(alist->addrs + i + 1, + alist->addrs + i, + sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); + + a = &alist->addrs[i].transport.sin6; + a->sin6_port = xport; + a->sin6_addr.s6_addr32[0] = 0; + a->sin6_addr.s6_addr32[1] = 0; + a->sin6_addr.s6_addr32[2] = htonl(0xffff); + a->sin6_addr.s6_addr32[3] = xdr; + alist->nr_ipv4++; + alist->nr_addrs++; +} + +/* + * Merge an IPv6 entry into a fileserver address list. + */ +void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) +{ + struct sockaddr_in6 *a; + __be16 xport = htons(port); + int i, diff; + + for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { + a = &alist->addrs[i].transport.sin6; + diff = memcmp(xdr, &a->sin6_addr, 16); + if (diff == 0 && + xport == a->sin6_port) + return; + if (diff == 0 && + xport < a->sin6_port) + break; + if (diff < 0) + break; + } + + if (i < alist->nr_addrs) + memmove(alist->addrs + i + 1, + alist->addrs + i, + sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); + + a = &alist->addrs[i].transport.sin6; + a->sin6_port = xport; + a->sin6_addr.s6_addr32[0] = xdr[0]; + a->sin6_addr.s6_addr32[1] = xdr[1]; + a->sin6_addr.s6_addr32[2] = xdr[2]; + a->sin6_addr.s6_addr32[3] = xdr[3]; + alist->nr_addrs++; +} + +/* + * Get an address to try. + */ +bool afs_iterate_addresses(struct afs_addr_cursor *ac) +{ + _enter("%hu+%hd", ac->start, (short)ac->index); + + if (!ac->alist) + return false; + + if (ac->begun) { + ac->index++; + if (ac->index == ac->alist->nr_addrs) + ac->index = 0; + + if (ac->index == ac->start) { + ac->error = -EDESTADDRREQ; + return false; + } + } + + ac->begun = true; + ac->responded = false; + ac->addr = &ac->alist->addrs[ac->index]; + return true; +} + +/* + * Release an address list cursor. + */ +int afs_end_cursor(struct afs_addr_cursor *ac) +{ + if (ac->responded && ac->index != ac->start) + WRITE_ONCE(ac->alist->index, ac->index); + + afs_put_addrlist(ac->alist); + ac->alist = NULL; + return ac->error; +} + +/* + * Set the address cursor for iterating over VL servers. + */ +int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell) +{ + struct afs_addr_list *alist; + int ret; + + if (!rcu_access_pointer(cell->vl_addrs)) { + ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET, + TASK_INTERRUPTIBLE); + if (ret < 0) + return ret; + + if (!rcu_access_pointer(cell->vl_addrs) && + ktime_get_real_seconds() < cell->dns_expiry) + return cell->error; + } + + read_lock(&cell->vl_addrs_lock); + alist = rcu_dereference_protected(cell->vl_addrs, + lockdep_is_held(&cell->vl_addrs_lock)); + if (alist->nr_addrs > 0) + afs_get_addrlist(alist); + else + alist = NULL; + read_unlock(&cell->vl_addrs_lock); + + if (!alist) + return -EDESTADDRREQ; + + ac->alist = alist; + ac->addr = NULL; + ac->start = READ_ONCE(alist->index); + ac->index = ac->start; + ac->error = 0; + ac->begun = false; + return 0; +} diff --git a/fs/afs/afs.h b/fs/afs/afs.h index 3c462ff6db63..b94d0edc2b78 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -14,11 +14,14 @@ #include <linux/in.h> -#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */ -#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */ -#define AFSNAMEMAX 256 /* maximum length of a filename plus NUL */ -#define AFSPATHMAX 1024 /* maximum length of a pathname plus NUL */ -#define AFSOPAQUEMAX 1024 /* maximum length of an opaque field */ +#define AFS_MAXCELLNAME 64 /* Maximum length of a cell name */ +#define AFS_MAXVOLNAME 64 /* Maximum length of a volume name */ +#define AFS_MAXNSERVERS 8 /* Maximum servers in a basic volume record */ +#define AFS_NMAXNSERVERS 13 /* Maximum servers in a N/U-class volume record */ +#define AFS_MAXTYPES 3 /* Maximum number of volume types */ +#define AFSNAMEMAX 256 /* Maximum length of a filename plus NUL */ +#define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */ +#define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */ typedef unsigned afs_volid_t; typedef unsigned afs_vnodeid_t; @@ -72,6 +75,15 @@ struct afs_callback { #define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ +struct afs_uuid { + __be32 time_low; /* low part of timestamp */ + __be16 time_mid; /* mid part of timestamp */ + __be16 time_hi_and_version; /* high part of timestamp and version */ + __s8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ + __s8 clock_seq_low; /* clock seq low */ + __s8 node[6]; /* spatially unique node ID (MAC addr) */ +}; + /* * AFS volume information */ @@ -124,7 +136,6 @@ struct afs_file_status { afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ - struct afs_fid parent; /* parent dir ID for non-dirs only */ time_t mtime_client; /* last time client changed data */ time_t mtime_server; /* last time server changed data */ s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ @@ -167,4 +178,16 @@ struct afs_volume_status { #define AFS_BLOCK_SIZE 1024 +/* + * XDR encoding of UUID in AFS. + */ +struct afs_uuid__xdr { + __be32 time_low; + __be32 time_mid; + __be32 time_hi_and_version; + __be32 clock_seq_hi_and_reserved; + __be32 clock_seq_low; + __be32 node[6]; +}; + #endif /* AFS_H */ diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h index eb647323d8f0..d47b6d01e4c0 100644 --- a/fs/afs/afs_fs.h +++ b/fs/afs/afs_fs.h @@ -37,9 +37,12 @@ enum AFS_FS_Operations { FSLOOKUP = 161, /* AFS lookup file in directory */ FSFETCHDATA64 = 65537, /* AFS Fetch file data */ FSSTOREDATA64 = 65538, /* AFS Store file data */ + FSGIVEUPALLCALLBACKS = 65539, /* AFS Give up all outstanding callbacks on a server */ + FSGETCAPABILITIES = 65540, /* Probe and get the capabilities of a fileserver */ }; enum AFS_FS_Errors { + VRESTARTING = -100, /* Server is restarting */ VSALVAGE = 101, /* volume needs salvaging */ VNOVNODE = 102, /* no such file/dir (vnode) */ VNOVOL = 103, /* no such volume or volume unavailable */ @@ -51,6 +54,9 @@ enum AFS_FS_Errors { VOVERQUOTA = 109, /* volume's maximum quota exceeded */ VBUSY = 110, /* volume is temporarily unavailable */ VMOVED = 111, /* volume moved to new server - ask this FS where */ + VIO = 112, /* I/O error in volume */ + VSALVAGING = 113, /* Volume is being salvaged */ + VRESTRICTED = 120, /* Volume is restricted from using */ }; #endif /* AFS_FS_H */ diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h index 800f607ffaf5..e3c4688f573b 100644 --- a/fs/afs/afs_vl.h +++ b/fs/afs/afs_vl.h @@ -16,11 +16,17 @@ #define AFS_VL_PORT 7003 /* volume location service port */ #define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */ +#define YFS_VL_SERVICE 2503 /* Service ID for AuriStor upgraded VL service */ enum AFSVL_Operations { - VLGETENTRYBYID = 503, /* AFS Get Cache Entry By ID operation ID */ - VLGETENTRYBYNAME = 504, /* AFS Get Cache Entry By Name operation ID */ - VLPROBE = 514, /* AFS Probe Volume Location Service operation ID */ + VLGETENTRYBYID = 503, /* AFS Get VLDB entry by ID */ + VLGETENTRYBYNAME = 504, /* AFS Get VLDB entry by name */ + VLPROBE = 514, /* AFS probe VL service */ + VLGETENTRYBYIDU = 526, /* AFS Get VLDB entry by ID (UUID-variant) */ + VLGETENTRYBYNAMEU = 527, /* AFS Get VLDB entry by name (UUID-variant) */ + VLGETADDRSU = 533, /* AFS Get addrs for fileserver */ + YVLGETENDPOINTS = 64002, /* YFS Get endpoints for file/volume server */ + VLGETCAPABILITIES = 65537, /* AFS Get server capabilities */ }; enum AFSVL_Errors { @@ -54,6 +60,19 @@ enum AFSVL_Errors { AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */ }; +enum { + YFS_SERVER_INDEX = 0, + YFS_SERVER_UUID = 1, + YFS_SERVER_ENDPOINT = 2, +}; + +enum { + YFS_ENDPOINT_IPV4 = 0, + YFS_ENDPOINT_IPV6 = 1, +}; + +#define YFS_MAXENDPOINTS 16 + /* * maps to "struct vldbentry" in vvl-spec.pdf */ @@ -74,11 +93,57 @@ struct afs_vldbentry { struct in_addr addr; /* server address */ unsigned partition; /* partition ID on this server */ unsigned flags; /* server specific flags */ -#define AFS_VLSF_NEWREPSITE 0x0001 /* unused */ +#define AFS_VLSF_NEWREPSITE 0x0001 /* Ignore all 'non-new' servers */ #define AFS_VLSF_ROVOL 0x0002 /* this server holds a R/O instance of the volume */ #define AFS_VLSF_RWVOL 0x0004 /* this server holds a R/W instance of the volume */ #define AFS_VLSF_BACKVOL 0x0008 /* this server holds a backup instance of the volume */ +#define AFS_VLSF_UUID 0x0010 /* This server is referred to by its UUID */ +#define AFS_VLSF_DONTUSE 0x0020 /* This server ref should be ignored */ } servers[8]; }; +#define AFS_VLDB_MAXNAMELEN 65 + + +struct afs_ListAddrByAttributes__xdr { + __be32 Mask; +#define AFS_VLADDR_IPADDR 0x1 /* Match by ->ipaddr */ +#define AFS_VLADDR_INDEX 0x2 /* Match by ->index */ +#define AFS_VLADDR_UUID 0x4 /* Match by ->uuid */ + __be32 ipaddr; + __be32 index; + __be32 spare; + struct afs_uuid__xdr uuid; +}; + +struct afs_uvldbentry__xdr { + __be32 name[AFS_VLDB_MAXNAMELEN]; + __be32 nServers; + struct afs_uuid__xdr serverNumber[AFS_NMAXNSERVERS]; + __be32 serverUnique[AFS_NMAXNSERVERS]; + __be32 serverPartition[AFS_NMAXNSERVERS]; + __be32 serverFlags[AFS_NMAXNSERVERS]; + __be32 volumeId[AFS_MAXTYPES]; + __be32 cloneId; + __be32 flags; + __be32 spares1; + __be32 spares2; + __be32 spares3; + __be32 spares4; + __be32 spares5; + __be32 spares6; + __be32 spares7; + __be32 spares8; + __be32 spares9; +}; + +struct afs_address_list { + refcount_t usage; + unsigned int version; + unsigned int nr_addrs; + struct sockaddr_rxrpc addrs[]; +}; + +extern void afs_put_address_list(struct afs_address_list *alist); + #endif /* AFS_VL_H */ diff --git a/fs/afs/cache.c b/fs/afs/cache.c index 1fe855191261..f62ff71d28c9 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -14,19 +14,6 @@ static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, void *buffer, uint16_t buflen); -static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen); - -static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static enum fscache_checkaux afs_vlocation_cache_check_aux( - void *cookie_netfs_data, const void *buffer, uint16_t buflen); - static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, void *buffer, uint16_t buflen); @@ -42,23 +29,13 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, struct fscache_netfs afs_cache_netfs = { .name = "afs", - .version = 0, + .version = 1, }; struct fscache_cookie_def afs_cell_cache_index_def = { .name = "AFS.cell", .type = FSCACHE_COOKIE_TYPE_INDEX, .get_key = afs_cell_cache_get_key, - .get_aux = afs_cell_cache_get_aux, - .check_aux = afs_cell_cache_check_aux, -}; - -struct fscache_cookie_def afs_vlocation_cache_index_def = { - .name = "AFS.vldb", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = afs_vlocation_cache_get_key, - .get_aux = afs_vlocation_cache_get_aux, - .check_aux = afs_vlocation_cache_check_aux, }; struct fscache_cookie_def afs_volume_cache_index_def = { @@ -95,150 +72,26 @@ static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, return klen; } -/* - * provide new auxiliary cache data - */ -static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_cell *cell = cookie_netfs_data; - uint16_t dlen; - - _enter("%p,%p,%u", cell, buffer, bufmax); - - dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]); - dlen = min(dlen, bufmax); - dlen &= ~(sizeof(cell->vl_addrs[0]) - 1); - - memcpy(buffer, cell->vl_addrs, dlen); - return dlen; -} - -/* - * check that the auxiliary data indicates that the entry is still valid - */ -static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen) -{ - _leave(" = OKAY"); - return FSCACHE_CHECKAUX_OKAY; -} - -/*****************************************************************************/ -/* - * set the key for the index entry - */ -static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vlocation *vlocation = cookie_netfs_data; - uint16_t klen; - - _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); - - klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name)); - if (klen > bufmax) - return 0; - - memcpy(buffer, vlocation->vldb.name, klen); - - _leave(" = %u", klen); - return klen; -} - -/* - * provide new auxiliary cache data - */ -static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vlocation *vlocation = cookie_netfs_data; - uint16_t dlen; - - _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); - - dlen = sizeof(struct afs_cache_vlocation); - dlen -= offsetof(struct afs_cache_vlocation, nservers); - if (dlen > bufmax) - return 0; - - memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen); - - _leave(" = %u", dlen); - return dlen; -} - -/* - * check that the auxiliary data indicates that the entry is still valid - */ -static -enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen) -{ - const struct afs_cache_vlocation *cvldb; - struct afs_vlocation *vlocation = cookie_netfs_data; - uint16_t dlen; - - _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen); - - /* check the size of the data is what we're expecting */ - dlen = sizeof(struct afs_cache_vlocation); - dlen -= offsetof(struct afs_cache_vlocation, nservers); - if (dlen != buflen) - return FSCACHE_CHECKAUX_OBSOLETE; - - cvldb = container_of(buffer, struct afs_cache_vlocation, nservers); - - /* if what's on disk is more valid than what's in memory, then use the - * VL record from the cache */ - if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) { - memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen); - vlocation->valid = 1; - _leave(" = SUCCESS [c->m]"); - return FSCACHE_CHECKAUX_OKAY; - } - - /* need to update the cache if the cached info differs */ - if (memcmp(&vlocation->vldb, buffer, dlen) != 0) { - /* delete if the volume IDs for this name differ */ - if (memcmp(&vlocation->vldb.vid, &cvldb->vid, - sizeof(cvldb->vid)) != 0 - ) { - _leave(" = OBSOLETE"); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - _leave(" = UPDATE"); - return FSCACHE_CHECKAUX_NEEDS_UPDATE; - } - - _leave(" = OKAY"); - return FSCACHE_CHECKAUX_OKAY; -} - /*****************************************************************************/ /* * set the key for the volume index entry */ static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) + void *buffer, uint16_t bufmax) { const struct afs_volume *volume = cookie_netfs_data; - uint16_t klen; + struct { + u64 volid; + } __packed key; _enter("{%u},%p,%u", volume->type, buffer, bufmax); - klen = sizeof(volume->type); - if (klen > bufmax) + if (bufmax < sizeof(key)) return 0; - memcpy(buffer, &volume->type, sizeof(volume->type)); - - _leave(" = %u", klen); - return klen; - + key.volid = volume->vid; + memcpy(buffer, &key, sizeof(key)); + return sizeof(key); } /*****************************************************************************/ @@ -249,20 +102,25 @@ static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, void *buffer, uint16_t bufmax) { const struct afs_vnode *vnode = cookie_netfs_data; - uint16_t klen; + struct { + u32 vnode_id[3]; + } __packed key; _enter("{%x,%x,%llx},%p,%u", vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, buffer, bufmax); - klen = sizeof(vnode->fid.vnode); - if (klen > bufmax) - return 0; + /* Allow for a 96-bit key */ + memset(&key, 0, sizeof(key)); + key.vnode_id[0] = vnode->fid.vnode; + key.vnode_id[1] = 0; + key.vnode_id[2] = 0; - memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode)); + if (sizeof(key) > bufmax) + return 0; - _leave(" = %u", klen); - return klen; + memcpy(buffer, &key, sizeof(key)); + return sizeof(key); } /* @@ -280,6 +138,11 @@ static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, *size = vnode->status.size; } +struct afs_vnode_cache_aux { + u64 data_version; + u32 fid_unique; +} __packed; + /* * provide new auxiliary cache data */ @@ -287,23 +150,21 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, void *buffer, uint16_t bufmax) { const struct afs_vnode *vnode = cookie_netfs_data; - uint16_t dlen; + struct afs_vnode_cache_aux aux; _enter("{%x,%x,%Lx},%p,%u", vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, buffer, bufmax); - dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); - if (dlen > bufmax) - return 0; + memset(&aux, 0, sizeof(aux)); + aux.data_version = vnode->status.data_version; + aux.fid_unique = vnode->fid.unique; - memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique)); - buffer += sizeof(vnode->fid.unique); - memcpy(buffer, &vnode->status.data_version, - sizeof(vnode->status.data_version)); + if (bufmax < sizeof(aux)) + return 0; - _leave(" = %u", dlen); - return dlen; + memcpy(buffer, &aux, sizeof(aux)); + return sizeof(aux); } /* @@ -314,43 +175,29 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, uint16_t buflen) { struct afs_vnode *vnode = cookie_netfs_data; - uint16_t dlen; + struct afs_vnode_cache_aux aux; _enter("{%x,%x,%llx},%p,%u", vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, buffer, buflen); + memcpy(&aux, buffer, sizeof(aux)); + /* check the size of the data is what we're expecting */ - dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); - if (dlen != buflen) { - _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen); + if (buflen != sizeof(aux)) { + _leave(" = OBSOLETE [len %hx != %zx]", buflen, sizeof(aux)); return FSCACHE_CHECKAUX_OBSOLETE; } - if (memcmp(buffer, - &vnode->fid.unique, - sizeof(vnode->fid.unique) - ) != 0) { - unsigned unique; - - memcpy(&unique, buffer, sizeof(unique)); - + if (vnode->fid.unique != aux.fid_unique) { _leave(" = OBSOLETE [uniq %x != %x]", - unique, vnode->fid.unique); + aux.fid_unique, vnode->fid.unique); return FSCACHE_CHECKAUX_OBSOLETE; } - if (memcmp(buffer + sizeof(vnode->fid.unique), - &vnode->status.data_version, - sizeof(vnode->status.data_version) - ) != 0) { - afs_dataversion_t version; - - memcpy(&version, buffer + sizeof(vnode->fid.unique), - sizeof(version)); - + if (vnode->status.data_version != aux.data_version) { _leave(" = OBSOLETE [vers %llx != %llx]", - version, vnode->status.data_version); + aux.data_version, vnode->status.data_version); return FSCACHE_CHECKAUX_OBSOLETE; } diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 25d404d22cae..f4291b576054 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -20,118 +20,151 @@ #include <linux/sched.h> #include "internal.h" -#if 0 -unsigned afs_vnode_update_timeout = 10; -#endif /* 0 */ - -#define afs_breakring_space(server) \ - CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail, \ - ARRAY_SIZE((server)->cb_break)) - -//static void afs_callback_updater(struct work_struct *); - -static struct workqueue_struct *afs_callback_update_worker; - /* - * allow the fileserver to request callback state (re-)initialisation + * Set up an interest-in-callbacks record for a volume on a server and + * register it with the server. + * - Called with volume->server_sem held. */ -void afs_init_callback_state(struct afs_server *server) +int afs_register_server_cb_interest(struct afs_vnode *vnode, + struct afs_server_entry *entry) { - struct afs_vnode *vnode; - - _enter("{%p}", server); + struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x; + struct afs_server *server = entry->server; + +again: + vcbi = vnode->cb_interest; + if (vcbi) { + if (vcbi == cbi) + return 0; + + if (cbi && vcbi->server == cbi->server) { + write_seqlock(&vnode->cb_lock); + vnode->cb_interest = afs_get_cb_interest(cbi); + write_sequnlock(&vnode->cb_lock); + afs_put_cb_interest(afs_v2net(vnode), cbi); + return 0; + } - spin_lock(&server->cb_lock); + if (!cbi && vcbi->server == server) { + afs_get_cb_interest(vcbi); + x = cmpxchg(&entry->cb_interest, cbi, vcbi); + if (x != cbi) { + cbi = x; + afs_put_cb_interest(afs_v2net(vnode), vcbi); + goto again; + } + return 0; + } + } - /* kill all the promises on record from this server */ - while (!RB_EMPTY_ROOT(&server->cb_promises)) { - vnode = rb_entry(server->cb_promises.rb_node, - struct afs_vnode, cb_promise); - _debug("UNPROMISE { vid=%x:%u uq=%u}", - vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; + if (!cbi) { + new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL); + if (!new) + return -ENOMEM; + + refcount_set(&new->usage, 1); + new->sb = vnode->vfs_inode.i_sb; + new->vid = vnode->volume->vid; + new->server = afs_get_server(server); + INIT_LIST_HEAD(&new->cb_link); + + write_lock(&server->cb_break_lock); + list_add_tail(&new->cb_link, &server->cb_interests); + write_unlock(&server->cb_break_lock); + + x = cmpxchg(&entry->cb_interest, cbi, new); + if (x == cbi) { + cbi = new; + } else { + cbi = x; + afs_put_cb_interest(afs_v2net(vnode), new); + } } - spin_unlock(&server->cb_lock); - _leave(""); + ASSERT(cbi); + + /* Change the server the vnode is using. This entails scrubbing any + * interest the vnode had in the previous server it was using. + */ + write_seqlock(&vnode->cb_lock); + + vnode->cb_interest = afs_get_cb_interest(cbi); + vnode->cb_s_break = cbi->server->cb_s_break; + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + + write_sequnlock(&vnode->cb_lock); + return 0; } /* - * handle the data invalidation side of a callback being broken + * Set a vnode's interest on a server. */ -void afs_broken_callback_work(struct work_struct *work) +void afs_set_cb_interest(struct afs_vnode *vnode, struct afs_cb_interest *cbi) { - struct afs_vnode *vnode = - container_of(work, struct afs_vnode, cb_broken_work); + struct afs_cb_interest *old_cbi = NULL; - _enter(""); - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + if (vnode->cb_interest == cbi) return; - /* we're only interested in dealing with a broken callback on *this* - * vnode and only if no-one else has dealt with it yet */ - if (!mutex_trylock(&vnode->validate_lock)) - return; /* someone else is dealing with it */ - - if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { - if (S_ISDIR(vnode->vfs_inode.i_mode)) - afs_clear_permits(vnode); - - if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0) - goto out; - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto out; - - /* if the vnode's data version number changed then its contents - * are different */ - if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - afs_zap_data(vnode); + write_seqlock(&vnode->cb_lock); + if (vnode->cb_interest != cbi) { + afs_get_cb_interest(cbi); + old_cbi = vnode->cb_interest; + vnode->cb_interest = cbi; } + write_sequnlock(&vnode->cb_lock); + afs_put_cb_interest(afs_v2net(vnode), cbi); +} -out: - mutex_unlock(&vnode->validate_lock); - - /* avoid the potential race whereby the mutex_trylock() in this - * function happens again between the clear_bit() and the - * mutex_unlock() */ - if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { - _debug("requeue"); - queue_work(afs_callback_update_worker, &vnode->cb_broken_work); +/* + * Remove an interest on a server. + */ +void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) +{ + if (cbi && refcount_dec_and_test(&cbi->usage)) { + if (!list_empty(&cbi->cb_link)) { + write_lock(&cbi->server->cb_break_lock); + list_del_init(&cbi->cb_link); + write_unlock(&cbi->server->cb_break_lock); + afs_put_server(net, cbi->server); + } + kfree(cbi); } - _leave(""); +} + +/* + * allow the fileserver to request callback state (re-)initialisation + */ +void afs_init_callback_state(struct afs_server *server) +{ + if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags)) + server->cb_s_break++; } /* * actually break a callback */ -static void afs_break_callback(struct afs_server *server, - struct afs_vnode *vnode) +void afs_break_callback(struct afs_vnode *vnode) { _enter(""); - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + write_seqlock(&vnode->cb_lock); + + if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + vnode->cb_break++; + afs_clear_permits(vnode); - if (vnode->cb_promised) { spin_lock(&vnode->lock); _debug("break callback"); - spin_lock(&server->cb_lock); - if (vnode->cb_promised) { - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; - } - spin_unlock(&server->cb_lock); - - queue_work(afs_callback_update_worker, &vnode->cb_broken_work); if (list_empty(&vnode->granted_locks) && !list_empty(&vnode->pending_locks)) afs_lock_may_be_available(vnode); spin_unlock(&vnode->lock); } + + write_sequnlock(&vnode->cb_lock); } /* @@ -143,49 +176,31 @@ static void afs_break_callback(struct afs_server *server, static void afs_break_one_callback(struct afs_server *server, struct afs_fid *fid) { + struct afs_cb_interest *cbi; + struct afs_iget_data data; struct afs_vnode *vnode; - struct rb_node *p; - - _debug("find"); - spin_lock(&server->fs_lock); - p = server->fs_vnodes.rb_node; - while (p) { - vnode = rb_entry(p, struct afs_vnode, server_rb); - if (fid->vid < vnode->fid.vid) - p = p->rb_left; - else if (fid->vid > vnode->fid.vid) - p = p->rb_right; - else if (fid->vnode < vnode->fid.vnode) - p = p->rb_left; - else if (fid->vnode > vnode->fid.vnode) - p = p->rb_right; - else if (fid->unique < vnode->fid.unique) - p = p->rb_left; - else if (fid->unique > vnode->fid.unique) - p = p->rb_right; - else - goto found; - } - - /* not found so we just ignore it (it may have moved to another - * server) */ -not_available: - _debug("not avail"); - spin_unlock(&server->fs_lock); - _leave(""); - return; + struct inode *inode; -found: - _debug("found"); - ASSERTCMP(server, ==, vnode->server); + read_lock(&server->cb_break_lock); - if (!igrab(AFS_VNODE_TO_I(vnode))) - goto not_available; - spin_unlock(&server->fs_lock); + /* Step through all interested superblocks. There may be more than one + * because of cell aliasing. + */ + list_for_each_entry(cbi, &server->cb_interests, cb_link) { + if (cbi->vid != fid->vid) + continue; + + data.volume = NULL; + data.fid = *fid; + inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data); + if (inode) { + vnode = AFS_FS_I(inode); + afs_break_callback(vnode); + iput(inode); + } + } - afs_break_callback(server, vnode); - iput(&vnode->vfs_inode); - _leave(""); + read_unlock(&server->cb_break_lock); } /* @@ -216,261 +231,14 @@ void afs_break_callbacks(struct afs_server *server, size_t count, } /* - * record the callback for breaking - * - the caller must hold server->cb_lock + * Clear the callback interests in a server list. */ -static void afs_do_give_up_callback(struct afs_server *server, - struct afs_vnode *vnode) +void afs_clear_callback_interests(struct afs_net *net, struct afs_server_list *slist) { - struct afs_callback *cb; - - _enter("%p,%p", server, vnode); - - cb = &server->cb_break[server->cb_break_head]; - cb->fid = vnode->fid; - cb->version = vnode->cb_version; - cb->expiry = vnode->cb_expiry; - cb->type = vnode->cb_type; - smp_wmb(); - server->cb_break_head = - (server->cb_break_head + 1) & - (ARRAY_SIZE(server->cb_break) - 1); - - /* defer the breaking of callbacks to try and collect as many as - * possible to ship in one operation */ - switch (atomic_inc_return(&server->cb_break_n)) { - case 1 ... AFSCBMAX - 1: - queue_delayed_work(afs_callback_update_worker, - &server->cb_break_work, HZ * 2); - break; - case AFSCBMAX: - afs_flush_callback_breaks(server); - break; - default: - break; - } - - ASSERT(server->cb_promises.rb_node != NULL); - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; - _leave(""); -} - -/* - * discard the callback on a deleted item - */ -void afs_discard_callback_on_delete(struct afs_vnode *vnode) -{ - struct afs_server *server = vnode->server; + int i; - _enter("%d", vnode->cb_promised); - - if (!vnode->cb_promised) { - _leave(" [not promised]"); - return; - } - - ASSERT(server != NULL); - - spin_lock(&server->cb_lock); - if (vnode->cb_promised) { - ASSERT(server->cb_promises.rb_node != NULL); - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; + for (i = 0; i < slist->nr_servers; i++) { + afs_put_cb_interest(net, slist->servers[i].cb_interest); + slist->servers[i].cb_interest = NULL; } - spin_unlock(&server->cb_lock); - _leave(""); -} - -/* - * give up the callback registered for a vnode on the file server when the - * inode is being cleared - */ -void afs_give_up_callback(struct afs_vnode *vnode) -{ - struct afs_server *server = vnode->server; - - DECLARE_WAITQUEUE(myself, current); - - _enter("%d", vnode->cb_promised); - - _debug("GIVE UP INODE %p", &vnode->vfs_inode); - - if (!vnode->cb_promised) { - _leave(" [not promised]"); - return; - } - - ASSERT(server != NULL); - - spin_lock(&server->cb_lock); - if (vnode->cb_promised && afs_breakring_space(server) == 0) { - add_wait_queue(&server->cb_break_waitq, &myself); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!vnode->cb_promised || - afs_breakring_space(server) != 0) - break; - spin_unlock(&server->cb_lock); - schedule(); - spin_lock(&server->cb_lock); - } - remove_wait_queue(&server->cb_break_waitq, &myself); - __set_current_state(TASK_RUNNING); - } - - /* of course, it's always possible for the server to break this vnode's - * callback first... */ - if (vnode->cb_promised) - afs_do_give_up_callback(server, vnode); - - spin_unlock(&server->cb_lock); - _leave(""); -} - -/* - * dispatch a deferred give up callbacks operation - */ -void afs_dispatch_give_up_callbacks(struct work_struct *work) -{ - struct afs_server *server = - container_of(work, struct afs_server, cb_break_work.work); - - _enter(""); - - /* tell the fileserver to discard the callback promises it has - * - in the event of ENOMEM or some other error, we just forget that we - * had callbacks entirely, and the server will call us later to break - * them - */ - afs_fs_give_up_callbacks(server, true); -} - -/* - * flush the outstanding callback breaks on a server - */ -void afs_flush_callback_breaks(struct afs_server *server) -{ - mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0); -} - -#if 0 -/* - * update a bunch of callbacks - */ -static void afs_callback_updater(struct work_struct *work) -{ - struct afs_server *server; - struct afs_vnode *vnode, *xvnode; - time64_t now; - long timeout; - int ret; - - server = container_of(work, struct afs_server, updater); - - _enter(""); - - now = ktime_get_real_seconds(); - - /* find the first vnode to update */ - spin_lock(&server->cb_lock); - for (;;) { - if (RB_EMPTY_ROOT(&server->cb_promises)) { - spin_unlock(&server->cb_lock); - _leave(" [nothing]"); - return; - } - - vnode = rb_entry(rb_first(&server->cb_promises), - struct afs_vnode, cb_promise); - if (atomic_read(&vnode->usage) > 0) - break; - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; - } - - timeout = vnode->update_at - now; - if (timeout > 0) { - queue_delayed_work(afs_vnode_update_worker, - &afs_vnode_update, timeout * HZ); - spin_unlock(&server->cb_lock); - _leave(" [nothing]"); - return; - } - - list_del_init(&vnode->update); - atomic_inc(&vnode->usage); - spin_unlock(&server->cb_lock); - - /* we can now perform the update */ - _debug("update %s", vnode->vldb.name); - vnode->state = AFS_VL_UPDATING; - vnode->upd_rej_cnt = 0; - vnode->upd_busy_cnt = 0; - - ret = afs_vnode_update_record(vl, &vldb); - switch (ret) { - case 0: - afs_vnode_apply_update(vl, &vldb); - vnode->state = AFS_VL_UPDATING; - break; - case -ENOMEDIUM: - vnode->state = AFS_VL_VOLUME_DELETED; - break; - default: - vnode->state = AFS_VL_UNCERTAIN; - break; - } - - /* and then reschedule */ - _debug("reschedule"); - vnode->update_at = ktime_get_real_seconds() + - afs_vnode_update_timeout; - - spin_lock(&server->cb_lock); - - if (!list_empty(&server->cb_promises)) { - /* next update in 10 minutes, but wait at least 1 second more - * than the newest record already queued so that we don't spam - * the VL server suddenly with lots of requests - */ - xvnode = list_entry(server->cb_promises.prev, - struct afs_vnode, update); - if (vnode->update_at <= xvnode->update_at) - vnode->update_at = xvnode->update_at + 1; - xvnode = list_entry(server->cb_promises.next, - struct afs_vnode, update); - timeout = xvnode->update_at - now; - if (timeout < 0) - timeout = 0; - } else { - timeout = afs_vnode_update_timeout; - } - - list_add_tail(&vnode->update, &server->cb_promises); - - _debug("timeout %ld", timeout); - queue_delayed_work(afs_vnode_update_worker, - &afs_vnode_update, timeout * HZ); - spin_unlock(&server->cb_lock); - afs_put_vnode(vl); -} -#endif - -/* - * initialise the callback update process - */ -int __init afs_callback_update_init(void) -{ - afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd", - WQ_MEM_RECLAIM); - return afs_callback_update_worker ? 0 : -ENOMEM; -} - -/* - * shut down the callback update process - */ -void afs_callback_update_kill(void) -{ - destroy_workqueue(afs_callback_update_worker); } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index ca0a3cf93791..9bb921d120d0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -1,6 +1,6 @@ /* AFS cell and server record management * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2002, 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -9,213 +9,291 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/key.h> #include <linux/ctype.h> #include <linux/dns_resolver.h> #include <linux/sched.h> +#include <linux/inet.h> #include <keys/rxrpc-type.h> #include "internal.h" -DECLARE_RWSEM(afs_proc_cells_sem); -LIST_HEAD(afs_proc_cells); +unsigned __read_mostly afs_cell_gc_delay = 10; -static LIST_HEAD(afs_cells); -static DEFINE_RWLOCK(afs_cells_lock); -static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */ -static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq); -static struct afs_cell *afs_cell_root; +static void afs_manage_cell(struct work_struct *); + +static void afs_dec_cells_outstanding(struct afs_net *net) +{ + if (atomic_dec_and_test(&net->cells_outstanding)) + wake_up_atomic_t(&net->cells_outstanding); +} /* - * allocate a cell record and fill in its name, VL server address list and - * allocate an anonymous key + * Set the cell timer to fire after a given delay, assuming it's not already + * set for an earlier time. */ -static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen, - char *vllist) +static void afs_set_cell_timer(struct afs_net *net, time64_t delay) { - struct afs_cell *cell; - struct key *key; - char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; - char *dvllist = NULL, *_vllist = NULL; - char delimiter = ':'; - int ret; + if (net->live) { + atomic_inc(&net->cells_outstanding); + if (timer_reduce(&net->cells_timer, jiffies + delay * HZ)) + afs_dec_cells_outstanding(net); + } +} - _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist); +/* + * Look up and get an activation reference on a cell record under RCU + * conditions. The caller must hold the RCU read lock. + */ +struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, + const char *name, unsigned int namesz) +{ + struct afs_cell *cell = NULL; + struct rb_node *p; + int n, seq = 0, ret = 0; - BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ + _enter("%*.*s", namesz, namesz, name); - if (namelen > AFS_MAXCELLNAME) { - _leave(" = -ENAMETOOLONG"); + if (name && namesz == 0) + return ERR_PTR(-EINVAL); + if (namesz > AFS_MAXCELLNAME) return ERR_PTR(-ENAMETOOLONG); - } - /* allocate and initialise a cell record */ - cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); - if (!cell) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } + do { + /* Unfortunately, rbtree walking doesn't give reliable results + * under just the RCU read lock, so we have to check for + * changes. + */ + if (cell) + afs_put_cell(net, cell); + cell = NULL; + ret = -ENOENT; - memcpy(cell->name, name, namelen); - cell->name[namelen] = 0; - - atomic_set(&cell->usage, 1); - INIT_LIST_HEAD(&cell->link); - rwlock_init(&cell->servers_lock); - INIT_LIST_HEAD(&cell->servers); - init_rwsem(&cell->vl_sem); - INIT_LIST_HEAD(&cell->vl_list); - spin_lock_init(&cell->vl_lock); - - /* if the ip address is invalid, try dns query */ - if (!vllist || strlen(vllist) < 7) { - ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); - if (ret < 0) { - if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY) - /* translate these errors into something - * userspace might understand */ - ret = -EDESTADDRREQ; - _leave(" = %d", ret); - return ERR_PTR(ret); + read_seqbegin_or_lock(&net->cells_lock, &seq); + + if (!name) { + cell = rcu_dereference_raw(net->ws_cell); + if (cell) { + afs_get_cell(cell); + continue; + } + ret = -EDESTADDRREQ; + continue; } - _vllist = dvllist; - /* change the delimiter for user-space reply */ - delimiter = ','; + p = rcu_dereference_raw(net->cells.rb_node); + while (p) { + cell = rb_entry(p, struct afs_cell, net_node); + + n = strncasecmp(cell->name, name, + min_t(size_t, cell->name_len, namesz)); + if (n == 0) + n = cell->name_len - namesz; + if (n < 0) { + p = rcu_dereference_raw(p->rb_left); + } else if (n > 0) { + p = rcu_dereference_raw(p->rb_right); + } else { + if (atomic_inc_not_zero(&cell->usage)) { + ret = 0; + break; + } + /* We want to repeat the search, this time with + * the lock properly locked. + */ + } + cell = NULL; + } - } else { - _vllist = vllist; - } + } while (need_seqretry(&net->cells_lock, seq)); - /* fill in the VL server list from the rest of the string */ - do { - unsigned a, b, c, d; + done_seqretry(&net->cells_lock, seq); - next = strchr(_vllist, delimiter); - if (next) - *next++ = 0; + return ret == 0 ? cell : ERR_PTR(ret); +} - if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) - goto bad_address; +/* + * Set up a cell record and fill in its name, VL server address list and + * allocate an anonymous key + */ +static struct afs_cell *afs_alloc_cell(struct afs_net *net, + const char *name, unsigned int namelen, + const char *vllist) +{ + struct afs_cell *cell; + int i, ret; - if (a > 255 || b > 255 || c > 255 || d > 255) - goto bad_address; + ASSERT(name); + if (namelen == 0) + return ERR_PTR(-EINVAL); + if (namelen > AFS_MAXCELLNAME) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } - cell->vl_addrs[cell->vl_naddrs++].s_addr = - htonl((a << 24) | (b << 16) | (c << 8) | d); + _enter("%*.*s,%s", namelen, namelen, name, vllist); - } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next)); + cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL); + if (!cell) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } - /* create a key to represent an anonymous user */ - memcpy(keyname, "afs@", 4); - dp = keyname + 4; - cp = cell->name; - do { - *dp++ = toupper(*cp); - } while (*cp++); + cell->net = net; + cell->name_len = namelen; + for (i = 0; i < namelen; i++) + cell->name[i] = tolower(name[i]); + + atomic_set(&cell->usage, 2); + INIT_WORK(&cell->manager, afs_manage_cell); + cell->flags = ((1 << AFS_CELL_FL_NOT_READY) | + (1 << AFS_CELL_FL_NO_LOOKUP_YET)); + INIT_LIST_HEAD(&cell->proc_volumes); + rwlock_init(&cell->proc_lock); + rwlock_init(&cell->vl_addrs_lock); + + /* Fill in the VL server list if we were given a list of addresses to + * use. + */ + if (vllist) { + struct afs_addr_list *alist; + + alist = afs_parse_text_addrs(vllist, strlen(vllist), ':', + VL_SERVICE, AFS_VL_PORT); + if (IS_ERR(alist)) { + ret = PTR_ERR(alist); + goto parse_failed; + } - key = rxrpc_get_null_key(keyname); - if (IS_ERR(key)) { - _debug("no key"); - ret = PTR_ERR(key); - goto error; + rcu_assign_pointer(cell->vl_addrs, alist); + cell->dns_expiry = TIME64_MAX; } - cell->anonymous_key = key; - - _debug("anon key %p{%x}", - cell->anonymous_key, key_serial(cell->anonymous_key)); _leave(" = %p", cell); return cell; -bad_address: - printk(KERN_ERR "kAFS: bad VL server IP address\n"); - ret = -EINVAL; -error: - key_put(cell->anonymous_key); - kfree(dvllist); +parse_failed: + if (ret == -EINVAL) + printk(KERN_ERR "kAFS: bad VL server IP address\n"); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); } /* - * afs_cell_crate() - create a cell record - * @name: is the name of the cell. - * @namsesz: is the strlen of the cell name. - * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format. - * @retref: is T to return the cell reference when the cell exists. + * afs_lookup_cell - Look up or create a cell record. + * @net: The network namespace + * @name: The name of the cell. + * @namesz: The strlen of the cell name. + * @vllist: A colon/comma separated list of numeric IP addresses or NULL. + * @excl: T if an error should be given if the cell name already exists. + * + * Look up a cell record by name and query the DNS for VL server addresses if + * needed. Note that that actual DNS query is punted off to the manager thread + * so that this function can return immediately if interrupted whilst allowing + * cell records to be shared even if not yet fully constructed. */ -struct afs_cell *afs_cell_create(const char *name, unsigned namesz, - char *vllist, bool retref) +struct afs_cell *afs_lookup_cell(struct afs_net *net, + const char *name, unsigned int namesz, + const char *vllist, bool excl) { - struct afs_cell *cell; - int ret; - - _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist); + struct afs_cell *cell, *candidate, *cursor; + struct rb_node *parent, **pp; + int ret, n; + + _enter("%s,%s", name, vllist); + + if (!excl) { + rcu_read_lock(); + cell = afs_lookup_cell_rcu(net, name, namesz); + rcu_read_unlock(); + if (!IS_ERR(cell)) + goto wait_for_cell; + } - down_write(&afs_cells_sem); - read_lock(&afs_cells_lock); - list_for_each_entry(cell, &afs_cells, link) { - if (strncasecmp(cell->name, name, namesz) == 0) - goto duplicate_name; + /* Assume we're probably going to create a cell and preallocate and + * mostly set up a candidate record. We can then use this to stash the + * name, the net namespace and VL server addresses. + * + * We also want to do this before we hold any locks as it may involve + * upcalling to userspace to make DNS queries. + */ + candidate = afs_alloc_cell(net, name, namesz, vllist); + if (IS_ERR(candidate)) { + _leave(" = %ld", PTR_ERR(candidate)); + return candidate; } - read_unlock(&afs_cells_lock); - cell = afs_cell_alloc(name, namesz, vllist); - if (IS_ERR(cell)) { - _leave(" = %ld", PTR_ERR(cell)); - up_write(&afs_cells_sem); - return cell; + /* Find the insertion point and check to see if someone else added a + * cell whilst we were allocating. + */ + write_seqlock(&net->cells_lock); + + pp = &net->cells.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + cursor = rb_entry(parent, struct afs_cell, net_node); + + n = strncasecmp(cursor->name, name, + min_t(size_t, cursor->name_len, namesz)); + if (n == 0) + n = cursor->name_len - namesz; + if (n < 0) + pp = &(*pp)->rb_left; + else if (n > 0) + pp = &(*pp)->rb_right; + else + goto cell_already_exists; } - /* add a proc directory for this cell */ - ret = afs_proc_cell_setup(cell); - if (ret < 0) - goto error; + cell = candidate; + candidate = NULL; + rb_link_node_rcu(&cell->net_node, parent, pp); + rb_insert_color(&cell->net_node, &net->cells); + atomic_inc(&net->cells_outstanding); + write_sequnlock(&net->cells_lock); -#ifdef CONFIG_AFS_FSCACHE - /* put it up for caching (this never returns an error) */ - cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, - &afs_cell_cache_index_def, - cell, true); -#endif + queue_work(afs_wq, &cell->manager); - /* add to the cell lists */ - write_lock(&afs_cells_lock); - list_add_tail(&cell->link, &afs_cells); - write_unlock(&afs_cells_lock); +wait_for_cell: + _debug("wait_for_cell"); + ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NOT_READY, TASK_INTERRUPTIBLE); + smp_rmb(); - down_write(&afs_proc_cells_sem); - list_add_tail(&cell->proc_link, &afs_proc_cells); - up_write(&afs_proc_cells_sem); - up_write(&afs_cells_sem); + switch (READ_ONCE(cell->state)) { + case AFS_CELL_FAILED: + ret = cell->error; + goto error; + default: + _debug("weird %u %d", cell->state, cell->error); + goto error; + case AFS_CELL_ACTIVE: + break; + } - _leave(" = %p", cell); + _leave(" = %p [cell]", cell); return cell; +cell_already_exists: + _debug("cell exists"); + cell = cursor; + if (excl) { + ret = -EEXIST; + } else { + afs_get_cell(cursor); + ret = 0; + } + write_sequnlock(&net->cells_lock); + kfree(candidate); + if (ret == 0) + goto wait_for_cell; + goto error_noput; error: - up_write(&afs_cells_sem); - key_put(cell->anonymous_key); - kfree(cell); - _leave(" = %d", ret); + afs_put_cell(net, cell); +error_noput: + _leave(" = %d [error]", ret); return ERR_PTR(ret); - -duplicate_name: - if (retref && !IS_ERR(cell)) - afs_get_cell(cell); - - read_unlock(&afs_cells_lock); - up_write(&afs_cells_sem); - - if (retref) { - _leave(" = %p", cell); - return cell; - } - - _leave(" = -EEXIST"); - return ERR_PTR(-EEXIST); } /* @@ -223,10 +301,11 @@ duplicate_name: * - can be called with a module parameter string * - can be called from a write to /proc/fs/afs/rootcell */ -int afs_cell_init(char *rootcell) +int afs_cell_init(struct afs_net *net, const char *rootcell) { struct afs_cell *old_root, *new_root; - char *cp; + const char *cp, *vllist; + size_t len; _enter(""); @@ -239,222 +318,453 @@ int afs_cell_init(char *rootcell) } cp = strchr(rootcell, ':'); - if (!cp) + if (!cp) { _debug("kAFS: no VL server IP addresses specified"); - else - *cp++ = 0; + vllist = NULL; + len = strlen(rootcell); + } else { + vllist = cp + 1; + len = cp - rootcell; + } /* allocate a cell record for the root cell */ - new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false); + new_root = afs_lookup_cell(net, rootcell, len, vllist, false); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); return PTR_ERR(new_root); } + set_bit(AFS_CELL_FL_NO_GC, &new_root->flags); + afs_get_cell(new_root); + /* install the new cell */ - write_lock(&afs_cells_lock); - old_root = afs_cell_root; - afs_cell_root = new_root; - write_unlock(&afs_cells_lock); - afs_put_cell(old_root); + write_seqlock(&net->cells_lock); + old_root = net->ws_cell; + net->ws_cell = new_root; + write_sequnlock(&net->cells_lock); + afs_put_cell(net, old_root); _leave(" = 0"); return 0; } /* - * lookup a cell record + * Update a cell's VL server address list from the DNS. */ -struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz, - bool dns_cell) +static void afs_update_cell(struct afs_cell *cell) { - struct afs_cell *cell; - - _enter("\"%*.*s\",", namesz, namesz, name ?: ""); - - down_read(&afs_cells_sem); - read_lock(&afs_cells_lock); - - if (name) { - /* if the cell was named, look for it in the cell record list */ - list_for_each_entry(cell, &afs_cells, link) { - if (strncmp(cell->name, name, namesz) == 0) { - afs_get_cell(cell); - goto found; - } + struct afs_addr_list *alist, *old; + time64_t now, expiry; + + _enter("%s", cell->name); + + alist = afs_dns_query(cell, &expiry); + if (IS_ERR(alist)) { + switch (PTR_ERR(alist)) { + case -ENODATA: + /* The DNS said that the cell does not exist */ + set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags); + clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags); + cell->dns_expiry = ktime_get_real_seconds() + 61; + break; + + case -EAGAIN: + case -ECONNREFUSED: + default: + set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags); + cell->dns_expiry = ktime_get_real_seconds() + 10; + break; } - cell = ERR_PTR(-ENOENT); - if (dns_cell) - goto create_cell; - found: - ; + + cell->error = -EDESTADDRREQ; } else { - cell = afs_cell_root; - if (!cell) { - /* this should not happen unless user tries to mount - * when root cell is not set. Return an impossibly - * bizarre errno to alert the user. Things like - * ENOENT might be "more appropriate" but they happen - * for other reasons. - */ - cell = ERR_PTR(-EDESTADDRREQ); - } else { - afs_get_cell(cell); - } + clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags); + clear_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags); + + /* Exclusion on changing vl_addrs is achieved by a + * non-reentrant work item. + */ + old = rcu_dereference_protected(cell->vl_addrs, true); + rcu_assign_pointer(cell->vl_addrs, alist); + cell->dns_expiry = expiry; + if (old) + afs_put_addrlist(old); } - read_unlock(&afs_cells_lock); - up_read(&afs_cells_sem); - _leave(" = %p", cell); - return cell; + if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags)) + wake_up_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET); -create_cell: - read_unlock(&afs_cells_lock); - up_read(&afs_cells_sem); + now = ktime_get_real_seconds(); + afs_set_cell_timer(cell->net, cell->dns_expiry - now); + _leave(""); +} - cell = afs_cell_create(name, namesz, NULL, true); +/* + * Destroy a cell record + */ +static void afs_cell_destroy(struct rcu_head *rcu) +{ + struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu); - _leave(" = %p", cell); - return cell; + _enter("%p{%s}", cell, cell->name); + + ASSERTCMP(atomic_read(&cell->usage), ==, 0); + + afs_put_addrlist(cell->vl_addrs); + key_put(cell->anonymous_key); + kfree(cell); + + _leave(" [destroyed]"); } -#if 0 /* - * try and get a cell record + * Queue the cell manager. */ -struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell) +static void afs_queue_cell_manager(struct afs_net *net) { - write_lock(&afs_cells_lock); + int outstanding = atomic_inc_return(&net->cells_outstanding); - if (cell && !list_empty(&cell->link)) - afs_get_cell(cell); - else - cell = NULL; + _enter("%d", outstanding); - write_unlock(&afs_cells_lock); - return cell; + if (!queue_work(afs_wq, &net->cells_manager)) + afs_dec_cells_outstanding(net); } -#endif /* 0 */ /* - * destroy a cell record + * Cell management timer. We have an increment on cells_outstanding that we + * need to pass along to the work item. */ -void afs_put_cell(struct afs_cell *cell) +void afs_cells_timer(struct timer_list *timer) { - if (!cell) - return; + struct afs_net *net = container_of(timer, struct afs_net, cells_timer); - _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); + _enter(""); + if (!queue_work(afs_wq, &net->cells_manager)) + afs_dec_cells_outstanding(net); +} - ASSERTCMP(atomic_read(&cell->usage), >, 0); +/* + * Get a reference on a cell record. + */ +struct afs_cell *afs_get_cell(struct afs_cell *cell) +{ + atomic_inc(&cell->usage); + return cell; +} - /* to prevent a race, the decrement and the dequeue must be effectively - * atomic */ - write_lock(&afs_cells_lock); +/* + * Drop a reference on a cell record. + */ +void afs_put_cell(struct afs_net *net, struct afs_cell *cell) +{ + time64_t now, expire_delay; - if (likely(!atomic_dec_and_test(&cell->usage))) { - write_unlock(&afs_cells_lock); - _leave(""); + if (!cell) return; - } - ASSERT(list_empty(&cell->servers)); - ASSERT(list_empty(&cell->vl_list)); + _enter("%s", cell->name); - write_unlock(&afs_cells_lock); + now = ktime_get_real_seconds(); + cell->last_inactive = now; + expire_delay = 0; + if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) && + !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags)) + expire_delay = afs_cell_gc_delay; - wake_up(&afs_cells_freeable_wq); + if (atomic_dec_return(&cell->usage) > 1) + return; - _leave(" [unused]"); + /* 'cell' may now be garbage collected. */ + afs_set_cell_timer(net, expire_delay); } /* - * destroy a cell record - * - must be called with the afs_cells_sem write-locked - * - cell->link should have been broken by the caller + * Allocate a key to use as a placeholder for anonymous user security. */ -static void afs_cell_destroy(struct afs_cell *cell) +static int afs_alloc_anon_key(struct afs_cell *cell) { - _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); + struct key *key; + char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp; + + /* Create a key to represent an anonymous user. */ + memcpy(keyname, "afs@", 4); + dp = keyname + 4; + cp = cell->name; + do { + *dp++ = tolower(*cp); + } while (*cp++); - ASSERTCMP(atomic_read(&cell->usage), >=, 0); - ASSERT(list_empty(&cell->link)); + key = rxrpc_get_null_key(keyname); + if (IS_ERR(key)) + return PTR_ERR(key); - /* wait for everyone to stop using the cell */ - if (atomic_read(&cell->usage) > 0) { - DECLARE_WAITQUEUE(myself, current); + cell->anonymous_key = key; - _debug("wait for cell %s", cell->name); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&afs_cells_freeable_wq, &myself); + _debug("anon key %p{%x}", + cell->anonymous_key, key_serial(cell->anonymous_key)); + return 0; +} - while (atomic_read(&cell->usage) > 0) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } +/* + * Activate a cell. + */ +static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) +{ + int ret; - remove_wait_queue(&afs_cells_freeable_wq, &myself); - set_current_state(TASK_RUNNING); + if (!cell->anonymous_key) { + ret = afs_alloc_anon_key(cell); + if (ret < 0) + return ret; } - _debug("cell dead"); - ASSERTCMP(atomic_read(&cell->usage), ==, 0); - ASSERT(list_empty(&cell->servers)); - ASSERT(list_empty(&cell->vl_list)); +#ifdef CONFIG_AFS_FSCACHE + cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, + &afs_cell_cache_index_def, + cell, true); +#endif + ret = afs_proc_cell_setup(net, cell); + if (ret < 0) + return ret; + spin_lock(&net->proc_cells_lock); + list_add_tail(&cell->proc_link, &net->proc_cells); + spin_unlock(&net->proc_cells_lock); + return 0; +} - afs_proc_cell_remove(cell); +/* + * Deactivate a cell. + */ +static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) +{ + _enter("%s", cell->name); + + afs_proc_cell_remove(net, cell); - down_write(&afs_proc_cells_sem); + spin_lock(&net->proc_cells_lock); list_del_init(&cell->proc_link); - up_write(&afs_proc_cells_sem); + spin_unlock(&net->proc_cells_lock); #ifdef CONFIG_AFS_FSCACHE fscache_relinquish_cookie(cell->cache, 0); + cell->cache = NULL; #endif - key_put(cell->anonymous_key); - kfree(cell); - _leave(" [destroyed]"); + _leave(""); } /* - * purge in-memory cell database on module unload or afs_init() failure - * - the timeout daemon is stopped before calling this + * Manage a cell record, initialising and destroying it, maintaining its DNS + * records. */ -void afs_cell_purge(void) +static void afs_manage_cell(struct work_struct *work) { - struct afs_cell *cell; + struct afs_cell *cell = container_of(work, struct afs_cell, manager); + struct afs_net *net = cell->net; + bool deleted; + int ret, usage; + + _enter("%s", cell->name); + +again: + _debug("state %u", cell->state); + switch (cell->state) { + case AFS_CELL_INACTIVE: + case AFS_CELL_FAILED: + write_seqlock(&net->cells_lock); + usage = 1; + deleted = atomic_try_cmpxchg_relaxed(&cell->usage, &usage, 0); + if (deleted) + rb_erase(&cell->net_node, &net->cells); + write_sequnlock(&net->cells_lock); + if (deleted) + goto final_destruction; + if (cell->state == AFS_CELL_FAILED) + goto done; + cell->state = AFS_CELL_UNSET; + goto again; + + case AFS_CELL_UNSET: + cell->state = AFS_CELL_ACTIVATING; + goto again; + + case AFS_CELL_ACTIVATING: + ret = afs_activate_cell(net, cell); + if (ret < 0) + goto activation_failed; + + cell->state = AFS_CELL_ACTIVE; + smp_wmb(); + clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags); + wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY); + goto again; + + case AFS_CELL_ACTIVE: + if (atomic_read(&cell->usage) > 1) { + time64_t now = ktime_get_real_seconds(); + if (cell->dns_expiry <= now && net->live) + afs_update_cell(cell); + goto done; + } + cell->state = AFS_CELL_DEACTIVATING; + goto again; + + case AFS_CELL_DEACTIVATING: + set_bit(AFS_CELL_FL_NOT_READY, &cell->flags); + if (atomic_read(&cell->usage) > 1) + goto reverse_deactivation; + afs_deactivate_cell(net, cell); + cell->state = AFS_CELL_INACTIVE; + goto again; + + default: + break; + } + _debug("bad state %u", cell->state); + BUG(); /* Unhandled state */ + +activation_failed: + cell->error = ret; + afs_deactivate_cell(net, cell); + + cell->state = AFS_CELL_FAILED; + smp_wmb(); + if (test_and_clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags)) + wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY); + goto again; + +reverse_deactivation: + cell->state = AFS_CELL_ACTIVE; + smp_wmb(); + clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags); + wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY); + _leave(" [deact->act]"); + return; + +done: + _leave(" [done %u]", cell->state); + return; + +final_destruction: + call_rcu(&cell->rcu, afs_cell_destroy); + afs_dec_cells_outstanding(net); + _leave(" [destruct %d]", atomic_read(&net->cells_outstanding)); +} + +/* + * Manage the records of cells known to a network namespace. This includes + * updating the DNS records and garbage collecting unused cells that were + * automatically added. + * + * Note that constructed cell records may only be removed from net->cells by + * this work item, so it is safe for this work item to stash a cursor pointing + * into the tree and then return to caller (provided it skips cells that are + * still under construction). + * + * Note also that we were given an increment on net->cells_outstanding by + * whoever queued us that we need to deal with before returning. + */ +void afs_manage_cells(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, cells_manager); + struct rb_node *cursor; + time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; + bool purging = !net->live; _enter(""); - afs_put_cell(afs_cell_root); + /* Trawl the cell database looking for cells that have expired from + * lack of use and cells whose DNS results have expired and dispatch + * their managers. + */ + read_seqlock_excl(&net->cells_lock); - down_write(&afs_cells_sem); + for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { + struct afs_cell *cell = + rb_entry(cursor, struct afs_cell, net_node); + unsigned usage; + bool sched_cell = false; - while (!list_empty(&afs_cells)) { - cell = NULL; + usage = atomic_read(&cell->usage); + _debug("manage %s %u", cell->name, usage); + + ASSERTCMP(usage, >=, 1); - /* remove the next cell from the front of the list */ - write_lock(&afs_cells_lock); + if (purging) { + if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + usage = atomic_dec_return(&cell->usage); + ASSERTCMP(usage, ==, 1); + } + + if (usage == 1) { + time64_t expire_at = cell->last_inactive; + + if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) && + !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags)) + expire_at += afs_cell_gc_delay; + if (purging || expire_at <= now) + sched_cell = true; + else if (expire_at < next_manage) + next_manage = expire_at; + } - if (!list_empty(&afs_cells)) { - cell = list_entry(afs_cells.next, - struct afs_cell, link); - list_del_init(&cell->link); + if (!purging) { + if (cell->dns_expiry <= now) + sched_cell = true; + else if (cell->dns_expiry <= next_manage) + next_manage = cell->dns_expiry; } - write_unlock(&afs_cells_lock); + if (sched_cell) + queue_work(afs_wq, &cell->manager); + } + + read_sequnlock_excl(&net->cells_lock); - if (cell) { - _debug("PURGING CELL %s (%d)", - cell->name, atomic_read(&cell->usage)); + /* Update the timer on the way out. We have to pass an increment on + * cells_outstanding in the namespace that we are in to the timer or + * the work scheduler. + */ + if (!purging && next_manage < TIME64_MAX) { + now = ktime_get_real_seconds(); - /* now the cell should be left with no references */ - afs_cell_destroy(cell); + if (next_manage - now <= 0) { + if (queue_work(afs_wq, &net->cells_manager)) + atomic_inc(&net->cells_outstanding); + } else { + afs_set_cell_timer(net, next_manage - now); } } - up_write(&afs_cells_sem); + afs_dec_cells_outstanding(net); + _leave(" [%d]", atomic_read(&net->cells_outstanding)); +} + +/* + * Purge in-memory cell database. + */ +void afs_cell_purge(struct afs_net *net) +{ + struct afs_cell *ws; + + _enter(""); + + write_seqlock(&net->cells_lock); + ws = net->ws_cell; + net->ws_cell = NULL; + write_sequnlock(&net->cells_lock); + afs_put_cell(net, ws); + + _debug("del timer"); + if (del_timer_sync(&net->cells_timer)) + atomic_dec(&net->cells_outstanding); + + _debug("kick mgr"); + afs_queue_cell_manager(net); + + _debug("wait"); + wait_on_atomic_t(&net->cells_outstanding, atomic_t_wait, + TASK_UNINTERRUPTIBLE); _leave(""); } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 782d4d05a53b..41e277f57b20 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -41,7 +41,6 @@ static CM_NAME(CallBack); static const struct afs_call_type afs_SRXCBCallBack = { .name = afs_SRXCBCallBack_name, .deliver = afs_deliver_cb_callback, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, .work = SRXAFSCB_CallBack, }; @@ -53,7 +52,6 @@ static CM_NAME(InitCallBackState); static const struct afs_call_type afs_SRXCBInitCallBackState = { .name = afs_SRXCBInitCallBackState_name, .deliver = afs_deliver_cb_init_call_back_state, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, .work = SRXAFSCB_InitCallBackState, }; @@ -65,7 +63,6 @@ static CM_NAME(InitCallBackState3); static const struct afs_call_type afs_SRXCBInitCallBackState3 = { .name = afs_SRXCBInitCallBackState3_name, .deliver = afs_deliver_cb_init_call_back_state3, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, .work = SRXAFSCB_InitCallBackState, }; @@ -77,7 +74,6 @@ static CM_NAME(Probe); static const struct afs_call_type afs_SRXCBProbe = { .name = afs_SRXCBProbe_name, .deliver = afs_deliver_cb_probe, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, .work = SRXAFSCB_Probe, }; @@ -89,7 +85,6 @@ static CM_NAME(ProbeUuid); static const struct afs_call_type afs_SRXCBProbeUuid = { .name = afs_SRXCBProbeUuid_name, .deliver = afs_deliver_cb_probe_uuid, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, .work = SRXAFSCB_ProbeUuid, }; @@ -101,7 +96,6 @@ static CM_NAME(TellMeAboutYourself); static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { .name = afs_SRXCBTellMeAboutYourself_name, .deliver = afs_deliver_cb_tell_me_about_yourself, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, .work = SRXAFSCB_TellMeAboutYourself, }; @@ -127,6 +121,9 @@ bool afs_cm_incoming_call(struct afs_call *call) case CBProbe: call->type = &afs_SRXCBProbe; return true; + case CBProbeUuid: + call->type = &afs_SRXCBProbeUuid; + return true; case CBTellMeAboutYourself: call->type = &afs_SRXCBTellMeAboutYourself; return true; @@ -147,18 +144,16 @@ static void afs_cm_destructor(struct afs_call *call) * afs_deliver_cb_callback(). */ if (call->unmarshall == 5) { - ASSERT(call->server && call->count && call->request); - afs_break_callbacks(call->server, call->count, call->request); + ASSERT(call->cm_server && call->count && call->request); + afs_break_callbacks(call->cm_server, call->count, call->request); } - afs_put_server(call->server); - call->server = NULL; kfree(call->buffer); call->buffer = NULL; } /* - * allow the fileserver to see if the cache manager is still alive + * The server supplied a list of callbacks that it wanted to break. */ static void SRXAFSCB_CallBack(struct work_struct *work) { @@ -173,7 +168,7 @@ static void SRXAFSCB_CallBack(struct work_struct *work) * yet */ afs_send_empty_reply(call); - afs_break_callbacks(call->server, call->count, call->request); + afs_break_callbacks(call->cm_server, call->count, call->request); afs_put_call(call); _leave(""); } @@ -193,7 +188,6 @@ static int afs_deliver_cb_callback(struct afs_call *call) switch (call->unmarshall) { case 0: - rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); call->offset = 0; call->unmarshall++; @@ -286,14 +280,16 @@ static int afs_deliver_cb_callback(struct afs_call *call) break; } - call->state = AFS_CALL_REPLYING; + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return -EIO; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - server = afs_find_server(&srx); + rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); + server = afs_find_server(call->net, &srx); if (!server) return -ENOTCONN; - call->server = server; + call->cm_server = server; return afs_queue_call_work(call); } @@ -305,9 +301,9 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, work); - _enter("{%p}", call->server); + _enter("{%p}", call->cm_server); - afs_init_callback_state(call->server); + afs_init_callback_state(call->cm_server); afs_send_empty_reply(call); afs_put_call(call); _leave(""); @@ -324,21 +320,18 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) _enter(""); - rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); + rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; - /* no unmarshalling required */ - call->state = AFS_CALL_REPLYING; - /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - server = afs_find_server(&srx); + server = afs_find_server(call->net, &srx); if (!server) return -ENOTCONN; - call->server = server; + call->cm_server = server; return afs_queue_call_work(call); } @@ -357,8 +350,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) _enter(""); - rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); - _enter("{%u}", call->unmarshall); switch (call->unmarshall) { @@ -402,15 +393,16 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) break; } - /* no unmarshalling required */ - call->state = AFS_CALL_REPLYING; + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return -EIO; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - server = afs_find_server(&srx); + rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); + server = afs_find_server(call->net, &srx); if (!server) return -ENOTCONN; - call->server = server; + call->cm_server = server; return afs_queue_call_work(call); } @@ -441,8 +433,8 @@ static int afs_deliver_cb_probe(struct afs_call *call) if (ret < 0) return ret; - /* no unmarshalling required */ - call->state = AFS_CALL_REPLYING; + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return -EIO; return afs_queue_call_work(call); } @@ -461,7 +453,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) _enter(""); - if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) + if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0) reply.match = htonl(0); else reply.match = htonl(1); @@ -524,7 +516,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) break; } - call->state = AFS_CALL_REPLYING; + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return -EIO; return afs_queue_call_work(call); } @@ -568,13 +561,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) memset(&reply, 0, sizeof(reply)); reply.ia.nifs = htonl(nifs); - reply.ia.uuid[0] = afs_uuid.time_low; - reply.ia.uuid[1] = htonl(ntohs(afs_uuid.time_mid)); - reply.ia.uuid[2] = htonl(ntohs(afs_uuid.time_hi_and_version)); - reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved); - reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low); + reply.ia.uuid[0] = call->net->uuid.time_low; + reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid)); + reply.ia.uuid[2] = htonl(ntohs(call->net->uuid.time_hi_and_version)); + reply.ia.uuid[3] = htonl((s8) call->net->uuid.clock_seq_hi_and_reserved); + reply.ia.uuid[4] = htonl((s8) call->net->uuid.clock_seq_low); for (loop = 0; loop < 6; loop++) - reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]); + reply.ia.uuid[loop + 5] = htonl((s8) call->net->uuid.node[loop]); if (ifs) { for (loop = 0; loop < nifs; loop++) { @@ -605,8 +598,8 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) if (ret < 0) return ret; - /* no unmarshalling required */ - call->state = AFS_CALL_REPLYING; + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return -EIO; return afs_queue_call_work(call); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 613a77058263..23c7f395d718 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -130,10 +130,11 @@ struct afs_lookup_cookie { /* * check that a directory page is valid */ -static inline bool afs_dir_check_page(struct inode *dir, struct page *page) +bool afs_dir_check_page(struct inode *dir, struct page *page) { struct afs_dir_page *dbuf; - loff_t latter; + struct afs_vnode *vnode = AFS_FS_I(dir); + loff_t latter, i_size, off; int tmp, qty; #if 0 @@ -150,8 +151,15 @@ static inline bool afs_dir_check_page(struct inode *dir, struct page *page) } #endif - /* determine how many magic numbers there should be in this page */ - latter = dir->i_size - page_offset(page); + /* Determine how many magic numbers there should be in this page, but + * we must take care because the directory may change size under us. + */ + off = page_offset(page); + i_size = i_size_read(dir); + if (i_size <= off) + goto checked; + + latter = i_size - off; if (latter >= PAGE_SIZE) qty = PAGE_SIZE; else @@ -162,13 +170,15 @@ static inline bool afs_dir_check_page(struct inode *dir, struct page *page) dbuf = page_address(page); for (tmp = 0; tmp < qty; tmp++) { if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) { - printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n", + printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n", __func__, dir->i_ino, tmp, qty, ntohs(dbuf->blocks[tmp].pagehdr.magic)); + trace_afs_dir_check_failed(vnode, off, i_size); goto error; } } +checked: SetPageChecked(page); return true; @@ -183,6 +193,7 @@ error: static inline void afs_dir_put_page(struct page *page) { kunmap(page); + unlock_page(page); put_page(page); } @@ -197,9 +208,10 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); if (!IS_ERR(page)) { + lock_page(page); kmap(page); if (unlikely(!PageChecked(page))) { - if (PageError(page) || !afs_dir_check_page(dir, page)) + if (PageError(page)) goto fail; } } @@ -384,8 +396,7 @@ out: */ static int afs_readdir(struct file *file, struct dir_context *ctx) { - return afs_dir_iterate(file_inode(file), - ctx, file->private_data); + return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file)); } /* @@ -553,7 +564,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version; /* instantiate the dentry */ - inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL); + inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL, NULL); key_put(key); if (IS_ERR(inode)) { _leave(" = %ld", PTR_ERR(inode)); @@ -581,6 +592,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) struct afs_vnode *vnode, *dir; struct afs_fid uninitialized_var(fid); struct dentry *parent; + struct inode *inode; struct key *key; void *dir_version; int ret; @@ -588,30 +600,39 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - vnode = AFS_FS_I(d_inode(dentry)); - - if (d_really_is_positive(dentry)) + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); _enter("{v={%x:%u} n=%pd fl=%lx},", vnode->fid.vid, vnode->fid.vnode, dentry, vnode->flags); - else + } else { _enter("{neg n=%pd}", dentry); + } key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell); if (IS_ERR(key)) key = NULL; + if (d_really_is_positive(dentry)) { + inode = d_inode(dentry); + if (inode) { + vnode = AFS_FS_I(inode); + afs_validate(vnode, key); + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + goto out_bad; + } + } + /* lock down the parent dentry so we can peer at it */ parent = dget_parent(dentry); dir = AFS_FS_I(d_inode(parent)); /* validate the parent directory */ - if (test_bit(AFS_VNODE_MODIFIED, &dir->flags)) - afs_validate(dir, key); + afs_validate(dir, key); if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { _debug("%pd: parent dir deleted", dentry); - goto out_bad; + goto out_bad_parent; } dir_version = (void *) (unsigned long) dir->status.data_version; @@ -626,13 +647,16 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) case 0: /* the filename maps to something */ if (d_really_is_negative(dentry)) - goto out_bad; - if (is_bad_inode(d_inode(dentry))) { + goto out_bad_parent; + inode = d_inode(dentry); + if (is_bad_inode(inode)) { printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", dentry); - goto out_bad; + goto out_bad_parent; } + vnode = AFS_FS_I(inode); + /* if the vnode ID has changed, then the dirent points to a * different file */ if (fid.vnode != vnode->fid.vnode) { @@ -649,10 +673,10 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _debug("%pd: file deleted (uq %u -> %u I:%u)", dentry, fid.unique, vnode->fid.unique, - d_inode(dentry)->i_generation); - spin_lock(&vnode->lock); + vnode->vfs_inode.i_generation); + write_seqlock(&vnode->cb_lock); set_bit(AFS_VNODE_DELETED, &vnode->flags); - spin_unlock(&vnode->lock); + write_sequnlock(&vnode->cb_lock); goto not_found; } goto out_valid; @@ -667,7 +691,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) default: _debug("failed to iterate dir %pd: %d", parent, ret); - goto out_bad; + goto out_bad_parent; } out_valid: @@ -683,9 +707,10 @@ not_found: dentry->d_flags |= DCACHE_NFSFS_RENAMED; spin_unlock(&dentry->d_lock); -out_bad: +out_bad_parent: _debug("dropping dentry %pd2", dentry); dput(parent); +out_bad: key_put(key); _leave(" = 0 [bad]"); @@ -727,20 +752,48 @@ static void afs_d_release(struct dentry *dentry) } /* + * Create a new inode for create/mkdir/symlink + */ +static void afs_vnode_new_inode(struct afs_fs_cursor *fc, + struct dentry *new_dentry, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb) +{ + struct inode *inode; + + if (fc->ac.error < 0) + return; + + d_drop(new_dentry); + + inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key, + newfid, newstatus, newcb, fc->cbi); + if (IS_ERR(inode)) { + /* ENOMEM or EINTR at a really inconvenient time - just abandon + * the new directory on the server. + */ + fc->ac.error = PTR_ERR(inode); + return; + } + + d_add(new_dentry, inode); +} + +/* * create a directory on an AFS filesystem */ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct afs_file_status status; - struct afs_callback cb; - struct afs_server *server; - struct afs_vnode *dvnode, *vnode; - struct afs_fid fid; - struct inode *inode; + struct afs_file_status newstatus; + struct afs_fs_cursor fc; + struct afs_callback newcb; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_fid newfid; struct key *key; int ret; - dvnode = AFS_FS_I(dir); + mode |= S_IFDIR; _enter("{%x:%u},{%pd},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); @@ -751,40 +804,29 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto error; } - mode |= S_IFDIR; - ret = afs_vnode_create(dvnode, key, dentry->d_name.name, - mode, &fid, &status, &cb, &server); - if (ret < 0) - goto mkdir_error; + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + afs_fs_create(&fc, dentry->d_name.name, mode, + &newfid, &newstatus, &newcb); + } - inode = afs_iget(dir->i_sb, key, &fid, &status, &cb); - if (IS_ERR(inode)) { - /* ENOMEM at a really inconvenient time - just abandon the new - * directory on the server */ - ret = PTR_ERR(inode); - goto iget_error; + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, dvnode, fc.cb_break); + afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb); + ret = afs_end_vnode_operation(&fc); + if (ret < 0) + goto error_key; + } else { + goto error_key; } - /* apply the status report we've got for the new vnode */ - vnode = AFS_FS_I(inode); - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) { - _debug("not hashed"); - d_rehash(dentry); - } key_put(key); _leave(" = 0"); return 0; -iget_error: - afs_put_server(server); -mkdir_error: +error_key: key_put(key); error: d_drop(dentry); @@ -793,16 +835,29 @@ error: } /* + * Remove a subdir from a directory. + */ +static void afs_dir_remove_subdir(struct dentry *dentry) +{ + if (d_really_is_positive(dentry)) { + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + + clear_nlink(&vnode->vfs_inode); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + } +} + +/* * remove a directory from an AFS filesystem */ static int afs_rmdir(struct inode *dir, struct dentry *dentry) { - struct afs_vnode *dvnode, *vnode; + struct afs_fs_cursor fc; + struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; int ret; - dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%pd}", dvnode->fid.vid, dvnode->fid.vnode, dentry); @@ -812,45 +867,88 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) goto error; } - ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true); - if (ret < 0) - goto rmdir_error; + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + afs_fs_remove(&fc, dentry->d_name.name, true); + } - if (d_really_is_positive(dentry)) { - vnode = AFS_FS_I(d_inode(dentry)); - clear_nlink(&vnode->vfs_inode); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - afs_discard_callback_on_delete(vnode); + afs_vnode_commit_status(&fc, dvnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + if (ret == 0) + afs_dir_remove_subdir(dentry); } key_put(key); - _leave(" = 0"); - return 0; - -rmdir_error: - key_put(key); error: - _leave(" = %d", ret); return ret; } /* - * remove a file from an AFS filesystem + * Remove a link to a file or symlink from a directory. + * + * If the file was not deleted due to excess hard links, the fileserver will + * break the callback promise on the file - if it had one - before it returns + * to us, and if it was deleted, it won't + * + * However, if we didn't have a callback promise outstanding, or it was + * outstanding on a different server, then it won't break it either... + */ +static int afs_dir_remove_link(struct dentry *dentry, struct key *key, + unsigned long d_version_before, + unsigned long d_version_after) +{ + bool dir_valid; + int ret = 0; + + /* There were no intervening changes on the server if the version + * number we got back was incremented by exactly 1. + */ + dir_valid = (d_version_after == d_version_before + 1); + + if (d_really_is_positive(dentry)) { + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + + if (dir_valid) { + drop_nlink(&vnode->vfs_inode); + if (vnode->vfs_inode.i_nlink == 0) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + } + ret = 0; + } else { + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + kdebug("AFS_VNODE_DELETED"); + + ret = afs_validate(vnode, key); + if (ret == -ESTALE) + ret = 0; + } + _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret); + } + + return ret; +} + +/* + * Remove a file or symlink from an AFS filesystem. */ static int afs_unlink(struct inode *dir, struct dentry *dentry) { - struct afs_vnode *dvnode, *vnode; + struct afs_fs_cursor fc; + struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct key *key; + unsigned long d_version = (unsigned long)dentry->d_fsdata; int ret; - dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%pd}", dvnode->fid.vid, dvnode->fid.vnode, dentry); - ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) - goto error; + return -ENAMETOOLONG; key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { @@ -858,44 +956,30 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) goto error; } + /* Try to make sure we have a callback promise on the victim. */ if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); - - /* make sure we have a callback promise on the victim */ ret = afs_validate(vnode, key); if (ret < 0) - goto error; + goto error_key; } - ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false); - if (ret < 0) - goto remove_error; + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + afs_fs_remove(&fc, dentry->d_name.name, false); + } - if (d_really_is_positive(dentry)) { - /* if the file wasn't deleted due to excess hard links, the - * fileserver will break the callback promise on the file - if - * it had one - before it returns to us, and if it was deleted, - * it won't - * - * however, if we didn't have a callback promise outstanding, - * or it was outstanding on a different server, then it won't - * break it either... - */ - vnode = AFS_FS_I(d_inode(dentry)); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - _debug("AFS_VNODE_DELETED"); - if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) - _debug("AFS_VNODE_CB_BROKEN"); - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - ret = afs_validate(vnode, key); - _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret); + afs_vnode_commit_status(&fc, dvnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + if (ret == 0) + ret = afs_dir_remove_link( + dentry, key, d_version, + (unsigned long)dvnode->status.data_version); } - key_put(key); - _leave(" = 0"); - return 0; - -remove_error: +error_key: key_put(key); error: _leave(" = %d", ret); @@ -908,60 +992,52 @@ error: static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct afs_file_status status; - struct afs_callback cb; - struct afs_server *server; - struct afs_vnode *dvnode, *vnode; - struct afs_fid fid; - struct inode *inode; + struct afs_fs_cursor fc; + struct afs_file_status newstatus; + struct afs_callback newcb; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_fid newfid; struct key *key; int ret; - dvnode = AFS_FS_I(dir); + mode |= S_IFREG; _enter("{%x:%u},{%pd},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); goto error; } - mode |= S_IFREG; - ret = afs_vnode_create(dvnode, key, dentry->d_name.name, - mode, &fid, &status, &cb, &server); - if (ret < 0) - goto create_error; + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + afs_fs_create(&fc, dentry->d_name.name, mode, + &newfid, &newstatus, &newcb); + } - inode = afs_iget(dir->i_sb, key, &fid, &status, &cb); - if (IS_ERR(inode)) { - /* ENOMEM at a really inconvenient time - just abandon the new - * directory on the server */ - ret = PTR_ERR(inode); - goto iget_error; + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, dvnode, fc.cb_break); + afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb); + ret = afs_end_vnode_operation(&fc); + if (ret < 0) + goto error_key; + } else { + goto error_key; } - /* apply the status report we've got for the new vnode */ - vnode = AFS_FS_I(inode); - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) { - _debug("not hashed"); - d_rehash(dentry); - } key_put(key); _leave(" = 0"); return 0; -iget_error: - afs_put_server(server); -create_error: +error_key: key_put(key); error: d_drop(dentry); @@ -975,6 +1051,7 @@ error: static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry) { + struct afs_fs_cursor fc; struct afs_vnode *dvnode, *vnode; struct key *key; int ret; @@ -987,23 +1064,47 @@ static int afs_link(struct dentry *from, struct inode *dir, dvnode->fid.vid, dvnode->fid.vnode, dentry); + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); goto error; } - ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name); - if (ret < 0) - goto link_error; + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) { + afs_end_vnode_operation(&fc); + goto error_key; + } + + while (afs_select_fileserver(&fc)) { + fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break; + afs_fs_link(&fc, vnode, dentry->d_name.name); + } + + afs_vnode_commit_status(&fc, dvnode, fc.cb_break); + afs_vnode_commit_status(&fc, vnode, fc.cb_break_2); + ihold(&vnode->vfs_inode); + d_instantiate(dentry, &vnode->vfs_inode); + + mutex_unlock(&vnode->io_lock); + ret = afs_end_vnode_operation(&fc); + if (ret < 0) + goto error_key; + } else { + goto error_key; + } - ihold(&vnode->vfs_inode); - d_instantiate(dentry, &vnode->vfs_inode); key_put(key); _leave(" = 0"); return 0; -link_error: +error_key: key_put(key); error: d_drop(dentry); @@ -1017,20 +1118,21 @@ error: static int afs_symlink(struct inode *dir, struct dentry *dentry, const char *content) { - struct afs_file_status status; - struct afs_server *server; - struct afs_vnode *dvnode, *vnode; - struct afs_fid fid; - struct inode *inode; + struct afs_fs_cursor fc; + struct afs_file_status newstatus; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_fid newfid; struct key *key; int ret; - dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%pd},%s", dvnode->fid.vid, dvnode->fid.vnode, dentry, content); + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + ret = -EINVAL; if (strlen(content) >= AFSPATHMAX) goto error; @@ -1041,39 +1143,29 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, goto error; } - ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content, - &fid, &status, &server); - if (ret < 0) - goto create_error; + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + afs_fs_symlink(&fc, dentry->d_name.name, content, + &newfid, &newstatus); + } - inode = afs_iget(dir->i_sb, key, &fid, &status, NULL); - if (IS_ERR(inode)) { - /* ENOMEM at a really inconvenient time - just abandon the new - * directory on the server */ - ret = PTR_ERR(inode); - goto iget_error; + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, dvnode, fc.cb_break); + afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, NULL); + ret = afs_end_vnode_operation(&fc); + if (ret < 0) + goto error_key; + } else { + goto error_key; } - /* apply the status report we've got for the new vnode */ - vnode = AFS_FS_I(inode); - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) { - _debug("not hashed"); - d_rehash(dentry); - } key_put(key); _leave(" = 0"); return 0; -iget_error: - afs_put_server(server); -create_error: +error_key: key_put(key); error: d_drop(dentry); @@ -1088,6 +1180,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + struct afs_fs_cursor fc; struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; struct key *key; int ret; @@ -1111,19 +1204,33 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, goto error; } - ret = afs_vnode_rename(orig_dvnode, new_dvnode, key, - old_dentry->d_name.name, - new_dentry->d_name.name); - if (ret < 0) - goto rename_error; - key_put(key); - _leave(" = 0"); - return 0; + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, orig_dvnode, key)) { + if (orig_dvnode != new_dvnode) { + if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) { + afs_end_vnode_operation(&fc); + goto error_key; + } + } + while (afs_select_fileserver(&fc)) { + fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break; + fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break; + afs_fs_rename(&fc, old_dentry->d_name.name, + new_dvnode, new_dentry->d_name.name); + } -rename_error: + afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break); + afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2); + if (orig_dvnode != new_dvnode) + mutex_unlock(&new_dvnode->io_lock); + ret = afs_end_vnode_operation(&fc); + if (ret < 0) + goto error_key; + } + +error_key: key_put(key); error: - d_drop(new_dentry); _leave(" = %d", ret); return ret; } diff --git a/fs/afs/file.c b/fs/afs/file.c index 510cba15fa56..a39192ced99e 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,11 +19,11 @@ #include <linux/task_io_accounting_ops.h> #include "internal.h" +static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); static int afs_readpage(struct file *file, struct page *page); static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); -static int afs_launder_page(struct page *page); static int afs_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); @@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = afs_file_write, - .mmap = generic_file_readonly_mmap, + .mmap = afs_file_mmap, .splice_read = generic_file_splice_read, .fsync = afs_fsync, .lock = afs_lock, @@ -62,12 +62,63 @@ const struct address_space_operations afs_fs_aops = { .writepages = afs_writepages, }; +static const struct vm_operations_struct afs_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = afs_page_mkwrite, +}; + +/* + * Discard a pin on a writeback key. + */ +void afs_put_wb_key(struct afs_wb_key *wbk) +{ + if (refcount_dec_and_test(&wbk->usage)) { + key_put(wbk->key); + kfree(wbk); + } +} + +/* + * Cache key for writeback. + */ +int afs_cache_wb_key(struct afs_vnode *vnode, struct afs_file *af) +{ + struct afs_wb_key *wbk, *p; + + wbk = kzalloc(sizeof(struct afs_wb_key), GFP_KERNEL); + if (!wbk) + return -ENOMEM; + refcount_set(&wbk->usage, 2); + wbk->key = af->key; + + spin_lock(&vnode->wb_lock); + list_for_each_entry(p, &vnode->wb_keys, vnode_link) { + if (p->key == wbk->key) + goto found; + } + + key_get(wbk->key); + list_add_tail(&wbk->vnode_link, &vnode->wb_keys); + spin_unlock(&vnode->wb_lock); + af->wb = wbk; + return 0; + +found: + refcount_inc(&p->usage); + spin_unlock(&vnode->wb_lock); + af->wb = p; + kfree(wbk); + return 0; +} + /* * open an AFS file or directory and attach a key to it */ int afs_open(struct inode *inode, struct file *file) { struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_file *af; struct key *key; int ret; @@ -75,19 +126,38 @@ int afs_open(struct inode *inode, struct file *file) key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - return PTR_ERR(key); + ret = PTR_ERR(key); + goto error; } - ret = afs_validate(vnode, key); - if (ret < 0) { - _leave(" = %d [val]", ret); - return ret; + af = kzalloc(sizeof(*af), GFP_KERNEL); + if (!af) { + ret = -ENOMEM; + goto error_key; } + af->key = key; + + ret = afs_validate(vnode, key); + if (ret < 0) + goto error_af; - file->private_data = key; + if (file->f_mode & FMODE_WRITE) { + ret = afs_cache_wb_key(vnode, af); + if (ret < 0) + goto error_af; + } + + file->private_data = af; _leave(" = 0"); return 0; + +error_af: + kfree(af); +error_key: + key_put(key); +error: + _leave(" = %d", ret); + return ret; } /* @@ -96,10 +166,16 @@ int afs_open(struct inode *inode, struct file *file) int afs_release(struct inode *inode, struct file *file) { struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_file *af = file->private_data; _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); - key_put(file->private_data); + file->private_data = NULL; + if (af->wb) + afs_put_wb_key(af->wb); + key_put(af->key); + kfree(af); + afs_prune_wb_keys(vnode); _leave(" = 0"); return 0; } @@ -138,6 +214,37 @@ static void afs_file_readpage_read_complete(struct page *page, #endif /* + * Fetch file data from the volume. + */ +int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc) +{ + struct afs_fs_cursor fc; + int ret; + + _enter("%s{%x:%u.%u},%x,,,", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, vnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = vnode->cb_break + vnode->cb_s_break; + afs_fs_fetch_data(&fc, desc); + } + + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, vnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + } + + _leave(" = %d", ret); + return ret; +} + +/* * read page from file, directory or symlink, given a key to use */ int afs_page_filler(void *data, struct page *page) @@ -199,8 +306,13 @@ int afs_page_filler(void *data, struct page *page) /* read the contents of the file from the server into the * page */ - ret = afs_vnode_fetch_data(vnode, key, req); + ret = afs_fetch_data(vnode, key, req); afs_put_read(req); + + if (ret >= 0 && S_ISDIR(inode->i_mode) && + !afs_dir_check_page(inode, page)) + ret = -EIO; + if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -259,12 +371,12 @@ static int afs_readpage(struct file *file, struct page *page) int ret; if (file) { - key = file->private_data; + key = afs_file_key(file); ASSERT(key != NULL); ret = afs_page_filler(key, page); } else { struct inode *inode = page->mapping->host; - key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + key = afs_request_key(AFS_FS_S(inode->i_sb)->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); } else { @@ -281,7 +393,7 @@ static int afs_readpage(struct file *file, struct page *page) static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req) { #ifdef CONFIG_AFS_FSCACHE - struct afs_vnode *vnode = call->reply; + struct afs_vnode *vnode = call->reply[0]; #endif struct page *page = req->pages[req->index]; @@ -310,7 +422,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping, struct afs_read *req; struct list_head *p; struct page *first, *page; - struct key *key = file->private_data; + struct key *key = afs_file_key(file); pgoff_t index; int ret, n, i; @@ -369,7 +481,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping, return 0; } - ret = afs_vnode_fetch_data(vnode, key, req); + ret = afs_fetch_data(vnode, key, req); if (ret < 0) goto error; @@ -406,7 +518,7 @@ error: static int afs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct key *key = file->private_data; + struct key *key = afs_file_key(file); struct afs_vnode *vnode; int ret = 0; @@ -464,16 +576,6 @@ static int afs_readpages(struct file *file, struct address_space *mapping, } /* - * write back a dirty page - */ -static int afs_launder_page(struct page *page) -{ - _enter("{%lu}", page->index); - - return 0; -} - -/* * invalidate part or all of a page * - release a page and clean up its private data if offset is 0 (indicating * the entire page) @@ -481,7 +583,8 @@ static int afs_launder_page(struct page *page) static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct afs_writeback *wb = (struct afs_writeback *) page_private(page); + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + unsigned long priv; _enter("{%lu},%u,%u", page->index, offset, length); @@ -498,13 +601,11 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, #endif if (PagePrivate(page)) { - if (wb && !PageWriteback(page)) { - set_page_private(page, 0); - afs_put_writeback(wb); - } - - if (!page_private(page)) - ClearPagePrivate(page); + priv = page_private(page); + trace_afs_page_dirty(vnode, tracepoint_string("inval"), + page->index, priv); + set_page_private(page, 0); + ClearPagePrivate(page); } } @@ -517,8 +618,8 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, */ static int afs_releasepage(struct page *page, gfp_t gfp_flags) { - struct afs_writeback *wb = (struct afs_writeback *) page_private(page); struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + unsigned long priv; _enter("{{%x:%u}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, @@ -534,10 +635,10 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) #endif if (PagePrivate(page)) { - if (wb) { - set_page_private(page, 0); - afs_put_writeback(wb); - } + priv = page_private(page); + trace_afs_page_dirty(vnode, tracepoint_string("rel"), + page->index, priv); + set_page_private(page, 0); ClearPagePrivate(page); } @@ -545,3 +646,16 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) _leave(" = T"); return 1; } + +/* + * Handle setting up a memory mapping on an AFS file. + */ +static int afs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + + ret = generic_file_mmap(file, vma); + if (ret == 0) + vma->vm_ops = &afs_vm_ops; + return ret; +} diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 3191dff2c156..c40ba2fe3cbe 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -14,48 +14,17 @@ #define AFS_LOCK_GRANTED 0 #define AFS_LOCK_PENDING 1 +struct workqueue_struct *afs_lock_manager; + static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl); static void afs_fl_release_private(struct file_lock *fl); -static struct workqueue_struct *afs_lock_manager; -static DEFINE_MUTEX(afs_lock_manager_mutex); - static const struct file_lock_operations afs_lock_ops = { .fl_copy_lock = afs_fl_copy_lock, .fl_release_private = afs_fl_release_private, }; /* - * initialise the lock manager thread if it isn't already running - */ -static int afs_init_lock_manager(void) -{ - int ret; - - ret = 0; - if (!afs_lock_manager) { - mutex_lock(&afs_lock_manager_mutex); - if (!afs_lock_manager) { - afs_lock_manager = alloc_workqueue("kafs_lockd", - WQ_MEM_RECLAIM, 0); - if (!afs_lock_manager) - ret = -ENOMEM; - } - mutex_unlock(&afs_lock_manager_mutex); - } - return ret; -} - -/* - * destroy the lock manager thread if it's running - */ -void __exit afs_kill_lock_manager(void) -{ - if (afs_lock_manager) - destroy_workqueue(afs_lock_manager); -} - -/* * if the callback is broken on this vnode, then the lock may now be available */ void afs_lock_may_be_available(struct afs_vnode *vnode) @@ -99,6 +68,100 @@ static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl) } /* + * Get a lock on a file + */ +static int afs_set_lock(struct afs_vnode *vnode, struct key *key, + afs_lock_type_t type) +{ + struct afs_fs_cursor fc; + int ret; + + _enter("%s{%x:%u.%u},%x,%u", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), type); + + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, vnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = vnode->cb_break + vnode->cb_s_break; + afs_fs_set_lock(&fc, type); + } + + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, vnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * Extend a lock on a file + */ +static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_fs_cursor fc; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, vnode, key)) { + while (afs_select_current_fileserver(&fc)) { + fc.cb_break = vnode->cb_break + vnode->cb_s_break; + afs_fs_extend_lock(&fc); + } + + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, vnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * Release a lock on a file + */ +static int afs_release_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_fs_cursor fc; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, vnode, key)) { + while (afs_select_current_fileserver(&fc)) { + fc.cb_break = vnode->cb_break + vnode->cb_s_break; + afs_fs_release_lock(&fc); + } + + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, vnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + } + + _leave(" = %d", ret); + return ret; +} + +/* * do work for a lock, including: * - probing for a lock we're waiting on but didn't get immediately * - extending a lock that's close to timing out @@ -107,7 +170,7 @@ void afs_lock_work(struct work_struct *work) { struct afs_vnode *vnode = container_of(work, struct afs_vnode, lock_work.work); - struct file_lock *fl; + struct file_lock *fl, *next; afs_lock_type_t type; struct key *key; int ret; @@ -116,117 +179,136 @@ void afs_lock_work(struct work_struct *work) spin_lock(&vnode->lock); - if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) { +again: + _debug("wstate %u for %p", vnode->lock_state, vnode); + switch (vnode->lock_state) { + case AFS_VNODE_LOCK_NEED_UNLOCK: _debug("unlock"); + vnode->lock_state = AFS_VNODE_LOCK_UNLOCKING; spin_unlock(&vnode->lock); /* attempt to release the server lock; if it fails, we just - * wait 5 minutes and it'll time out anyway */ - ret = afs_vnode_release_lock(vnode, vnode->unlock_key); + * wait 5 minutes and it'll expire anyway */ + ret = afs_release_lock(vnode, vnode->lock_key); if (ret < 0) printk(KERN_WARNING "AFS:" " Failed to release lock on {%x:%x} error %d\n", vnode->fid.vid, vnode->fid.vnode, ret); spin_lock(&vnode->lock); - key_put(vnode->unlock_key); - vnode->unlock_key = NULL; - clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags); - } + key_put(vnode->lock_key); + vnode->lock_key = NULL; + vnode->lock_state = AFS_VNODE_LOCK_NONE; + + if (list_empty(&vnode->pending_locks)) { + spin_unlock(&vnode->lock); + return; + } - /* if we've got a lock, then it must be time to extend that lock as AFS - * locks time out after 5 minutes */ - if (!list_empty(&vnode->granted_locks)) { + /* The new front of the queue now owns the state variables. */ + next = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; + goto again; + + /* If we've already got a lock, then it must be time to extend that + * lock as AFS locks time out after 5 minutes. + */ + case AFS_VNODE_LOCK_GRANTED: _debug("extend"); - if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) - BUG(); - fl = list_entry(vnode->granted_locks.next, - struct file_lock, fl_u.afs.link); - key = key_get(fl->fl_file->private_data); + ASSERT(!list_empty(&vnode->granted_locks)); + + key = key_get(vnode->lock_key); + vnode->lock_state = AFS_VNODE_LOCK_EXTENDING; spin_unlock(&vnode->lock); - ret = afs_vnode_extend_lock(vnode, key); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + ret = afs_extend_lock(vnode, key); /* RPC */ key_put(key); - switch (ret) { - case 0: + + if (ret < 0) + pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + + spin_lock(&vnode->lock); + + if (vnode->lock_state != AFS_VNODE_LOCK_EXTENDING) + goto again; + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; + + if (ret == 0) afs_schedule_lock_extension(vnode); - break; - default: - /* ummm... we failed to extend the lock - retry - * extension shortly */ - printk(KERN_WARNING "AFS:" - " Failed to extend lock on {%x:%x} error %d\n", - vnode->fid.vid, vnode->fid.vnode, ret); + else queue_delayed_work(afs_lock_manager, &vnode->lock_work, HZ * 10); - break; - } - _leave(" [extend]"); + spin_unlock(&vnode->lock); + _leave(" [ext]"); return; - } - /* if we don't have a granted lock, then we must've been called back by - * the server, and so if might be possible to get a lock we're - * currently waiting for */ - if (!list_empty(&vnode->pending_locks)) { + /* If we don't have a granted lock, then we must've been called + * back by the server, and so if might be possible to get a + * lock we're currently waiting for. + */ + case AFS_VNODE_LOCK_WAITING_FOR_CB: _debug("get"); - if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) - BUG(); - fl = list_entry(vnode->pending_locks.next, - struct file_lock, fl_u.afs.link); - key = key_get(fl->fl_file->private_data); - type = (fl->fl_type == F_RDLCK) ? - AFS_LOCK_READ : AFS_LOCK_WRITE; + key = key_get(vnode->lock_key); + type = vnode->lock_type; + vnode->lock_state = AFS_VNODE_LOCK_SETTING; spin_unlock(&vnode->lock); - ret = afs_vnode_set_lock(vnode, key, type); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + ret = afs_set_lock(vnode, key, type); /* RPC */ + key_put(key); + + spin_lock(&vnode->lock); switch (ret) { case -EWOULDBLOCK: _debug("blocked"); break; case 0: _debug("acquired"); - if (type == AFS_LOCK_READ) - set_bit(AFS_VNODE_READLOCKED, &vnode->flags); - else - set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); - ret = AFS_LOCK_GRANTED; + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; + /* Fall through */ default: - spin_lock(&vnode->lock); - /* the pending lock may have been withdrawn due to a - * signal */ - if (list_entry(vnode->pending_locks.next, - struct file_lock, fl_u.afs.link) == fl) { - fl->fl_u.afs.state = ret; - if (ret == AFS_LOCK_GRANTED) - afs_grant_locks(vnode, fl); - else - list_del_init(&fl->fl_u.afs.link); - wake_up(&fl->fl_wait); - spin_unlock(&vnode->lock); - } else { + /* Pass the lock or the error onto the first locker in + * the list - if they're looking for this type of lock. + * If they're not, we assume that whoever asked for it + * took a signal. + */ + if (list_empty(&vnode->pending_locks)) { _debug("withdrawn"); - clear_bit(AFS_VNODE_READLOCKED, &vnode->flags); - clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); - spin_unlock(&vnode->lock); - afs_vnode_release_lock(vnode, key); - if (!list_empty(&vnode->pending_locks)) - afs_lock_may_be_available(vnode); + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + goto again; } - break; + + fl = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + if (vnode->lock_type != type) { + _debug("changed"); + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + goto again; + } + + fl->fl_u.afs.state = ret; + if (ret == 0) + afs_grant_locks(vnode, fl); + else + list_del_init(&fl->fl_u.afs.link); + wake_up(&fl->fl_wait); + spin_unlock(&vnode->lock); + _leave(" [granted]"); + return; } - key_put(key); - _leave(" [pend]"); + + default: + /* Looks like a lock request was withdrawn. */ + spin_unlock(&vnode->lock); + _leave(" [no]"); return; } - - /* looks like the lock request was withdrawn on a signal */ - spin_unlock(&vnode->lock); - _leave(" [no locks]"); } /* @@ -235,15 +317,105 @@ void afs_lock_work(struct work_struct *work) * AF_RXRPC * - the caller must hold the vnode lock */ -static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) +static void afs_defer_unlock(struct afs_vnode *vnode) +{ + _enter(""); + + if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED || + vnode->lock_state == AFS_VNODE_LOCK_EXTENDING) { + cancel_delayed_work(&vnode->lock_work); + + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + afs_lock_may_be_available(vnode); + } +} + +/* + * Check that our view of the file metadata is up to date and check to see + * whether we think that we have a locking permit. + */ +static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, + afs_lock_type_t type, bool can_sleep) +{ + afs_access_t access; + int ret; + + /* Make sure we've got a callback on this file and that our view of the + * data version is up to date. + */ + ret = afs_validate(vnode, key); + if (ret < 0) + return ret; + + /* Check the permission set to see if we're actually going to be + * allowed to get a lock on this file. + */ + ret = afs_check_permit(vnode, key, &access); + if (ret < 0) + return ret; + + /* At a rough estimation, you need LOCK, WRITE or INSERT perm to + * read-lock a file and WRITE or INSERT perm to write-lock a file. + * + * We can't rely on the server to do this for us since if we want to + * share a read lock that we already have, we won't go the server. + */ + if (type == AFS_LOCK_READ) { + if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE | AFS_ACE_LOCK))) + return -EACCES; + if (vnode->status.lock_count == -1 && !can_sleep) + return -EAGAIN; /* Write locked */ + } else { + if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE))) + return -EACCES; + if (vnode->status.lock_count != 0 && !can_sleep) + return -EAGAIN; /* Locked */ + } + + return 0; +} + +/* + * Remove the front runner from the pending queue. + * - The caller must hold vnode->lock. + */ +static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl) { - cancel_delayed_work(&vnode->lock_work); - if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) && - !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) - BUG(); - if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) - BUG(); - vnode->unlock_key = key_get(key); + struct file_lock *next; + + _enter(""); + + /* ->lock_type, ->lock_key and ->lock_state only belong to this + * file_lock if we're at the front of the pending queue or if we have + * the lock granted or if the lock_state is NEED_UNLOCK or UNLOCKING. + */ + if (vnode->granted_locks.next == &fl->fl_u.afs.link && + vnode->granted_locks.prev == &fl->fl_u.afs.link) { + list_del_init(&fl->fl_u.afs.link); + afs_defer_unlock(vnode); + return; + } + + if (!list_empty(&vnode->granted_locks) || + vnode->pending_locks.next != &fl->fl_u.afs.link) { + list_del_init(&fl->fl_u.afs.link); + return; + } + + list_del_init(&fl->fl_u.afs.link); + key_put(vnode->lock_key); + vnode->lock_key = NULL; + vnode->lock_state = AFS_VNODE_LOCK_NONE; + + if (list_empty(&vnode->pending_locks)) + return; + + /* The new front of the queue now owns the state variables. */ + next = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; afs_lock_may_be_available(vnode); } @@ -252,10 +424,10 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = file_inode(file); + struct inode *inode = locks_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); afs_lock_type_t type; - struct key *key = file->private_data; + struct key *key = afs_file_key(file); int ret; _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); @@ -264,175 +436,142 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) return -EINVAL; - ret = afs_init_lock_manager(); - if (ret < 0) - return ret; - fl->fl_ops = &afs_lock_ops; INIT_LIST_HEAD(&fl->fl_u.afs.link); fl->fl_u.afs.state = AFS_LOCK_PENDING; type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; - spin_lock(&inode->i_lock); - - /* make sure we've got a callback on this file and that our view of the - * data version is up to date */ - ret = afs_vnode_fetch_status(vnode, NULL, key); + ret = afs_do_setlk_check(vnode, key, type, fl->fl_flags & FL_SLEEP); if (ret < 0) - goto error; - - if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) { - ret = -EAGAIN; - goto error; - } + return ret; spin_lock(&vnode->lock); - /* if we've already got a readlock on the server then we can instantly + /* If we've already got a readlock on the server then we instantly * grant another readlock, irrespective of whether there are any - * pending writelocks */ + * pending writelocks. + */ if (type == AFS_LOCK_READ && - vnode->flags & (1 << AFS_VNODE_READLOCKED)) { + vnode->lock_state == AFS_VNODE_LOCK_GRANTED && + vnode->lock_type == AFS_LOCK_READ) { _debug("instant readlock"); - ASSERTCMP(vnode->flags & - ((1 << AFS_VNODE_LOCKING) | - (1 << AFS_VNODE_WRITELOCKED)), ==, 0); ASSERT(!list_empty(&vnode->granted_locks)); - goto sharing_existing_lock; + goto share_existing_lock; } - /* if there's no-one else with a lock on this vnode, then we need to - * ask the server for a lock */ - if (list_empty(&vnode->pending_locks) && - list_empty(&vnode->granted_locks)) { - _debug("not locked"); - ASSERTCMP(vnode->flags & - ((1 << AFS_VNODE_LOCKING) | - (1 << AFS_VNODE_READLOCKED) | - (1 << AFS_VNODE_WRITELOCKED)), ==, 0); - list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); - set_bit(AFS_VNODE_LOCKING, &vnode->flags); - spin_unlock(&vnode->lock); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); - ret = afs_vnode_set_lock(vnode, key, type); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); - switch (ret) { - case 0: - _debug("acquired"); - goto acquired_server_lock; - case -EWOULDBLOCK: - _debug("would block"); - spin_lock(&vnode->lock); - ASSERT(list_empty(&vnode->granted_locks)); - ASSERTCMP(vnode->pending_locks.next, ==, - &fl->fl_u.afs.link); - goto wait; - default: - spin_lock(&vnode->lock); - list_del_init(&fl->fl_u.afs.link); - spin_unlock(&vnode->lock); - goto error; - } - } + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) + goto need_to_wait; - /* otherwise, we need to wait for a local lock to become available */ - _debug("wait local"); - list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); -wait: - if (!(fl->fl_flags & FL_SLEEP)) { - _debug("noblock"); - ret = -EAGAIN; - goto abort_attempt; - } + /* We don't have a lock on this vnode and we aren't currently waiting + * for one either, so ask the server for a lock. + * + * Note that we need to be careful if we get interrupted by a signal + * after dispatching the request as we may still get the lock, even + * though we don't wait for the reply (it's not too bad a problem - the + * lock will expire in 10 mins anyway). + */ + _debug("not locked"); + vnode->lock_key = key_get(key); + vnode->lock_type = type; + vnode->lock_state = AFS_VNODE_LOCK_SETTING; spin_unlock(&vnode->lock); - /* now we need to sleep and wait for the lock manager thread to get the - * lock from the server */ - _debug("sleep"); - ret = wait_event_interruptible(fl->fl_wait, - fl->fl_u.afs.state <= AFS_LOCK_GRANTED); - if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { - ret = fl->fl_u.afs.state; - if (ret < 0) - goto error; - spin_lock(&vnode->lock); - goto given_lock; - } - - /* we were interrupted, but someone may still be in the throes of - * giving us the lock */ - _debug("intr"); - ASSERTCMP(ret, ==, -ERESTARTSYS); + ret = afs_set_lock(vnode, key, type); /* RPC */ spin_lock(&vnode->lock); - if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { - ret = fl->fl_u.afs.state; - if (ret < 0) { - spin_unlock(&vnode->lock); - goto error; - } - goto given_lock; - } + switch (ret) { + default: + goto abort_attempt; -abort_attempt: - /* we aren't going to get the lock, either because we're unwilling to - * wait, or because some signal happened */ - _debug("abort"); - if (list_empty(&vnode->granted_locks) && - vnode->pending_locks.next == &fl->fl_u.afs.link) { - if (vnode->pending_locks.prev != &fl->fl_u.afs.link) { - /* kick the next pending lock into having a go */ - list_del_init(&fl->fl_u.afs.link); - afs_lock_may_be_available(vnode); - } - } else { - list_del_init(&fl->fl_u.afs.link); + case -EWOULDBLOCK: + /* The server doesn't have a lock-waiting queue, so the client + * will have to retry. The server will break the outstanding + * callbacks on a file when a lock is released. + */ + _debug("would block"); + ASSERT(list_empty(&vnode->granted_locks)); + ASSERTCMP(vnode->pending_locks.next, ==, &fl->fl_u.afs.link); + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; + goto need_to_wait; + + case 0: + _debug("acquired"); + break; } - spin_unlock(&vnode->lock); - goto error; -acquired_server_lock: /* we've acquired a server lock, but it needs to be renewed after 5 * mins */ - spin_lock(&vnode->lock); + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; afs_schedule_lock_extension(vnode); - if (type == AFS_LOCK_READ) - set_bit(AFS_VNODE_READLOCKED, &vnode->flags); - else - set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); -sharing_existing_lock: + +share_existing_lock: /* the lock has been granted as far as we're concerned... */ fl->fl_u.afs.state = AFS_LOCK_GRANTED; list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); + given_lock: /* ... but we do still need to get the VFS's blessing */ - ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING))); - ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) | - (1 << AFS_VNODE_WRITELOCKED))) != 0); + spin_unlock(&vnode->lock); + ret = posix_lock_file(file, fl, NULL); if (ret < 0) goto vfs_rejected_lock; - spin_unlock(&vnode->lock); - /* again, make sure we've got a callback on this file and, again, make + /* Again, make sure we've got a callback on this file and, again, make * sure that our view of the data version is up to date (we ignore - * errors incurred here and deal with the consequences elsewhere) */ - afs_vnode_fetch_status(vnode, NULL, key); + * errors incurred here and deal with the consequences elsewhere). + */ + afs_validate(vnode, key); + _leave(" = 0"); + return 0; -error: - spin_unlock(&inode->i_lock); +need_to_wait: + /* We're going to have to wait. Either this client doesn't have a lock + * on the server yet and we need to wait for a callback to occur, or + * the client does have a lock on the server, but it belongs to some + * other process(es) and is incompatible with the lock we want. + */ + ret = -EAGAIN; + if (fl->fl_flags & FL_SLEEP) { + spin_unlock(&vnode->lock); + + _debug("sleep"); + ret = wait_event_interruptible(fl->fl_wait, + fl->fl_u.afs.state != AFS_LOCK_PENDING); + + spin_lock(&vnode->lock); + } + + if (fl->fl_u.afs.state == AFS_LOCK_GRANTED) + goto given_lock; + if (fl->fl_u.afs.state < 0) + ret = fl->fl_u.afs.state; + +abort_attempt: + /* we aren't going to get the lock, either because we're unwilling to + * wait, or because some signal happened */ + _debug("abort"); + afs_dequeue_lock(vnode, fl); + +error_unlock: + spin_unlock(&vnode->lock); _leave(" = %d", ret); return ret; vfs_rejected_lock: - /* the VFS rejected the lock we just obtained, so we have to discard - * what we just got */ + /* The VFS rejected the lock we just obtained, so we have to discard + * what we just got. We defer this to the lock manager work item to + * deal with. + */ _debug("vfs refused %d", ret); + spin_lock(&vnode->lock); list_del_init(&fl->fl_u.afs.link); if (list_empty(&vnode->granted_locks)) - afs_defer_unlock(vnode, key); - goto abort_attempt; + afs_defer_unlock(vnode); + goto error_unlock; } /* @@ -440,34 +579,21 @@ vfs_rejected_lock: */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); - struct key *key = file->private_data; + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); int ret; _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + /* Flush all pending writes before doing anything with locks. */ + vfs_fsync(file, 0); + /* only whole-file unlocks are supported */ if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) return -EINVAL; - fl->fl_ops = &afs_lock_ops; - INIT_LIST_HEAD(&fl->fl_u.afs.link); - fl->fl_u.afs.state = AFS_LOCK_PENDING; - - spin_lock(&vnode->lock); ret = posix_lock_file(file, fl, NULL); - if (ret < 0) { - spin_unlock(&vnode->lock); - _leave(" = %d [vfs]", ret); - return ret; - } - - /* discard the server lock only if all granted locks are gone */ - if (list_empty(&vnode->granted_locks)) - afs_defer_unlock(vnode, key); - spin_unlock(&vnode->lock); - _leave(" = 0"); - return 0; + _leave(" = %d [%u]", ret, vnode->lock_state); + return ret; } /* @@ -475,37 +601,33 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); - struct key *key = file->private_data; + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct key *key = afs_file_key(file); int ret, lock_count; _enter(""); fl->fl_type = F_UNLCK; - inode_lock(&vnode->vfs_inode); - /* check local lock records first */ - ret = 0; posix_test_lock(file, fl); if (fl->fl_type == F_UNLCK) { /* no local locks; consult the server */ - ret = afs_vnode_fetch_status(vnode, NULL, key); + ret = afs_fetch_status(vnode, key); if (ret < 0) goto error; - lock_count = vnode->status.lock_count; - if (lock_count) { - if (lock_count > 0) - fl->fl_type = F_RDLCK; - else - fl->fl_type = F_WRLCK; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - } + + lock_count = READ_ONCE(vnode->status.lock_count); + if (lock_count > 0) + fl->fl_type = F_RDLCK; + else + fl->fl_type = F_WRLCK; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; } + ret = 0; error: - inode_unlock(&vnode->vfs_inode); _leave(" = %d [%hd]", ret, fl->fl_type); return ret; } @@ -515,7 +637,7 @@ error: */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, @@ -538,7 +660,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); _enter("{%x:%u},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, @@ -568,9 +690,13 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { + struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + _enter(""); + spin_lock(&vnode->lock); list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link); + spin_unlock(&vnode->lock); } /* @@ -579,7 +705,12 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { + struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + _enter(""); - list_del_init(&fl->fl_u.afs.link); + spin_lock(&vnode->lock); + afs_dequeue_lock(vnode, fl); + _debug("state %u for %p", vnode->lock_state, vnode); + spin_unlock(&vnode->lock); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 19f76ae36982..b90ef39ae914 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -16,12 +16,19 @@ #include "internal.h" #include "afs_fs.h" +static const struct afs_fid afs_zero_fid; + /* * We need somewhere to discard into in case the server helpfully returns more * than we asked for in FS.FetchData{,64}. */ static u8 afs_discard_buffer[64]; +static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi) +{ + call->cbi = afs_get_cb_interest(cbi); +} + /* * decode an AFSFid block */ @@ -47,14 +54,18 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, const __be32 *bp = *_bp; umode_t mode; u64 data_version, size; - u32 changed = 0; /* becomes non-zero if ctime-type changes seen */ + bool changed = false; kuid_t owner; kgid_t group; + if (vnode) + write_seqlock(&vnode->cb_lock); + #define EXTRACT(DST) \ do { \ u32 x = ntohl(*bp++); \ - changed |= DST - x; \ + if (DST != x) \ + changed |= true; \ DST = x; \ } while (0) @@ -70,8 +81,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, EXTRACT(status->caller_access); /* call ticket dependent */ EXTRACT(status->anon_access); EXTRACT(status->mode); - EXTRACT(status->parent.vnode); - EXTRACT(status->parent.unique); + bp++; /* parent.vnode */ + bp++; /* parent.unique */ bp++; /* seg size */ status->mtime_client = ntohl(*bp++); status->mtime_server = ntohl(*bp++); @@ -95,7 +106,6 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, status->mtime_client, status->mtime_server); if (vnode) { - status->parent.vid = vnode->fid.vid; if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { _debug("vnode changed"); i_size_write(&vnode->vfs_inode, size); @@ -127,25 +137,47 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, _debug("vnode modified %llx on {%x:%u}", (unsigned long long) data_version, vnode->fid.vid, vnode->fid.vnode); - set_bit(AFS_VNODE_MODIFIED, &vnode->flags); + set_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags); set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); } } else if (store_version) { status->data_version = data_version; } + + if (vnode) + write_sequnlock(&vnode->cb_lock); } /* * decode an AFSCallBack block */ -static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode) +static void xdr_decode_AFSCallBack(struct afs_call *call, + struct afs_vnode *vnode, + const __be32 **_bp) { + struct afs_cb_interest *old, *cbi = call->cbi; const __be32 *bp = *_bp; + u32 cb_expiry; + + write_seqlock(&vnode->cb_lock); + + if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) { + vnode->cb_version = ntohl(*bp++); + cb_expiry = ntohl(*bp++); + vnode->cb_type = ntohl(*bp++); + vnode->cb_expires_at = cb_expiry + ktime_get_real_seconds(); + old = vnode->cb_interest; + if (old != call->cbi) { + vnode->cb_interest = cbi; + cbi = old; + } + set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + } else { + bp += 3; + } - vnode->cb_version = ntohl(*bp++); - vnode->cb_expiry = ntohl(*bp++); - vnode->cb_type = ntohl(*bp++); - vnode->cb_expires = vnode->cb_expiry + ktime_get_real_seconds(); + write_sequnlock(&vnode->cb_lock); + call->cbi = cbi; *_bp = bp; } @@ -243,22 +275,22 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, */ static int afs_deliver_fs_fetch_status(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; - _enter(""); - ret = afs_transfer_reply(call); if (ret < 0) return ret; + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSCallBack(&bp, vnode); - if (call->reply2) - xdr_decode_AFSVolSync(&bp, call->reply2); + xdr_decode_AFSCallBack(call, vnode, &bp); + if (call->reply[1]) + xdr_decode_AFSVolSync(&bp, call->reply[1]); _leave(" = 0 [done]"); return 0; @@ -269,35 +301,33 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) */ static const struct afs_call_type afs_RXFSFetchStatus = { .name = "FS.FetchStatus", + .op = afs_FS_FetchStatus, .deliver = afs_deliver_fs_fetch_status, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * fetch the status information for a file */ -int afs_fs_fetch_file_status(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - struct afs_volsync *volsync, - bool async) +int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%x:%u},,", - key_serial(key), vnode->fid.vid, vnode->fid.vnode); + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); - call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); - if (!call) + call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + if (!call) { + fc->ac.error = -ENOMEM; return -ENOMEM; + } - call->key = key; - call->reply = vnode; - call->reply2 = volsync; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = volsync; /* marshall the parameters */ bp = call->request; @@ -306,7 +336,10 @@ int afs_fs_fetch_file_status(struct afs_server *server, bp[2] = htonl(vnode->fid.vnode); bp[3] = htonl(vnode->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -314,8 +347,8 @@ int afs_fs_fetch_file_status(struct afs_server *server, */ static int afs_deliver_fs_fetch_data(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; - struct afs_read *req = call->reply3; + struct afs_vnode *vnode = call->reply[0]; + struct afs_read *req = call->reply[2]; const __be32 *bp; unsigned int size; void *buffer; @@ -431,9 +464,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSCallBack(&bp, vnode); - if (call->reply2) - xdr_decode_AFSVolSync(&bp, call->reply2); + xdr_decode_AFSCallBack(call, vnode, &bp); + if (call->reply[1]) + xdr_decode_AFSVolSync(&bp, call->reply[1]); call->offset = 0; call->unmarshall++; @@ -457,7 +490,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) static void afs_fetch_data_destructor(struct afs_call *call) { - struct afs_read *req = call->reply3; + struct afs_read *req = call->reply[2]; afs_put_read(req); afs_flat_call_destructor(call); @@ -468,43 +501,38 @@ static void afs_fetch_data_destructor(struct afs_call *call) */ static const struct afs_call_type afs_RXFSFetchData = { .name = "FS.FetchData", + .op = afs_FS_FetchData, .deliver = afs_deliver_fs_fetch_data, - .abort_to_error = afs_abort_to_error, .destructor = afs_fetch_data_destructor, }; static const struct afs_call_type afs_RXFSFetchData64 = { .name = "FS.FetchData64", + .op = afs_FS_FetchData64, .deliver = afs_deliver_fs_fetch_data, - .abort_to_error = afs_abort_to_error, .destructor = afs_fetch_data_destructor, }; /* * fetch data from a very large file */ -static int afs_fs_fetch_data64(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - struct afs_read *req, - bool async) +static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(net, &afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->reply2 = NULL; /* volsync */ - call->reply3 = req; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->operation_ID = FSFETCHDATA64; + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = NULL; /* volsync */ + call->reply[2] = req; /* marshall the parameters */ bp = call->request; @@ -518,39 +546,37 @@ static int afs_fs_fetch_data64(struct afs_server *server, bp[7] = htonl(lower_32_bits(req->len)); atomic_inc(&req->usage); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* * fetch data from a file */ -int afs_fs_fetch_data(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - struct afs_read *req, - bool async) +int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; if (upper_32_bits(req->pos) || upper_32_bits(req->len) || upper_32_bits(req->pos + req->len)) - return afs_fs_fetch_data64(server, key, vnode, req, async); + return afs_fs_fetch_data64(fc, req); _enter(""); - call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(net, &afs_RXFSFetchData, 24, (21 + 3 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->reply2 = NULL; /* volsync */ - call->reply3 = req; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->operation_ID = FSFETCHDATA; + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = NULL; /* volsync */ + call->reply[2] = req; /* marshall the parameters */ bp = call->request; @@ -562,90 +588,10 @@ int afs_fs_fetch_data(struct afs_server *server, bp[5] = htonl(lower_32_bits(req->len)); atomic_inc(&req->usage); - return afs_make_call(&server->addr, call, GFP_NOFS, async); -} - -/* - * deliver reply data to an FS.GiveUpCallBacks - */ -static int afs_deliver_fs_give_up_callbacks(struct afs_call *call) -{ - _enter(""); - - /* shouldn't be any reply data */ - return afs_extract_data(call, NULL, 0, false); -} - -/* - * FS.GiveUpCallBacks operation type - */ -static const struct afs_call_type afs_RXFSGiveUpCallBacks = { - .name = "FS.GiveUpCallBacks", - .deliver = afs_deliver_fs_give_up_callbacks, - .abort_to_error = afs_abort_to_error, - .destructor = afs_flat_call_destructor, -}; - -/* - * give up a set of callbacks - * - the callbacks are held in the server->cb_break ring - */ -int afs_fs_give_up_callbacks(struct afs_server *server, - bool async) -{ - struct afs_call *call; - size_t ncallbacks; - __be32 *bp, *tp; - int loop; - - ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail, - ARRAY_SIZE(server->cb_break)); - - _enter("{%zu},", ncallbacks); - - if (ncallbacks == 0) - return 0; - if (ncallbacks > AFSCBMAX) - ncallbacks = AFSCBMAX; - - _debug("break %zu callbacks", ncallbacks); - - call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks, - 12 + ncallbacks * 6 * 4, 0); - if (!call) - return -ENOMEM; - - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - - /* marshall the parameters */ - bp = call->request; - tp = bp + 2 + ncallbacks * 3; - *bp++ = htonl(FSGIVEUPCALLBACKS); - *bp++ = htonl(ncallbacks); - *tp++ = htonl(ncallbacks); - - atomic_sub(ncallbacks, &server->cb_break_n); - for (loop = ncallbacks; loop > 0; loop--) { - struct afs_callback *cb = - &server->cb_break[server->cb_break_tail]; - - *bp++ = htonl(cb->fid.vid); - *bp++ = htonl(cb->fid.vnode); - *bp++ = htonl(cb->fid.unique); - *tp++ = htonl(cb->version); - *tp++ = htonl(cb->expiry); - *tp++ = htonl(cb->type); - smp_mb(); - server->cb_break_tail = - (server->cb_break_tail + 1) & - (ARRAY_SIZE(server->cb_break) - 1); - } - - ASSERT(ncallbacks > 0); - wake_up_nr(&server->cb_break_waitq, ncallbacks); - - return afs_make_call(&server->addr, call, GFP_NOFS, async); + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -653,7 +599,7 @@ int afs_fs_give_up_callbacks(struct afs_server *server, */ static int afs_deliver_fs_create_vnode(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; @@ -665,11 +611,11 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFid(&bp, call->reply2); - xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL); + xdr_decode_AFSFid(&bp, call->reply[1]); + xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL); xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSCallBack_raw(&bp, call->reply4); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + xdr_decode_AFSCallBack_raw(&bp, call->reply[3]); + /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); return 0; @@ -678,27 +624,33 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* * FS.CreateFile and FS.MakeDir operation type */ -static const struct afs_call_type afs_RXFSCreateXXXX = { - .name = "FS.CreateXXXX", +static const struct afs_call_type afs_RXFSCreateFile = { + .name = "FS.CreateFile", + .op = afs_FS_CreateFile, + .deliver = afs_deliver_fs_create_vnode, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSMakeDir = { + .name = "FS.MakeDir", + .op = afs_FS_MakeDir, .deliver = afs_deliver_fs_create_vnode, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * create a file or make a directory */ -int afs_fs_create(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, +int afs_fs_create(struct afs_fs_cursor *fc, const char *name, umode_t mode, struct afs_fid *newfid, struct afs_file_status *newstatus, - struct afs_callback *newcb, - bool async) + struct afs_callback *newcb) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); size_t namesz, reqsz, padsz; __be32 *bp; @@ -708,18 +660,17 @@ int afs_fs_create(struct afs_server *server, padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz + (6 * 4); - call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz, - (3 + 21 + 21 + 3 + 6) * 4); + call = afs_alloc_flat_call( + net, S_ISDIR(mode) ? &afs_RXFSMakeDir : &afs_RXFSCreateFile, + reqsz, (3 + 21 + 21 + 3 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->reply2 = newfid; - call->reply3 = newstatus; - call->reply4 = newcb; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = newfid; + call->reply[2] = newstatus; + call->reply[3] = newcb; /* marshall the parameters */ bp = call->request; @@ -741,7 +692,9 @@ int afs_fs_create(struct afs_server *server, *bp++ = htonl(mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -749,7 +702,7 @@ int afs_fs_create(struct afs_server *server, */ static int afs_deliver_fs_remove(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; @@ -762,7 +715,7 @@ static int afs_deliver_fs_remove(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); return 0; @@ -771,24 +724,28 @@ static int afs_deliver_fs_remove(struct afs_call *call) /* * FS.RemoveDir/FS.RemoveFile operation type */ -static const struct afs_call_type afs_RXFSRemoveXXXX = { - .name = "FS.RemoveXXXX", +static const struct afs_call_type afs_RXFSRemoveFile = { + .name = "FS.RemoveFile", + .op = afs_FS_RemoveFile, + .deliver = afs_deliver_fs_remove, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSRemoveDir = { + .name = "FS.RemoveDir", + .op = afs_FS_RemoveDir, .deliver = afs_deliver_fs_remove, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * remove a file or directory */ -int afs_fs_remove(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - const char *name, - bool isdir, - bool async) +int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); size_t namesz, reqsz, padsz; __be32 *bp; @@ -798,14 +755,14 @@ int afs_fs_remove(struct afs_server *server, padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz; - call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4); + call = afs_alloc_flat_call( + net, isdir ? &afs_RXFSRemoveDir : &afs_RXFSRemoveFile, + reqsz, (21 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; /* marshall the parameters */ bp = call->request; @@ -821,7 +778,9 @@ int afs_fs_remove(struct afs_server *server, bp = (void *) bp + padsz; } - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -829,7 +788,7 @@ int afs_fs_remove(struct afs_server *server, */ static int afs_deliver_fs_link(struct afs_call *call) { - struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; + struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1]; const __be32 *bp; int ret; @@ -843,7 +802,7 @@ static int afs_deliver_fs_link(struct afs_call *call) bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); return 0; @@ -854,22 +813,20 @@ static int afs_deliver_fs_link(struct afs_call *call) */ static const struct afs_call_type afs_RXFSLink = { .name = "FS.Link", + .op = afs_FS_Link, .deliver = afs_deliver_fs_link, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * make a hard link */ -int afs_fs_link(struct afs_server *server, - struct key *key, - struct afs_vnode *dvnode, - struct afs_vnode *vnode, - const char *name, - bool async) +int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name) { + struct afs_vnode *dvnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); size_t namesz, reqsz, padsz; __be32 *bp; @@ -879,15 +836,13 @@ int afs_fs_link(struct afs_server *server, padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz + (3 * 4); - call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4); + call = afs_alloc_flat_call(net, &afs_RXFSLink, reqsz, (21 + 21 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = dvnode; - call->reply2 = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = vnode; /* marshall the parameters */ bp = call->request; @@ -906,7 +861,9 @@ int afs_fs_link(struct afs_server *server, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -914,7 +871,7 @@ int afs_fs_link(struct afs_server *server, */ static int afs_deliver_fs_symlink(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; @@ -926,10 +883,10 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFid(&bp, call->reply2); - xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL); + xdr_decode_AFSFid(&bp, call->reply[1]); + xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL); xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); return 0; @@ -940,24 +897,23 @@ static int afs_deliver_fs_symlink(struct afs_call *call) */ static const struct afs_call_type afs_RXFSSymlink = { .name = "FS.Symlink", + .op = afs_FS_Symlink, .deliver = afs_deliver_fs_symlink, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * create a symbolic link */ -int afs_fs_symlink(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, +int afs_fs_symlink(struct afs_fs_cursor *fc, const char *name, const char *contents, struct afs_fid *newfid, - struct afs_file_status *newstatus, - bool async) + struct afs_file_status *newstatus) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); size_t namesz, reqsz, padsz, c_namesz, c_padsz; __be32 *bp; @@ -971,17 +927,15 @@ int afs_fs_symlink(struct afs_server *server, reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4); - call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz, + call = afs_alloc_flat_call(net, &afs_RXFSSymlink, reqsz, (3 + 21 + 21 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->reply2 = newfid; - call->reply3 = newstatus; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = newfid; + call->reply[2] = newstatus; /* marshall the parameters */ bp = call->request; @@ -1010,7 +964,9 @@ int afs_fs_symlink(struct afs_server *server, *bp++ = htonl(S_IRWXUGO); /* unix mode */ *bp++ = 0; /* segment size */ - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -1018,7 +974,7 @@ int afs_fs_symlink(struct afs_server *server, */ static int afs_deliver_fs_rename(struct afs_call *call) { - struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; + struct afs_vnode *orig_dvnode = call->reply[0], *new_dvnode = call->reply[1]; const __be32 *bp; int ret; @@ -1034,7 +990,7 @@ static int afs_deliver_fs_rename(struct afs_call *call) if (new_dvnode != orig_dvnode) xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode, NULL); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); return 0; @@ -1045,23 +1001,22 @@ static int afs_deliver_fs_rename(struct afs_call *call) */ static const struct afs_call_type afs_RXFSRename = { .name = "FS.Rename", + .op = afs_FS_Rename, .deliver = afs_deliver_fs_rename, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * create a symbolic link */ -int afs_fs_rename(struct afs_server *server, - struct key *key, - struct afs_vnode *orig_dvnode, +int afs_fs_rename(struct afs_fs_cursor *fc, const char *orig_name, struct afs_vnode *new_dvnode, - const char *new_name, - bool async) + const char *new_name) { + struct afs_vnode *orig_dvnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(orig_dvnode); size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz; __be32 *bp; @@ -1078,15 +1033,13 @@ int afs_fs_rename(struct afs_server *server, (3 * 4) + 4 + n_namesz + n_padsz; - call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4); + call = afs_alloc_flat_call(net, &afs_RXFSRename, reqsz, (21 + 21 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = orig_dvnode; - call->reply2 = new_dvnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = orig_dvnode; + call->reply[1] = new_dvnode; /* marshall the parameters */ bp = call->request; @@ -1113,7 +1066,9 @@ int afs_fs_rename(struct afs_server *server, bp = (void *) bp + n_padsz; } - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &orig_dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -1121,7 +1076,7 @@ int afs_fs_rename(struct afs_server *server, */ static int afs_deliver_fs_store_data(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; @@ -1135,7 +1090,7 @@ static int afs_deliver_fs_store_data(struct afs_call *call) bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, &call->store_version); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ afs_pages_written_back(vnode, call); @@ -1148,47 +1103,44 @@ static int afs_deliver_fs_store_data(struct afs_call *call) */ static const struct afs_call_type afs_RXFSStoreData = { .name = "FS.StoreData", + .op = afs_FS_StoreData, .deliver = afs_deliver_fs_store_data, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSStoreData64 = { .name = "FS.StoreData64", + .op = afs_FS_StoreData64, .deliver = afs_deliver_fs_store_data, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * store a set of pages to a very large file */ -static int afs_fs_store_data64(struct afs_server *server, - struct afs_writeback *wb, +static int afs_fs_store_data64(struct afs_fs_cursor *fc, + struct address_space *mapping, pgoff_t first, pgoff_t last, unsigned offset, unsigned to, - loff_t size, loff_t pos, loff_t i_size, - bool async) + loff_t size, loff_t pos, loff_t i_size) { - struct afs_vnode *vnode = wb->vnode; + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%x:%u},,", - key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); - call = afs_alloc_flat_call(&afs_RXFSStoreData64, + call = afs_alloc_flat_call(net, &afs_RXFSStoreData64, (4 + 6 + 3 * 2) * 4, (21 + 6) * 4); if (!call) return -ENOMEM; - call->wb = wb; - call->key = wb->key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->mapping = vnode->vfs_inode.i_mapping; + call->key = fc->key; + call->mapping = mapping; + call->reply[0] = vnode; call->first = first; call->last = last; call->first_offset = offset; @@ -1217,24 +1169,25 @@ static int afs_fs_store_data64(struct afs_server *server, *bp++ = htonl(i_size >> 32); *bp++ = htonl((u32) i_size); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* * store a set of pages */ -int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, +int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, - bool async) + unsigned offset, unsigned to) { - struct afs_vnode *vnode = wb->vnode; + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); loff_t size, pos, i_size; __be32 *bp; _enter(",%x,{%x:%u},,", - key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); size = (loff_t)to - (loff_t)offset; if (first != last) @@ -1251,21 +1204,18 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, (unsigned long long) i_size); if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32) - return afs_fs_store_data64(server, wb, first, last, offset, to, - size, pos, i_size, async); + return afs_fs_store_data64(fc, mapping, first, last, offset, to, + size, pos, i_size); - call = afs_alloc_flat_call(&afs_RXFSStoreData, + call = afs_alloc_flat_call(net, &afs_RXFSStoreData, (4 + 6 + 3) * 4, (21 + 6) * 4); if (!call) return -ENOMEM; - call->wb = wb; - call->key = wb->key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->mapping = vnode->vfs_inode.i_mapping; + call->key = fc->key; + call->mapping = mapping; + call->reply[0] = vnode; call->first = first; call->last = last; call->first_offset = offset; @@ -1291,7 +1241,9 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, *bp++ = htonl(size); *bp++ = htonl(i_size); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -1300,7 +1252,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, static int afs_deliver_fs_store_status(struct afs_call *call) { afs_dataversion_t *store_version; - struct afs_vnode *vnode = call->reply; + struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; @@ -1317,7 +1269,7 @@ static int afs_deliver_fs_store_status(struct afs_call *call) bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); return 0; @@ -1328,22 +1280,22 @@ static int afs_deliver_fs_store_status(struct afs_call *call) */ static const struct afs_call_type afs_RXFSStoreStatus = { .name = "FS.StoreStatus", + .op = afs_FS_StoreStatus, .deliver = afs_deliver_fs_store_status, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSStoreData_as_Status = { .name = "FS.StoreData", + .op = afs_FS_StoreData, .deliver = afs_deliver_fs_store_status, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSStoreData64_as_Status = { .name = "FS.StoreData64", + .op = afs_FS_StoreData64, .deliver = afs_deliver_fs_store_status, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; @@ -1351,30 +1303,27 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = { * set the attributes on a very large file, using FS.StoreData rather than * FS.StoreStatus so as to alter the file size also */ -static int afs_fs_setattr_size64(struct afs_server *server, struct key *key, - struct afs_vnode *vnode, struct iattr *attr, - bool async) +static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%x:%u},,", - key_serial(key), vnode->fid.vid, vnode->fid.vnode); + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); - call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status, + call = afs_alloc_flat_call(net, &afs_RXFSStoreData64_as_Status, (4 + 6 + 3 * 2) * 4, (21 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; call->store_version = vnode->status.data_version + 1; - call->operation_ID = FSSTOREDATA; /* marshall the parameters */ bp = call->request; @@ -1392,40 +1341,38 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key, *bp++ = htonl(attr->ia_size >> 32); /* new file length */ *bp++ = htonl((u32) attr->ia_size); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus * so as to alter the file size also */ -static int afs_fs_setattr_size(struct afs_server *server, struct key *key, - struct afs_vnode *vnode, struct iattr *attr, - bool async) +static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(",%x,{%x:%u},,", - key_serial(key), vnode->fid.vid, vnode->fid.vnode); + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); if (attr->ia_size >> 32) - return afs_fs_setattr_size64(server, key, vnode, attr, - async); + return afs_fs_setattr_size64(fc, attr); - call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status, + call = afs_alloc_flat_call(net, &afs_RXFSStoreData_as_Status, (4 + 6 + 3) * 4, (21 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; call->store_version = vnode->status.data_version + 1; - call->operation_ID = FSSTOREDATA; /* marshall the parameters */ bp = call->request; @@ -1440,38 +1387,36 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key, *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* * set the attributes on a file, using FS.StoreData if there's a change in file * size, and FS.StoreStatus otherwise */ -int afs_fs_setattr(struct afs_server *server, struct key *key, - struct afs_vnode *vnode, struct iattr *attr, - bool async) +int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; if (attr->ia_valid & ATTR_SIZE) - return afs_fs_setattr_size(server, key, vnode, attr, - async); + return afs_fs_setattr_size(fc, attr); _enter(",%x,{%x:%u},,", - key_serial(key), vnode->fid.vid, vnode->fid.vnode); + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); - call = afs_alloc_flat_call(&afs_RXFSStoreStatus, + call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus, (4 + 6) * 4, (21 + 6) * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->operation_ID = FSSTORESTATUS; + call->key = fc->key; + call->reply[0] = vnode; /* marshall the parameters */ bp = call->request; @@ -1482,7 +1427,9 @@ int afs_fs_setattr(struct afs_server *server, struct key *key, xdr_encode_AFS_StoreStatus(&bp, attr); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -1510,7 +1457,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2); + xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]); call->offset = 0; call->unmarshall++; @@ -1531,13 +1478,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) case 3: _debug("extract volname"); if (call->count > 0) { - ret = afs_extract_data(call, call->reply3, + ret = afs_extract_data(call, call->reply[2], call->count, true); if (ret < 0) return ret; } - p = call->reply3; + p = call->reply[2]; p[call->count] = 0; _debug("volname '%s'", p); @@ -1578,13 +1525,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) case 6: _debug("extract offline"); if (call->count > 0) { - ret = afs_extract_data(call, call->reply3, + ret = afs_extract_data(call, call->reply[2], call->count, true); if (ret < 0) return ret; } - p = call->reply3; + p = call->reply[2]; p[call->count] = 0; _debug("offline '%s'", p); @@ -1625,13 +1572,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) case 9: _debug("extract motd"); if (call->count > 0) { - ret = afs_extract_data(call, call->reply3, + ret = afs_extract_data(call, call->reply[2], call->count, true); if (ret < 0) return ret; } - p = call->reply3; + p = call->reply[2]; p[call->count] = 0; _debug("motd '%s'", p); @@ -1662,8 +1609,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) */ static void afs_get_volume_status_call_destructor(struct afs_call *call) { - kfree(call->reply3); - call->reply3 = NULL; + kfree(call->reply[2]); + call->reply[2] = NULL; afs_flat_call_destructor(call); } @@ -1672,21 +1619,20 @@ static void afs_get_volume_status_call_destructor(struct afs_call *call) */ static const struct afs_call_type afs_RXFSGetVolumeStatus = { .name = "FS.GetVolumeStatus", + .op = afs_FS_GetVolumeStatus, .deliver = afs_deliver_fs_get_volume_status, - .abort_to_error = afs_abort_to_error, .destructor = afs_get_volume_status_call_destructor, }; /* * fetch the status of a volume */ -int afs_fs_get_volume_status(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - struct afs_volume_status *vs, - bool async) +int afs_fs_get_volume_status(struct afs_fs_cursor *fc, + struct afs_volume_status *vs) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; void *tmpbuf; @@ -1696,25 +1642,25 @@ int afs_fs_get_volume_status(struct afs_server *server, if (!tmpbuf) return -ENOMEM; - call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4); + call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4); if (!call) { kfree(tmpbuf); return -ENOMEM; } - call->key = key; - call->reply = vnode; - call->reply2 = vs; - call->reply3 = tmpbuf; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = vs; + call->reply[2] = tmpbuf; /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSGETVOLUMESTATUS); bp[1] = htonl(vnode->fid.vid); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* @@ -1733,7 +1679,7 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); return 0; @@ -1744,8 +1690,8 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call) */ static const struct afs_call_type afs_RXFSSetLock = { .name = "FS.SetLock", + .op = afs_FS_SetLock, .deliver = afs_deliver_fs_xxxx_lock, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; @@ -1754,8 +1700,8 @@ static const struct afs_call_type afs_RXFSSetLock = { */ static const struct afs_call_type afs_RXFSExtendLock = { .name = "FS.ExtendLock", + .op = afs_FS_ExtendLock, .deliver = afs_deliver_fs_xxxx_lock, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; @@ -1764,33 +1710,29 @@ static const struct afs_call_type afs_RXFSExtendLock = { */ static const struct afs_call_type afs_RXFSReleaseLock = { .name = "FS.ReleaseLock", + .op = afs_FS_ReleaseLock, .deliver = afs_deliver_fs_xxxx_lock, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* - * get a lock on a file + * Set a lock on a file */ -int afs_fs_set_lock(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - afs_lock_type_t type, - bool async) +int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4); + call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; /* marshall the parameters */ bp = call->request; @@ -1800,30 +1742,29 @@ int afs_fs_set_lock(struct afs_server *server, *bp++ = htonl(vnode->fid.unique); *bp++ = htonl(type); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* * extend a lock on a file */ -int afs_fs_extend_lock(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - bool async) +int afs_fs_extend_lock(struct afs_fs_cursor *fc) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4); + call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; /* marshall the parameters */ bp = call->request; @@ -1832,30 +1773,29 @@ int afs_fs_extend_lock(struct afs_server *server, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); } /* * release a lock on a file */ -int afs_fs_release_lock(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - bool async) +int afs_fs_release_lock(struct afs_fs_cursor *fc) { + struct afs_vnode *vnode = fc->vnode; struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); __be32 *bp; _enter(""); - call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4); + call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = fc->key; + call->reply[0] = vnode; /* marshall the parameters */ bp = call->request; @@ -1864,5 +1804,145 @@ int afs_fs_release_lock(struct afs_server *server, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, async); + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an FS.GiveUpAllCallBacks operation. + */ +static int afs_deliver_fs_give_up_all_callbacks(struct afs_call *call) +{ + return afs_transfer_reply(call); +} + +/* + * FS.GiveUpAllCallBacks operation type + */ +static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = { + .name = "FS.GiveUpAllCallBacks", + .op = afs_FS_GiveUpAllCallBacks, + .deliver = afs_deliver_fs_give_up_all_callbacks, + .destructor = afs_flat_call_destructor, +}; + +/* + * Flush all the callbacks we have on a server. + */ +int afs_fs_give_up_all_callbacks(struct afs_net *net, + struct afs_server *server, + struct afs_addr_cursor *ac, + struct key *key) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_RXFSGiveUpAllCallBacks, 1 * 4, 0); + if (!call) + return -ENOMEM; + + call->key = key; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSGIVEUPALLCALLBACKS); + + /* Can't take a ref on server */ + return afs_make_call(ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an FS.GetCapabilities operation. + */ +static int afs_deliver_fs_get_capabilities(struct afs_call *call) +{ + u32 count; + int ret; + + _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + +again: + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* Extract the capabilities word count */ + case 1: + ret = afs_extract_data(call, &call->tmp, + 1 * sizeof(__be32), + true); + if (ret < 0) + return ret; + + count = ntohl(call->tmp); + + call->count = count; + call->count2 = count; + call->offset = 0; + call->unmarshall++; + + /* Extract capabilities words */ + case 2: + count = min(call->count, 16U); + ret = afs_extract_data(call, call->buffer, + count * sizeof(__be32), + call->count > 16); + if (ret < 0) + return ret; + + /* TODO: Examine capabilities */ + + call->count -= count; + if (call->count > 0) + goto again; + call->offset = 0; + call->unmarshall++; + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.GetCapabilities operation type + */ +static const struct afs_call_type afs_RXFSGetCapabilities = { + .name = "FS.GetCapabilities", + .op = afs_FS_GetCapabilities, + .deliver = afs_deliver_fs_get_capabilities, + .destructor = afs_flat_call_destructor, +}; + +/* + * Probe a fileserver for the capabilities that it supports. This can + * return up to 196 words. + */ +int afs_fs_get_capabilities(struct afs_net *net, + struct afs_server *server, + struct afs_addr_cursor *ac, + struct key *key) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_RXFSGetCapabilities, 1 * 4, 16 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSGETCAPABILITIES); + + /* Can't take a ref on server */ + trace_afs_make_fs_call(call, NULL); + return afs_make_call(ac, call, GFP_NOFS, false); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 342316a9e3e0..1e81864ef0b2 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -23,11 +23,6 @@ #include <linux/namei.h> #include "internal.h" -struct afs_iget_data { - struct afs_fid fid; - struct afs_volume *volume; /* volume on which resides */ -}; - static const struct inode_operations afs_symlink_inode_operations = { .get_link = page_get_link, .listxattr = afs_listxattr, @@ -39,6 +34,7 @@ static const struct inode_operations afs_symlink_inode_operations = { static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) { struct inode *inode = AFS_VNODE_TO_I(vnode); + bool changed; _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu", vnode->status.type, @@ -47,6 +43,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) vnode->status.data_version, vnode->status.mode); + read_seqlock_excl(&vnode->cb_lock); + switch (vnode->status.type) { case AFS_FTYPE_FILE: inode->i_mode = S_IFREG | vnode->status.mode; @@ -63,9 +61,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) if ((vnode->status.mode & 0777) == 0644) { inode->i_flags |= S_AUTOMOUNT; - spin_lock(&vnode->lock); set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - spin_unlock(&vnode->lock); inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_mntpt_inode_operations; @@ -78,13 +74,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) break; default: printk("kAFS: AFS vnode with undefined type\n"); + read_sequnlock_excl(&vnode->cb_lock); return -EBADMSG; } -#ifdef CONFIG_AFS_FSCACHE - if (vnode->status.size != inode->i_size) - fscache_attr_changed(vnode->cache); -#endif + changed = (vnode->status.size != inode->i_size); set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; @@ -97,13 +91,49 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_generation = vnode->fid.unique; inode->i_version = vnode->status.data_version; inode->i_mapping->a_ops = &afs_fs_aops; + + read_sequnlock_excl(&vnode->cb_lock); + +#ifdef CONFIG_AFS_FSCACHE + if (changed) + fscache_attr_changed(vnode->cache); +#endif return 0; } /* + * Fetch file status from the volume. + */ +int afs_fetch_status(struct afs_vnode *vnode, struct key *key) +{ + struct afs_fs_cursor fc; + int ret; + + _enter("%s,{%x:%u.%u,S=%lx}", + vnode->volume->name, + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, + vnode->flags); + + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, vnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = vnode->cb_break + vnode->cb_s_break; + afs_fs_fetch_file_status(&fc, NULL); + } + + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, vnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + } + + _leave(" = %d", ret); + return ret; +} + +/* * iget5() comparator */ -static int afs_iget5_test(struct inode *inode, void *opaque) +int afs_iget5_test(struct inode *inode, void *opaque) { struct afs_iget_data *data = opaque; @@ -204,7 +234,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, */ struct inode *afs_iget(struct super_block *sb, struct key *key, struct afs_fid *fid, struct afs_file_status *status, - struct afs_callback *cb) + struct afs_callback *cb, struct afs_cb_interest *cbi) { struct afs_iget_data data = { .fid = *fid }; struct afs_super_info *as; @@ -237,8 +267,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, if (!status) { /* it's a remotely extant inode */ - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - ret = afs_vnode_fetch_status(vnode, NULL, key); + ret = afs_fetch_status(vnode, key); if (ret < 0) goto bad_inode; } else { @@ -249,16 +278,17 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, /* it's a symlink we just created (the fileserver * didn't give us a callback) */ vnode->cb_version = 0; - vnode->cb_expiry = 0; vnode->cb_type = 0; - vnode->cb_expires = ktime_get_real_seconds(); + vnode->cb_expires_at = 0; } else { vnode->cb_version = cb->version; - vnode->cb_expiry = cb->expiry; vnode->cb_type = cb->type; - vnode->cb_expires = vnode->cb_expiry + - ktime_get_real_seconds(); + vnode->cb_expires_at = cb->expiry; + vnode->cb_interest = afs_get_cb_interest(cbi); + set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); } + + vnode->cb_expires_at += ktime_get_real_seconds(); } /* set up caching before mapping the status, as map-status reads the @@ -320,25 +350,38 @@ void afs_zap_data(struct afs_vnode *vnode) */ int afs_validate(struct afs_vnode *vnode, struct key *key) { + time64_t now = ktime_get_real_seconds(); + bool valid = false; int ret; _enter("{v={%x:%u} fl=%lx},%x", vnode->fid.vid, vnode->fid.vnode, vnode->flags, key_serial(key)); - if (vnode->cb_promised && - !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && - !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) && - !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { - if (vnode->cb_expires < ktime_get_real_seconds() + 10) { - _debug("callback expired"); - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - } else { - goto valid; + /* Quickly check the callback state. Ideally, we'd use read_seqbegin + * here, but we have no way to pass the net namespace to the RCU + * cleanup for the server record. + */ + read_seqlock_excl(&vnode->cb_lock); + + if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) { + vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; + } else if (!test_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags) && + !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && + vnode->cb_expires_at - 10 > now) { + valid = true; } + } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + valid = true; } + read_sequnlock_excl(&vnode->cb_lock); + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + clear_nlink(&vnode->vfs_inode); + + if (valid) goto valid; mutex_lock(&vnode->validate_lock); @@ -347,12 +390,16 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * a new promise - note that if the (parent) directory's metadata was * changed then the security may be different and we may no longer have * access */ - if (!vnode->cb_promised || - test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { + if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { _debug("not promised"); - ret = afs_vnode_fetch_status(vnode, NULL, key); - if (ret < 0) + ret = afs_fetch_status(vnode, key); + if (ret < 0) { + if (ret == -ENOENT) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } goto error_unlock; + } _debug("new promise [fl=%lx]", vnode->flags); } @@ -367,7 +414,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) afs_zap_data(vnode); - clear_bit(AFS_VNODE_MODIFIED, &vnode->flags); + clear_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags); mutex_unlock(&vnode->validate_lock); valid: _leave(" = 0"); @@ -386,10 +433,17 @@ int afs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); + struct afs_vnode *vnode = AFS_FS_I(inode); + int seq = 0; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); - generic_fillattr(inode, stat); + do { + read_seqbegin_or_lock(&vnode->cb_lock, &seq); + generic_fillattr(inode, stat); + } while (need_seqretry(&vnode->cb_lock, seq)); + + done_seqretry(&vnode->cb_lock, seq); return 0; } @@ -411,18 +465,14 @@ int afs_drop_inode(struct inode *inode) */ void afs_evict_inode(struct inode *inode) { - struct afs_permits *permits; struct afs_vnode *vnode; vnode = AFS_FS_I(inode); - _enter("{%x:%u.%d} v=%u x=%u t=%u }", + _enter("{%x:%u.%d}", vnode->fid.vid, vnode->fid.vnode, - vnode->fid.unique, - vnode->cb_version, - vnode->cb_expiry, - vnode->cb_type); + vnode->fid.unique); _debug("CLEAR INODE %p", inode); @@ -431,31 +481,24 @@ void afs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - afs_give_up_callback(vnode); - - if (vnode->server) { - spin_lock(&vnode->server->fs_lock); - rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes); - spin_unlock(&vnode->server->fs_lock); - afs_put_server(vnode->server); - vnode->server = NULL; + if (vnode->cb_interest) { + afs_put_cb_interest(afs_i2net(inode), vnode->cb_interest); + vnode->cb_interest = NULL; } - ASSERT(list_empty(&vnode->writebacks)); - ASSERT(!vnode->cb_promised); + while (!list_empty(&vnode->wb_keys)) { + struct afs_wb_key *wbk = list_entry(vnode->wb_keys.next, + struct afs_wb_key, vnode_link); + list_del(&wbk->vnode_link); + afs_put_wb_key(wbk); + } #ifdef CONFIG_AFS_FSCACHE fscache_relinquish_cookie(vnode->cache, 0); vnode->cache = NULL; #endif - mutex_lock(&vnode->permits_lock); - permits = vnode->permits; - RCU_INIT_POINTER(vnode->permits, NULL); - mutex_unlock(&vnode->permits_lock); - if (permits) - call_rcu(&permits->rcu, afs_zap_permits); - + afs_put_permits(vnode->permit_cache); _leave(""); } @@ -464,6 +507,7 @@ void afs_evict_inode(struct inode *inode) */ int afs_setattr(struct dentry *dentry, struct iattr *attr) { + struct afs_fs_cursor fc; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; int ret; @@ -479,13 +523,11 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) } /* flush any dirty data outstanding on a regular file */ - if (S_ISREG(vnode->vfs_inode.i_mode)) { + if (S_ISREG(vnode->vfs_inode.i_mode)) filemap_write_and_wait(vnode->vfs_inode.i_mapping); - afs_writeback_all(vnode); - } if (attr->ia_valid & ATTR_FILE) { - key = attr->ia_file->private_data; + key = afs_file_key(attr->ia_file); } else { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) { @@ -494,7 +536,18 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) } } - ret = afs_vnode_setattr(vnode, key, attr); + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, vnode, key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = vnode->cb_break + vnode->cb_s_break; + afs_fs_setattr(&fc, attr); + } + + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, vnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + } + if (!(attr->ia_valid & ATTR_FILE)) key_put(key); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 82e16556afea..804d1f905622 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -21,6 +21,7 @@ #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> +#include <net/net_namespace.h> #include <net/af_rxrpc.h> #include "afs.h" @@ -31,16 +32,6 @@ struct pagevec; struct afs_call; -typedef enum { - AFS_VL_NEW, /* new, uninitialised record */ - AFS_VL_CREATING, /* creating record */ - AFS_VL_VALID, /* record is pending */ - AFS_VL_NO_VOLUME, /* no such volume available */ - AFS_VL_UPDATING, /* update in progress */ - AFS_VL_VOLUME_DELETED, /* volume was deleted */ - AFS_VL_UNCERTAIN, /* uncertain state (update failed) */ -} __attribute__((packed)) afs_vlocation_state_t; - struct afs_mount_params { bool rwpath; /* T if the parent should be considered R/W */ bool force; /* T to force cell type */ @@ -48,20 +39,43 @@ struct afs_mount_params { afs_voltype_t type; /* type of volume requested */ int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ + struct afs_net *net; /* Network namespace in effect */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ struct key *key; /* key to use for secure mounting */ }; +struct afs_iget_data { + struct afs_fid fid; + struct afs_volume *volume; /* volume on which resides */ +}; + enum afs_call_state { - AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ - AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ - AFS_CALL_AWAIT_OP_ID, /* awaiting op ID on incoming call */ - AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ - AFS_CALL_REPLYING, /* replying to incoming call */ - AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ - AFS_CALL_COMPLETE, /* Completed or failed */ + AFS_CALL_CL_REQUESTING, /* Client: Request is being sent */ + AFS_CALL_CL_AWAIT_REPLY, /* Client: Awaiting reply */ + AFS_CALL_CL_PROC_REPLY, /* Client: rxrpc call complete; processing reply */ + AFS_CALL_SV_AWAIT_OP_ID, /* Server: Awaiting op ID */ + AFS_CALL_SV_AWAIT_REQUEST, /* Server: Awaiting request data */ + AFS_CALL_SV_REPLYING, /* Server: Replying */ + AFS_CALL_SV_AWAIT_ACK, /* Server: Awaiting final ACK */ + AFS_CALL_COMPLETE, /* Completed or failed */ }; + +/* + * List of server addresses. + */ +struct afs_addr_list { + struct rcu_head rcu; /* Must be first */ + refcount_t usage; + u32 version; /* Version */ + unsigned short nr_addrs; + unsigned short index; /* Address currently in use */ + unsigned short nr_ipv4; /* Number of IPv4 addresses */ + unsigned long probed; /* Mask of servers that have been probed */ + unsigned long yfs; /* Mask of servers that are YFS */ + struct sockaddr_rxrpc addrs[]; +}; + /* * a record of an in-progress RxRPC call */ @@ -72,25 +86,25 @@ struct afs_call { struct work_struct work; /* actual work processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct key *key; /* security for this call */ - struct afs_server *server; /* server affected by incoming CM call */ + struct afs_net *net; /* The network namespace */ + struct afs_server *cm_server; /* Server affected by incoming CM call */ + struct afs_cb_interest *cbi; /* Callback interest for server used */ void *request; /* request data (first part) */ - struct address_space *mapping; /* page set */ - struct afs_writeback *wb; /* writeback being performed */ + struct address_space *mapping; /* Pages being written from */ void *buffer; /* reply receive buffer */ - void *reply; /* reply buffer (first part) */ - void *reply2; /* reply buffer (second part) */ - void *reply3; /* reply buffer (third part) */ - void *reply4; /* reply buffer (fourth part) */ + void *reply[4]; /* Where to put the reply */ pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ size_t offset; /* offset into received data store */ atomic_t usage; enum afs_call_state state; + spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned first_offset; /* offset into mapping[first] */ + unsigned int cb_break; /* cb_break + cb_s_break before the call */ union { unsigned last_to; /* amount of mapping[last] */ unsigned count2; /* count used in unmarshalling */ @@ -100,8 +114,9 @@ struct afs_call { bool send_pages; /* T if data from mapping should be sent */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ - u16 service_id; /* RxRPC service ID to call */ - __be16 port; /* target UDP port */ + bool ret_reply0; /* T if should return reply[0] on success */ + bool upgrade; /* T to request service upgrade */ + u16 service_id; /* Actual service ID (after upgrade) */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ @@ -110,15 +125,13 @@ struct afs_call { struct afs_call_type { const char *name; + unsigned int op; /* Really enum afs_fs_operation */ /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ int (*deliver)(struct afs_call *call); - /* map an abort code to an error number */ - int (*abort_to_error)(u32 abort_code); - /* clean up a call */ void (*destructor)(struct afs_call *call); @@ -127,6 +140,30 @@ struct afs_call_type { }; /* + * Key available for writeback on a file. + */ +struct afs_wb_key { + refcount_t usage; + struct key *key; + struct list_head vnode_link; /* Link in vnode->wb_keys */ +}; + +/* + * AFS open file information record. Pointed to by file->private_data. + */ +struct afs_file { + struct key *key; /* The key this file was opened with */ + struct afs_wb_key *wb; /* Writeback key record for this file */ +}; + +static inline struct key *afs_file_key(struct file *file) +{ + struct afs_file *af = file->private_data; + + return af->key; +} + +/* * Record of an outstanding read operation on a vnode. */ struct afs_read { @@ -142,38 +179,13 @@ struct afs_read { }; /* - * record of an outstanding writeback on a vnode - */ -struct afs_writeback { - struct list_head link; /* link in vnode->writebacks */ - struct work_struct writer; /* work item to perform the writeback */ - struct afs_vnode *vnode; /* vnode to which this write applies */ - struct key *key; /* owner of this write */ - wait_queue_head_t waitq; /* completion and ready wait queue */ - pgoff_t first; /* first page in batch */ - pgoff_t point; /* last page in current store op */ - pgoff_t last; /* last page in batch (inclusive) */ - unsigned offset_first; /* offset into first page of start of write */ - unsigned to_last; /* offset into last page of end of write */ - int num_conflicts; /* count of conflicting writes in list */ - int usage; - bool conflicts; /* T if has dependent conflicts */ - enum { - AFS_WBACK_SYNCING, /* synchronisation being performed */ - AFS_WBACK_PENDING, /* write pending */ - AFS_WBACK_CONFLICTING, /* conflicting writes posted */ - AFS_WBACK_WRITING, /* writing back */ - AFS_WBACK_COMPLETE /* the writeback record has been unlinked */ - } state __attribute__((packed)); -}; - -/* * AFS superblock private data * - there's one superblock per volume */ struct afs_super_info { + struct afs_net *net; /* Network namespace */ + struct afs_cell *cell; /* The cell in which the volume resides */ struct afs_volume *volume; /* volume record */ - char rwparent; /* T if parent is R/W AFS volume */ }; static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) @@ -184,204 +196,297 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) extern struct file_system_type afs_fs_type; /* - * entry in the cached cell catalogue + * AFS network namespace record. */ -struct afs_cache_cell { - char name[AFS_MAXCELLNAME]; /* cell name (padded with NULs) */ - struct in_addr vl_servers[15]; /* cached cell VL servers */ +struct afs_net { + struct afs_uuid uuid; + bool live; /* F if this namespace is being removed */ + + /* AF_RXRPC I/O stuff */ + struct socket *socket; + struct afs_call *spare_incoming_call; + struct work_struct charge_preallocation_work; + struct mutex socket_mutex; + atomic_t nr_outstanding_calls; + atomic_t nr_superblocks; + + /* Cell database */ + struct rb_root cells; + struct afs_cell *ws_cell; + struct work_struct cells_manager; + struct timer_list cells_timer; + atomic_t cells_outstanding; + seqlock_t cells_lock; + + spinlock_t proc_cells_lock; + struct list_head proc_cells; + + /* Known servers. Theoretically each fileserver can only be in one + * cell, but in practice, people create aliases and subsets and there's + * no easy way to distinguish them. + */ + seqlock_t fs_lock; /* For fs_servers */ + struct rb_root fs_servers; /* afs_server (by server UUID or address) */ + struct list_head fs_updates; /* afs_server (by update_at) */ + struct hlist_head fs_proc; /* procfs servers list */ + + struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */ + struct hlist_head fs_addresses6; /* afs_server (by lowest IPv6 addr) */ + seqlock_t fs_addr_lock; /* For fs_addresses[46] */ + + struct work_struct fs_manager; + struct timer_list fs_timer; + atomic_t servers_outstanding; + + /* File locking renewal management */ + struct mutex lock_manager_mutex; + + /* Misc */ + struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ +}; + +extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns + +enum afs_cell_state { + AFS_CELL_UNSET, + AFS_CELL_ACTIVATING, + AFS_CELL_ACTIVE, + AFS_CELL_DEACTIVATING, + AFS_CELL_INACTIVE, + AFS_CELL_FAILED, }; /* - * AFS cell record + * AFS cell record. + * + * This is a tricky concept to get right as it is possible to create aliases + * simply by pointing AFSDB/SRV records for two names at the same set of VL + * servers; it is also possible to do things like setting up two sets of VL + * servers, one of which provides a superset of the volumes provided by the + * other (for internal/external division, for example). + * + * Cells only exist in the sense that (a) a cell's name maps to a set of VL + * servers and (b) a cell's name is used by the client to select the key to use + * for authentication and encryption. The cell name is not typically used in + * the protocol. + * + * There is no easy way to determine if two cells are aliases or one is a + * subset of another. */ struct afs_cell { - atomic_t usage; - struct list_head link; /* main cell list link */ + union { + struct rcu_head rcu; + struct rb_node net_node; /* Node in net->cells */ + }; + struct afs_net *net; struct key *anonymous_key; /* anonymous user key for this cell */ + struct work_struct manager; /* Manager for init/deinit/dns */ struct list_head proc_link; /* /proc cell list link */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif - - /* server record management */ - rwlock_t servers_lock; /* active server list lock */ - struct list_head servers; /* active server list */ - - /* volume location record management */ - struct rw_semaphore vl_sem; /* volume management serialisation semaphore */ - struct list_head vl_list; /* cell's active VL record list */ - spinlock_t vl_lock; /* vl_list lock */ - unsigned short vl_naddrs; /* number of VL servers in addr list */ - unsigned short vl_curr_svix; /* current server index */ - struct in_addr vl_addrs[AFS_CELL_MAX_ADDRS]; /* cell VL server addresses */ - - char name[0]; /* cell name - must go last */ + time64_t dns_expiry; /* Time AFSDB/SRV record expires */ + time64_t last_inactive; /* Time of last drop of usage count */ + atomic_t usage; + unsigned long flags; +#define AFS_CELL_FL_NOT_READY 0 /* The cell record is not ready for use */ +#define AFS_CELL_FL_NO_GC 1 /* The cell was added manually, don't auto-gc */ +#define AFS_CELL_FL_NOT_FOUND 2 /* Permanent DNS error */ +#define AFS_CELL_FL_DNS_FAIL 3 /* Failed to access DNS */ +#define AFS_CELL_FL_NO_LOOKUP_YET 4 /* Not completed first DNS lookup yet */ + enum afs_cell_state state; + short error; + + /* Active fileserver interaction state. */ + struct list_head proc_volumes; /* procfs volume list */ + rwlock_t proc_lock; + + /* VL server list. */ + rwlock_t vl_addrs_lock; /* Lock on vl_addrs */ + struct afs_addr_list __rcu *vl_addrs; /* List of VL servers */ + u8 name_len; /* Length of name */ + char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */ }; /* - * entry in the cached volume location catalogue + * Cached VLDB entry. + * + * This is pointed to by cell->vldb_entries, indexed by name. */ -struct afs_cache_vlocation { - /* volume name (lowercase, padded with NULs) */ - uint8_t name[AFS_MAXVOLNAME + 1]; +struct afs_vldb_entry { + afs_volid_t vid[3]; /* Volume IDs for R/W, R/O and Bak volumes */ - uint8_t nservers; /* number of entries used in servers[] */ - uint8_t vidmask; /* voltype mask for vid[] */ - uint8_t srvtmask[8]; /* voltype masks for servers[] */ + unsigned long flags; +#define AFS_VLDB_HAS_RW 0 /* - R/W volume exists */ +#define AFS_VLDB_HAS_RO 1 /* - R/O volume exists */ +#define AFS_VLDB_HAS_BAK 2 /* - Backup volume exists */ +#define AFS_VLDB_QUERY_VALID 3 /* - Record is valid */ +#define AFS_VLDB_QUERY_ERROR 4 /* - VL server returned error */ + + uuid_t fs_server[AFS_NMAXNSERVERS]; + u8 fs_mask[AFS_NMAXNSERVERS]; #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ - - afs_volid_t vid[3]; /* volume IDs for R/W, R/O and Bak volumes */ - struct in_addr servers[8]; /* fileserver addresses */ - time_t rtime; /* last retrieval time */ + short error; + u8 nr_servers; /* Number of server records */ + u8 name_len; + u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; /* - * volume -> vnode hash table entry + * Record of fileserver with which we're actively communicating. */ -struct afs_cache_vhash { - afs_voltype_t vtype; /* which volume variation */ - uint8_t hash_bucket; /* which hash bucket this represents */ -} __attribute__((packed)); +struct afs_server { + struct rcu_head rcu; + union { + uuid_t uuid; /* Server ID */ + struct afs_uuid _uuid; + }; -/* - * AFS volume location record - */ -struct afs_vlocation { + struct afs_addr_list __rcu *addresses; + struct rb_node uuid_rb; /* Link in net->servers */ + struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ + struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ + struct hlist_node proc_link; /* Link in net->fs_proc */ + struct afs_server *gc_next; /* Next server in manager's list */ + time64_t put_time; /* Time at which last put */ + time64_t update_at; /* Time at which to next update the record */ + unsigned long flags; +#define AFS_SERVER_FL_NEW 0 /* New server, don't inc cb_s_break */ +#define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */ +#define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ +#define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */ +#define AFS_SERVER_FL_UPDATING 4 +#define AFS_SERVER_FL_PROBED 5 /* The fileserver has been probed */ +#define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ atomic_t usage; - time64_t time_of_death; /* time at which put reduced usage to 0 */ - struct list_head link; /* link in cell volume location list */ - struct list_head grave; /* link in master graveyard list */ - struct list_head update; /* link in master update list */ - struct afs_cell *cell; /* cell to which volume belongs */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif - struct afs_cache_vlocation vldb; /* volume information DB record */ - struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ - wait_queue_head_t waitq; /* status change waitqueue */ - time64_t update_at; /* time at which record should be updated */ - spinlock_t lock; /* access lock */ - afs_vlocation_state_t state; /* volume location state */ - unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */ - unsigned short upd_busy_cnt; /* EBUSY count during update */ - bool valid; /* T if valid */ + u32 addr_version; /* Address list version */ + + /* file service access */ + rwlock_t fs_lock; /* access lock */ + + /* callback promise management */ + struct list_head cb_interests; /* List of superblocks using this server */ + unsigned cb_s_break; /* Break-everything counter. */ + rwlock_t cb_break_lock; /* Volume finding lock */ }; /* - * AFS fileserver record + * Interest by a superblock on a server. */ -struct afs_server { - atomic_t usage; - time64_t time_of_death; /* time at which put reduced usage to 0 */ - struct in_addr addr; /* server address */ - struct afs_cell *cell; /* cell in which server resides */ - struct list_head link; /* link in cell's server list */ - struct list_head grave; /* link in master graveyard list */ - struct rb_node master_rb; /* link in master by-addr tree */ - struct rw_semaphore sem; /* access lock */ +struct afs_cb_interest { + struct list_head cb_link; /* Link in server->cb_interests */ + struct afs_server *server; /* Server on which this interest resides */ + struct super_block *sb; /* Superblock on which inodes reside */ + afs_volid_t vid; /* Volume ID to match */ + refcount_t usage; +}; - /* file service access */ - struct rb_root fs_vnodes; /* vnodes backed by this server (ordered by FID) */ - unsigned long fs_act_jif; /* time at which last activity occurred */ - unsigned long fs_dead_jif; /* time at which no longer to be considered dead */ - spinlock_t fs_lock; /* access lock */ - int fs_state; /* 0 or reason FS currently marked dead (-errno) */ +/* + * Replaceable server list. + */ +struct afs_server_entry { + struct afs_server *server; + struct afs_cb_interest *cb_interest; +}; - /* callback promise management */ - struct rb_root cb_promises; /* vnode expiration list (ordered earliest first) */ - struct delayed_work cb_updater; /* callback updater */ - struct delayed_work cb_break_work; /* collected break dispatcher */ - wait_queue_head_t cb_break_waitq; /* space available in cb_break waitqueue */ - spinlock_t cb_lock; /* access lock */ - struct afs_callback cb_break[64]; /* ring of callbacks awaiting breaking */ - atomic_t cb_break_n; /* number of pending breaks */ - u8 cb_break_head; /* head of callback breaking ring */ - u8 cb_break_tail; /* tail of callback breaking ring */ +struct afs_server_list { + refcount_t usage; + unsigned short nr_servers; + unsigned short index; /* Server currently in use */ + unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ + unsigned int seq; /* Set to ->servers_seq when installed */ + struct afs_server_entry servers[]; }; /* - * AFS volume access record + * Live AFS volume management. */ struct afs_volume { + afs_volid_t vid; /* volume ID */ atomic_t usage; - struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ - struct afs_vlocation *vlocation; /* volume location */ + time64_t update_at; /* Time at which to next update */ + struct afs_cell *cell; /* Cell to which belongs (pins ref) */ + struct list_head proc_link; /* Link in cell->vl_proc */ + unsigned long flags; +#define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ +#define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ +#define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ +#define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ +#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */ +#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif - afs_volid_t vid; /* volume ID */ + struct afs_server_list *servers; /* List of servers on which volume resides */ + rwlock_t servers_lock; /* Lock for ->servers */ + unsigned int servers_seq; /* Incremented each time ->servers changes */ + afs_voltype_t type; /* type of volume */ + short error; char type_force; /* force volume type (suppress R/O -> R/W) */ - unsigned short nservers; /* number of server slots filled */ - unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ - struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ - struct rw_semaphore server_sem; /* lock for accessing current server */ + u8 name_len; + u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; -/* - * vnode catalogue entry - */ -struct afs_cache_vnode { - afs_vnodeid_t vnode_id; /* vnode ID */ - unsigned vnode_unique; /* vnode ID uniquifier */ - afs_dataversion_t data_version; /* data version */ +enum afs_lock_state { + AFS_VNODE_LOCK_NONE, /* The vnode has no lock on the server */ + AFS_VNODE_LOCK_WAITING_FOR_CB, /* We're waiting for the server to break the callback */ + AFS_VNODE_LOCK_SETTING, /* We're asking the server for a lock */ + AFS_VNODE_LOCK_GRANTED, /* We have a lock on the server */ + AFS_VNODE_LOCK_EXTENDING, /* We're extending a lock on the server */ + AFS_VNODE_LOCK_NEED_UNLOCK, /* We need to unlock on the server */ + AFS_VNODE_LOCK_UNLOCKING, /* We're telling the server to unlock */ }; /* - * AFS inode private data + * AFS inode private data. + * + * Note that afs_alloc_inode() *must* reset anything that could incorrectly + * leak from one inode to another. */ struct afs_vnode { struct inode vfs_inode; /* the VFS's inode record */ struct afs_volume *volume; /* volume on which vnode resides */ - struct afs_server *server; /* server currently supplying this file */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif - struct afs_permits *permits; /* cache of permits so far obtained */ - struct mutex permits_lock; /* lock for altering permits list */ + struct afs_permits *permit_cache; /* cache of permits so far obtained */ + struct mutex io_lock; /* Lock for serialising I/O on this mutex */ struct mutex validate_lock; /* lock for validating this vnode */ - wait_queue_head_t update_waitq; /* status fetch waitqueue */ - int update_cnt; /* number of outstanding ops that will update the - * status */ - spinlock_t writeback_lock; /* lock for writebacks */ + spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; -#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */ +#define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ -#define AFS_VNODE_MODIFIED 2 /* set if vnode's data modified */ +#define AFS_VNODE_DIR_MODIFIED 2 /* set if dir vnode's data modified */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ -#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */ -#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ -#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ -#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ -#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */ -#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */ +#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ +#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ - long acl_order; /* ACL check count (callback break count) */ - - struct list_head writebacks; /* alterations in pagecache that need writing */ + struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ struct delayed_work lock_work; /* work to be done in locking */ - struct key *unlock_key; /* key to be used in unlocking */ + struct key *lock_key; /* Key to be used in lock ops */ + enum afs_lock_state lock_state : 8; + afs_lock_type_t lock_type : 8; /* outstanding callback notification on this file */ - struct rb_node server_rb; /* link in server->fs_vnodes */ - struct rb_node cb_promise; /* link in server->cb_promises */ - struct work_struct cb_broken_work; /* work to be done on callback break */ - time64_t cb_expires; /* time at which callback expires */ - time64_t cb_expires_at; /* time used to order cb_promise */ + struct afs_cb_interest *cb_interest; /* Server on which this resides */ + unsigned int cb_s_break; /* Mass break counter on ->server */ + unsigned int cb_break; /* Break counter on vnode */ + seqlock_t cb_lock; /* Lock for ->cb_interest, ->status, ->cb_*break */ + + time64_t cb_expires_at; /* time at which callback expires */ unsigned cb_version; /* callback version */ - unsigned cb_expiry; /* callback expiry time */ afs_callback_type_t cb_type; /* type of callback */ - bool cb_promised; /* true if promise still holds */ }; /* @@ -389,16 +494,21 @@ struct afs_vnode { */ struct afs_permit { struct key *key; /* RxRPC ticket holding a security context */ - afs_access_t access_mask; /* access mask for this key */ + afs_access_t access; /* CallerAccess value for this key */ }; /* - * cache of security records from attempts to access a vnode + * Immutable cache of CallerAccess records from attempts to access vnodes. + * These may be shared between multiple vnodes. */ struct afs_permits { - struct rcu_head rcu; /* disposal procedure */ - int count; /* number of records */ - struct afs_permit permits[0]; /* the permits so far examined */ + struct rcu_head rcu; + struct hlist_node hash_node; /* Link in hash */ + unsigned long h; /* Hash value for this permit list */ + refcount_t usage; + unsigned short nr_permits; /* Number of records */ + bool invalidated; /* Invalidated due to key change */ + struct afs_permit permits[]; /* List of permits sorted by key pointer */ }; /* @@ -410,28 +520,78 @@ struct afs_interface { unsigned mtu; /* MTU of interface */ }; -struct afs_uuid { - __be32 time_low; /* low part of timestamp */ - __be16 time_mid; /* mid part of timestamp */ - __be16 time_hi_and_version; /* high part of timestamp and version */ - __u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ - __u8 clock_seq_low; /* clock seq low */ - __u8 node[6]; /* spatially unique node ID (MAC addr) */ +/* + * Cursor for iterating over a server's address list. + */ +struct afs_addr_cursor { + struct afs_addr_list *alist; /* Current address list (pins ref) */ + struct sockaddr_rxrpc *addr; + u32 abort_code; + unsigned short start; /* Starting point in alist->addrs[] */ + unsigned short index; /* Wrapping offset from start to current addr */ + short error; + bool begun; /* T if we've begun iteration */ + bool responded; /* T if the current address responded */ +}; + +/* + * Cursor for iterating over a set of fileservers. + */ +struct afs_fs_cursor { + struct afs_addr_cursor ac; + struct afs_vnode *vnode; + struct afs_server_list *server_list; /* Current server list (pins ref) */ + struct afs_cb_interest *cbi; /* Server on which this resides (pins ref) */ + struct key *key; /* Key for the server */ + unsigned int cb_break; /* cb_break + cb_s_break before the call */ + unsigned int cb_break_2; /* cb_break + cb_s_break (2nd vnode) */ + unsigned char start; /* Initial index in server list */ + unsigned char index; /* Number of servers tried beyond start */ + unsigned short flags; +#define AFS_FS_CURSOR_STOP 0x0001 /* Set to cease iteration */ +#define AFS_FS_CURSOR_VBUSY 0x0002 /* Set if seen VBUSY */ +#define AFS_FS_CURSOR_VMOVED 0x0004 /* Set if seen VMOVED */ +#define AFS_FS_CURSOR_VNOVOL 0x0008 /* Set if seen VNOVOL */ +#define AFS_FS_CURSOR_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ +#define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ }; +#include <trace/events/afs.h> + /*****************************************************************************/ /* + * addr_list.c + */ +static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist) +{ + if (alist) + refcount_inc(&alist->usage); + return alist; +} +extern struct afs_addr_list *afs_alloc_addrlist(unsigned int, + unsigned short, + unsigned short); +extern void afs_put_addrlist(struct afs_addr_list *); +extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char, + unsigned short, unsigned short); +extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *); +extern bool afs_iterate_addresses(struct afs_addr_cursor *); +extern int afs_end_cursor(struct afs_addr_cursor *); +extern int afs_set_vl_cursor(struct afs_addr_cursor *, struct afs_cell *); + +extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16); +extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); + +/* * cache.c */ #ifdef CONFIG_AFS_FSCACHE extern struct fscache_netfs afs_cache_netfs; extern struct fscache_cookie_def afs_cell_cache_index_def; -extern struct fscache_cookie_def afs_vlocation_cache_index_def; extern struct fscache_cookie_def afs_volume_cache_index_def; extern struct fscache_cookie_def afs_vnode_cache_index_def; #else #define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL) #define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) #define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) #endif @@ -440,29 +600,31 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def; * callback.c */ extern void afs_init_callback_state(struct afs_server *); -extern void afs_broken_callback_work(struct work_struct *); -extern void afs_break_callbacks(struct afs_server *, size_t, - struct afs_callback[]); -extern void afs_discard_callback_on_delete(struct afs_vnode *); -extern void afs_give_up_callback(struct afs_vnode *); -extern void afs_dispatch_give_up_callbacks(struct work_struct *); -extern void afs_flush_callback_breaks(struct afs_server *); -extern int __init afs_callback_update_init(void); -extern void afs_callback_update_kill(void); +extern void afs_break_callback(struct afs_vnode *); +extern void afs_break_callbacks(struct afs_server *, size_t,struct afs_callback[]); + +extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *); +extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *); +extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *); + +static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi) +{ + refcount_inc(&cbi->usage); + return cbi; +} /* * cell.c */ -extern struct rw_semaphore afs_proc_cells_sem; -extern struct list_head afs_proc_cells; - -#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) -extern int afs_cell_init(char *); -extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool); -extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool); -extern struct afs_cell *afs_grab_cell(struct afs_cell *); -extern void afs_put_cell(struct afs_cell *); -extern void afs_cell_purge(void); +extern int afs_cell_init(struct afs_net *, const char *); +extern struct afs_cell *afs_lookup_cell_rcu(struct afs_net *, const char *, unsigned); +extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, + const char *, bool); +extern struct afs_cell *afs_get_cell(struct afs_cell *); +extern void afs_put_cell(struct afs_net *, struct afs_cell *); +extern void afs_manage_cells(struct work_struct *); +extern void afs_cells_timer(struct timer_list *); +extern void __net_exit afs_cell_purge(struct afs_net *); /* * cmservice.c @@ -472,6 +634,7 @@ extern bool afs_cm_incoming_call(struct afs_call *); /* * dir.c */ +extern bool afs_dir_check_page(struct inode *, struct page *); extern const struct inode_operations afs_dir_inode_operations; extern const struct dentry_operations afs_fs_dentry_operations; extern const struct file_operations afs_dir_file_operations; @@ -483,15 +646,19 @@ extern const struct address_space_operations afs_fs_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; +extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); +extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); +extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *); extern int afs_page_filler(void *, struct page *); extern void afs_put_read(struct afs_read *); /* * flock.c */ -extern void __exit afs_kill_lock_manager(void); +extern struct workqueue_struct *afs_lock_manager; + extern void afs_lock_work(struct work_struct *); extern void afs_lock_may_be_available(struct afs_vnode *); extern int afs_lock(struct file *, int, struct file_lock *); @@ -500,48 +667,40 @@ extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ -extern int afs_fs_fetch_file_status(struct afs_server *, struct key *, - struct afs_vnode *, struct afs_volsync *, - bool); -extern int afs_fs_give_up_callbacks(struct afs_server *, bool); -extern int afs_fs_fetch_data(struct afs_server *, struct key *, - struct afs_vnode *, struct afs_read *, bool); -extern int afs_fs_create(struct afs_server *, struct key *, - struct afs_vnode *, const char *, umode_t, - struct afs_fid *, struct afs_file_status *, - struct afs_callback *, bool); -extern int afs_fs_remove(struct afs_server *, struct key *, - struct afs_vnode *, const char *, bool, bool); -extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *, - struct afs_vnode *, const char *, bool); -extern int afs_fs_symlink(struct afs_server *, struct key *, - struct afs_vnode *, const char *, const char *, - struct afs_fid *, struct afs_file_status *, bool); -extern int afs_fs_rename(struct afs_server *, struct key *, - struct afs_vnode *, const char *, - struct afs_vnode *, const char *, bool); -extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *, - pgoff_t, pgoff_t, unsigned, unsigned, bool); -extern int afs_fs_setattr(struct afs_server *, struct key *, - struct afs_vnode *, struct iattr *, bool); -extern int afs_fs_get_volume_status(struct afs_server *, struct key *, - struct afs_vnode *, - struct afs_volume_status *, bool); -extern int afs_fs_set_lock(struct afs_server *, struct key *, - struct afs_vnode *, afs_lock_type_t, bool); -extern int afs_fs_extend_lock(struct afs_server *, struct key *, - struct afs_vnode *, bool); -extern int afs_fs_release_lock(struct afs_server *, struct key *, - struct afs_vnode *, bool); +extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *); +extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *); +extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *); +extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, + struct afs_fid *, struct afs_file_status *, struct afs_callback *); +extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool); +extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *); +extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, + struct afs_fid *, struct afs_file_status *); +extern int afs_fs_rename(struct afs_fs_cursor *, const char *, + struct afs_vnode *, const char *); +extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *, + pgoff_t, pgoff_t, unsigned, unsigned); +extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *); +extern int afs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *); +extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t); +extern int afs_fs_extend_lock(struct afs_fs_cursor *); +extern int afs_fs_release_lock(struct afs_fs_cursor *); +extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, + struct afs_addr_cursor *, struct key *); +extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *, + struct afs_addr_cursor *, struct key *); /* * inode.c */ +extern int afs_fetch_status(struct afs_vnode *, struct key *); +extern int afs_iget5_test(struct inode *, void *); extern struct inode *afs_iget_autocell(struct inode *, const char *, int, struct key *); extern struct inode *afs_iget(struct super_block *, struct key *, struct afs_fid *, struct afs_file_status *, - struct afs_callback *); + struct afs_callback *, + struct afs_cb_interest *); extern void afs_zap_data(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int); @@ -553,7 +712,35 @@ extern int afs_drop_inode(struct inode *); * main.c */ extern struct workqueue_struct *afs_wq; -extern struct afs_uuid afs_uuid; + +static inline struct afs_net *afs_d2net(struct dentry *dentry) +{ + return &__afs_net; +} + +static inline struct afs_net *afs_i2net(struct inode *inode) +{ + return &__afs_net; +} + +static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) +{ + return &__afs_net; +} + +static inline struct afs_net *afs_sock2net(struct sock *sk) +{ + return &__afs_net; +} + +static inline struct afs_net *afs_get_net(struct afs_net *net) +{ + return net; +} + +static inline void afs_put_net(struct afs_net *net) +{ +} /* * misc.c @@ -578,23 +765,33 @@ extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool); /* * proc.c */ -extern int afs_proc_init(void); -extern void afs_proc_cleanup(void); -extern int afs_proc_cell_setup(struct afs_cell *); -extern void afs_proc_cell_remove(struct afs_cell *); +extern int __net_init afs_proc_init(struct afs_net *); +extern void __net_exit afs_proc_cleanup(struct afs_net *); +extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *); +extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *); + +/* + * rotate.c + */ +extern bool afs_begin_vnode_operation(struct afs_fs_cursor *, struct afs_vnode *, + struct key *); +extern bool afs_select_fileserver(struct afs_fs_cursor *); +extern bool afs_select_current_fileserver(struct afs_fs_cursor *); +extern int afs_end_vnode_operation(struct afs_fs_cursor *); /* * rxrpc.c */ -extern struct socket *afs_socket; -extern atomic_t afs_outstanding_calls; +extern struct workqueue_struct *afs_async_calls; -extern int afs_open_socket(void); -extern void afs_close_socket(void); +extern int __net_init afs_open_socket(struct afs_net *); +extern void __net_exit afs_close_socket(struct afs_net *); +extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); extern int afs_queue_call_work(struct afs_call *); -extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, bool); -extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, +extern long afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t, bool); +extern struct afs_call *afs_alloc_flat_call(struct afs_net *, + const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); @@ -606,117 +803,136 @@ static inline int afs_transfer_reply(struct afs_call *call) return afs_extract_data(call, call->buffer, call->reply_max, false); } +static inline bool afs_check_call_state(struct afs_call *call, + enum afs_call_state state) +{ + return READ_ONCE(call->state) == state; +} + +static inline bool afs_set_call_state(struct afs_call *call, + enum afs_call_state from, + enum afs_call_state to) +{ + bool ok = false; + + spin_lock_bh(&call->state_lock); + if (call->state == from) { + call->state = to; + trace_afs_call_state(call, from, to, 0, 0); + ok = true; + } + spin_unlock_bh(&call->state_lock); + return ok; +} + +static inline void afs_set_call_complete(struct afs_call *call, + int error, u32 remote_abort) +{ + enum afs_call_state state; + bool ok = false; + + spin_lock_bh(&call->state_lock); + state = call->state; + if (state != AFS_CALL_COMPLETE) { + call->abort_code = remote_abort; + call->error = error; + call->state = AFS_CALL_COMPLETE; + trace_afs_call_state(call, state, AFS_CALL_COMPLETE, + error, remote_abort); + ok = true; + } + spin_unlock_bh(&call->state_lock); + if (ok) + trace_afs_call_done(call); +} + /* * security.c */ +extern void afs_put_permits(struct afs_permits *); extern void afs_clear_permits(struct afs_vnode *); -extern void afs_cache_permit(struct afs_vnode *, struct key *, long); +extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int); extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); +extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); extern int afs_permission(struct inode *, int); +extern void __exit afs_clean_up_permit_cache(void); /* * server.c */ extern spinlock_t afs_server_peer_lock; -#define afs_get_server(S) \ -do { \ - _debug("GET SERVER %d", atomic_read(&(S)->usage)); \ - atomic_inc(&(S)->usage); \ -} while(0) +static inline struct afs_server *afs_get_server(struct afs_server *server) +{ + atomic_inc(&server->usage); + return server; +} -extern struct afs_server *afs_lookup_server(struct afs_cell *, - const struct in_addr *); -extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *); -extern void afs_put_server(struct afs_server *); -extern void __exit afs_purge_servers(void); +extern struct afs_server *afs_find_server(struct afs_net *, + const struct sockaddr_rxrpc *); +extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); +extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *); +extern void afs_put_server(struct afs_net *, struct afs_server *); +extern void afs_manage_servers(struct work_struct *); +extern void afs_servers_timer(struct timer_list *); +extern void __net_exit afs_purge_servers(struct afs_net *); +extern bool afs_probe_fileserver(struct afs_fs_cursor *); +extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *); /* - * super.c + * server_list.c */ -extern int afs_fs_init(void); -extern void afs_fs_exit(void); +static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list *slist) +{ + refcount_inc(&slist->usage); + return slist; +} -/* - * vlclient.c - */ -extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, - const char *, struct afs_cache_vlocation *, - bool); -extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *, - afs_volid_t, afs_voltype_t, - struct afs_cache_vlocation *, bool); +extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); +extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *, + struct afs_vldb_entry *, + u8); +extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); /* - * vlocation.c + * super.c */ -#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0) - -extern int __init afs_vlocation_update_init(void); -extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *, - struct key *, - const char *, size_t); -extern void afs_put_vlocation(struct afs_vlocation *); -extern void afs_vlocation_purge(void); +extern int __init afs_fs_init(void); +extern void __exit afs_fs_exit(void); /* - * vnode.c + * vlclient.c */ -static inline struct afs_vnode *AFS_FS_I(struct inode *inode) -{ - return container_of(inode, struct afs_vnode, vfs_inode); -} - -static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) -{ - return &vnode->vfs_inode; -} - -extern void afs_vnode_finalise_status_update(struct afs_vnode *, - struct afs_server *); -extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *, - struct key *); -extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *, - struct afs_read *); -extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *, - umode_t, struct afs_fid *, struct afs_file_status *, - struct afs_callback *, struct afs_server **); -extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *, - bool); -extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *, - const char *); -extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *, - const char *, struct afs_fid *, - struct afs_file_status *, struct afs_server **); -extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *, - struct key *, const char *, const char *); -extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t, - unsigned, unsigned); -extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *); -extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *, - struct afs_volume_status *); -extern int afs_vnode_set_lock(struct afs_vnode *, struct key *, - afs_lock_type_t); -extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *); -extern int afs_vnode_release_lock(struct afs_vnode *, struct key *); +extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *, + struct afs_addr_cursor *, + struct key *, const char *, int); +extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *, struct afs_addr_cursor *, + struct key *, const uuid_t *); +extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *); +extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *, struct afs_addr_cursor *, + struct key *, const uuid_t *); /* * volume.c */ -#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) +static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume) +{ + if (volume) + atomic_inc(&volume->usage); + return volume; +} -extern void afs_put_volume(struct afs_volume *); -extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *); -extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *); -extern int afs_volume_release_fileserver(struct afs_vnode *, - struct afs_server *, int); +extern struct afs_volume *afs_create_volume(struct afs_mount_params *); +extern void afs_activate_volume(struct afs_volume *); +extern void afs_deactivate_volume(struct afs_volume *); +extern void afs_put_volume(struct afs_cell *, struct afs_volume *); +extern int afs_check_volume_status(struct afs_volume *, struct key *); /* * write.c */ extern int afs_set_page_dirty(struct page *); -extern void afs_put_writeback(struct afs_writeback *); extern int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); @@ -727,9 +943,11 @@ extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); -extern int afs_writeback_all(struct afs_vnode *); extern int afs_flush(struct file *, fl_owner_t); extern int afs_fsync(struct file *, loff_t, loff_t, int); +extern int afs_page_mkwrite(struct vm_fault *); +extern void afs_prune_wb_keys(struct afs_vnode *); +extern int afs_launder_page(struct page *); /* * xattr.c @@ -737,12 +955,42 @@ extern int afs_fsync(struct file *, loff_t, loff_t, int); extern const struct xattr_handler *afs_xattr_handlers[]; extern ssize_t afs_listxattr(struct dentry *, char *, size_t); + +/* + * Miscellaneous inline functions. + */ +static inline struct afs_vnode *AFS_FS_I(struct inode *inode) +{ + return container_of(inode, struct afs_vnode, vfs_inode); +} + +static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) +{ + return &vnode->vfs_inode; +} + +static inline void afs_vnode_commit_status(struct afs_fs_cursor *fc, + struct afs_vnode *vnode, + unsigned int cb_break) +{ + if (fc->ac.error == 0) + afs_cache_permit(vnode, fc->key, cb_break); +} + +static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc, + struct afs_vnode *vnode) +{ + if (fc->ac.error == -ENOENT) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + afs_break_callback(vnode); + } +} + + /*****************************************************************************/ /* * debug tracing */ -#include <trace/events/afs.h> - extern unsigned afs_debug; #define dbgprintk(FMT,...) \ diff --git a/fs/afs/main.c b/fs/afs/main.c index 9944770849da..15a02a05ff40 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -31,57 +31,112 @@ static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); -struct afs_uuid afs_uuid; struct workqueue_struct *afs_wq; +struct afs_net __afs_net; /* - * initialise the AFS client FS module + * Initialise an AFS network namespace record. */ -static int __init afs_init(void) +static int __net_init afs_net_init(struct afs_net *net) { int ret; - printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); + net->live = true; + generate_random_uuid((unsigned char *)&net->uuid); - generate_random_uuid((unsigned char *)&afs_uuid); + INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + mutex_init(&net->socket_mutex); - /* create workqueue */ - ret = -ENOMEM; - afs_wq = alloc_workqueue("afs", 0, 0); - if (!afs_wq) - return ret; + net->cells = RB_ROOT; + seqlock_init(&net->cells_lock); + INIT_WORK(&net->cells_manager, afs_manage_cells); + timer_setup(&net->cells_timer, afs_cells_timer, 0); - /* register the /proc stuff */ - ret = afs_proc_init(); - if (ret < 0) - goto error_proc; + spin_lock_init(&net->proc_cells_lock); + INIT_LIST_HEAD(&net->proc_cells); -#ifdef CONFIG_AFS_FSCACHE - /* we want to be able to cache */ - ret = fscache_register_netfs(&afs_cache_netfs); + seqlock_init(&net->fs_lock); + net->fs_servers = RB_ROOT; + INIT_LIST_HEAD(&net->fs_updates); + INIT_HLIST_HEAD(&net->fs_proc); + + INIT_HLIST_HEAD(&net->fs_addresses4); + INIT_HLIST_HEAD(&net->fs_addresses6); + seqlock_init(&net->fs_addr_lock); + + INIT_WORK(&net->fs_manager, afs_manage_servers); + timer_setup(&net->fs_timer, afs_servers_timer, 0); + + /* Register the /proc stuff */ + ret = afs_proc_init(net); if (ret < 0) - goto error_cache; -#endif + goto error_proc; - /* initialise the cell DB */ - ret = afs_cell_init(rootcell); + /* Initialise the cell DB */ + ret = afs_cell_init(net, rootcell); if (ret < 0) goto error_cell_init; - /* initialise the VL update process */ - ret = afs_vlocation_update_init(); + /* Create the RxRPC transport */ + ret = afs_open_socket(net); if (ret < 0) - goto error_vl_update_init; + goto error_open_socket; - /* initialise the callback update process */ - ret = afs_callback_update_init(); + return 0; + +error_open_socket: + net->live = false; + afs_cell_purge(net); + afs_purge_servers(net); +error_cell_init: + net->live = false; + afs_proc_cleanup(net); +error_proc: + net->live = false; + return ret; +} + +/* + * Clean up and destroy an AFS network namespace record. + */ +static void __net_exit afs_net_exit(struct afs_net *net) +{ + net->live = false; + afs_cell_purge(net); + afs_purge_servers(net); + afs_close_socket(net); + afs_proc_cleanup(net); +} + +/* + * initialise the AFS client FS module + */ +static int __init afs_init(void) +{ + int ret = -ENOMEM; + + printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); + + afs_wq = alloc_workqueue("afs", 0, 0); + if (!afs_wq) + goto error_afs_wq; + afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); + if (!afs_async_calls) + goto error_async; + afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); + if (!afs_lock_manager) + goto error_lockmgr; + +#ifdef CONFIG_AFS_FSCACHE + /* we want to be able to cache */ + ret = fscache_register_netfs(&afs_cache_netfs); if (ret < 0) - goto error_callback_update_init; + goto error_cache; +#endif - /* create the RxRPC transport */ - ret = afs_open_socket(); + ret = afs_net_init(&__afs_net); if (ret < 0) - goto error_open_socket; + goto error_net; /* register the filesystems */ ret = afs_fs_init(); @@ -91,21 +146,18 @@ static int __init afs_init(void) return ret; error_fs: - afs_close_socket(); -error_open_socket: - afs_callback_update_kill(); -error_callback_update_init: - afs_vlocation_purge(); -error_vl_update_init: - afs_cell_purge(); -error_cell_init: + afs_net_exit(&__afs_net); +error_net: #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); error_cache: #endif - afs_proc_cleanup(); -error_proc: + destroy_workqueue(afs_lock_manager); +error_lockmgr: + destroy_workqueue(afs_async_calls); +error_async: destroy_workqueue(afs_wq); +error_afs_wq: rcu_barrier(); printk(KERN_ERR "kAFS: failed to register: %d\n", ret); return ret; @@ -124,17 +176,14 @@ static void __exit afs_exit(void) printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); afs_fs_exit(); - afs_kill_lock_manager(); - afs_close_socket(); - afs_purge_servers(); - afs_callback_update_kill(); - afs_vlocation_purge(); - destroy_workqueue(afs_wq); - afs_cell_purge(); + afs_net_exit(&__afs_net); #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); #endif - afs_proc_cleanup(); + destroy_workqueue(afs_lock_manager); + destroy_workqueue(afs_async_calls); + destroy_workqueue(afs_wq); + afs_clean_up_permit_cache(); rcu_barrier(); } diff --git a/fs/afs/misc.c b/fs/afs/misc.c index c05f1f1c0d41..700a5fa7f4ec 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -21,12 +21,12 @@ int afs_abort_to_error(u32 abort_code) { switch (abort_code) { - /* low errno codes inserted into abort namespace */ + /* Low errno codes inserted into abort namespace */ case 13: return -EACCES; case 27: return -EFBIG; case 30: return -EROFS; - /* VICE "special error" codes; 101 - 111 */ + /* VICE "special error" codes; 101 - 111 */ case VSALVAGE: return -EIO; case VNOVNODE: return -ENOENT; case VNOVOL: return -ENOMEDIUM; @@ -39,7 +39,37 @@ int afs_abort_to_error(u32 abort_code) case VBUSY: return -EBUSY; case VMOVED: return -ENXIO; - /* Unified AFS error table; ET "uae" == 0x2f6df00 */ + /* Volume Location server errors */ + case AFSVL_IDEXIST: return -EEXIST; + case AFSVL_IO: return -EREMOTEIO; + case AFSVL_NAMEEXIST: return -EEXIST; + case AFSVL_CREATEFAIL: return -EREMOTEIO; + case AFSVL_NOENT: return -ENOMEDIUM; + case AFSVL_EMPTY: return -ENOMEDIUM; + case AFSVL_ENTDELETED: return -ENOMEDIUM; + case AFSVL_BADNAME: return -EINVAL; + case AFSVL_BADINDEX: return -EINVAL; + case AFSVL_BADVOLTYPE: return -EINVAL; + case AFSVL_BADSERVER: return -EINVAL; + case AFSVL_BADPARTITION: return -EINVAL; + case AFSVL_REPSFULL: return -EFBIG; + case AFSVL_NOREPSERVER: return -ENOENT; + case AFSVL_DUPREPSERVER: return -EEXIST; + case AFSVL_RWNOTFOUND: return -ENOENT; + case AFSVL_BADREFCOUNT: return -EINVAL; + case AFSVL_SIZEEXCEEDED: return -EINVAL; + case AFSVL_BADENTRY: return -EINVAL; + case AFSVL_BADVOLIDBUMP: return -EINVAL; + case AFSVL_IDALREADYHASHED: return -EINVAL; + case AFSVL_ENTRYLOCKED: return -EBUSY; + case AFSVL_BADVOLOPER: return -EBADRQC; + case AFSVL_BADRELLOCKTYPE: return -EINVAL; + case AFSVL_RERELEASE: return -EREMOTEIO; + case AFSVL_BADSERVERFLAG: return -EINVAL; + case AFSVL_PERM: return -EACCES; + case AFSVL_NOMEM: return -EREMOTEIO; + + /* Unified AFS error table; ET "uae" == 0x2f6df00 */ case 0x2f6df00: return -EPERM; case 0x2f6df01: return -ENOENT; case 0x2f6df04: return -EIO; @@ -68,7 +98,7 @@ int afs_abort_to_error(u32 abort_code) case 0x2f6df6c: return -ETIMEDOUT; case 0x2f6df78: return -EDQUOT; - /* RXKAD abort codes; from include/rxrpc/packet.h. ET "RXK" == 0x1260B00 */ + /* RXKAD abort codes; from include/rxrpc/packet.h. ET "RXK" == 0x1260B00 */ case RXKADINCONSISTENCY: return -EPROTO; case RXKADPACKETSHORT: return -EPROTO; case RXKADLEVELFAIL: return -EKEYREJECTED; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 35efb9a31dd7..4508dd54f789 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -17,8 +17,15 @@ #include <linux/uaccess.h> #include "internal.h" -static struct proc_dir_entry *proc_afs; +static inline struct afs_net *afs_proc2net(struct file *f) +{ + return &__afs_net; +} +static inline struct afs_net *afs_seq2net(struct seq_file *m) +{ + return &__afs_net; // TODO: use seq_file_net(m) +} static int afs_proc_cells_open(struct inode *inode, struct file *file); static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos); @@ -98,22 +105,22 @@ static const struct file_operations afs_proc_cell_vlservers_fops = { .release = seq_release, }; -static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); -static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, +static int afs_proc_servers_open(struct inode *inode, struct file *file); +static void *afs_proc_servers_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_servers_next(struct seq_file *p, void *v, loff_t *pos); -static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); -static int afs_proc_cell_servers_show(struct seq_file *m, void *v); - -static const struct seq_operations afs_proc_cell_servers_ops = { - .start = afs_proc_cell_servers_start, - .next = afs_proc_cell_servers_next, - .stop = afs_proc_cell_servers_stop, - .show = afs_proc_cell_servers_show, +static void afs_proc_servers_stop(struct seq_file *p, void *v); +static int afs_proc_servers_show(struct seq_file *m, void *v); + +static const struct seq_operations afs_proc_servers_ops = { + .start = afs_proc_servers_start, + .next = afs_proc_servers_next, + .stop = afs_proc_servers_stop, + .show = afs_proc_servers_show, }; -static const struct file_operations afs_proc_cell_servers_fops = { - .open = afs_proc_cell_servers_open, +static const struct file_operations afs_proc_servers_fops = { + .open = afs_proc_servers_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -122,23 +129,24 @@ static const struct file_operations afs_proc_cell_servers_fops = { /* * initialise the /proc/fs/afs/ directory */ -int afs_proc_init(void) +int afs_proc_init(struct afs_net *net) { _enter(""); - proc_afs = proc_mkdir("fs/afs", NULL); - if (!proc_afs) + net->proc_afs = proc_mkdir("fs/afs", NULL); + if (!net->proc_afs) goto error_dir; - if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) || - !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops)) + if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) || + !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) || + !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops)) goto error_tree; _leave(" = 0"); return 0; error_tree: - remove_proc_subtree("fs/afs", NULL); + proc_remove(net->proc_afs); error_dir: _leave(" = -ENOMEM"); return -ENOMEM; @@ -147,9 +155,10 @@ error_dir: /* * clean up the /proc/fs/afs/ directory */ -void afs_proc_cleanup(void) +void afs_proc_cleanup(struct afs_net *net) { - remove_proc_subtree("fs/afs", NULL); + proc_remove(net->proc_afs); + net->proc_afs = NULL; } /* @@ -166,7 +175,6 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) m = file->private_data; m->private = PDE_DATA(inode); - return 0; } @@ -176,25 +184,28 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) */ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) { - /* lock the list against modification */ - down_read(&afs_proc_cells_sem); - return seq_list_start_head(&afs_proc_cells, *_pos); + struct afs_net *net = afs_seq2net(m); + + rcu_read_lock(); + return seq_list_start_head(&net->proc_cells, *_pos); } /* * move to next cell in cells list */ -static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos) +static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &afs_proc_cells, pos); + struct afs_net *net = afs_seq2net(m); + + return seq_list_next(v, &net->proc_cells, pos); } /* * clean up after reading from the cells list */ -static void afs_proc_cells_stop(struct seq_file *p, void *v) +static void afs_proc_cells_stop(struct seq_file *m, void *v) { - up_read(&afs_proc_cells_sem); + rcu_read_unlock(); } /* @@ -203,16 +214,16 @@ static void afs_proc_cells_stop(struct seq_file *p, void *v) static int afs_proc_cells_show(struct seq_file *m, void *v) { struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); + struct afs_net *net = afs_seq2net(m); - if (v == &afs_proc_cells) { + if (v == &net->proc_cells) { /* display header on line 1 */ seq_puts(m, "USE NAME\n"); return 0; } /* display one cell per line on subsequent lines */ - seq_printf(m, "%3d %s\n", - atomic_read(&cell->usage), cell->name); + seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name); return 0; } @@ -223,6 +234,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v) static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, size_t size, loff_t *_pos) { + struct afs_net *net = afs_proc2net(file); char *kbuf, *name, *args; int ret; @@ -264,13 +276,13 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, if (strcmp(kbuf, "add") == 0) { struct afs_cell *cell; - cell = afs_cell_create(name, strlen(name), args, false); + cell = afs_lookup_cell(net, name, strlen(name), args, true); if (IS_ERR(cell)) { ret = PTR_ERR(cell); goto done; } - afs_put_cell(cell); + set_bit(AFS_CELL_FL_NO_GC, &cell->flags); printk("kAFS: Added new cell '%s'\n", name); } else { goto inval; @@ -303,6 +315,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file, const char __user *buf, size_t size, loff_t *_pos) { + struct afs_net *net = afs_proc2net(file); char *kbuf, *s; int ret; @@ -322,7 +335,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file, /* determine command to perform */ _debug("rootcell=%s", kbuf); - ret = afs_cell_init(kbuf); + ret = afs_cell_init(net, kbuf); if (ret >= 0) ret = size; /* consume everything, always */ @@ -334,29 +347,27 @@ static ssize_t afs_proc_rootcell_write(struct file *file, /* * initialise /proc/fs/afs/<cell>/ */ -int afs_proc_cell_setup(struct afs_cell *cell) +int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell) { struct proc_dir_entry *dir; - _enter("%p{%s}", cell, cell->name); + _enter("%p{%s},%p", cell, cell->name, net->proc_afs); - dir = proc_mkdir(cell->name, proc_afs); + dir = proc_mkdir(cell->name, net->proc_afs); if (!dir) goto error_dir; - if (!proc_create_data("servers", 0, dir, - &afs_proc_cell_servers_fops, cell) || - !proc_create_data("vlservers", 0, dir, - &afs_proc_cell_vlservers_fops, cell) || + if (!proc_create_data("vlservers", 0, dir, + &afs_proc_cell_vlservers_fops, cell) || !proc_create_data("volumes", 0, dir, - &afs_proc_cell_volumes_fops, cell)) + &afs_proc_cell_volumes_fops, cell)) goto error_tree; _leave(" = 0"); return 0; error_tree: - remove_proc_subtree(cell->name, proc_afs); + remove_proc_subtree(cell->name, net->proc_afs); error_dir: _leave(" = -ENOMEM"); return -ENOMEM; @@ -365,11 +376,11 @@ error_dir: /* * remove /proc/fs/afs/<cell>/ */ -void afs_proc_cell_remove(struct afs_cell *cell) +void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell) { _enter(""); - remove_proc_subtree(cell->name, proc_afs); + remove_proc_subtree(cell->name, net->proc_afs); _leave(""); } @@ -407,9 +418,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) _enter("cell=%p pos=%Ld", cell, *_pos); - /* lock the list against modification */ - down_read(&cell->vl_sem); - return seq_list_start_head(&cell->vl_list, *_pos); + read_lock(&cell->proc_lock); + return seq_list_start_head(&cell->proc_volumes, *_pos); } /* @@ -421,7 +431,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, struct afs_cell *cell = p->private; _enter("cell=%p pos=%Ld", cell, *_pos); - return seq_list_next(v, &cell->vl_list, _pos); + return seq_list_next(v, &cell->proc_volumes, _pos); } /* @@ -431,17 +441,13 @@ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) { struct afs_cell *cell = p->private; - up_read(&cell->vl_sem); + read_unlock(&cell->proc_lock); } -static const char afs_vlocation_states[][4] = { - [AFS_VL_NEW] = "New", - [AFS_VL_CREATING] = "Crt", - [AFS_VL_VALID] = "Val", - [AFS_VL_NO_VOLUME] = "NoV", - [AFS_VL_UPDATING] = "Upd", - [AFS_VL_VOLUME_DELETED] = "Del", - [AFS_VL_UNCERTAIN] = "Unc", +static const char afs_vol_types[3][3] = { + [AFSVL_RWVOL] = "RW", + [AFSVL_ROVOL] = "RO", + [AFSVL_BACKVOL] = "BK", }; /* @@ -450,23 +456,17 @@ static const char afs_vlocation_states[][4] = { static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) { struct afs_cell *cell = m->private; - struct afs_vlocation *vlocation = - list_entry(v, struct afs_vlocation, link); + struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link); - /* display header on line 1 */ - if (v == &cell->vl_list) { - seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n"); + /* Display header on line 1 */ + if (v == &cell->proc_volumes) { + seq_puts(m, "USE VID TY\n"); return 0; } - /* display one cell per line on subsequent lines */ - seq_printf(m, "%3d %s %08x %08x %08x %s\n", - atomic_read(&vlocation->usage), - afs_vlocation_states[vlocation->state], - vlocation->vldb.vid[0], - vlocation->vldb.vid[1], - vlocation->vldb.vid[2], - vlocation->vldb.name); + seq_printf(m, "%3d %08x %s\n", + atomic_read(&vol->usage), vol->vid, + afs_vol_types[vol->type]); return 0; } @@ -501,23 +501,23 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) */ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) { + struct afs_addr_list *alist; struct afs_cell *cell = m->private; loff_t pos = *_pos; - _enter("cell=%p pos=%Ld", cell, *_pos); + rcu_read_lock(); - /* lock the list against modification */ - down_read(&cell->vl_sem); + alist = rcu_dereference(cell->vl_addrs); /* allow for the header line */ if (!pos) return (void *) 1; pos--; - if (pos >= cell->vl_naddrs) + if (!alist || pos >= alist->nr_addrs) return NULL; - return &cell->vl_addrs[pos]; + return alist->addrs + pos; } /* @@ -526,17 +526,18 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, loff_t *_pos) { + struct afs_addr_list *alist; struct afs_cell *cell = p->private; loff_t pos; - _enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos); + alist = rcu_dereference(cell->vl_addrs); pos = *_pos; (*_pos)++; - if (pos >= cell->vl_naddrs) + if (!alist || pos >= alist->nr_addrs) return NULL; - return &cell->vl_addrs[pos]; + return alist->addrs + pos; } /* @@ -544,9 +545,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, */ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) { - struct afs_cell *cell = p->private; - - up_read(&cell->vl_sem); + rcu_read_unlock(); } /* @@ -554,100 +553,76 @@ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) */ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) { - struct in_addr *addr = v; + struct sockaddr_rxrpc *addr = v; /* display header on line 1 */ - if (v == (struct in_addr *) 1) { + if (v == (void *)1) { seq_puts(m, "ADDRESS\n"); return 0; } /* display one cell per line on subsequent lines */ - seq_printf(m, "%pI4\n", &addr->s_addr); + seq_printf(m, "%pISp\n", &addr->transport); return 0; } /* - * open "/proc/fs/afs/<cell>/servers" which provides a summary of active + * open "/proc/fs/afs/servers" which provides a summary of active * servers */ -static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) +static int afs_proc_servers_open(struct inode *inode, struct file *file) { - struct afs_cell *cell; - struct seq_file *m; - int ret; - - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; - - ret = seq_open(file, &afs_proc_cell_servers_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = cell; - return 0; + return seq_open(file, &afs_proc_servers_ops); } /* - * set up the iterator to start reading from the cells list and return the - * first item + * Set up the iterator to start reading from the server list and return the + * first item. */ -static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) - __acquires(m->private->servers_lock) +static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos) { - struct afs_cell *cell = m->private; - - _enter("cell=%p pos=%Ld", cell, *_pos); + struct afs_net *net = afs_seq2net(m); - /* lock the list against modification */ - read_lock(&cell->servers_lock); - return seq_list_start_head(&cell->servers, *_pos); + rcu_read_lock(); + return seq_hlist_start_head_rcu(&net->fs_proc, *_pos); } /* * move to next cell in cells list */ -static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, - loff_t *_pos) +static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_cell *cell = p->private; + struct afs_net *net = afs_seq2net(m); - _enter("cell=%p pos=%Ld", cell, *_pos); - return seq_list_next(v, &cell->servers, _pos); + return seq_hlist_next_rcu(v, &net->fs_proc, _pos); } /* * clean up after reading from the cells list */ -static void afs_proc_cell_servers_stop(struct seq_file *p, void *v) - __releases(p->private->servers_lock) +static void afs_proc_servers_stop(struct seq_file *p, void *v) { - struct afs_cell *cell = p->private; - - read_unlock(&cell->servers_lock); + rcu_read_unlock(); } /* * display a header line followed by a load of volume lines */ -static int afs_proc_cell_servers_show(struct seq_file *m, void *v) +static int afs_proc_servers_show(struct seq_file *m, void *v) { - struct afs_cell *cell = m->private; - struct afs_server *server = list_entry(v, struct afs_server, link); - char ipaddr[20]; + struct afs_server *server; + struct afs_addr_list *alist; - /* display header on line 1 */ - if (v == &cell->servers) { - seq_puts(m, "USE ADDR STATE\n"); + if (v == SEQ_START_TOKEN) { + seq_puts(m, "UUID USE ADDR\n"); return 0; } - /* display one cell per line on subsequent lines */ - sprintf(ipaddr, "%pI4", &server->addr); - seq_printf(m, "%3d %-15.15s %5d\n", - atomic_read(&server->usage), ipaddr, server->fs_state); - + server = list_entry(v, struct afs_server, proc_link); + alist = rcu_dereference(server->addresses); + seq_printf(m, "%pU %3d %pISp\n", + &server->uuid, + atomic_read(&server->usage), + &alist->addrs[alist->index].transport); return 0; } diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c new file mode 100644 index 000000000000..d04511fb3879 --- /dev/null +++ b/fs/afs/rotate.c @@ -0,0 +1,757 @@ +/* Handle fileserver selection and rotation. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/sched/signal.h> +#include "internal.h" +#include "afs_fs.h" + +/* + * Initialise a filesystem server cursor for iterating over FS servers. + */ +void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) +{ + memset(fc, 0, sizeof(*fc)); +} + +/* + * Begin an operation on the fileserver. + * + * Fileserver operations are serialised on the server by vnode, so we serialise + * them here also using the io_lock. + */ +bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + struct key *key) +{ + afs_init_fs_cursor(fc, vnode); + fc->vnode = vnode; + fc->key = key; + fc->ac.error = SHRT_MAX; + + if (mutex_lock_interruptible(&vnode->io_lock) < 0) { + fc->ac.error = -EINTR; + fc->flags |= AFS_FS_CURSOR_STOP; + return false; + } + + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) + fc->flags |= AFS_FS_CURSOR_CUR_ONLY; + return true; +} + +/* + * Begin iteration through a server list, starting with the vnode's last used + * server if possible, or the last recorded good server if not. + */ +static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, + struct afs_vnode *vnode) +{ + struct afs_cb_interest *cbi; + int i; + + read_lock(&vnode->volume->servers_lock); + fc->server_list = afs_get_serverlist(vnode->volume->servers); + read_unlock(&vnode->volume->servers_lock); + + cbi = vnode->cb_interest; + if (cbi) { + /* See if the vnode's preferred record is still available */ + for (i = 0; i < fc->server_list->nr_servers; i++) { + if (fc->server_list->servers[i].cb_interest == cbi) { + fc->start = i; + goto found_interest; + } + } + + /* If we have a lock outstanding on a server that's no longer + * serving this vnode, then we can't switch to another server + * and have to return an error. + */ + if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { + fc->ac.error = -ESTALE; + return false; + } + + /* Note that the callback promise is effectively broken */ + write_seqlock(&vnode->cb_lock); + ASSERTCMP(cbi, ==, vnode->cb_interest); + vnode->cb_interest = NULL; + if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) + vnode->cb_break++; + write_sequnlock(&vnode->cb_lock); + + afs_put_cb_interest(afs_v2net(vnode), cbi); + cbi = NULL; + } else { + fc->start = READ_ONCE(fc->server_list->index); + } + +found_interest: + fc->index = fc->start; + return true; +} + +/* + * Post volume busy note. + */ +static void afs_busy(struct afs_volume *volume, u32 abort_code) +{ + const char *m; + + switch (abort_code) { + case VOFFLINE: m = "offline"; break; + case VRESTARTING: m = "restarting"; break; + case VSALVAGING: m = "being salvaged"; break; + default: m = "busy"; break; + } + + pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m); +} + +/* + * Sleep and retry the operation to the same fileserver. + */ +static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) +{ + msleep_interruptible(1000); + if (signal_pending(current)) { + fc->ac.error = -ERESTARTSYS; + return false; + } + + return true; +} + +/* + * Select the fileserver to use. May be called multiple times to rotate + * through the fileservers. + */ +bool afs_select_fileserver(struct afs_fs_cursor *fc) +{ + struct afs_addr_list *alist; + struct afs_server *server; + struct afs_vnode *vnode = fc->vnode; + + _enter("%u/%u,%u/%u,%d,%d", + fc->index, fc->start, + fc->ac.index, fc->ac.start, + fc->ac.error, fc->ac.abort_code); + + if (fc->flags & AFS_FS_CURSOR_STOP) { + _leave(" = f [stopped]"); + return false; + } + + /* Evaluate the result of the previous operation, if there was one. */ + switch (fc->ac.error) { + case SHRT_MAX: + goto start; + + case 0: + default: + /* Success or local failure. Stop. */ + fc->flags |= AFS_FS_CURSOR_STOP; + _leave(" = f [okay/local %d]", fc->ac.error); + return false; + + case -ECONNABORTED: + /* The far side rejected the operation on some grounds. This + * might involve the server being busy or the volume having been moved. + */ + switch (fc->ac.abort_code) { + case VNOVOL: + /* This fileserver doesn't know about the volume. + * - May indicate that the VL is wrong - retry once and compare + * the results. + * - May indicate that the fileserver couldn't attach to the vol. + */ + if (fc->flags & AFS_FS_CURSOR_VNOVOL) { + fc->ac.error = -EREMOTEIO; + goto failed; + } + + write_lock(&vnode->volume->servers_lock); + fc->server_list->vnovol_mask |= 1 << fc->index; + write_unlock(&vnode->volume->servers_lock); + + set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); + fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); + if (fc->ac.error < 0) + goto failed; + + if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { + fc->ac.error = -ENOMEDIUM; + goto failed; + } + + /* If the server list didn't change, then assume that + * it's the fileserver having trouble. + */ + if (vnode->volume->servers == fc->server_list) { + fc->ac.error = -EREMOTEIO; + goto failed; + } + + /* Try again */ + fc->flags |= AFS_FS_CURSOR_VNOVOL; + _leave(" = t [vnovol]"); + return true; + + case VSALVAGE: /* TODO: Should this return an error or iterate? */ + case VVOLEXISTS: + case VNOSERVICE: + case VONLINE: + case VDISKFULL: + case VOVERQUOTA: + fc->ac.error = afs_abort_to_error(fc->ac.abort_code); + goto next_server; + + case VOFFLINE: + if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) { + afs_busy(vnode->volume, fc->ac.abort_code); + clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); + } + if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { + fc->ac.error = -EADV; + goto failed; + } + if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { + fc->ac.error = -ESTALE; + goto failed; + } + goto busy; + + case VSALVAGING: + case VRESTARTING: + case VBUSY: + /* Retry after going round all the servers unless we + * have a file lock we need to maintain. + */ + if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { + fc->ac.error = -EBUSY; + goto failed; + } + if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { + afs_busy(vnode->volume, fc->ac.abort_code); + clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); + } + busy: + if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { + if (!afs_sleep_and_retry(fc)) + goto failed; + + /* Retry with same server & address */ + _leave(" = t [vbusy]"); + return true; + } + + fc->flags |= AFS_FS_CURSOR_VBUSY; + goto next_server; + + case VMOVED: + /* The volume migrated to another server. We consider + * consider all locks and callbacks broken and request + * an update from the VLDB. + * + * We also limit the number of VMOVED hops we will + * honour, just in case someone sets up a loop. + */ + if (fc->flags & AFS_FS_CURSOR_VMOVED) { + fc->ac.error = -EREMOTEIO; + goto failed; + } + fc->flags |= AFS_FS_CURSOR_VMOVED; + + set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); + set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); + fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); + if (fc->ac.error < 0) + goto failed; + + /* If the server list didn't change, then the VLDB is + * out of sync with the fileservers. This is hopefully + * a temporary condition, however, so we don't want to + * permanently block access to the file. + * + * TODO: Try other fileservers if we can. + * + * TODO: Retry a few times with sleeps. + */ + if (vnode->volume->servers == fc->server_list) { + fc->ac.error = -ENOMEDIUM; + goto failed; + } + + goto restart_from_beginning; + + default: + clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); + clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); + fc->ac.error = afs_abort_to_error(fc->ac.abort_code); + goto failed; + } + + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + _debug("no conn"); + goto iterate_address; + } + +restart_from_beginning: + _debug("restart"); + afs_end_cursor(&fc->ac); + afs_put_cb_interest(afs_v2net(vnode), fc->cbi); + fc->cbi = NULL; + afs_put_serverlist(afs_v2net(vnode), fc->server_list); + fc->server_list = NULL; +start: + _debug("start"); + /* See if we need to do an update of the volume record. Note that the + * volume may have moved or even have been deleted. + */ + fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); + if (fc->ac.error < 0) + goto failed; + + if (!afs_start_fs_iteration(fc, vnode)) + goto failed; + goto use_server; + +next_server: + _debug("next"); + afs_put_cb_interest(afs_v2net(vnode), fc->cbi); + fc->cbi = NULL; + fc->index++; + if (fc->index >= fc->server_list->nr_servers) + fc->index = 0; + if (fc->index != fc->start) + goto use_server; + + /* That's all the servers poked to no good effect. Try again if some + * of them were busy. + */ + if (fc->flags & AFS_FS_CURSOR_VBUSY) + goto restart_from_beginning; + + fc->ac.error = -EDESTADDRREQ; + goto failed; + +use_server: + _debug("use"); + /* We're starting on a different fileserver from the list. We need to + * check it, create a callback intercept, find its address list and + * probe its capabilities before we use it. + */ + ASSERTCMP(fc->ac.alist, ==, NULL); + server = fc->server_list->servers[fc->index].server; + + if (!afs_check_server_record(fc, server)) + goto failed; + + _debug("USING SERVER: %pU", &server->uuid); + + /* Make sure we've got a callback interest record for this server. We + * have to link it in before we send the request as we can be sent a + * break request before we've finished decoding the reply and + * installing the vnode. + */ + fc->ac.error = afs_register_server_cb_interest( + vnode, &fc->server_list->servers[fc->index]); + if (fc->ac.error < 0) + goto failed; + + fc->cbi = afs_get_cb_interest(vnode->cb_interest); + + read_lock(&server->fs_lock); + alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->fs_lock)); + afs_get_addrlist(alist); + read_unlock(&server->fs_lock); + + + /* Probe the current fileserver if we haven't done so yet. */ + if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { + fc->ac.alist = afs_get_addrlist(alist); + + if (!afs_probe_fileserver(fc)) + goto failed; + } + + if (!fc->ac.alist) + fc->ac.alist = alist; + else + afs_put_addrlist(alist); + + fc->ac.addr = NULL; + fc->ac.start = READ_ONCE(alist->index); + fc->ac.index = fc->ac.start; + fc->ac.error = 0; + fc->ac.begun = false; + goto iterate_address; + +iterate_address: + ASSERT(fc->ac.alist); + _debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs); + /* Iterate over the current server's address list to try and find an + * address on which it will respond to us. + */ + if (afs_iterate_addresses(&fc->ac)) { + _leave(" = t"); + return true; + } + + afs_end_cursor(&fc->ac); + goto next_server; + +failed: + fc->flags |= AFS_FS_CURSOR_STOP; + _leave(" = f [failed %d]", fc->ac.error); + return false; +} + +/* + * Select the same fileserver we used for a vnode before and only that + * fileserver. We use this when we have a lock on that file, which is backed + * only by the fileserver we obtained it from. + */ +bool afs_select_current_fileserver(struct afs_fs_cursor *fc) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_cb_interest *cbi = vnode->cb_interest; + struct afs_addr_list *alist; + + _enter(""); + + switch (fc->ac.error) { + case SHRT_MAX: + if (!cbi) { + fc->ac.error = -ESTALE; + fc->flags |= AFS_FS_CURSOR_STOP; + return false; + } + + fc->cbi = afs_get_cb_interest(vnode->cb_interest); + + read_lock(&cbi->server->fs_lock); + alist = rcu_dereference_protected(cbi->server->addresses, + lockdep_is_held(&cbi->server->fs_lock)); + afs_get_addrlist(alist); + read_unlock(&cbi->server->fs_lock); + if (!alist) { + fc->ac.error = -ESTALE; + fc->flags |= AFS_FS_CURSOR_STOP; + return false; + } + + fc->ac.alist = alist; + fc->ac.addr = NULL; + fc->ac.start = READ_ONCE(alist->index); + fc->ac.index = fc->ac.start; + fc->ac.error = 0; + fc->ac.begun = false; + goto iterate_address; + + case 0: + default: + /* Success or local failure. Stop. */ + fc->flags |= AFS_FS_CURSOR_STOP; + _leave(" = f [okay/local %d]", fc->ac.error); + return false; + + case -ECONNABORTED: + fc->flags |= AFS_FS_CURSOR_STOP; + _leave(" = f [abort]"); + return false; + + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + _debug("no conn"); + goto iterate_address; + } + +iterate_address: + /* Iterate over the current server's address list to try and find an + * address on which it will respond to us. + */ + if (afs_iterate_addresses(&fc->ac)) { + _leave(" = t"); + return true; + } + + afs_end_cursor(&fc->ac); + return false; +} + +/* + * Tidy up a filesystem cursor and unlock the vnode. + */ +int afs_end_vnode_operation(struct afs_fs_cursor *fc) +{ + struct afs_net *net = afs_v2net(fc->vnode); + int ret; + + mutex_unlock(&fc->vnode->io_lock); + + afs_end_cursor(&fc->ac); + afs_put_cb_interest(net, fc->cbi); + afs_put_serverlist(net, fc->server_list); + + ret = fc->ac.error; + if (ret == -ECONNABORTED) + afs_abort_to_error(fc->ac.abort_code); + + return fc->ac.error; +} + +#if 0 +/* + * Set a filesystem server cursor for using a specific FS server. + */ +int afs_set_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) +{ + afs_init_fs_cursor(fc, vnode); + + read_seqlock_excl(&vnode->cb_lock); + if (vnode->cb_interest) { + if (vnode->cb_interest->server->fs_state == 0) + fc->server = afs_get_server(vnode->cb_interest->server); + else + fc->ac.error = vnode->cb_interest->server->fs_state; + } else { + fc->ac.error = -ESTALE; + } + read_sequnlock_excl(&vnode->cb_lock); + + return fc->ac.error; +} + +/* + * pick a server to use to try accessing this volume + * - returns with an elevated usage count on the server chosen + */ +bool afs_volume_pick_fileserver(struct afs_fs_cursor *fc, struct afs_vnode *vnode) +{ + struct afs_volume *volume = vnode->volume; + struct afs_server *server; + int ret, state, loop; + + _enter("%s", volume->vlocation->vldb.name); + + /* stick with the server we're already using if we can */ + if (vnode->cb_interest && vnode->cb_interest->server->fs_state == 0) { + fc->server = afs_get_server(vnode->cb_interest->server); + goto set_server; + } + + down_read(&volume->server_sem); + + /* handle the no-server case */ + if (volume->nservers == 0) { + fc->ac.error = volume->rjservers ? -ENOMEDIUM : -ESTALE; + up_read(&volume->server_sem); + _leave(" = f [no servers %d]", fc->ac.error); + return false; + } + + /* basically, just search the list for the first live server and use + * that */ + ret = 0; + for (loop = 0; loop < volume->nservers; loop++) { + server = volume->servers[loop]; + state = server->fs_state; + + _debug("consider %d [%d]", loop, state); + + switch (state) { + case 0: + goto picked_server; + + case -ENETUNREACH: + if (ret == 0) + ret = state; + break; + + case -EHOSTUNREACH: + if (ret == 0 || + ret == -ENETUNREACH) + ret = state; + break; + + case -ECONNREFUSED: + if (ret == 0 || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH) + ret = state; + break; + + default: + case -EREMOTEIO: + if (ret == 0 || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH || + ret == -ECONNREFUSED) + ret = state; + break; + } + } + +error: + fc->ac.error = ret; + + /* no available servers + * - TODO: handle the no active servers case better + */ + up_read(&volume->server_sem); + _leave(" = f [%d]", fc->ac.error); + return false; + +picked_server: + /* Found an apparently healthy server. We need to register an interest + * in receiving callbacks before we talk to it. + */ + ret = afs_register_server_cb_interest(vnode, + &volume->cb_interests[loop], server); + if (ret < 0) + goto error; + + fc->server = afs_get_server(server); + up_read(&volume->server_sem); +set_server: + fc->ac.alist = afs_get_addrlist(fc->server->addrs); + fc->ac.addr = &fc->ac.alist->addrs[0]; + _debug("USING SERVER: %pIS\n", &fc->ac.addr->transport); + _leave(" = t (picked %pIS)", &fc->ac.addr->transport); + return true; +} + +/* + * release a server after use + * - releases the ref on the server struct that was acquired by picking + * - records result of using a particular server to access a volume + * - return true to try again, false if okay or to issue error + * - the caller must release the server struct if result was false + */ +bool afs_iterate_fs_cursor(struct afs_fs_cursor *fc, + struct afs_vnode *vnode) +{ + struct afs_volume *volume = vnode->volume; + struct afs_server *server = fc->server; + unsigned loop; + + _enter("%s,%pIS,%d", + volume->vlocation->vldb.name, &fc->ac.addr->transport, + fc->ac.error); + + switch (fc->ac.error) { + /* success */ + case 0: + server->fs_state = 0; + _leave(" = f"); + return false; + + /* the fileserver denied all knowledge of the volume */ + case -ENOMEDIUM: + down_write(&volume->server_sem); + + /* firstly, find where the server is in the active list (if it + * is) */ + for (loop = 0; loop < volume->nservers; loop++) + if (volume->servers[loop] == server) + goto present; + + /* no longer there - may have been discarded by another op */ + goto try_next_server_upw; + + present: + volume->nservers--; + memmove(&volume->servers[loop], + &volume->servers[loop + 1], + sizeof(volume->servers[loop]) * + (volume->nservers - loop)); + volume->servers[volume->nservers] = NULL; + afs_put_server(afs_v2net(vnode), server); + volume->rjservers++; + + if (volume->nservers > 0) + /* another server might acknowledge its existence */ + goto try_next_server_upw; + + /* handle the case where all the fileservers have rejected the + * volume + * - TODO: try asking the fileservers for volume information + * - TODO: contact the VL server again to see if the volume is + * no longer registered + */ + up_write(&volume->server_sem); + afs_put_server(afs_v2net(vnode), server); + fc->server = NULL; + _leave(" = f [completely rejected]"); + return false; + + /* problem reaching the server */ + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIME: + case -ETIMEDOUT: + case -EREMOTEIO: + /* mark the server as dead + * TODO: vary dead timeout depending on error + */ + spin_lock(&server->fs_lock); + if (!server->fs_state) { + server->fs_state = fc->ac.error; + printk("kAFS: SERVER DEAD state=%d\n", fc->ac.error); + } + spin_unlock(&server->fs_lock); + goto try_next_server; + + /* miscellaneous error */ + default: + case -ENOMEM: + case -ENONET: + /* tell the caller to accept the result */ + afs_put_server(afs_v2net(vnode), server); + fc->server = NULL; + _leave(" = f [local failure]"); + return false; + } + + /* tell the caller to loop around and try the next server */ +try_next_server_upw: + up_write(&volume->server_sem); +try_next_server: + afs_put_server(afs_v2net(vnode), server); + _leave(" = t [try next server]"); + return true; +} + +/* + * Clean up a fileserver cursor. + */ +int afs_end_fs_cursor(struct afs_fs_cursor *fc, struct afs_net *net) +{ + afs_end_cursor(&fc->ac); + afs_put_server(net, fc->server); + return fc->ac.error; +} + +#endif diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 0bf191f0dbaf..e1126659f043 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -17,13 +17,10 @@ #include "internal.h" #include "afs_cm.h" -struct socket *afs_socket; /* my RxRPC socket */ -static struct workqueue_struct *afs_async_calls; -static struct afs_call *afs_spare_incoming_call; -atomic_t afs_outstanding_calls; +struct workqueue_struct *afs_async_calls; static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); -static int afs_wait_for_call_to_complete(struct afs_call *); +static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); @@ -34,24 +31,13 @@ static int afs_deliver_cm_op_id(struct afs_call *); static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", .deliver = afs_deliver_cm_op_id, - .abort_to_error = afs_abort_to_error, }; -static void afs_charge_preallocation(struct work_struct *); - -static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation); - -static int afs_wait_atomic_t(atomic_t *p) -{ - schedule(); - return 0; -} - /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT */ -int afs_open_socket(void) +int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; @@ -59,28 +45,26 @@ int afs_open_socket(void) _enter(""); - ret = -ENOMEM; - afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); - if (!afs_async_calls) - goto error_0; - - ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); + ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET6, &socket); if (ret < 0) goto error_1; socket->sk->sk_allocation = GFP_NOFS; /* bind the callback manager's address to make this a server socket */ + memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; srx.srx_service = CM_SERVICE; srx.transport_type = SOCK_DGRAM; - srx.transport_len = sizeof(srx.transport.sin); - srx.transport.sin.sin_family = AF_INET; - srx.transport.sin.sin_port = htons(AFS_CM_PORT); - memset(&srx.transport.sin.sin_addr, 0, - sizeof(srx.transport.sin.sin_addr)); + srx.transport_len = sizeof(srx.transport.sin6); + srx.transport.sin6.sin6_family = AF_INET6; + srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + if (ret == -EADDRINUSE) { + srx.transport.sin6.sin6_port = 0; + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + } if (ret < 0) goto error_2; @@ -91,16 +75,14 @@ int afs_open_socket(void) if (ret < 0) goto error_2; - afs_socket = socket; - afs_charge_preallocation(NULL); + net->socket = socket; + afs_charge_preallocation(&net->charge_preallocation_work); _leave(" = 0"); return 0; error_2: sock_release(socket); error_1: - destroy_workqueue(afs_async_calls); -error_0: _leave(" = %d", ret); return ret; } @@ -108,36 +90,36 @@ error_0: /* * close the RxRPC socket AFS was using */ -void afs_close_socket(void) +void afs_close_socket(struct afs_net *net) { _enter(""); - kernel_listen(afs_socket, 0); + kernel_listen(net->socket, 0); flush_workqueue(afs_async_calls); - if (afs_spare_incoming_call) { - afs_put_call(afs_spare_incoming_call); - afs_spare_incoming_call = NULL; + if (net->spare_incoming_call) { + afs_put_call(net->spare_incoming_call); + net->spare_incoming_call = NULL; } - _debug("outstanding %u", atomic_read(&afs_outstanding_calls)); - wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t, + _debug("outstanding %u", atomic_read(&net->nr_outstanding_calls)); + wait_on_atomic_t(&net->nr_outstanding_calls, atomic_t_wait, TASK_UNINTERRUPTIBLE); _debug("no outstanding calls"); - kernel_sock_shutdown(afs_socket, SHUT_RDWR); + kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); - sock_release(afs_socket); + sock_release(net->socket); _debug("dework"); - destroy_workqueue(afs_async_calls); _leave(""); } /* * Allocate a call. */ -static struct afs_call *afs_alloc_call(const struct afs_call_type *type, +static struct afs_call *afs_alloc_call(struct afs_net *net, + const struct afs_call_type *type, gfp_t gfp) { struct afs_call *call; @@ -148,11 +130,13 @@ static struct afs_call *afs_alloc_call(const struct afs_call_type *type, return NULL; call->type = type; + call->net = net; atomic_set(&call->usage, 1); INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); + spin_lock_init(&call->state_lock); - o = atomic_inc_return(&afs_outstanding_calls); + o = atomic_inc_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_alloc, 1, o, __builtin_return_address(0)); return call; @@ -163,8 +147,9 @@ static struct afs_call *afs_alloc_call(const struct afs_call_type *type, */ void afs_put_call(struct afs_call *call) { + struct afs_net *net = call->net; int n = atomic_dec_return(&call->usage); - int o = atomic_read(&afs_outstanding_calls); + int o = atomic_read(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_put, n + 1, o, __builtin_return_address(0)); @@ -175,20 +160,22 @@ void afs_put_call(struct afs_call *call) ASSERT(call->type->name != NULL); if (call->rxcall) { - rxrpc_kernel_end_call(afs_socket, call->rxcall); + rxrpc_kernel_end_call(net->socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) call->type->destructor(call); + afs_put_server(call->net, call->cm_server); + afs_put_cb_interest(call->net, call->cbi); kfree(call->request); kfree(call); - o = atomic_dec_return(&afs_outstanding_calls); + o = atomic_dec_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_free, 0, o, __builtin_return_address(0)); if (o == 0) - wake_up_atomic_t(&afs_outstanding_calls); + wake_up_atomic_t(&net->nr_outstanding_calls); } } @@ -200,7 +187,7 @@ int afs_queue_call_work(struct afs_call *call) int u = atomic_inc_return(&call->usage); trace_afs_call(call, afs_call_trace_work, u, - atomic_read(&afs_outstanding_calls), + atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); INIT_WORK(&call->work, call->type->work); @@ -213,12 +200,13 @@ int afs_queue_call_work(struct afs_call *call) /* * allocate a call with flat request and reply buffers */ -struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, +struct afs_call *afs_alloc_flat_call(struct afs_net *net, + const struct afs_call_type *type, size_t request_size, size_t reply_max) { struct afs_call *call; - call = afs_alloc_call(type, GFP_NOFS); + call = afs_alloc_call(net, type, GFP_NOFS); if (!call) goto nomem_call; @@ -236,6 +224,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, goto nomem_free; } + call->operation_ID = type->op; init_waitqueue_head(&call->waitq); return call; @@ -300,8 +289,7 @@ static void afs_notify_end_request_tx(struct sock *sock, { struct afs_call *call = (struct afs_call *)call_user_ID; - if (call->state == AFS_CALL_REQUESTING) - call->state = AFS_CALL_AWAIT_REPLY; + afs_set_call_state(call, AFS_CALL_CL_REQUESTING, AFS_CALL_CL_AWAIT_REPLY); } /* @@ -319,11 +307,13 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) do { afs_load_bvec(call, msg, bv, first, last, offset); + trace_afs_send_pages(call, msg, first, last, offset); + offset = 0; bytes = msg->msg_iter.count; nr = msg->msg_iter.nr_segs; - ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg, + ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, msg, bytes, afs_notify_end_request_tx); for (loop = 0; loop < nr; loop++) put_page(bv[loop].bv_page); @@ -333,62 +323,62 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) first += nr; } while (first <= last); + trace_afs_sent_pages(call, call->first, last, first, ret); return ret; } /* * initiate a call */ -int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, - bool async) +long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, + gfp_t gfp, bool async) { - struct sockaddr_rxrpc srx; + struct sockaddr_rxrpc *srx = ac->addr; struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; size_t offset; s64 tx_total_len; - u32 abort_code; int ret; - _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); + _enter(",{%pISp},", &srx->transport); ASSERT(call->type != NULL); ASSERT(call->type->name != NULL); _debug("____MAKE %p{%s,%x} [%d]____", call, call->type->name, key_serial(call->key), - atomic_read(&afs_outstanding_calls)); + atomic_read(&call->net->nr_outstanding_calls)); call->async = async; - memset(&srx, 0, sizeof(srx)); - srx.srx_family = AF_RXRPC; - srx.srx_service = call->service_id; - srx.transport_type = SOCK_DGRAM; - srx.transport_len = sizeof(srx.transport.sin); - srx.transport.sin.sin_family = AF_INET; - srx.transport.sin.sin_port = call->port; - memcpy(&srx.transport.sin.sin_addr, addr, 4); - /* Work out the length we're going to transmit. This is awkward for * calls such as FS.StoreData where there's an extra injection of data * after the initial fixed part. */ tx_total_len = call->request_size; if (call->send_pages) { - tx_total_len += call->last_to - call->first_offset; - tx_total_len += (call->last - call->first) * PAGE_SIZE; + if (call->last == call->first) { + tx_total_len += call->last_to - call->first_offset; + } else { + /* It looks mathematically like you should be able to + * combine the following lines with the ones above, but + * unsigned arithmetic is fun when it wraps... + */ + tx_total_len += PAGE_SIZE - call->first_offset; + tx_total_len += call->last_to; + tx_total_len += (call->last - call->first - 1) * PAGE_SIZE; + } } /* create a call */ - rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, + rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, (unsigned long)call, tx_total_len, gfp, (async ? afs_wake_up_async_call : - afs_wake_up_call_waiter)); - call->key = NULL; + afs_wake_up_call_waiter), + call->upgrade); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); goto error_kill_call; @@ -406,16 +396,9 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; - msg.msg_flags = (call->send_pages ? MSG_MORE : 0); + msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0); - /* We have to change the state *before* sending the last packet as - * rxrpc might give us the reply before it returns from sending the - * request. Further, if the send fails, we may already have been given - * a notification and may have collected it. - */ - if (!call->send_pages) - call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(afs_socket, rxcall, + ret = rxrpc_kernel_send_data(call->net->socket, rxcall, &msg, call->request_size, afs_notify_end_request_tx); if (ret < 0) @@ -432,22 +415,26 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, if (call->async) return -EINPROGRESS; - return afs_wait_for_call_to_complete(call); + return afs_wait_for_call_to_complete(call, ac); error_do_abort: call->state = AFS_CALL_COMPLETE; if (ret != -ECONNABORTED) { - rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, - ret, "KSD"); + rxrpc_kernel_abort_call(call->net->socket, rxcall, + RX_USER_ABORT, ret, "KSD"); } else { - abort_code = 0; offset = 0; - rxrpc_kernel_recv_data(afs_socket, rxcall, NULL, 0, &offset, - false, &abort_code); - ret = call->type->abort_to_error(abort_code); + rxrpc_kernel_recv_data(call->net->socket, rxcall, NULL, + 0, &offset, false, &call->abort_code, + &call->service_id); + ac->abort_code = call->abort_code; + ac->responded = true; } + call->error = ret; + trace_afs_call_done(call); error_kill_call: afs_put_call(call); + ac->error = ret; _leave(" = %d", ret); return ret; } @@ -457,123 +444,174 @@ error_kill_call: */ static void afs_deliver_to_call(struct afs_call *call) { - u32 abort_code; + enum afs_call_state state; + u32 abort_code, remote_abort = 0; int ret; _enter("%s", call->type->name); - while (call->state == AFS_CALL_AWAIT_REPLY || - call->state == AFS_CALL_AWAIT_OP_ID || - call->state == AFS_CALL_AWAIT_REQUEST || - call->state == AFS_CALL_AWAIT_ACK + while (state = READ_ONCE(call->state), + state == AFS_CALL_CL_AWAIT_REPLY || + state == AFS_CALL_SV_AWAIT_OP_ID || + state == AFS_CALL_SV_AWAIT_REQUEST || + state == AFS_CALL_SV_AWAIT_ACK ) { - if (call->state == AFS_CALL_AWAIT_ACK) { + if (state == AFS_CALL_SV_AWAIT_ACK) { size_t offset = 0; - ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + ret = rxrpc_kernel_recv_data(call->net->socket, + call->rxcall, NULL, 0, &offset, false, - &call->abort_code); + &remote_abort, + &call->service_id); trace_afs_recv_data(call, 0, offset, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; - if (ret == 1 || ret < 0) { - call->state = AFS_CALL_COMPLETE; - goto done; + if (ret < 0 || ret == 1) { + if (ret == 1) + ret = 0; + goto call_complete; } return; } ret = call->type->deliver(call); + state = READ_ONCE(call->state); switch (ret) { case 0: - if (call->state == AFS_CALL_AWAIT_REPLY) - call->state = AFS_CALL_COMPLETE; + if (state == AFS_CALL_CL_PROC_REPLY) + goto call_complete; + ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY); goto done; case -EINPROGRESS: case -EAGAIN: goto out; + case -EIO: case -ECONNABORTED: - goto call_complete; + ASSERTCMP(state, ==, AFS_CALL_COMPLETE); + goto done; case -ENOTCONN: abort_code = RX_CALL_DEAD; - rxrpc_kernel_abort_call(afs_socket, call->rxcall, + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KNC"); - goto save_error; + goto local_abort; case -ENOTSUPP: abort_code = RXGEN_OPCODE; - rxrpc_kernel_abort_call(afs_socket, call->rxcall, + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KIV"); - goto save_error; + goto local_abort; case -ENODATA: case -EBADMSG: case -EMSGSIZE: default: abort_code = RXGEN_CC_UNMARSHAL; - if (call->state != AFS_CALL_AWAIT_REPLY) + if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; - rxrpc_kernel_abort_call(afs_socket, call->rxcall, + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, -EBADMSG, "KUM"); - goto save_error; + goto local_abort; } } done: - if (call->state == AFS_CALL_COMPLETE && call->incoming) + if (state == AFS_CALL_COMPLETE && call->incoming) afs_put_call(call); out: _leave(""); return; -save_error: - call->error = ret; +local_abort: + abort_code = 0; call_complete: - call->state = AFS_CALL_COMPLETE; + afs_set_call_complete(call, ret, remote_abort); + state = AFS_CALL_COMPLETE; goto done; } /* * wait synchronously for a call to complete */ -static int afs_wait_for_call_to_complete(struct afs_call *call) +static long afs_wait_for_call_to_complete(struct afs_call *call, + struct afs_addr_cursor *ac) { - int ret; + signed long rtt2, timeout; + long ret; + u64 rtt; + u32 life, last_life; DECLARE_WAITQUEUE(myself, current); _enter(""); + rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); + rtt2 = nsecs_to_jiffies64(rtt) * 2; + if (rtt2 < 2) + rtt2 = 2; + + timeout = rtt2; + last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); + add_wait_queue(&call->waitq, &myself); for (;;) { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); /* deliver any messages that are in the queue */ - if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && + call->need_attention) { call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); continue; } - if (call->state == AFS_CALL_COMPLETE || - signal_pending(current)) + if (afs_check_call_state(call, AFS_CALL_COMPLETE)) break; - schedule(); + + life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); + if (timeout == 0 && + life == last_life && signal_pending(current)) + break; + + if (life != last_life) { + timeout = rtt2; + last_life = life; + } + + timeout = schedule_timeout(timeout); } remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); /* Kill off the call if it's still live. */ - if (call->state < AFS_CALL_COMPLETE) { + if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { _debug("call interrupted"); - rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, -EINTR, "KWI"); + if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + RX_USER_ABORT, -EINTR, "KWI")) + afs_set_call_complete(call, -EINTR, 0); + } + + spin_lock_bh(&call->state_lock); + ac->abort_code = call->abort_code; + ac->error = call->error; + spin_unlock_bh(&call->state_lock); + + ret = ac->error; + switch (ret) { + case 0: + if (call->ret_reply0) { + ret = (long)call->reply[0]; + call->reply[0] = NULL; + } + /* Fall through */ + case -ECONNABORTED: + ac->responded = true; + break; } - ret = call->error; _debug("call complete"); afs_put_call(call); - _leave(" = %d", ret); + _leave(" = %p", (void *)ret); return ret; } @@ -604,7 +642,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, u = __atomic_add_unless(&call->usage, 1, 0); if (u != 0) { trace_afs_call(call, afs_call_trace_wake, u, - atomic_read(&afs_outstanding_calls), + atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); if (!queue_work(afs_async_calls, &call->async_work)) @@ -643,7 +681,7 @@ static void afs_process_async_call(struct work_struct *work) } if (call->state == AFS_CALL_COMPLETE) { - call->reply = NULL; + call->reply[0] = NULL; /* We have two refs to release - one from the alloc and one * queued with the work item - and we can't just deallocate the @@ -668,22 +706,24 @@ static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) /* * Charge the incoming call preallocation. */ -static void afs_charge_preallocation(struct work_struct *work) +void afs_charge_preallocation(struct work_struct *work) { - struct afs_call *call = afs_spare_incoming_call; + struct afs_net *net = + container_of(work, struct afs_net, charge_preallocation_work); + struct afs_call *call = net->spare_incoming_call; for (;;) { if (!call) { - call = afs_alloc_call(&afs_RXCMxxxx, GFP_KERNEL); + call = afs_alloc_call(net, &afs_RXCMxxxx, GFP_KERNEL); if (!call) break; call->async = true; - call->state = AFS_CALL_AWAIT_OP_ID; + call->state = AFS_CALL_SV_AWAIT_OP_ID; init_waitqueue_head(&call->waitq); } - if (rxrpc_kernel_charge_accept(afs_socket, + if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, afs_rx_attach, (unsigned long)call, @@ -691,7 +731,7 @@ static void afs_charge_preallocation(struct work_struct *work) break; call = NULL; } - afs_spare_incoming_call = call; + net->spare_incoming_call = call; } /* @@ -712,7 +752,9 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { - queue_work(afs_wq, &afs_charge_preallocation_work); + struct afs_net *net = afs_sock2net(sk); + + queue_work(afs_wq, &net->charge_preallocation_work); } /* @@ -733,7 +775,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call) return ret; call->operation_ID = ntohl(call->tmp); - call->state = AFS_CALL_AWAIT_REQUEST; + afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST); call->offset = 0; /* ask the cache manager to route the call (it'll change the call type @@ -758,8 +800,7 @@ static void afs_notify_end_reply_tx(struct sock *sock, { struct afs_call *call = (struct afs_call *)call_user_ID; - if (call->state == AFS_CALL_REPLYING) - call->state = AFS_CALL_AWAIT_ACK; + afs_set_call_state(call, AFS_CALL_SV_REPLYING, AFS_CALL_SV_AWAIT_ACK); } /* @@ -767,11 +808,12 @@ static void afs_notify_end_reply_tx(struct sock *sock, */ void afs_send_empty_reply(struct afs_call *call) { + struct afs_net *net = call->net; struct msghdr msg; _enter(""); - rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0); + rxrpc_kernel_set_tx_length(net->socket, call->rxcall, 0); msg.msg_name = NULL; msg.msg_namelen = 0; @@ -780,8 +822,7 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_controllen = 0; msg.msg_flags = 0; - call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0, + switch (rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, 0, afs_notify_end_reply_tx)) { case 0: _leave(" [replied]"); @@ -789,7 +830,7 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); - rxrpc_kernel_abort_call(afs_socket, call->rxcall, + rxrpc_kernel_abort_call(net->socket, call->rxcall, RX_USER_ABORT, -ENOMEM, "KOO"); default: _leave(" [error]"); @@ -802,13 +843,14 @@ void afs_send_empty_reply(struct afs_call *call) */ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) { + struct afs_net *net = call->net; struct msghdr msg; struct kvec iov[1]; int n; _enter(""); - rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len); + rxrpc_kernel_set_tx_length(net->socket, call->rxcall, len); iov[0].iov_base = (void *) buf; iov[0].iov_len = len; @@ -819,8 +861,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_controllen = 0; msg.msg_flags = 0; - call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len, + n = rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, len, afs_notify_end_reply_tx); if (n >= 0) { /* Success */ @@ -830,7 +871,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); - rxrpc_kernel_abort_call(afs_socket, call->rxcall, + rxrpc_kernel_abort_call(net->socket, call->rxcall, RX_USER_ABORT, -ENOMEM, "KOO"); } _leave(" [error]"); @@ -842,6 +883,9 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) int afs_extract_data(struct afs_call *call, void *buf, size_t count, bool want_more) { + struct afs_net *net = call->net; + enum afs_call_state state; + u32 remote_abort = 0; int ret; _enter("{%s,%zu},,%zu,%d", @@ -849,31 +893,32 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, ASSERTCMP(call->offset, <=, count); - ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, buf, count, &call->offset, - want_more, &call->abort_code); + want_more, &remote_abort, + &call->service_id); trace_afs_recv_data(call, count, call->offset, want_more, ret); if (ret == 0 || ret == -EAGAIN) return ret; + state = READ_ONCE(call->state); if (ret == 1) { - switch (call->state) { - case AFS_CALL_AWAIT_REPLY: - call->state = AFS_CALL_COMPLETE; + switch (state) { + case AFS_CALL_CL_AWAIT_REPLY: + afs_set_call_state(call, state, AFS_CALL_CL_PROC_REPLY); break; - case AFS_CALL_AWAIT_REQUEST: - call->state = AFS_CALL_REPLYING; + case AFS_CALL_SV_AWAIT_REQUEST: + afs_set_call_state(call, state, AFS_CALL_SV_REPLYING); break; + case AFS_CALL_COMPLETE: + kdebug("prem complete %d", call->error); + return -EIO; default: break; } return 0; } - if (ret == -ECONNABORTED) - call->error = call->type->abort_to_error(call->abort_code); - else - call->error = ret; - call->state = AFS_CALL_COMPLETE; + afs_set_call_complete(call, ret, remote_abort); return ret; } diff --git a/fs/afs/security.c b/fs/afs/security.c index faca66227ecf..b88b7d45fdaa 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -1,6 +1,6 @@ /* AFS security handling * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -14,9 +14,13 @@ #include <linux/fs.h> #include <linux/ctype.h> #include <linux/sched.h> +#include <linux/hashtable.h> #include <keys/rxrpc-type.h> #include "internal.h" +static DEFINE_HASHTABLE(afs_permits_cache, 10); +static DEFINE_SPINLOCK(afs_permits_lock); + /* * get a key */ @@ -46,167 +50,235 @@ struct key *afs_request_key(struct afs_cell *cell) } /* - * dispose of a permits list + * Dispose of a list of permits. */ -void afs_zap_permits(struct rcu_head *rcu) +static void afs_permits_rcu(struct rcu_head *rcu) { struct afs_permits *permits = container_of(rcu, struct afs_permits, rcu); - int loop; - - _enter("{%d}", permits->count); + int i; - for (loop = permits->count - 1; loop >= 0; loop--) - key_put(permits->permits[loop].key); + for (i = 0; i < permits->nr_permits; i++) + key_put(permits->permits[i].key); kfree(permits); } /* - * dispose of a permits list in which all the key pointers have been copied + * Discard a permission cache. */ -static void afs_dispose_of_permits(struct rcu_head *rcu) +void afs_put_permits(struct afs_permits *permits) { - struct afs_permits *permits = - container_of(rcu, struct afs_permits, rcu); - - _enter("{%d}", permits->count); - - kfree(permits); + if (permits && refcount_dec_and_test(&permits->usage)) { + spin_lock(&afs_permits_lock); + hash_del_rcu(&permits->hash_node); + spin_unlock(&afs_permits_lock); + call_rcu(&permits->rcu, afs_permits_rcu); + } } /* - * get the authorising vnode - this is the specified inode itself if it's a - * directory or it's the parent directory if the specified inode is a file or - * symlink - * - the caller must release the ref on the inode + * Clear a permit cache on callback break. */ -static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode, - struct key *key) +void afs_clear_permits(struct afs_vnode *vnode) { - struct afs_vnode *auth_vnode; - struct inode *auth_inode; + struct afs_permits *permits; - _enter(""); + spin_lock(&vnode->lock); + permits = rcu_dereference_protected(vnode->permit_cache, + lockdep_is_held(&vnode->lock)); + RCU_INIT_POINTER(vnode->permit_cache, NULL); + vnode->cb_break++; + spin_unlock(&vnode->lock); - if (S_ISDIR(vnode->vfs_inode.i_mode)) { - auth_inode = igrab(&vnode->vfs_inode); - ASSERT(auth_inode != NULL); - } else { - auth_inode = afs_iget(vnode->vfs_inode.i_sb, key, - &vnode->status.parent, NULL, NULL); - if (IS_ERR(auth_inode)) - return ERR_CAST(auth_inode); - } - - auth_vnode = AFS_FS_I(auth_inode); - _leave(" = {%x}", auth_vnode->fid.vnode); - return auth_vnode; + if (permits) + afs_put_permits(permits); } /* - * clear the permit cache on a directory vnode + * Hash a list of permits. Use simple addition to make it easy to add an extra + * one at an as-yet indeterminate position in the list. */ -void afs_clear_permits(struct afs_vnode *vnode) +static void afs_hash_permits(struct afs_permits *permits) { - struct afs_permits *permits; - - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + unsigned long h = permits->nr_permits; + int i; - mutex_lock(&vnode->permits_lock); - permits = vnode->permits; - RCU_INIT_POINTER(vnode->permits, NULL); - mutex_unlock(&vnode->permits_lock); + for (i = 0; i < permits->nr_permits; i++) { + h += (unsigned long)permits->permits[i].key / sizeof(void *); + h += permits->permits[i].access; + } - if (permits) - call_rcu(&permits->rcu, afs_zap_permits); - _leave(""); + permits->h = h; } /* - * add the result obtained for a vnode to its or its parent directory's cache - * for the key used to access it + * Cache the CallerAccess result obtained from doing a fileserver operation + * that returned a vnode status for a particular key. If a callback break + * occurs whilst the operation was in progress then we have to ditch the cache + * as the ACL *may* have changed. */ -void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order) +void afs_cache_permit(struct afs_vnode *vnode, struct key *key, + unsigned int cb_break) { - struct afs_permits *permits, *xpermits; - struct afs_permit *permit; - struct afs_vnode *auth_vnode; - int count, loop; - - _enter("{%x:%u},%x,%lx", - vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order); - - auth_vnode = afs_get_auth_inode(vnode, key); - if (IS_ERR(auth_vnode)) { - _leave(" [get error %ld]", PTR_ERR(auth_vnode)); - return; - } + struct afs_permits *permits, *xpermits, *replacement, *zap, *new = NULL; + afs_access_t caller_access = READ_ONCE(vnode->status.caller_access); + size_t size = 0; + bool changed = false; + int i, j; + + _enter("{%x:%u},%x,%x", + vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access); + + rcu_read_lock(); + + /* Check for the common case first: We got back the same access as last + * time we tried and already have it recorded. + */ + permits = rcu_dereference(vnode->permit_cache); + if (permits) { + if (!permits->invalidated) { + for (i = 0; i < permits->nr_permits; i++) { + if (permits->permits[i].key < key) + continue; + if (permits->permits[i].key > key) + break; + if (permits->permits[i].access != caller_access) { + changed = true; + break; + } + + if (cb_break != (vnode->cb_break + + vnode->cb_interest->server->cb_s_break)) { + changed = true; + break; + } + + /* The cache is still good. */ + rcu_read_unlock(); + return; + } + } - mutex_lock(&auth_vnode->permits_lock); + changed |= permits->invalidated; + size = permits->nr_permits; - /* guard against a rename being detected whilst we waited for the - * lock */ - if (memcmp(&auth_vnode->fid, &vnode->status.parent, - sizeof(struct afs_fid)) != 0) { - _debug("renamed"); - goto out_unlock; + /* If this set of permits is now wrong, clear the permits + * pointer so that no one tries to use the stale information. + */ + if (changed) { + spin_lock(&vnode->lock); + if (permits != rcu_access_pointer(vnode->permit_cache)) + goto someone_else_changed_it_unlock; + RCU_INIT_POINTER(vnode->permit_cache, NULL); + spin_unlock(&vnode->lock); + + afs_put_permits(permits); + permits = NULL; + size = 0; + } } - /* have to be careful as the directory's callback may be broken between - * us receiving the status we're trying to cache and us getting the - * lock to update the cache for the status */ - if (auth_vnode->acl_order - acl_order > 0) { - _debug("ACL changed?"); - goto out_unlock; + if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) { + rcu_read_unlock(); + goto someone_else_changed_it; } - /* always update the anonymous mask */ - _debug("anon access %x", vnode->status.anon_access); - auth_vnode->status.anon_access = vnode->status.anon_access; - if (key == vnode->volume->cell->anonymous_key) - goto out_unlock; - - xpermits = auth_vnode->permits; - count = 0; - if (xpermits) { - /* see if the permit is already in the list - * - if it is then we just amend the list - */ - count = xpermits->count; - permit = xpermits->permits; - for (loop = count; loop > 0; loop--) { - if (permit->key == key) { - permit->access_mask = - vnode->status.caller_access; - goto out_unlock; + /* We need a ref on any permits list we want to copy as we'll have to + * drop the lock to do memory allocation. + */ + if (permits && !refcount_inc_not_zero(&permits->usage)) { + rcu_read_unlock(); + goto someone_else_changed_it; + } + + rcu_read_unlock(); + + /* Speculatively create a new list with the revised permission set. We + * discard this if we find an extant match already in the hash, but + * it's easier to compare with memcmp this way. + * + * We fill in the key pointers at this time, but we don't get the refs + * yet. + */ + size++; + new = kzalloc(sizeof(struct afs_permits) + + sizeof(struct afs_permit) * size, GFP_NOFS); + if (!new) + goto out_put; + + refcount_set(&new->usage, 1); + new->nr_permits = size; + i = j = 0; + if (permits) { + for (i = 0; i < permits->nr_permits; i++) { + if (j == i && permits->permits[i].key > key) { + new->permits[j].key = key; + new->permits[j].access = caller_access; + j++; } - permit++; + new->permits[j].key = permits->permits[i].key; + new->permits[j].access = permits->permits[i].access; + j++; } } - permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1), - GFP_NOFS); - if (!permits) - goto out_unlock; - - if (xpermits) - memcpy(permits->permits, xpermits->permits, - count * sizeof(struct afs_permit)); - - _debug("key %x access %x", - key_serial(key), vnode->status.caller_access); - permits->permits[count].access_mask = vnode->status.caller_access; - permits->permits[count].key = key_get(key); - permits->count = count + 1; - - rcu_assign_pointer(auth_vnode->permits, permits); - if (xpermits) - call_rcu(&xpermits->rcu, afs_dispose_of_permits); - -out_unlock: - mutex_unlock(&auth_vnode->permits_lock); - iput(&auth_vnode->vfs_inode); - _leave(""); + if (j == i) { + new->permits[j].key = key; + new->permits[j].access = caller_access; + } + + afs_hash_permits(new); + + /* Now see if the permit list we want is actually already available */ + spin_lock(&afs_permits_lock); + + hash_for_each_possible(afs_permits_cache, xpermits, hash_node, new->h) { + if (xpermits->h != new->h || + xpermits->invalidated || + xpermits->nr_permits != new->nr_permits || + memcmp(xpermits->permits, new->permits, + new->nr_permits * sizeof(struct afs_permit)) != 0) + continue; + + if (refcount_inc_not_zero(&xpermits->usage)) { + replacement = xpermits; + goto found; + } + + break; + } + + for (i = 0; i < new->nr_permits; i++) + key_get(new->permits[i].key); + hash_add_rcu(afs_permits_cache, &new->hash_node, new->h); + replacement = new; + new = NULL; + +found: + spin_unlock(&afs_permits_lock); + + kfree(new); + + spin_lock(&vnode->lock); + zap = rcu_access_pointer(vnode->permit_cache); + if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) && + zap == permits) + rcu_assign_pointer(vnode->permit_cache, replacement); + else + zap = replacement; + spin_unlock(&vnode->lock); + afs_put_permits(zap); +out_put: + afs_put_permits(permits); + return; + +someone_else_changed_it_unlock: + spin_unlock(&vnode->lock); +someone_else_changed_it: + /* Someone else changed the cache under us - don't recheck at this + * time. + */ + return; } /* @@ -214,60 +286,49 @@ out_unlock: * permitted to be accessed with this authorisation, and if so, what access it * is granted */ -static int afs_check_permit(struct afs_vnode *vnode, struct key *key, - afs_access_t *_access) +int afs_check_permit(struct afs_vnode *vnode, struct key *key, + afs_access_t *_access) { struct afs_permits *permits; - struct afs_permit *permit; - struct afs_vnode *auth_vnode; - bool valid; - int loop, ret; + bool valid = false; + int i, ret; _enter("{%x:%u},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); - auth_vnode = afs_get_auth_inode(vnode, key); - if (IS_ERR(auth_vnode)) { - *_access = 0; - _leave(" = %ld", PTR_ERR(auth_vnode)); - return PTR_ERR(auth_vnode); - } - - ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode)); + permits = vnode->permit_cache; /* check the permits to see if we've got one yet */ - if (key == auth_vnode->volume->cell->anonymous_key) { + if (key == vnode->volume->cell->anonymous_key) { _debug("anon"); - *_access = auth_vnode->status.anon_access; + *_access = vnode->status.anon_access; valid = true; } else { - valid = false; rcu_read_lock(); - permits = rcu_dereference(auth_vnode->permits); + permits = rcu_dereference(vnode->permit_cache); if (permits) { - permit = permits->permits; - for (loop = permits->count; loop > 0; loop--) { - if (permit->key == key) { - _debug("found in cache"); - *_access = permit->access_mask; - valid = true; + for (i = 0; i < permits->nr_permits; i++) { + if (permits->permits[i].key < key) + continue; + if (permits->permits[i].key > key) break; - } - permit++; + + *_access = permits->permits[i].access; + valid = !permits->invalidated; + break; } } rcu_read_unlock(); } if (!valid) { - /* check the status on the file we're actually interested in - * (the post-processing will cache the result on auth_vnode) */ + /* Check the status on the file we're actually interested in + * (the post-processing will cache the result). + */ _debug("no valid permit"); - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - ret = afs_vnode_fetch_status(vnode, auth_vnode, key); + ret = afs_fetch_status(vnode, key); if (ret < 0) { - iput(&auth_vnode->vfs_inode); *_access = 0; _leave(" = %d", ret); return ret; @@ -275,7 +336,6 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key, *_access = vnode->status.caller_access; } - iput(&auth_vnode->vfs_inode); _leave(" = 0 [access %x]", *_access); return 0; } @@ -304,14 +364,9 @@ int afs_permission(struct inode *inode, int mask) return PTR_ERR(key); } - /* if the promise has expired, we need to check the server again */ - if (!vnode->cb_promised) { - _debug("not promised"); - ret = afs_vnode_fetch_status(vnode, NULL, key); - if (ret < 0) - goto error; - _debug("new promise [fl=%lx]", vnode->flags); - } + ret = afs_validate(vnode, key); + if (ret < 0) + goto error; /* check the permits to see if we've got one yet */ ret = afs_check_permit(vnode, key, &access); @@ -365,3 +420,12 @@ error: _leave(" = %d", ret); return ret; } + +void __exit afs_clean_up_permit_cache(void) +{ + int i; + + for (i = 0; i < HASH_SIZE(afs_permits_cache); i++) + WARN_ON_ONCE(!hlist_empty(&afs_permits_cache[i])); + +} diff --git a/fs/afs/server.c b/fs/afs/server.c index c001b1f2455f..1880f1b6a9f1 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -11,317 +11,689 @@ #include <linux/sched.h> #include <linux/slab.h> +#include "afs_fs.h" #include "internal.h" -static unsigned afs_server_timeout = 10; /* server timeout in seconds */ +static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ +static unsigned afs_server_update_delay = 30; /* Time till VLDB recheck in secs */ -static void afs_reap_server(struct work_struct *); +static void afs_inc_servers_outstanding(struct afs_net *net) +{ + atomic_inc(&net->servers_outstanding); +} + +static void afs_dec_servers_outstanding(struct afs_net *net) +{ + if (atomic_dec_and_test(&net->servers_outstanding)) + wake_up_atomic_t(&net->servers_outstanding); +} + +/* + * Find a server by one of its addresses. + */ +struct afs_server *afs_find_server(struct afs_net *net, + const struct sockaddr_rxrpc *srx) +{ + const struct sockaddr_in6 *a = &srx->transport.sin6, *b; + const struct afs_addr_list *alist; + struct afs_server *server = NULL; + unsigned int i; + bool ipv6 = true; + int seq = 0, diff; + + if (srx->transport.sin6.sin6_addr.s6_addr32[0] == 0 || + srx->transport.sin6.sin6_addr.s6_addr32[1] == 0 || + srx->transport.sin6.sin6_addr.s6_addr32[2] == htonl(0xffff)) + ipv6 = false; + + rcu_read_lock(); + + do { + if (server) + afs_put_server(net, server); + server = NULL; + read_seqbegin_or_lock(&net->fs_addr_lock, &seq); + + if (ipv6) { + hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { + alist = rcu_dereference(server->addresses); + for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { + b = &alist->addrs[i].transport.sin6; + diff = (u16)a->sin6_port - (u16)b->sin6_port; + if (diff == 0) + diff = memcmp(&a->sin6_addr, + &b->sin6_addr, + sizeof(struct in6_addr)); + if (diff == 0) + goto found; + if (diff < 0) { + // TODO: Sort the list + //if (i == alist->nr_ipv4) + // goto not_found; + break; + } + } + } + } else { + hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { + alist = rcu_dereference(server->addresses); + for (i = 0; i < alist->nr_ipv4; i++) { + b = &alist->addrs[i].transport.sin6; + diff = (u16)a->sin6_port - (u16)b->sin6_port; + if (diff == 0) + diff = ((u32)a->sin6_addr.s6_addr32[3] - + (u32)b->sin6_addr.s6_addr32[3]); + if (diff == 0) + goto found; + if (diff < 0) { + // TODO: Sort the list + //if (i == 0) + // goto not_found; + break; + } + } + } + } + + //not_found: + server = NULL; + found: + if (server && !atomic_inc_not_zero(&server->usage)) + server = NULL; + + } while (need_seqretry(&net->fs_addr_lock, seq)); -/* tree of all the servers, indexed by IP address */ -static struct rb_root afs_servers = RB_ROOT; -static DEFINE_RWLOCK(afs_servers_lock); + done_seqretry(&net->fs_addr_lock, seq); -/* LRU list of all the servers not currently in use */ -static LIST_HEAD(afs_server_graveyard); -static DEFINE_SPINLOCK(afs_server_graveyard_lock); -static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server); + rcu_read_unlock(); + return server; +} /* - * install a server record in the master tree + * Look up a server by its UUID */ -static int afs_install_server(struct afs_server *server) +struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid) { - struct afs_server *xserver; + struct afs_server *server = NULL; + struct rb_node *p; + int diff, seq = 0; + + _enter("%pU", uuid); + + do { + /* Unfortunately, rbtree walking doesn't give reliable results + * under just the RCU read lock, so we have to check for + * changes. + */ + if (server) + afs_put_server(net, server); + server = NULL; + + read_seqbegin_or_lock(&net->fs_lock, &seq); + + p = net->fs_servers.rb_node; + while (p) { + server = rb_entry(p, struct afs_server, uuid_rb); + + diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); + if (diff < 0) { + p = p->rb_left; + } else if (diff > 0) { + p = p->rb_right; + } else { + afs_get_server(server); + break; + } + + server = NULL; + } + } while (need_seqretry(&net->fs_lock, seq)); + + done_seqretry(&net->fs_lock, seq); + + _leave(" = %p", server); + return server; +} + +/* + * Install a server record in the namespace tree + */ +static struct afs_server *afs_install_server(struct afs_net *net, + struct afs_server *candidate) +{ + const struct afs_addr_list *alist; + struct afs_server *server; struct rb_node **pp, *p; - int ret; + int ret = -EEXIST, diff; - _enter("%p", server); + _enter("%p", candidate); - write_lock(&afs_servers_lock); + write_seqlock(&net->fs_lock); - ret = -EEXIST; - pp = &afs_servers.rb_node; + /* Firstly install the server in the UUID lookup tree */ + pp = &net->fs_servers.rb_node; p = NULL; while (*pp) { p = *pp; _debug("- consider %p", p); - xserver = rb_entry(p, struct afs_server, master_rb); - if (server->addr.s_addr < xserver->addr.s_addr) + server = rb_entry(p, struct afs_server, uuid_rb); + diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t)); + if (diff < 0) pp = &(*pp)->rb_left; - else if (server->addr.s_addr > xserver->addr.s_addr) + else if (diff > 0) pp = &(*pp)->rb_right; else - goto error; + goto exists; } - rb_link_node(&server->master_rb, p, pp); - rb_insert_color(&server->master_rb, &afs_servers); + server = candidate; + rb_link_node(&server->uuid_rb, p, pp); + rb_insert_color(&server->uuid_rb, &net->fs_servers); + hlist_add_head_rcu(&server->proc_link, &net->fs_proc); + + write_seqlock(&net->fs_addr_lock); + alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&net->fs_addr_lock.lock)); + + /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install + * it in the IPv4 and/or IPv6 reverse-map lists. + * + * TODO: For speed we want to use something other than a flat list + * here; even sorting the list in terms of lowest address would help a + * bit, but anything we might want to do gets messy and memory + * intensive. + */ + if (alist->nr_ipv4 > 0) + hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4); + if (alist->nr_addrs > alist->nr_ipv4) + hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6); + + write_sequnlock(&net->fs_addr_lock); ret = 0; -error: - write_unlock(&afs_servers_lock); - return ret; +exists: + afs_get_server(server); + write_sequnlock(&net->fs_lock); + return server; } /* * allocate a new server record */ -static struct afs_server *afs_alloc_server(struct afs_cell *cell, - const struct in_addr *addr) +static struct afs_server *afs_alloc_server(struct afs_net *net, + const uuid_t *uuid, + struct afs_addr_list *alist) { struct afs_server *server; _enter(""); server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); - if (server) { - atomic_set(&server->usage, 1); - server->cell = cell; - - INIT_LIST_HEAD(&server->link); - INIT_LIST_HEAD(&server->grave); - init_rwsem(&server->sem); - spin_lock_init(&server->fs_lock); - server->fs_vnodes = RB_ROOT; - server->cb_promises = RB_ROOT; - spin_lock_init(&server->cb_lock); - init_waitqueue_head(&server->cb_break_waitq); - INIT_DELAYED_WORK(&server->cb_break_work, - afs_dispatch_give_up_callbacks); - - memcpy(&server->addr, addr, sizeof(struct in_addr)); - server->addr.s_addr = addr->s_addr; - _leave(" = %p{%d}", server, atomic_read(&server->usage)); - } else { - _leave(" = NULL [nomem]"); - } + if (!server) + goto enomem; + + atomic_set(&server->usage, 1); + RCU_INIT_POINTER(server->addresses, alist); + server->addr_version = alist->version; + server->uuid = *uuid; + server->flags = (1UL << AFS_SERVER_FL_NEW); + server->update_at = ktime_get_real_seconds() + afs_server_update_delay; + rwlock_init(&server->fs_lock); + INIT_LIST_HEAD(&server->cb_interests); + rwlock_init(&server->cb_break_lock); + + afs_inc_servers_outstanding(net); + _leave(" = %p", server); return server; + +enomem: + _leave(" = NULL [nomem]"); + return NULL; +} + +/* + * Look up an address record for a server + */ +static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, + struct key *key, const uuid_t *uuid) +{ + struct afs_addr_cursor ac; + struct afs_addr_list *alist; + int ret; + + ret = afs_set_vl_cursor(&ac, cell); + if (ret < 0) + return ERR_PTR(ret); + + while (afs_iterate_addresses(&ac)) { + if (test_bit(ac.index, &ac.alist->yfs)) + alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid); + else + alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid); + switch (ac.error) { + case 0: + afs_end_cursor(&ac); + return alist; + case -ECONNABORTED: + ac.error = afs_abort_to_error(ac.abort_code); + goto error; + case -ENOMEM: + case -ENONET: + goto error; + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + break; + default: + ac.error = -EIO; + goto error; + } + } + +error: + return ERR_PTR(afs_end_cursor(&ac)); } /* - * get an FS-server record for a cell + * Get or create a fileserver record. */ -struct afs_server *afs_lookup_server(struct afs_cell *cell, - const struct in_addr *addr) +struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, + const uuid_t *uuid) { + struct afs_addr_list *alist; struct afs_server *server, *candidate; - _enter("%p,%pI4", cell, &addr->s_addr); + _enter("%p,%pU", cell->net, uuid); - /* quick scan of the list to see if we already have the server */ - read_lock(&cell->servers_lock); + server = afs_find_server_by_uuid(cell->net, uuid); + if (server) + return server; - list_for_each_entry(server, &cell->servers, link) { - if (server->addr.s_addr == addr->s_addr) - goto found_server_quickly; - } - read_unlock(&cell->servers_lock); + alist = afs_vl_lookup_addrs(cell, key, uuid); + if (IS_ERR(alist)) + return ERR_CAST(alist); - candidate = afs_alloc_server(cell, addr); + candidate = afs_alloc_server(cell->net, uuid, alist); if (!candidate) { - _leave(" = -ENOMEM"); + afs_put_addrlist(alist); return ERR_PTR(-ENOMEM); } - write_lock(&cell->servers_lock); - - /* check the cell's server list again */ - list_for_each_entry(server, &cell->servers, link) { - if (server->addr.s_addr == addr->s_addr) - goto found_server; + server = afs_install_server(cell->net, candidate); + if (server != candidate) { + afs_put_addrlist(alist); + kfree(candidate); } - _debug("new"); - server = candidate; - if (afs_install_server(server) < 0) - goto server_in_two_cells; - - afs_get_cell(cell); - list_add_tail(&server->link, &cell->servers); - - write_unlock(&cell->servers_lock); _leave(" = %p{%d}", server, atomic_read(&server->usage)); return server; +} - /* found a matching server quickly */ -found_server_quickly: - _debug("found quickly"); - afs_get_server(server); - read_unlock(&cell->servers_lock); -no_longer_unused: - if (!list_empty(&server->grave)) { - spin_lock(&afs_server_graveyard_lock); - list_del_init(&server->grave); - spin_unlock(&afs_server_graveyard_lock); +/* + * Set the server timer to fire after a given delay, assuming it's not already + * set for an earlier time. + */ +static void afs_set_server_timer(struct afs_net *net, time64_t delay) +{ + if (net->live) { + afs_inc_servers_outstanding(net); + if (timer_reduce(&net->fs_timer, jiffies + delay * HZ)) + afs_dec_servers_outstanding(net); } - _leave(" = %p{%d}", server, atomic_read(&server->usage)); - return server; +} - /* found a matching server on the second pass */ -found_server: - _debug("found"); - afs_get_server(server); - write_unlock(&cell->servers_lock); - kfree(candidate); - goto no_longer_unused; - - /* found a server that seems to be in two cells */ -server_in_two_cells: - write_unlock(&cell->servers_lock); - kfree(candidate); - printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n", - addr); - _leave(" = -EEXIST"); - return ERR_PTR(-EEXIST); +/* + * Server management timer. We have an increment on fs_outstanding that we + * need to pass along to the work item. + */ +void afs_servers_timer(struct timer_list *timer) +{ + struct afs_net *net = container_of(timer, struct afs_net, fs_timer); + + _enter(""); + if (!queue_work(afs_wq, &net->fs_manager)) + afs_dec_servers_outstanding(net); } /* - * look up a server by its IP address + * Release a reference on a server record. */ -struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx) +void afs_put_server(struct afs_net *net, struct afs_server *server) { - struct afs_server *server = NULL; - struct rb_node *p; - struct in_addr addr = srx->transport.sin.sin_addr; + unsigned int usage; - _enter("{%d,%pI4}", srx->transport.family, &addr.s_addr); + if (!server) + return; - if (srx->transport.family != AF_INET) { - WARN(true, "AFS does not yes support non-IPv4 addresses\n"); - return NULL; - } + server->put_time = ktime_get_real_seconds(); - read_lock(&afs_servers_lock); + usage = atomic_dec_return(&server->usage); - p = afs_servers.rb_node; - while (p) { - server = rb_entry(p, struct afs_server, master_rb); + _enter("{%u}", usage); - _debug("- consider %p", p); + if (likely(usage > 0)) + return; - if (addr.s_addr < server->addr.s_addr) { - p = p->rb_left; - } else if (addr.s_addr > server->addr.s_addr) { - p = p->rb_right; - } else { - afs_get_server(server); - goto found; - } - } + afs_set_server_timer(net, afs_server_gc_delay); +} - server = NULL; -found: - read_unlock(&afs_servers_lock); - ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr); - _leave(" = %p", server); - return server; +static void afs_server_rcu(struct rcu_head *rcu) +{ + struct afs_server *server = container_of(rcu, struct afs_server, rcu); + + afs_put_addrlist(server->addresses); + kfree(server); } /* - * destroy a server record - * - removes from the cell list + * destroy a dead server */ -void afs_put_server(struct afs_server *server) +static void afs_destroy_server(struct afs_net *net, struct afs_server *server) { - if (!server) - return; + struct afs_addr_list *alist = server->addresses; + struct afs_addr_cursor ac = { + .alist = alist, + .addr = &alist->addrs[0], + .start = alist->index, + .index = alist->index, + .error = 0, + }; + _enter("%p", server); - _enter("%p{%d}", server, atomic_read(&server->usage)); + afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + call_rcu(&server->rcu, afs_server_rcu); + afs_dec_servers_outstanding(net); +} - _debug("PUT SERVER %d", atomic_read(&server->usage)); +/* + * Garbage collect any expired servers. + */ +static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) +{ + struct afs_server *server; + bool deleted; + int usage; + + while ((server = gc_list)) { + gc_list = server->gc_next; + + write_seqlock(&net->fs_lock); + usage = 1; + deleted = atomic_try_cmpxchg(&server->usage, &usage, 0); + if (deleted) { + rb_erase(&server->uuid_rb, &net->fs_servers); + hlist_del_rcu(&server->proc_link); + } + write_sequnlock(&net->fs_lock); - ASSERTCMP(atomic_read(&server->usage), >, 0); + if (deleted) + afs_destroy_server(net, server); + } +} - if (likely(!atomic_dec_and_test(&server->usage))) { - _leave(""); - return; +/* + * Manage the records of servers known to be within a network namespace. This + * includes garbage collecting unused servers. + * + * Note also that we were given an increment on net->servers_outstanding by + * whoever queued us that we need to deal with before returning. + */ +void afs_manage_servers(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, fs_manager); + struct afs_server *gc_list = NULL; + struct rb_node *cursor; + time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; + bool purging = !net->live; + + _enter(""); + + /* Trawl the server list looking for servers that have expired from + * lack of use. + */ + read_seqlock_excl(&net->fs_lock); + + for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) { + struct afs_server *server = + rb_entry(cursor, struct afs_server, uuid_rb); + int usage = atomic_read(&server->usage); + + _debug("manage %pU %u", &server->uuid, usage); + + ASSERTCMP(usage, >=, 1); + ASSERTIFCMP(purging, usage, ==, 1); + + if (usage == 1) { + time64_t expire_at = server->put_time; + + if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && + !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) + expire_at += afs_server_gc_delay; + if (purging || expire_at <= now) { + server->gc_next = gc_list; + gc_list = server; + } else if (expire_at < next_manage) { + next_manage = expire_at; + } + } } - afs_flush_callback_breaks(server); + read_sequnlock_excl(&net->fs_lock); + + /* Update the timer on the way out. We have to pass an increment on + * servers_outstanding in the namespace that we are in to the timer or + * the work scheduler. + */ + if (!purging && next_manage < TIME64_MAX) { + now = ktime_get_real_seconds(); - spin_lock(&afs_server_graveyard_lock); - if (atomic_read(&server->usage) == 0) { - list_move_tail(&server->grave, &afs_server_graveyard); - server->time_of_death = ktime_get_real_seconds(); - queue_delayed_work(afs_wq, &afs_server_reaper, - afs_server_timeout * HZ); + if (next_manage - now <= 0) { + if (queue_work(afs_wq, &net->fs_manager)) + afs_inc_servers_outstanding(net); + } else { + afs_set_server_timer(net, next_manage - now); + } } - spin_unlock(&afs_server_graveyard_lock); - _leave(" [dead]"); + + afs_gc_servers(net, gc_list); + + afs_dec_servers_outstanding(net); + _leave(" [%d]", atomic_read(&net->servers_outstanding)); +} + +static void afs_queue_server_manager(struct afs_net *net) +{ + afs_inc_servers_outstanding(net); + if (!queue_work(afs_wq, &net->fs_manager)) + afs_dec_servers_outstanding(net); } /* - * destroy a dead server + * Purge list of servers. */ -static void afs_destroy_server(struct afs_server *server) +void afs_purge_servers(struct afs_net *net) { - _enter("%p", server); + _enter(""); - ASSERTIF(server->cb_break_head != server->cb_break_tail, - delayed_work_pending(&server->cb_break_work)); + if (del_timer_sync(&net->fs_timer)) + atomic_dec(&net->servers_outstanding); - ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL); - ASSERTCMP(server->cb_promises.rb_node, ==, NULL); - ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail); - ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0); + afs_queue_server_manager(net); - afs_put_cell(server->cell); - kfree(server); + _debug("wait"); + wait_on_atomic_t(&net->servers_outstanding, atomic_t_wait, + TASK_UNINTERRUPTIBLE); + _leave(""); } /* - * reap dead server records + * Probe a fileserver to find its capabilities. + * + * TODO: Try service upgrade. */ -static void afs_reap_server(struct work_struct *work) +static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc) { - LIST_HEAD(corpses); - struct afs_server *server; - unsigned long delay, expiry; - time64_t now; - - now = ktime_get_real_seconds(); - spin_lock(&afs_server_graveyard_lock); - - while (!list_empty(&afs_server_graveyard)) { - server = list_entry(afs_server_graveyard.next, - struct afs_server, grave); + _enter(""); - /* the queue is ordered most dead first */ - expiry = server->time_of_death + afs_server_timeout; - if (expiry > now) { - delay = (expiry - now) * HZ; - mod_delayed_work(afs_wq, &afs_server_reaper, delay); + fc->ac.addr = NULL; + fc->ac.start = READ_ONCE(fc->ac.alist->index); + fc->ac.index = fc->ac.start; + fc->ac.error = 0; + fc->ac.begun = false; + + while (afs_iterate_addresses(&fc->ac)) { + afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server, + &fc->ac, fc->key); + switch (fc->ac.error) { + case 0: + afs_end_cursor(&fc->ac); + set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags); + return true; + case -ECONNABORTED: + fc->ac.error = afs_abort_to_error(fc->ac.abort_code); + goto error; + case -ENOMEM: + case -ENONET: + goto error; + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: break; + default: + fc->ac.error = -EIO; + goto error; } + } - write_lock(&server->cell->servers_lock); - write_lock(&afs_servers_lock); - if (atomic_read(&server->usage) > 0) { - list_del_init(&server->grave); - } else { - list_move_tail(&server->grave, &corpses); - list_del_init(&server->link); - rb_erase(&server->master_rb, &afs_servers); - } - write_unlock(&afs_servers_lock); - write_unlock(&server->cell->servers_lock); +error: + afs_end_cursor(&fc->ac); + return false; +} + +/* + * If we haven't already, try probing the fileserver to get its capabilities. + * We try not to instigate parallel probes, but it's possible that the parallel + * probes will fail due to authentication failure when ours would succeed. + * + * TODO: Try sending an anonymous probe if an authenticated probe fails. + */ +bool afs_probe_fileserver(struct afs_fs_cursor *fc) +{ + bool success; + int ret, retries = 0; + + _enter(""); + +retry: + if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) { + _leave(" = t"); + return true; } - spin_unlock(&afs_server_graveyard_lock); + if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) { + success = afs_do_probe_fileserver(fc); + clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags); + wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING); + _leave(" = t"); + return success; + } + + _debug("wait"); + ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING, + TASK_INTERRUPTIBLE); + if (ret == -ERESTARTSYS) { + fc->ac.error = ret; + _leave(" = f [%d]", ret); + return false; + } - /* now reap the corpses we've extracted */ - while (!list_empty(&corpses)) { - server = list_entry(corpses.next, struct afs_server, grave); - list_del(&server->grave); - afs_destroy_server(server); + retries++; + if (retries == 4) { + fc->ac.error = -ESTALE; + _leave(" = f [stale]"); + return false; } + _debug("retry"); + goto retry; } /* - * discard all the server records for rmmod + * Get an update for a server's address list. */ -void __exit afs_purge_servers(void) +static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server) { - afs_server_timeout = 0; - mod_delayed_work(afs_wq, &afs_server_reaper, 0); + struct afs_addr_list *alist, *discard; + + _enter(""); + + alist = afs_vl_lookup_addrs(fc->vnode->volume->cell, fc->key, + &server->uuid); + if (IS_ERR(alist)) { + fc->ac.error = PTR_ERR(alist); + _leave(" = f [%d]", fc->ac.error); + return false; + } + + discard = alist; + if (server->addr_version != alist->version) { + write_lock(&server->fs_lock); + discard = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->fs_lock)); + rcu_assign_pointer(server->addresses, alist); + server->addr_version = alist->version; + write_unlock(&server->fs_lock); + } + + server->update_at = ktime_get_real_seconds() + afs_server_update_delay; + afs_put_addrlist(discard); + _leave(" = t"); + return true; +} + +/* + * See if a server's address list needs updating. + */ +bool afs_check_server_record(struct afs_fs_cursor *fc, struct afs_server *server) +{ + time64_t now = ktime_get_real_seconds(); + long diff; + bool success; + int ret, retries = 0; + + _enter(""); + + ASSERT(server); + +retry: + diff = READ_ONCE(server->update_at) - now; + if (diff > 0) { + _leave(" = t [not now %ld]", diff); + return true; + } + + if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) { + success = afs_update_server_record(fc, server); + clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags); + wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING); + _leave(" = %d", success); + return success; + } + + ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING, + TASK_INTERRUPTIBLE); + if (ret == -ERESTARTSYS) { + fc->ac.error = ret; + _leave(" = f [intr]"); + return false; + } + + retries++; + if (retries == 4) { + _leave(" = f [stale]"); + ret = -ESTALE; + return false; + } + goto retry; } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c new file mode 100644 index 000000000000..0ab3f8457839 --- /dev/null +++ b/fs/afs/server_list.c @@ -0,0 +1,153 @@ +/* AFS fileserver list management. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include "internal.h" + +void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) +{ + int i; + + if (slist && refcount_dec_and_test(&slist->usage)) { + for (i = 0; i < slist->nr_servers; i++) { + afs_put_cb_interest(net, slist->servers[i].cb_interest); + afs_put_server(net, slist->servers[i].server); + } + kfree(slist); + } +} + +/* + * Build a server list from a VLDB record. + */ +struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, + struct key *key, + struct afs_vldb_entry *vldb, + u8 type_mask) +{ + struct afs_server_list *slist; + struct afs_server *server; + int ret = -ENOMEM, nr_servers = 0, i, j; + + for (i = 0; i < vldb->nr_servers; i++) + if (vldb->fs_mask[i] & type_mask) + nr_servers++; + + slist = kzalloc(sizeof(struct afs_server_list) + + sizeof(struct afs_server_entry) * nr_servers, + GFP_KERNEL); + if (!slist) + goto error; + + refcount_set(&slist->usage, 1); + + /* Make sure a records exists for each server in the list. */ + for (i = 0; i < vldb->nr_servers; i++) { + if (!(vldb->fs_mask[i] & type_mask)) + continue; + + server = afs_lookup_server(cell, key, &vldb->fs_server[i]); + if (IS_ERR(server)) { + ret = PTR_ERR(server); + if (ret == -ENOENT) + continue; + goto error_2; + } + + /* Insertion-sort by server pointer */ + for (j = 0; j < slist->nr_servers; j++) + if (slist->servers[j].server >= server) + break; + if (j < slist->nr_servers) { + if (slist->servers[j].server == server) { + afs_put_server(cell->net, server); + continue; + } + + memmove(slist->servers + j + 1, + slist->servers + j, + (slist->nr_servers - j) * sizeof(struct afs_server_entry)); + } + + slist->servers[j].server = server; + slist->nr_servers++; + } + + if (slist->nr_servers == 0) { + ret = -EDESTADDRREQ; + goto error_2; + } + + return slist; + +error_2: + afs_put_serverlist(cell->net, slist); +error: + return ERR_PTR(ret); +} + +/* + * Copy the annotations from an old server list to its potential replacement. + */ +bool afs_annotate_server_list(struct afs_server_list *new, + struct afs_server_list *old) +{ + struct afs_server *cur; + int i, j; + + if (old->nr_servers != new->nr_servers) + goto changed; + + for (i = 0; i < old->nr_servers; i++) + if (old->servers[i].server != new->servers[i].server) + goto changed; + + return false; + +changed: + /* Maintain the same current server as before if possible. */ + cur = old->servers[old->index].server; + for (j = 0; j < new->nr_servers; j++) { + if (new->servers[j].server == cur) { + new->index = j; + break; + } + } + + /* Keep the old callback interest records where possible so that we + * maintain callback interception. + */ + i = 0; + j = 0; + while (i < old->nr_servers && j < new->nr_servers) { + if (new->servers[j].server == old->servers[i].server) { + struct afs_cb_interest *cbi = old->servers[i].cb_interest; + if (cbi) { + new->servers[j].cb_interest = cbi; + refcount_inc(&cbi->usage); + } + i++; + j++; + continue; + } + + if (new->servers[j].server < old->servers[i].server) { + j++; + continue; + } + + i++; + continue; + } + + return true; +} diff --git a/fs/afs/super.c b/fs/afs/super.c index 689173c0a682..1037dd41a622 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -25,11 +25,10 @@ #include <linux/statfs.h> #include <linux/sched.h> #include <linux/nsproxy.h> +#include <linux/magic.h> #include <net/net_namespace.h> #include "internal.h" -#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ - static void afs_i_init_once(void *foo); static struct dentry *afs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); @@ -143,9 +142,9 @@ void __exit afs_fs_exit(void) */ static int afs_show_devname(struct seq_file *m, struct dentry *root) { - struct afs_super_info *as = root->d_sb->s_fs_info; + struct afs_super_info *as = AFS_FS_S(root->d_sb); struct afs_volume *volume = as->volume; - struct afs_cell *cell = volume->cell; + struct afs_cell *cell = as->cell; const char *suf = ""; char pref = '%'; @@ -163,7 +162,7 @@ static int afs_show_devname(struct seq_file *m, struct dentry *root) break; } - seq_printf(m, "%c%s:%s%s", pref, cell->name, volume->vlocation->vldb.name, suf); + seq_printf(m, "%c%s:%s%s", pref, cell->name, volume->name, suf); return 0; } @@ -201,12 +200,14 @@ static int afs_parse_options(struct afs_mount_params *params, token = match_token(p, afs_options_list, args); switch (token) { case afs_opt_cell: - cell = afs_cell_lookup(args[0].from, - args[0].to - args[0].from, - false); + rcu_read_lock(); + cell = afs_lookup_cell_rcu(params->net, + args[0].from, + args[0].to - args[0].from); + rcu_read_unlock(); if (IS_ERR(cell)) return PTR_ERR(cell); - afs_put_cell(params->cell); + afs_put_cell(params->net, params->cell); params->cell = cell; break; @@ -308,13 +309,14 @@ static int afs_parse_device_name(struct afs_mount_params *params, /* lookup the cell record */ if (cellname || !params->cell) { - cell = afs_cell_lookup(cellname, cellnamesz, true); + cell = afs_lookup_cell(params->net, cellname, cellnamesz, + NULL, false); if (IS_ERR(cell)) { printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_put_cell(params->cell); + afs_put_cell(params->net, params->cell); params->cell = cell; } @@ -332,14 +334,16 @@ static int afs_parse_device_name(struct afs_mount_params *params, static int afs_test_super(struct super_block *sb, void *data) { struct afs_super_info *as1 = data; - struct afs_super_info *as = sb->s_fs_info; + struct afs_super_info *as = AFS_FS_S(sb); - return as->volume == as1->volume; + return as->net == as1->net && as->volume->vid == as1->volume->vid; } static int afs_set_super(struct super_block *sb, void *data) { - sb->s_fs_info = data; + struct afs_super_info *as = data; + + sb->s_fs_info = as; return set_anon_super(sb, NULL); } @@ -349,7 +353,7 @@ static int afs_set_super(struct super_block *sb, void *data) static int afs_fill_super(struct super_block *sb, struct afs_mount_params *params) { - struct afs_super_info *as = sb->s_fs_info; + struct afs_super_info *as = AFS_FS_S(sb); struct afs_fid fid; struct inode *inode = NULL; int ret; @@ -366,13 +370,15 @@ static int afs_fill_super(struct super_block *sb, if (ret) return ret; sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); + sprintf(sb->s_id, "%u", as->volume->vid); + + afs_activate_volume(as->volume); /* allocate the root inode and dentry */ fid.vid = as->volume->vid; fid.vnode = 1; fid.unique = 1; - inode = afs_iget(sb, params->key, &fid, NULL, NULL); + inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -394,23 +400,45 @@ error: return ret; } +static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params) +{ + struct afs_super_info *as; + + as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); + if (as) { + as->net = afs_get_net(params->net); + as->cell = afs_get_cell(params->cell); + } + return as; +} + +static void afs_destroy_sbi(struct afs_super_info *as) +{ + if (as) { + afs_put_volume(as->cell, as->volume); + afs_put_cell(as->net, as->cell); + afs_put_net(as->net); + kfree(as); + } +} + /* * get an AFS superblock */ static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *options) + int flags, const char *dev_name, void *options) { struct afs_mount_params params; struct super_block *sb; - struct afs_volume *vol; + struct afs_volume *candidate; struct key *key; - char *new_opts = kstrdup(options, GFP_KERNEL); struct afs_super_info *as; int ret; _enter(",,%s,%p", dev_name, options); memset(¶ms, 0, sizeof(params)); + params.net = &__afs_net; ret = -EINVAL; if (current->nsproxy->net_ns != &init_net) @@ -436,70 +464,81 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, } params.key = key; - /* parse the device name */ - vol = afs_volume_lookup(¶ms); - if (IS_ERR(vol)) { - ret = PTR_ERR(vol); - goto error; - } - /* allocate a superblock info record */ - as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); - if (!as) { - ret = -ENOMEM; - afs_put_volume(vol); - goto error; + ret = -ENOMEM; + as = afs_alloc_sbi(¶ms); + if (!as) + goto error_key; + + /* Assume we're going to need a volume record; at the very least we can + * use it to update the volume record if we have one already. This + * checks that the volume exists within the cell. + */ + candidate = afs_create_volume(¶ms); + if (IS_ERR(candidate)) { + ret = PTR_ERR(candidate); + goto error_as; } - as->volume = vol; + + as->volume = candidate; /* allocate a deviceless superblock */ sb = sget(fs_type, afs_test_super, afs_set_super, flags, as); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - afs_put_volume(vol); - kfree(as); - goto error; + goto error_as; } if (!sb->s_root) { /* initial superblock/root creation */ _debug("create"); ret = afs_fill_super(sb, ¶ms); - if (ret < 0) { - deactivate_locked_super(sb); - goto error; - } - sb->s_flags |= MS_ACTIVE; + if (ret < 0) + goto error_sb; + as = NULL; + sb->s_flags |= SB_ACTIVE; } else { _debug("reuse"); - ASSERTCMP(sb->s_flags, &, MS_ACTIVE); - afs_put_volume(vol); - kfree(as); + ASSERTCMP(sb->s_flags, &, SB_ACTIVE); + afs_destroy_sbi(as); + as = NULL; } - afs_put_cell(params.cell); - kfree(new_opts); + afs_put_cell(params.net, params.cell); + key_put(params.key); _leave(" = 0 [%p]", sb); return dget(sb->s_root); -error: - afs_put_cell(params.cell); +error_sb: + deactivate_locked_super(sb); + goto error_key; +error_as: + afs_destroy_sbi(as); +error_key: key_put(params.key); - kfree(new_opts); +error: + afs_put_cell(params.net, params.cell); _leave(" = %d", ret); return ERR_PTR(ret); } static void afs_kill_super(struct super_block *sb) { - struct afs_super_info *as = sb->s_fs_info; + struct afs_super_info *as = AFS_FS_S(sb); + + /* Clear the callback interests (which will do ilookup5) before + * deactivating the superblock. + */ + afs_clear_callback_interests(as->net, as->volume->servers); kill_anon_super(sb); - afs_put_volume(as->volume); - kfree(as); + afs_deactivate_volume(as->volume); + afs_destroy_sbi(as); } /* - * initialise an inode cache slab element prior to any use + * Initialise an inode cache slab element prior to any use. Note that + * afs_alloc_inode() *must* reset anything that could incorrectly leak from one + * inode to another. */ static void afs_i_init_once(void *_vnode) { @@ -507,16 +546,15 @@ static void afs_i_init_once(void *_vnode) memset(vnode, 0, sizeof(*vnode)); inode_init_once(&vnode->vfs_inode); - init_waitqueue_head(&vnode->update_waitq); - mutex_init(&vnode->permits_lock); + mutex_init(&vnode->io_lock); mutex_init(&vnode->validate_lock); - spin_lock_init(&vnode->writeback_lock); + spin_lock_init(&vnode->wb_lock); spin_lock_init(&vnode->lock); - INIT_LIST_HEAD(&vnode->writebacks); + INIT_LIST_HEAD(&vnode->wb_keys); INIT_LIST_HEAD(&vnode->pending_locks); INIT_LIST_HEAD(&vnode->granted_locks); INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work); - INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work); + seqlock_init(&vnode->cb_lock); } /* @@ -532,13 +570,21 @@ static struct inode *afs_alloc_inode(struct super_block *sb) atomic_inc(&afs_count_active_inodes); + /* Reset anything that shouldn't leak from one inode to the next. */ memset(&vnode->fid, 0, sizeof(vnode->fid)); memset(&vnode->status, 0, sizeof(vnode->status)); vnode->volume = NULL; - vnode->update_cnt = 0; + vnode->lock_key = NULL; + vnode->permit_cache = NULL; + vnode->cb_interest = NULL; +#ifdef CONFIG_AFS_FSCACHE + vnode->cache = NULL; +#endif + vnode->flags = 1 << AFS_VNODE_UNSET; - vnode->cb_promised = false; + vnode->cb_type = 0; + vnode->lock_state = AFS_VNODE_LOCK_NONE; _leave(" = %p", &vnode->vfs_inode); return &vnode->vfs_inode; @@ -562,7 +608,7 @@ static void afs_destroy_inode(struct inode *inode) _debug("DESTROY INODE %p", inode); - ASSERTCMP(vnode->server, ==, NULL); + ASSERTCMP(vnode->cb_interest, ==, NULL); call_rcu(&inode->i_rcu, afs_i_callback); atomic_dec(&afs_count_active_inodes); @@ -573,6 +619,7 @@ static void afs_destroy_inode(struct inode *inode) */ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct afs_fs_cursor fc; struct afs_volume_status vs; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; @@ -582,21 +629,32 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) if (IS_ERR(key)) return PTR_ERR(key); - ret = afs_vnode_get_volume_status(vnode, key, &vs); - key_put(key); - if (ret < 0) { - _leave(" = %d", ret); - return ret; + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, vnode, key)) { + fc.flags |= AFS_FS_CURSOR_NO_VSLEEP; + while (afs_select_fileserver(&fc)) { + fc.cb_break = vnode->cb_break + vnode->cb_s_break; + afs_fs_get_volume_status(&fc, &vs); + } + + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, vnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); } - buf->f_type = dentry->d_sb->s_magic; - buf->f_bsize = AFS_BLOCK_SIZE; - buf->f_namelen = AFSNAMEMAX - 1; + key_put(key); - if (vs.max_quota == 0) - buf->f_blocks = vs.part_max_blocks; - else - buf->f_blocks = vs.max_quota; - buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use; - return 0; + if (ret == 0) { + buf->f_type = dentry->d_sb->s_magic; + buf->f_bsize = AFS_BLOCK_SIZE; + buf->f_namelen = AFSNAMEMAX - 1; + + if (vs.max_quota == 0) + buf->f_blocks = vs.part_max_blocks; + else + buf->f_blocks = vs.max_quota; + buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use; + } + + return ret; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index a5e4cc561b6c..e372f89fd36a 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -12,58 +12,19 @@ #include <linux/gfp.h> #include <linux/init.h> #include <linux/sched.h> +#include "afs_fs.h" #include "internal.h" /* - * map volume locator abort codes to error codes + * Deliver reply data to a VL.GetEntryByNameU call. */ -static int afs_vl_abort_to_error(u32 abort_code) +static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) { - _enter("%u", abort_code); - - switch (abort_code) { - case AFSVL_IDEXIST: return -EEXIST; - case AFSVL_IO: return -EREMOTEIO; - case AFSVL_NAMEEXIST: return -EEXIST; - case AFSVL_CREATEFAIL: return -EREMOTEIO; - case AFSVL_NOENT: return -ENOMEDIUM; - case AFSVL_EMPTY: return -ENOMEDIUM; - case AFSVL_ENTDELETED: return -ENOMEDIUM; - case AFSVL_BADNAME: return -EINVAL; - case AFSVL_BADINDEX: return -EINVAL; - case AFSVL_BADVOLTYPE: return -EINVAL; - case AFSVL_BADSERVER: return -EINVAL; - case AFSVL_BADPARTITION: return -EINVAL; - case AFSVL_REPSFULL: return -EFBIG; - case AFSVL_NOREPSERVER: return -ENOENT; - case AFSVL_DUPREPSERVER: return -EEXIST; - case AFSVL_RWNOTFOUND: return -ENOENT; - case AFSVL_BADREFCOUNT: return -EINVAL; - case AFSVL_SIZEEXCEEDED: return -EINVAL; - case AFSVL_BADENTRY: return -EINVAL; - case AFSVL_BADVOLIDBUMP: return -EINVAL; - case AFSVL_IDALREADYHASHED: return -EINVAL; - case AFSVL_ENTRYLOCKED: return -EBUSY; - case AFSVL_BADVOLOPER: return -EBADRQC; - case AFSVL_BADRELLOCKTYPE: return -EINVAL; - case AFSVL_RERELEASE: return -EREMOTEIO; - case AFSVL_BADSERVERFLAG: return -EINVAL; - case AFSVL_PERM: return -EACCES; - case AFSVL_NOMEM: return -EREMOTEIO; - default: - return afs_abort_to_error(abort_code); - } -} - -/* - * deliver reply data to a VL.GetEntryByXXX call - */ -static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call) -{ - struct afs_cache_vlocation *entry; - __be32 *bp; + struct afs_uvldbentry__xdr *uvldb; + struct afs_vldb_entry *entry; + bool new_only = false; u32 tmp; - int loop, ret; + int i, ret; _enter(""); @@ -72,144 +33,613 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call) return ret; /* unmarshall the reply once we've received all of it */ - entry = call->reply; - bp = call->buffer; - - for (loop = 0; loop < 64; loop++) - entry->name[loop] = ntohl(*bp++); - entry->name[loop] = 0; - bp++; /* final NUL */ + uvldb = call->buffer; + entry = call->reply[0]; - bp++; /* type */ - entry->nservers = ntohl(*bp++); + for (i = 0; i < ARRAY_SIZE(uvldb->name) - 1; i++) + entry->name[i] = (u8)ntohl(uvldb->name[i]); + entry->name[i] = 0; + entry->name_len = strlen(entry->name); - for (loop = 0; loop < 8; loop++) - entry->servers[loop].s_addr = *bp++; + /* If there is a new replication site that we can use, ignore all the + * sites that aren't marked as new. + */ + for (i = 0; i < AFS_NMAXNSERVERS; i++) { + tmp = ntohl(uvldb->serverFlags[i]); + if (!(tmp & AFS_VLSF_DONTUSE) && + (tmp & AFS_VLSF_NEWREPSITE)) + new_only = true; + } - bp += 8; /* partition IDs */ + for (i = 0; i < AFS_NMAXNSERVERS; i++) { + struct afs_uuid__xdr *xdr; + struct afs_uuid *uuid; + int j; - for (loop = 0; loop < 8; loop++) { - tmp = ntohl(*bp++); - entry->srvtmask[loop] = 0; + tmp = ntohl(uvldb->serverFlags[i]); + if (tmp & AFS_VLSF_DONTUSE || + (new_only && !(tmp & AFS_VLSF_NEWREPSITE))) + continue; if (tmp & AFS_VLSF_RWVOL) - entry->srvtmask[loop] |= AFS_VOL_VTM_RW; + entry->fs_mask[i] |= AFS_VOL_VTM_RW; if (tmp & AFS_VLSF_ROVOL) - entry->srvtmask[loop] |= AFS_VOL_VTM_RO; + entry->fs_mask[i] |= AFS_VOL_VTM_RO; if (tmp & AFS_VLSF_BACKVOL) - entry->srvtmask[loop] |= AFS_VOL_VTM_BAK; - } + entry->fs_mask[i] |= AFS_VOL_VTM_BAK; + if (!entry->fs_mask[i]) + continue; - entry->vid[0] = ntohl(*bp++); - entry->vid[1] = ntohl(*bp++); - entry->vid[2] = ntohl(*bp++); + xdr = &uvldb->serverNumber[i]; + uuid = (struct afs_uuid *)&entry->fs_server[i]; + uuid->time_low = xdr->time_low; + uuid->time_mid = htons(ntohl(xdr->time_mid)); + uuid->time_hi_and_version = htons(ntohl(xdr->time_hi_and_version)); + uuid->clock_seq_hi_and_reserved = (u8)ntohl(xdr->clock_seq_hi_and_reserved); + uuid->clock_seq_low = (u8)ntohl(xdr->clock_seq_low); + for (j = 0; j < 6; j++) + uuid->node[j] = (u8)ntohl(xdr->node[j]); - bp++; /* clone ID */ + entry->nr_servers++; + } + + for (i = 0; i < AFS_MAXTYPES; i++) + entry->vid[i] = ntohl(uvldb->volumeId[i]); - tmp = ntohl(*bp++); /* flags */ - entry->vidmask = 0; + tmp = ntohl(uvldb->flags); if (tmp & AFS_VLF_RWEXISTS) - entry->vidmask |= AFS_VOL_VTM_RW; + __set_bit(AFS_VLDB_HAS_RW, &entry->flags); if (tmp & AFS_VLF_ROEXISTS) - entry->vidmask |= AFS_VOL_VTM_RO; + __set_bit(AFS_VLDB_HAS_RO, &entry->flags); if (tmp & AFS_VLF_BACKEXISTS) - entry->vidmask |= AFS_VOL_VTM_BAK; - if (!entry->vidmask) - return -EBADMSG; + __set_bit(AFS_VLDB_HAS_BAK, &entry->flags); + if (!(tmp & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) { + entry->error = -ENOMEDIUM; + __set_bit(AFS_VLDB_QUERY_ERROR, &entry->flags); + } + + __set_bit(AFS_VLDB_QUERY_VALID, &entry->flags); _leave(" = 0 [done]"); return 0; } -/* - * VL.GetEntryByName operation type - */ -static const struct afs_call_type afs_RXVLGetEntryByName = { - .name = "VL.GetEntryByName", - .deliver = afs_deliver_vl_get_entry_by_xxx, - .abort_to_error = afs_vl_abort_to_error, - .destructor = afs_flat_call_destructor, -}; +static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call) +{ + kfree(call->reply[0]); + afs_flat_call_destructor(call); +} /* - * VL.GetEntryById operation type + * VL.GetEntryByNameU operation type. */ -static const struct afs_call_type afs_RXVLGetEntryById = { - .name = "VL.GetEntryById", - .deliver = afs_deliver_vl_get_entry_by_xxx, - .abort_to_error = afs_vl_abort_to_error, - .destructor = afs_flat_call_destructor, +static const struct afs_call_type afs_RXVLGetEntryByNameU = { + .name = "VL.GetEntryByNameU", + .op = afs_VL_GetEntryByNameU, + .deliver = afs_deliver_vl_get_entry_by_name_u, + .destructor = afs_destroy_vl_get_entry_by_name_u, }; /* - * dispatch a get volume entry by name operation + * Dispatch a get volume entry by name or ID operation (uuid variant). If the + * volname is a decimal number then it's a volume ID not a volume name. */ -int afs_vl_get_entry_by_name(struct in_addr *addr, - struct key *key, - const char *volname, - struct afs_cache_vlocation *entry, - bool async) +struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net, + struct afs_addr_cursor *ac, + struct key *key, + const char *volname, + int volnamesz) { + struct afs_vldb_entry *entry; struct afs_call *call; - size_t volnamesz, reqsz, padsz; + size_t reqsz, padsz; __be32 *bp; _enter(""); - volnamesz = strlen(volname); padsz = (4 - (volnamesz & 3)) & 3; reqsz = 8 + volnamesz + padsz; - call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384); - if (!call) - return -ENOMEM; + entry = kzalloc(sizeof(struct afs_vldb_entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + call = afs_alloc_flat_call(net, &afs_RXVLGetEntryByNameU, reqsz, + sizeof(struct afs_uvldbentry__xdr)); + if (!call) { + kfree(entry); + return ERR_PTR(-ENOMEM); + } call->key = key; - call->reply = entry; - call->service_id = VL_SERVICE; - call->port = htons(AFS_VL_PORT); + call->reply[0] = entry; + call->ret_reply0 = true; - /* marshall the parameters */ + /* Marshall the parameters */ bp = call->request; - *bp++ = htonl(VLGETENTRYBYNAME); + *bp++ = htonl(VLGETENTRYBYNAMEU); *bp++ = htonl(volnamesz); memcpy(bp, volname, volnamesz); if (padsz > 0) - memset((void *) bp + volnamesz, 0, padsz); + memset((void *)bp + volnamesz, 0, padsz); - /* initiate the call */ - return afs_make_call(addr, call, GFP_KERNEL, async); + trace_afs_make_vl_call(call); + return (struct afs_vldb_entry *)afs_make_call(ac, call, GFP_KERNEL, false); } /* - * dispatch a get volume entry by ID operation + * Deliver reply data to a VL.GetAddrsU call. + * + * GetAddrsU(IN ListAddrByAttributes *inaddr, + * OUT afsUUID *uuidp1, + * OUT uint32_t *uniquifier, + * OUT uint32_t *nentries, + * OUT bulkaddrs *blkaddrs); */ -int afs_vl_get_entry_by_id(struct in_addr *addr, - struct key *key, - afs_volid_t volid, - afs_voltype_t voltype, - struct afs_cache_vlocation *entry, - bool async) +static int afs_deliver_vl_get_addrs_u(struct afs_call *call) { + struct afs_addr_list *alist; + __be32 *bp; + u32 uniquifier, nentries, count; + int i, ret; + + _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + +again: + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* Extract the returned uuid, uniquifier, nentries and blkaddrs size */ + case 1: + ret = afs_extract_data(call, call->buffer, + sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32), + true); + if (ret < 0) + return ret; + + bp = call->buffer + sizeof(struct afs_uuid__xdr); + uniquifier = ntohl(*bp++); + nentries = ntohl(*bp++); + count = ntohl(*bp); + + nentries = min(nentries, count); + alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT); + if (!alist) + return -ENOMEM; + alist->version = uniquifier; + call->reply[0] = alist; + call->count = count; + call->count2 = nentries; + call->offset = 0; + call->unmarshall++; + + /* Extract entries */ + case 2: + count = min(call->count, 4U); + ret = afs_extract_data(call, call->buffer, + count * sizeof(__be32), + call->count > 4); + if (ret < 0) + return ret; + + alist = call->reply[0]; + bp = call->buffer; + for (i = 0; i < count; i++) + if (alist->nr_addrs < call->count2) + afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT); + + call->count -= count; + if (call->count > 0) + goto again; + call->offset = 0; + call->unmarshall++; + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +static void afs_vl_get_addrs_u_destructor(struct afs_call *call) +{ + afs_put_server(call->net, (struct afs_server *)call->reply[0]); + kfree(call->reply[1]); + return afs_flat_call_destructor(call); +} + +/* + * VL.GetAddrsU operation type. + */ +static const struct afs_call_type afs_RXVLGetAddrsU = { + .name = "VL.GetAddrsU", + .op = afs_VL_GetAddrsU, + .deliver = afs_deliver_vl_get_addrs_u, + .destructor = afs_vl_get_addrs_u_destructor, +}; + +/* + * Dispatch an operation to get the addresses for a server, where the server is + * nominated by UUID. + */ +struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, + struct afs_addr_cursor *ac, + struct key *key, + const uuid_t *uuid) +{ + struct afs_ListAddrByAttributes__xdr *r; + const struct afs_uuid *u = (const struct afs_uuid *)uuid; struct afs_call *call; __be32 *bp; + int i; _enter(""); - call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384); + call = afs_alloc_flat_call(net, &afs_RXVLGetAddrsU, + sizeof(__be32) + sizeof(struct afs_ListAddrByAttributes__xdr), + sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32)); + if (!call) + return ERR_PTR(-ENOMEM); + + call->key = key; + call->reply[0] = NULL; + call->ret_reply0 = true; + + /* Marshall the parameters */ + bp = call->request; + *bp++ = htonl(VLGETADDRSU); + r = (struct afs_ListAddrByAttributes__xdr *)bp; + r->Mask = htonl(AFS_VLADDR_UUID); + r->ipaddr = 0; + r->index = 0; + r->spare = 0; + r->uuid.time_low = u->time_low; + r->uuid.time_mid = htonl(ntohs(u->time_mid)); + r->uuid.time_hi_and_version = htonl(ntohs(u->time_hi_and_version)); + r->uuid.clock_seq_hi_and_reserved = htonl(u->clock_seq_hi_and_reserved); + r->uuid.clock_seq_low = htonl(u->clock_seq_low); + for (i = 0; i < 6; i++) + r->uuid.node[i] = ntohl(u->node[i]); + + trace_afs_make_vl_call(call); + return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); +} + +/* + * Deliver reply data to an VL.GetCapabilities operation. + */ +static int afs_deliver_vl_get_capabilities(struct afs_call *call) +{ + u32 count; + int ret; + + _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + +again: + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* Extract the capabilities word count */ + case 1: + ret = afs_extract_data(call, &call->tmp, + 1 * sizeof(__be32), + true); + if (ret < 0) + return ret; + + count = ntohl(call->tmp); + + call->count = count; + call->count2 = count; + call->offset = 0; + call->unmarshall++; + + /* Extract capabilities words */ + case 2: + count = min(call->count, 16U); + ret = afs_extract_data(call, call->buffer, + count * sizeof(__be32), + call->count > 16); + if (ret < 0) + return ret; + + /* TODO: Examine capabilities */ + + call->count -= count; + if (call->count > 0) + goto again; + call->offset = 0; + call->unmarshall++; + break; + } + + call->reply[0] = (void *)(unsigned long)call->service_id; + + _leave(" = 0 [done]"); + return 0; +} + +/* + * VL.GetCapabilities operation type + */ +static const struct afs_call_type afs_RXVLGetCapabilities = { + .name = "VL.GetCapabilities", + .op = afs_VL_GetCapabilities, + .deliver = afs_deliver_vl_get_capabilities, + .destructor = afs_flat_call_destructor, +}; + +/* + * Probe a fileserver for the capabilities that it supports. This can + * return up to 196 words. + * + * We use this to probe for service upgrade to determine what the server at the + * other end supports. + */ +int afs_vl_get_capabilities(struct afs_net *net, + struct afs_addr_cursor *ac, + struct key *key) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_RXVLGetCapabilities, 1 * 4, 16 * 4); if (!call) return -ENOMEM; call->key = key; - call->reply = entry; - call->service_id = VL_SERVICE; - call->port = htons(AFS_VL_PORT); + call->upgrade = true; /* Let's see if this is a YFS server */ + call->reply[0] = (void *)VLGETCAPABILITIES; + call->ret_reply0 = true; /* marshall the parameters */ bp = call->request; - *bp++ = htonl(VLGETENTRYBYID); - *bp++ = htonl(volid); - *bp = htonl(voltype); + *bp++ = htonl(VLGETCAPABILITIES); + + /* Can't take a ref on server */ + trace_afs_make_vl_call(call); + return afs_make_call(ac, call, GFP_KERNEL, false); +} + +/* + * Deliver reply data to a YFSVL.GetEndpoints call. + * + * GetEndpoints(IN yfsServerAttributes *attr, + * OUT opr_uuid *uuid, + * OUT afs_int32 *uniquifier, + * OUT endpoints *fsEndpoints, + * OUT endpoints *volEndpoints) + */ +static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) +{ + struct afs_addr_list *alist; + __be32 *bp; + u32 uniquifier, size; + int ret; + + _enter("{%u,%zu/%u,%u}", call->unmarshall, call->offset, call->count, call->count2); + +again: + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall = 1; + + /* Extract the returned uuid, uniquifier, fsEndpoints count and + * either the first fsEndpoint type or the volEndpoints + * count if there are no fsEndpoints. */ + case 1: + ret = afs_extract_data(call, call->buffer, + sizeof(uuid_t) + + 3 * sizeof(__be32), + true); + if (ret < 0) + return ret; + + bp = call->buffer + sizeof(uuid_t); + uniquifier = ntohl(*bp++); + call->count = ntohl(*bp++); + call->count2 = ntohl(*bp); /* Type or next count */ + + if (call->count > YFS_MAXENDPOINTS) + return -EBADMSG; + + alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); + if (!alist) + return -ENOMEM; + alist->version = uniquifier; + call->reply[0] = alist; + call->offset = 0; + + if (call->count == 0) + goto extract_volendpoints; + + call->unmarshall = 2; + + /* Extract fsEndpoints[] entries */ + case 2: + switch (call->count2) { + case YFS_ENDPOINT_IPV4: + size = sizeof(__be32) * (1 + 1 + 1); + break; + case YFS_ENDPOINT_IPV6: + size = sizeof(__be32) * (1 + 4 + 1); + break; + default: + return -EBADMSG; + } + + size += sizeof(__be32); + ret = afs_extract_data(call, call->buffer, size, true); + if (ret < 0) + return ret; + + alist = call->reply[0]; + bp = call->buffer; + switch (call->count2) { + case YFS_ENDPOINT_IPV4: + if (ntohl(bp[0]) != sizeof(__be32) * 2) + return -EBADMSG; + afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); + bp += 3; + break; + case YFS_ENDPOINT_IPV6: + if (ntohl(bp[0]) != sizeof(__be32) * 5) + return -EBADMSG; + afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); + bp += 6; + break; + default: + return -EBADMSG; + } + + /* Got either the type of the next entry or the count of + * volEndpoints if no more fsEndpoints. + */ + call->count2 = htonl(*bp++); + + call->offset = 0; + call->count--; + if (call->count > 0) + goto again; + + extract_volendpoints: + /* Extract the list of volEndpoints. */ + call->count = call->count2; + if (!call->count) + goto end; + if (call->count > YFS_MAXENDPOINTS) + return -EBADMSG; + + call->unmarshall = 3; + + /* Extract the type of volEndpoints[0]. Normally we would + * extract the type of the next endpoint when we extract the + * data of the current one, but this is the first... + */ + case 3: + ret = afs_extract_data(call, call->buffer, sizeof(__be32), true); + if (ret < 0) + return ret; + + bp = call->buffer; + call->count2 = htonl(*bp++); + call->offset = 0; + call->unmarshall = 4; + + /* Extract volEndpoints[] entries */ + case 4: + switch (call->count2) { + case YFS_ENDPOINT_IPV4: + size = sizeof(__be32) * (1 + 1 + 1); + break; + case YFS_ENDPOINT_IPV6: + size = sizeof(__be32) * (1 + 4 + 1); + break; + default: + return -EBADMSG; + } + + if (call->count > 1) + size += sizeof(__be32); + ret = afs_extract_data(call, call->buffer, size, true); + if (ret < 0) + return ret; + + bp = call->buffer; + switch (call->count2) { + case YFS_ENDPOINT_IPV4: + if (ntohl(bp[0]) != sizeof(__be32) * 2) + return -EBADMSG; + bp += 3; + break; + case YFS_ENDPOINT_IPV6: + if (ntohl(bp[0]) != sizeof(__be32) * 5) + return -EBADMSG; + bp += 6; + break; + default: + return -EBADMSG; + } + + /* Got either the type of the next entry or the count of + * volEndpoints if no more fsEndpoints. + */ + call->offset = 0; + call->count--; + if (call->count > 0) { + call->count2 = htonl(*bp++); + goto again; + } + + end: + call->unmarshall = 5; + + /* Done */ + case 5: + ret = afs_extract_data(call, call->buffer, 0, false); + if (ret < 0) + return ret; + call->unmarshall = 6; + + case 6: + break; + } + + alist = call->reply[0]; + + /* Start with IPv6 if available. */ + if (alist->nr_ipv4 < alist->nr_addrs) + alist->index = alist->nr_ipv4; + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFSVL.GetEndpoints operation type. + */ +static const struct afs_call_type afs_YFSVLGetEndpoints = { + .name = "YFSVL.GetEndpoints", + .op = afs_YFSVL_GetEndpoints, + .deliver = afs_deliver_yfsvl_get_endpoints, + .destructor = afs_vl_get_addrs_u_destructor, +}; + +/* + * Dispatch an operation to get the addresses for a server, where the server is + * nominated by UUID. + */ +struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net, + struct afs_addr_cursor *ac, + struct key *key, + const uuid_t *uuid) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_YFSVLGetEndpoints, + sizeof(__be32) * 2 + sizeof(*uuid), + sizeof(struct in6_addr) + sizeof(__be32) * 3); + if (!call) + return ERR_PTR(-ENOMEM); + + call->key = key; + call->reply[0] = NULL; + call->ret_reply0 = true; + + /* Marshall the parameters */ + bp = call->request; + *bp++ = htonl(YVLGETENDPOINTS); + *bp++ = htonl(YFS_SERVER_UUID); + memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */ - /* initiate the call */ - return afs_make_call(addr, call, GFP_KERNEL, async); + trace_afs_make_vl_call(call); + return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c deleted file mode 100644 index 37b7c3b342a6..000000000000 --- a/fs/afs/vlocation.c +++ /dev/null @@ -1,720 +0,0 @@ -/* AFS volume location management - * - * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/sched.h> -#include "internal.h" - -static unsigned afs_vlocation_timeout = 10; /* volume location timeout in seconds */ -static unsigned afs_vlocation_update_timeout = 10 * 60; - -static void afs_vlocation_reaper(struct work_struct *); -static void afs_vlocation_updater(struct work_struct *); - -static LIST_HEAD(afs_vlocation_updates); -static LIST_HEAD(afs_vlocation_graveyard); -static DEFINE_SPINLOCK(afs_vlocation_updates_lock); -static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock); -static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper); -static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater); -static struct workqueue_struct *afs_vlocation_update_worker; - -/* - * iterate through the VL servers in a cell until one of them admits knowing - * about the volume in question - */ -static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl, - struct key *key, - struct afs_cache_vlocation *vldb) -{ - struct afs_cell *cell = vl->cell; - struct in_addr addr; - int count, ret; - - _enter("%s,%s", cell->name, vl->vldb.name); - - down_write(&vl->cell->vl_sem); - ret = -ENOMEDIUM; - for (count = cell->vl_naddrs; count > 0; count--) { - addr = cell->vl_addrs[cell->vl_curr_svix]; - - _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr); - - /* attempt to access the VL server */ - ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb, - false); - switch (ret) { - case 0: - goto out; - case -ENOMEM: - case -ENONET: - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - if (ret == -ENOMEM || ret == -ENONET) - goto out; - goto rotate; - case -ENOMEDIUM: - case -EKEYREJECTED: - case -EKEYEXPIRED: - goto out; - default: - ret = -EIO; - goto rotate; - } - - /* rotate the server records upon lookup failure */ - rotate: - cell->vl_curr_svix++; - cell->vl_curr_svix %= cell->vl_naddrs; - } - -out: - up_write(&vl->cell->vl_sem); - _leave(" = %d", ret); - return ret; -} - -/* - * iterate through the VL servers in a cell until one of them admits knowing - * about the volume in question - */ -static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, - struct key *key, - afs_volid_t volid, - afs_voltype_t voltype, - struct afs_cache_vlocation *vldb) -{ - struct afs_cell *cell = vl->cell; - struct in_addr addr; - int count, ret; - - _enter("%s,%x,%d,", cell->name, volid, voltype); - - down_write(&vl->cell->vl_sem); - ret = -ENOMEDIUM; - for (count = cell->vl_naddrs; count > 0; count--) { - addr = cell->vl_addrs[cell->vl_curr_svix]; - - _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr); - - /* attempt to access the VL server */ - ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb, - false); - switch (ret) { - case 0: - goto out; - case -ENOMEM: - case -ENONET: - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - if (ret == -ENOMEM || ret == -ENONET) - goto out; - goto rotate; - case -EBUSY: - vl->upd_busy_cnt++; - if (vl->upd_busy_cnt <= 3) { - if (vl->upd_busy_cnt > 1) { - /* second+ BUSY - sleep a little bit */ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(1); - } - continue; - } - break; - case -ENOMEDIUM: - vl->upd_rej_cnt++; - goto rotate; - default: - ret = -EIO; - goto rotate; - } - - /* rotate the server records upon lookup failure */ - rotate: - cell->vl_curr_svix++; - cell->vl_curr_svix %= cell->vl_naddrs; - vl->upd_busy_cnt = 0; - } - -out: - if (ret < 0 && vl->upd_rej_cnt > 0) { - printk(KERN_NOTICE "kAFS:" - " Active volume no longer valid '%s'\n", - vl->vldb.name); - vl->valid = 0; - ret = -ENOMEDIUM; - } - - up_write(&vl->cell->vl_sem); - _leave(" = %d", ret); - return ret; -} - -/* - * allocate a volume location record - */ -static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell, - const char *name, - size_t namesz) -{ - struct afs_vlocation *vl; - - vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL); - if (vl) { - vl->cell = cell; - vl->state = AFS_VL_NEW; - atomic_set(&vl->usage, 1); - INIT_LIST_HEAD(&vl->link); - INIT_LIST_HEAD(&vl->grave); - INIT_LIST_HEAD(&vl->update); - init_waitqueue_head(&vl->waitq); - spin_lock_init(&vl->lock); - memcpy(vl->vldb.name, name, namesz); - } - - _leave(" = %p", vl); - return vl; -} - -/* - * update record if we found it in the cache - */ -static int afs_vlocation_update_record(struct afs_vlocation *vl, - struct key *key, - struct afs_cache_vlocation *vldb) -{ - afs_voltype_t voltype; - afs_volid_t vid; - int ret; - - /* try to look up a cached volume in the cell VL databases by ID */ - _debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", - vl->vldb.name, - vl->vldb.vidmask, - ntohl(vl->vldb.servers[0].s_addr), - vl->vldb.srvtmask[0], - ntohl(vl->vldb.servers[1].s_addr), - vl->vldb.srvtmask[1], - ntohl(vl->vldb.servers[2].s_addr), - vl->vldb.srvtmask[2]); - - _debug("Vids: %08x %08x %08x", - vl->vldb.vid[0], - vl->vldb.vid[1], - vl->vldb.vid[2]); - - if (vl->vldb.vidmask & AFS_VOL_VTM_RW) { - vid = vl->vldb.vid[0]; - voltype = AFSVL_RWVOL; - } else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) { - vid = vl->vldb.vid[1]; - voltype = AFSVL_ROVOL; - } else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) { - vid = vl->vldb.vid[2]; - voltype = AFSVL_BACKVOL; - } else { - BUG(); - vid = 0; - voltype = 0; - } - - /* contact the server to make sure the volume is still available - * - TODO: need to handle disconnected operation here - */ - ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb); - switch (ret) { - /* net error */ - default: - printk(KERN_WARNING "kAFS:" - " failed to update volume '%s' (%x) up in '%s': %d\n", - vl->vldb.name, vid, vl->cell->name, ret); - _leave(" = %d", ret); - return ret; - - /* pulled from local cache into memory */ - case 0: - _leave(" = 0"); - return 0; - - /* uh oh... looks like the volume got deleted */ - case -ENOMEDIUM: - printk(KERN_ERR "kAFS:" - " volume '%s' (%x) does not exist '%s'\n", - vl->vldb.name, vid, vl->cell->name); - - /* TODO: make existing record unavailable */ - _leave(" = %d", ret); - return ret; - } -} - -/* - * apply the update to a VL record - */ -static void afs_vlocation_apply_update(struct afs_vlocation *vl, - struct afs_cache_vlocation *vldb) -{ - _debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", - vldb->name, vldb->vidmask, - ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0], - ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1], - ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]); - - _debug("Vids: %08x %08x %08x", - vldb->vid[0], vldb->vid[1], vldb->vid[2]); - - if (strcmp(vldb->name, vl->vldb.name) != 0) - printk(KERN_NOTICE "kAFS:" - " name of volume '%s' changed to '%s' on server\n", - vl->vldb.name, vldb->name); - - vl->vldb = *vldb; - -#ifdef CONFIG_AFS_FSCACHE - fscache_update_cookie(vl->cache); -#endif -} - -/* - * fill in a volume location record, consulting the cache and the VL server - * both - */ -static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, - struct key *key) -{ - struct afs_cache_vlocation vldb; - int ret; - - _enter(""); - - ASSERTCMP(vl->valid, ==, 0); - - memset(&vldb, 0, sizeof(vldb)); - - /* see if we have an in-cache copy (will set vl->valid if there is) */ -#ifdef CONFIG_AFS_FSCACHE - vl->cache = fscache_acquire_cookie(vl->cell->cache, - &afs_vlocation_cache_index_def, vl, - true); -#endif - - if (vl->valid) { - /* try to update a known volume in the cell VL databases by - * ID as the name may have changed */ - _debug("found in cache"); - ret = afs_vlocation_update_record(vl, key, &vldb); - } else { - /* try to look up an unknown volume in the cell VL databases by - * name */ - ret = afs_vlocation_access_vl_by_name(vl, key, &vldb); - if (ret < 0) { - printk("kAFS: failed to locate '%s' in cell '%s'\n", - vl->vldb.name, vl->cell->name); - return ret; - } - } - - afs_vlocation_apply_update(vl, &vldb); - _leave(" = 0"); - return 0; -} - -/* - * queue a vlocation record for updates - */ -static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl) -{ - struct afs_vlocation *xvl; - - /* wait at least 10 minutes before updating... */ - vl->update_at = ktime_get_real_seconds() + - afs_vlocation_update_timeout; - - spin_lock(&afs_vlocation_updates_lock); - - if (!list_empty(&afs_vlocation_updates)) { - /* ... but wait at least 1 second more than the newest record - * already queued so that we don't spam the VL server suddenly - * with lots of requests - */ - xvl = list_entry(afs_vlocation_updates.prev, - struct afs_vlocation, update); - if (vl->update_at <= xvl->update_at) - vl->update_at = xvl->update_at + 1; - } else { - queue_delayed_work(afs_vlocation_update_worker, - &afs_vlocation_update, - afs_vlocation_update_timeout * HZ); - } - - list_add_tail(&vl->update, &afs_vlocation_updates); - spin_unlock(&afs_vlocation_updates_lock); -} - -/* - * lookup volume location - * - iterate through the VL servers in a cell until one of them admits knowing - * about the volume in question - * - lookup in the local cache if not able to find on the VL server - * - insert/update in the local cache if did get a VL response - */ -struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell, - struct key *key, - const char *name, - size_t namesz) -{ - struct afs_vlocation *vl; - int ret; - - _enter("{%s},{%x},%*.*s,%zu", - cell->name, key_serial(key), - (int) namesz, (int) namesz, name, namesz); - - if (namesz >= sizeof(vl->vldb.name)) { - _leave(" = -ENAMETOOLONG"); - return ERR_PTR(-ENAMETOOLONG); - } - - /* see if we have an in-memory copy first */ - down_write(&cell->vl_sem); - spin_lock(&cell->vl_lock); - list_for_each_entry(vl, &cell->vl_list, link) { - if (vl->vldb.name[namesz] != '\0') - continue; - if (memcmp(vl->vldb.name, name, namesz) == 0) - goto found_in_memory; - } - spin_unlock(&cell->vl_lock); - - /* not in the cell's in-memory lists - create a new record */ - vl = afs_vlocation_alloc(cell, name, namesz); - if (!vl) { - up_write(&cell->vl_sem); - return ERR_PTR(-ENOMEM); - } - - afs_get_cell(cell); - - list_add_tail(&vl->link, &cell->vl_list); - vl->state = AFS_VL_CREATING; - up_write(&cell->vl_sem); - -fill_in_record: - ret = afs_vlocation_fill_in_record(vl, key); - if (ret < 0) - goto error_abandon; - spin_lock(&vl->lock); - vl->state = AFS_VL_VALID; - spin_unlock(&vl->lock); - wake_up(&vl->waitq); - - /* update volume entry in local cache */ -#ifdef CONFIG_AFS_FSCACHE - fscache_update_cookie(vl->cache); -#endif - - /* schedule for regular updates */ - afs_vlocation_queue_for_updates(vl); - goto success; - -found_in_memory: - /* found in memory */ - _debug("found in memory"); - atomic_inc(&vl->usage); - spin_unlock(&cell->vl_lock); - if (!list_empty(&vl->grave)) { - spin_lock(&afs_vlocation_graveyard_lock); - list_del_init(&vl->grave); - spin_unlock(&afs_vlocation_graveyard_lock); - } - up_write(&cell->vl_sem); - - /* see if it was an abandoned record that we might try filling in */ - spin_lock(&vl->lock); - while (vl->state != AFS_VL_VALID) { - afs_vlocation_state_t state = vl->state; - - _debug("invalid [state %d]", state); - - if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) { - vl->state = AFS_VL_CREATING; - spin_unlock(&vl->lock); - goto fill_in_record; - } - - /* must now wait for creation or update by someone else to - * complete */ - _debug("wait"); - - spin_unlock(&vl->lock); - ret = wait_event_interruptible(vl->waitq, - vl->state == AFS_VL_NEW || - vl->state == AFS_VL_VALID || - vl->state == AFS_VL_NO_VOLUME); - if (ret < 0) - goto error; - spin_lock(&vl->lock); - } - spin_unlock(&vl->lock); - -success: - _leave(" = %p", vl); - return vl; - -error_abandon: - spin_lock(&vl->lock); - vl->state = AFS_VL_NEW; - spin_unlock(&vl->lock); - wake_up(&vl->waitq); -error: - ASSERT(vl != NULL); - afs_put_vlocation(vl); - _leave(" = %d", ret); - return ERR_PTR(ret); -} - -/* - * finish using a volume location record - */ -void afs_put_vlocation(struct afs_vlocation *vl) -{ - if (!vl) - return; - - _enter("%s", vl->vldb.name); - - ASSERTCMP(atomic_read(&vl->usage), >, 0); - - if (likely(!atomic_dec_and_test(&vl->usage))) { - _leave(""); - return; - } - - spin_lock(&afs_vlocation_graveyard_lock); - if (atomic_read(&vl->usage) == 0) { - _debug("buried"); - list_move_tail(&vl->grave, &afs_vlocation_graveyard); - vl->time_of_death = ktime_get_real_seconds(); - queue_delayed_work(afs_wq, &afs_vlocation_reap, - afs_vlocation_timeout * HZ); - - /* suspend updates on this record */ - if (!list_empty(&vl->update)) { - spin_lock(&afs_vlocation_updates_lock); - list_del_init(&vl->update); - spin_unlock(&afs_vlocation_updates_lock); - } - } - spin_unlock(&afs_vlocation_graveyard_lock); - _leave(" [killed?]"); -} - -/* - * destroy a dead volume location record - */ -static void afs_vlocation_destroy(struct afs_vlocation *vl) -{ - _enter("%p", vl); - -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vl->cache, 0); -#endif - afs_put_cell(vl->cell); - kfree(vl); -} - -/* - * reap dead volume location records - */ -static void afs_vlocation_reaper(struct work_struct *work) -{ - LIST_HEAD(corpses); - struct afs_vlocation *vl; - unsigned long delay, expiry; - time64_t now; - - _enter(""); - - now = ktime_get_real_seconds(); - spin_lock(&afs_vlocation_graveyard_lock); - - while (!list_empty(&afs_vlocation_graveyard)) { - vl = list_entry(afs_vlocation_graveyard.next, - struct afs_vlocation, grave); - - _debug("check %p", vl); - - /* the queue is ordered most dead first */ - expiry = vl->time_of_death + afs_vlocation_timeout; - if (expiry > now) { - delay = (expiry - now) * HZ; - _debug("delay %lu", delay); - mod_delayed_work(afs_wq, &afs_vlocation_reap, delay); - break; - } - - spin_lock(&vl->cell->vl_lock); - if (atomic_read(&vl->usage) > 0) { - _debug("no reap"); - list_del_init(&vl->grave); - } else { - _debug("reap"); - list_move_tail(&vl->grave, &corpses); - list_del_init(&vl->link); - } - spin_unlock(&vl->cell->vl_lock); - } - - spin_unlock(&afs_vlocation_graveyard_lock); - - /* now reap the corpses we've extracted */ - while (!list_empty(&corpses)) { - vl = list_entry(corpses.next, struct afs_vlocation, grave); - list_del(&vl->grave); - afs_vlocation_destroy(vl); - } - - _leave(""); -} - -/* - * initialise the VL update process - */ -int __init afs_vlocation_update_init(void) -{ - afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated", - WQ_MEM_RECLAIM, 0); - return afs_vlocation_update_worker ? 0 : -ENOMEM; -} - -/* - * discard all the volume location records for rmmod - */ -void afs_vlocation_purge(void) -{ - afs_vlocation_timeout = 0; - - spin_lock(&afs_vlocation_updates_lock); - list_del_init(&afs_vlocation_updates); - spin_unlock(&afs_vlocation_updates_lock); - mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0); - destroy_workqueue(afs_vlocation_update_worker); - - mod_delayed_work(afs_wq, &afs_vlocation_reap, 0); -} - -/* - * update a volume location - */ -static void afs_vlocation_updater(struct work_struct *work) -{ - struct afs_cache_vlocation vldb; - struct afs_vlocation *vl, *xvl; - time64_t now; - long timeout; - int ret; - - _enter(""); - - now = ktime_get_real_seconds(); - - /* find a record to update */ - spin_lock(&afs_vlocation_updates_lock); - for (;;) { - if (list_empty(&afs_vlocation_updates)) { - spin_unlock(&afs_vlocation_updates_lock); - _leave(" [nothing]"); - return; - } - - vl = list_entry(afs_vlocation_updates.next, - struct afs_vlocation, update); - if (atomic_read(&vl->usage) > 0) - break; - list_del_init(&vl->update); - } - - timeout = vl->update_at - now; - if (timeout > 0) { - queue_delayed_work(afs_vlocation_update_worker, - &afs_vlocation_update, timeout * HZ); - spin_unlock(&afs_vlocation_updates_lock); - _leave(" [nothing]"); - return; - } - - list_del_init(&vl->update); - atomic_inc(&vl->usage); - spin_unlock(&afs_vlocation_updates_lock); - - /* we can now perform the update */ - _debug("update %s", vl->vldb.name); - vl->state = AFS_VL_UPDATING; - vl->upd_rej_cnt = 0; - vl->upd_busy_cnt = 0; - - ret = afs_vlocation_update_record(vl, NULL, &vldb); - spin_lock(&vl->lock); - switch (ret) { - case 0: - afs_vlocation_apply_update(vl, &vldb); - vl->state = AFS_VL_VALID; - break; - case -ENOMEDIUM: - vl->state = AFS_VL_VOLUME_DELETED; - break; - default: - vl->state = AFS_VL_UNCERTAIN; - break; - } - spin_unlock(&vl->lock); - wake_up(&vl->waitq); - - /* and then reschedule */ - _debug("reschedule"); - vl->update_at = ktime_get_real_seconds() + - afs_vlocation_update_timeout; - - spin_lock(&afs_vlocation_updates_lock); - - if (!list_empty(&afs_vlocation_updates)) { - /* next update in 10 minutes, but wait at least 1 second more - * than the newest record already queued so that we don't spam - * the VL server suddenly with lots of requests - */ - xvl = list_entry(afs_vlocation_updates.prev, - struct afs_vlocation, update); - if (vl->update_at <= xvl->update_at) - vl->update_at = xvl->update_at + 1; - xvl = list_entry(afs_vlocation_updates.next, - struct afs_vlocation, update); - timeout = xvl->update_at - now; - if (timeout < 0) - timeout = 0; - } else { - timeout = afs_vlocation_update_timeout; - } - - ASSERT(list_empty(&vl->update)); - - list_add_tail(&vl->update, &afs_vlocation_updates); - - _debug("timeout %ld", timeout); - queue_delayed_work(afs_vlocation_update_worker, - &afs_vlocation_update, timeout * HZ); - spin_unlock(&afs_vlocation_updates_lock); - afs_put_vlocation(vl); -} diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c deleted file mode 100644 index dcb956143c86..000000000000 --- a/fs/afs/vnode.c +++ /dev/null @@ -1,1025 +0,0 @@ -/* AFS vnode management - * - * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include "internal.h" - -#if 0 -static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent, - int depth, char lr) -{ - struct afs_vnode *vnode; - bool bad = false; - - if (!node) - return false; - - if (node->rb_left) - bad = dump_tree_aux(node->rb_left, node, depth + 2, '/'); - - vnode = rb_entry(node, struct afs_vnode, cb_promise); - _debug("%c %*.*s%c%p {%d}", - rb_is_red(node) ? 'R' : 'B', - depth, depth, "", lr, - vnode, vnode->cb_expires_at); - if (rb_parent(node) != parent) { - printk("BAD: %p != %p\n", rb_parent(node), parent); - bad = true; - } - - if (node->rb_right) - bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\'); - - return bad; -} - -static noinline void dump_tree(const char *name, struct afs_server *server) -{ - _enter("%s", name); - if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-')) - BUG(); -} -#endif - -/* - * insert a vnode into the backing server's vnode tree - */ -static void afs_install_vnode(struct afs_vnode *vnode, - struct afs_server *server) -{ - struct afs_server *old_server = vnode->server; - struct afs_vnode *xvnode; - struct rb_node *parent, **p; - - _enter("%p,%p", vnode, server); - - if (old_server) { - spin_lock(&old_server->fs_lock); - rb_erase(&vnode->server_rb, &old_server->fs_vnodes); - spin_unlock(&old_server->fs_lock); - } - - afs_get_server(server); - vnode->server = server; - afs_put_server(old_server); - - /* insert into the server's vnode tree in FID order */ - spin_lock(&server->fs_lock); - - parent = NULL; - p = &server->fs_vnodes.rb_node; - while (*p) { - parent = *p; - xvnode = rb_entry(parent, struct afs_vnode, server_rb); - if (vnode->fid.vid < xvnode->fid.vid) - p = &(*p)->rb_left; - else if (vnode->fid.vid > xvnode->fid.vid) - p = &(*p)->rb_right; - else if (vnode->fid.vnode < xvnode->fid.vnode) - p = &(*p)->rb_left; - else if (vnode->fid.vnode > xvnode->fid.vnode) - p = &(*p)->rb_right; - else if (vnode->fid.unique < xvnode->fid.unique) - p = &(*p)->rb_left; - else if (vnode->fid.unique > xvnode->fid.unique) - p = &(*p)->rb_right; - else - BUG(); /* can't happen unless afs_iget() malfunctions */ - } - - rb_link_node(&vnode->server_rb, parent, p); - rb_insert_color(&vnode->server_rb, &server->fs_vnodes); - - spin_unlock(&server->fs_lock); - _leave(""); -} - -/* - * insert a vnode into the promising server's update/expiration tree - * - caller must hold vnode->lock - */ -static void afs_vnode_note_promise(struct afs_vnode *vnode, - struct afs_server *server) -{ - struct afs_server *old_server; - struct afs_vnode *xvnode; - struct rb_node *parent, **p; - - _enter("%p,%p", vnode, server); - - ASSERT(server != NULL); - - old_server = vnode->server; - if (vnode->cb_promised) { - if (server == old_server && - vnode->cb_expires == vnode->cb_expires_at) { - _leave(" [no change]"); - return; - } - - spin_lock(&old_server->cb_lock); - if (vnode->cb_promised) { - _debug("delete"); - rb_erase(&vnode->cb_promise, &old_server->cb_promises); - vnode->cb_promised = false; - } - spin_unlock(&old_server->cb_lock); - } - - if (vnode->server != server) - afs_install_vnode(vnode, server); - - vnode->cb_expires_at = vnode->cb_expires; - _debug("PROMISE on %p {%lu}", - vnode, (unsigned long) vnode->cb_expires_at); - - /* abuse an RB-tree to hold the expiration order (we may have multiple - * items with the same expiration time) */ - spin_lock(&server->cb_lock); - - parent = NULL; - p = &server->cb_promises.rb_node; - while (*p) { - parent = *p; - xvnode = rb_entry(parent, struct afs_vnode, cb_promise); - if (vnode->cb_expires_at < xvnode->cb_expires_at) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&vnode->cb_promise, parent, p); - rb_insert_color(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = true; - - spin_unlock(&server->cb_lock); - _leave(""); -} - -/* - * handle remote file deletion by discarding the callback promise - */ -static void afs_vnode_deleted_remotely(struct afs_vnode *vnode) -{ - struct afs_server *server; - - _enter("{%p}", vnode->server); - - set_bit(AFS_VNODE_DELETED, &vnode->flags); - - server = vnode->server; - if (server) { - if (vnode->cb_promised) { - spin_lock(&server->cb_lock); - if (vnode->cb_promised) { - rb_erase(&vnode->cb_promise, - &server->cb_promises); - vnode->cb_promised = false; - } - spin_unlock(&server->cb_lock); - } - - spin_lock(&server->fs_lock); - rb_erase(&vnode->server_rb, &server->fs_vnodes); - spin_unlock(&server->fs_lock); - - vnode->server = NULL; - afs_put_server(server); - } else { - ASSERT(!vnode->cb_promised); - } - - _leave(""); -} - -/* - * finish off updating the recorded status of a file after a successful - * operation completion - * - starts callback expiry timer - * - adds to server's callback list - */ -void afs_vnode_finalise_status_update(struct afs_vnode *vnode, - struct afs_server *server) -{ - struct afs_server *oldserver = NULL; - - _enter("%p,%p", vnode, server); - - spin_lock(&vnode->lock); - clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - afs_vnode_note_promise(vnode, server); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - - wake_up_all(&vnode->update_waitq); - afs_put_server(oldserver); - _leave(""); -} - -/* - * finish off updating the recorded status of a file after an operation failed - */ -static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret) -{ - _enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret); - - spin_lock(&vnode->lock); - - clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - - if (ret == -ENOENT) { - /* the file was deleted on the server */ - _debug("got NOENT from server - marking file deleted"); - afs_vnode_deleted_remotely(vnode); - } - - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - - wake_up_all(&vnode->update_waitq); - _leave(""); -} - -/* - * fetch file status from the volume - * - don't issue a fetch if: - * - the changed bit is not set and there's a valid callback - * - there are any outstanding ops that will fetch the status - * - TODO implement local caching - */ -int afs_vnode_fetch_status(struct afs_vnode *vnode, - struct afs_vnode *auth_vnode, struct key *key) -{ - struct afs_server *server; - unsigned long acl_order; - int ret; - - DECLARE_WAITQUEUE(myself, current); - - _enter("%s,{%x:%u.%u}", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); - - if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && - vnode->cb_promised) { - _leave(" [unchanged]"); - return 0; - } - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _leave(" [deleted]"); - return -ENOENT; - } - - acl_order = 0; - if (auth_vnode) - acl_order = auth_vnode->acl_order; - - spin_lock(&vnode->lock); - - if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && - vnode->cb_promised) { - spin_unlock(&vnode->lock); - _leave(" [unchanged]"); - return 0; - } - - ASSERTCMP(vnode->update_cnt, >=, 0); - - if (vnode->update_cnt > 0) { - /* someone else started a fetch */ - _debug("wait on fetch %d", vnode->update_cnt); - - set_current_state(TASK_UNINTERRUPTIBLE); - ASSERT(myself.func != NULL); - add_wait_queue(&vnode->update_waitq, &myself); - - /* wait for the status to be updated */ - for (;;) { - if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) - break; - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - break; - - /* check to see if it got updated and invalidated all - * before we saw it */ - if (vnode->update_cnt == 0) { - remove_wait_queue(&vnode->update_waitq, - &myself); - set_current_state(TASK_RUNNING); - goto get_anyway; - } - - spin_unlock(&vnode->lock); - - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - - spin_lock(&vnode->lock); - } - - remove_wait_queue(&vnode->update_waitq, &myself); - spin_unlock(&vnode->lock); - set_current_state(TASK_RUNNING); - - return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? - -ENOENT : 0; - } - -get_anyway: - /* okay... we're going to have to initiate the op */ - vnode->update_cnt++; - - spin_unlock(&vnode->lock); - - /* merge AFS status fetches and clear outstanding callback on this - * vnode */ - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %p{%08x}", - server, ntohl(server->addr.s_addr)); - - ret = afs_fs_fetch_file_status(server, key, vnode, NULL, - false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - _debug("adjust"); - if (auth_vnode) - afs_cache_permit(vnode, key, acl_order); - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - _debug("failed [%d]", ret); - afs_vnode_status_update_failed(vnode, ret); - } - - ASSERTCMP(vnode->update_cnt, >=, 0); - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * fetch file data from the volume - * - TODO implement caching - */ -int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key, - struct afs_read *desc) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,,,", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - /* this op will fetch the status */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - /* merge in AFS status fetches and clear outstanding callback on this - * vnode */ - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_fetch_data(server, key, vnode, desc, - false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } - - _leave(" = %d", ret); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - return PTR_ERR(server); -} - -/* - * make a file or a directory - */ -int afs_vnode_create(struct afs_vnode *vnode, struct key *key, - const char *name, umode_t mode, struct afs_fid *newfid, - struct afs_file_status *newstatus, - struct afs_callback *newcb, struct afs_server **_server) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,%s,,", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), - name); - - /* this op will fetch the status on the directory we're creating in */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_create(server, key, vnode, name, mode, newfid, - newstatus, newcb, false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - *_server = server; - } else { - afs_vnode_status_update_failed(vnode, ret); - *_server = NULL; - } - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * remove a file or directory - */ -int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name, - bool isdir) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,%s", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), - name); - - /* this op will fetch the status on the directory we're removing from */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_remove(server, key, vnode, name, isdir, - false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * create a hard link - */ -int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, - struct key *key, const char *name) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s", - dvnode->volume->vlocation->vldb.name, - dvnode->fid.vid, - dvnode->fid.vnode, - dvnode->fid.unique, - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), - name); - - /* this op will fetch the status on the directory we're removing from */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - spin_lock(&dvnode->lock); - dvnode->update_cnt++; - spin_unlock(&dvnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(dvnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_link(server, key, dvnode, vnode, name, - false); - - } while (!afs_volume_release_fileserver(dvnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_vnode_finalise_status_update(dvnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - afs_vnode_status_update_failed(dvnode, ret); - } - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - spin_lock(&dvnode->lock); - dvnode->update_cnt--; - ASSERTCMP(dvnode->update_cnt, >=, 0); - spin_unlock(&dvnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * create a symbolic link - */ -int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key, - const char *name, const char *content, - struct afs_fid *newfid, - struct afs_file_status *newstatus, - struct afs_server **_server) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,%s,%s,,,", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), - name, content); - - /* this op will fetch the status on the directory we're creating in */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_symlink(server, key, vnode, name, content, - newfid, newstatus, false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - *_server = server; - } else { - afs_vnode_status_update_failed(vnode, ret); - *_server = NULL; - } - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * rename a file - */ -int afs_vnode_rename(struct afs_vnode *orig_dvnode, - struct afs_vnode *new_dvnode, - struct key *key, - const char *orig_name, - const char *new_name) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s", - orig_dvnode->volume->vlocation->vldb.name, - orig_dvnode->fid.vid, - orig_dvnode->fid.vnode, - orig_dvnode->fid.unique, - new_dvnode->volume->vlocation->vldb.name, - new_dvnode->fid.vid, - new_dvnode->fid.vnode, - new_dvnode->fid.unique, - key_serial(key), - orig_name, - new_name); - - /* this op will fetch the status on both the directories we're dealing - * with */ - spin_lock(&orig_dvnode->lock); - orig_dvnode->update_cnt++; - spin_unlock(&orig_dvnode->lock); - if (new_dvnode != orig_dvnode) { - spin_lock(&new_dvnode->lock); - new_dvnode->update_cnt++; - spin_unlock(&new_dvnode->lock); - } - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(orig_dvnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_rename(server, key, orig_dvnode, orig_name, - new_dvnode, new_name, false); - - } while (!afs_volume_release_fileserver(orig_dvnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(orig_dvnode, server); - if (new_dvnode != orig_dvnode) - afs_vnode_finalise_status_update(new_dvnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(orig_dvnode, ret); - if (new_dvnode != orig_dvnode) - afs_vnode_status_update_failed(new_dvnode, ret); - } - - _leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt); - return ret; - -no_server: - spin_lock(&orig_dvnode->lock); - orig_dvnode->update_cnt--; - ASSERTCMP(orig_dvnode->update_cnt, >=, 0); - spin_unlock(&orig_dvnode->lock); - if (new_dvnode != orig_dvnode) { - spin_lock(&new_dvnode->lock); - new_dvnode->update_cnt--; - ASSERTCMP(new_dvnode->update_cnt, >=, 0); - spin_unlock(&new_dvnode->lock); - } - _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt); - return PTR_ERR(server); -} - -/* - * write to a file - */ -int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last, - unsigned offset, unsigned to) -{ - struct afs_server *server; - struct afs_vnode *vnode = wb->vnode; - int ret; - - _enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(wb->key), - first, last, offset, to); - - /* this op will fetch the status */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_store_data(server, wb, first, last, offset, to, - false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } - - _leave(" = %d", ret); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - return PTR_ERR(server); -} - -/* - * set the attributes on a file - */ -int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key, - struct iattr *attr) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - /* this op will fetch the status */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_setattr(server, key, vnode, attr, false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } - - _leave(" = %d", ret); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - return PTR_ERR(server); -} - -/* - * get the status of a volume - */ -int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key, - struct afs_volume_status *vs) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_get_volume_status(server, key, vnode, vs, false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) - afs_put_server(server); - - _leave(" = %d", ret); - return ret; - -no_server: - return PTR_ERR(server); -} - -/* - * get a lock on a file - */ -int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key, - afs_lock_type_t type) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,%u", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), type); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_set_lock(server, key, vnode, type, false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) - afs_put_server(server); - - _leave(" = %d", ret); - return ret; - -no_server: - return PTR_ERR(server); -} - -/* - * extend a lock on a file - */ -int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_extend_lock(server, key, vnode, false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) - afs_put_server(server); - - _leave(" = %d", ret); - return ret; - -no_server: - return PTR_ERR(server); -} - -/* - * release a lock on a file - */ -int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_release_lock(server, key, vnode, false); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) - afs_put_server(server); - - _leave(" = %d", ret); - return ret; - -no_server: - return PTR_ERR(server); -} diff --git a/fs/afs/volume.c b/fs/afs/volume.c index db73d6dad02b..684c48293353 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -10,19 +10,167 @@ */ #include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> #include <linux/slab.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/sched.h> #include "internal.h" -static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" }; +unsigned __read_mostly afs_volume_gc_delay = 10; +unsigned __read_mostly afs_volume_record_life = 60 * 60; + +static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" }; + +/* + * Allocate a volume record and load it up from a vldb record. + */ +static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params, + struct afs_vldb_entry *vldb, + unsigned long type_mask) +{ + struct afs_server_list *slist; + struct afs_server *server; + struct afs_volume *volume; + int ret = -ENOMEM, nr_servers = 0, i, j; + + for (i = 0; i < vldb->nr_servers; i++) + if (vldb->fs_mask[i] & type_mask) + nr_servers++; + + volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); + if (!volume) + goto error_0; + + volume->vid = vldb->vid[params->type]; + volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; + volume->cell = afs_get_cell(params->cell); + volume->type = params->type; + volume->type_force = params->force; + volume->name_len = vldb->name_len; + + atomic_set(&volume->usage, 1); + INIT_LIST_HEAD(&volume->proc_link); + rwlock_init(&volume->servers_lock); + memcpy(volume->name, vldb->name, vldb->name_len + 1); + + slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask); + if (IS_ERR(slist)) { + ret = PTR_ERR(slist); + goto error_1; + } + + refcount_set(&slist->usage, 1); + volume->servers = slist; + + /* Make sure a records exists for each server this volume occupies. */ + for (i = 0; i < nr_servers; i++) { + if (!(vldb->fs_mask[i] & type_mask)) + continue; + + server = afs_lookup_server(params->cell, params->key, + &vldb->fs_server[i]); + if (IS_ERR(server)) { + ret = PTR_ERR(server); + if (ret == -ENOENT) + continue; + goto error_2; + } + + /* Insertion-sort by server pointer */ + for (j = 0; j < slist->nr_servers; j++) + if (slist->servers[j].server >= server) + break; + if (j < slist->nr_servers) { + if (slist->servers[j].server == server) { + afs_put_server(params->net, server); + continue; + } + + memmove(slist->servers + j + 1, + slist->servers + j, + (slist->nr_servers - j) * sizeof(struct afs_server_entry)); + } + + slist->servers[j].server = server; + slist->nr_servers++; + } + + if (slist->nr_servers == 0) { + ret = -EDESTADDRREQ; + goto error_2; + } + + return volume; + +error_2: + afs_put_serverlist(params->net, slist); +error_1: + kfree(volume); +error_0: + return ERR_PTR(ret); +} /* - * lookup a volume by name - * - this can be one of the following: + * Look up a VLDB record for a volume. + */ +static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell, + struct key *key, + const char *volname, + size_t volnamesz) +{ + struct afs_addr_cursor ac; + struct afs_vldb_entry *vldb; + int ret; + + ret = afs_set_vl_cursor(&ac, cell); + if (ret < 0) + return ERR_PTR(ret); + + while (afs_iterate_addresses(&ac)) { + if (!test_bit(ac.index, &ac.alist->probed)) { + ret = afs_vl_get_capabilities(cell->net, &ac, key); + switch (ret) { + case VL_SERVICE: + clear_bit(ac.index, &ac.alist->yfs); + set_bit(ac.index, &ac.alist->probed); + ac.addr->srx_service = ret; + break; + case YFS_VL_SERVICE: + set_bit(ac.index, &ac.alist->yfs); + set_bit(ac.index, &ac.alist->probed); + ac.addr->srx_service = ret; + break; + } + } + + vldb = afs_vl_get_entry_by_name_u(cell->net, &ac, key, + volname, volnamesz); + switch (ac.error) { + case 0: + afs_end_cursor(&ac); + return vldb; + case -ECONNABORTED: + ac.error = afs_abort_to_error(ac.abort_code); + goto error; + case -ENOMEM: + case -ENONET: + goto error; + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + break; + default: + ac.error = -EIO; + goto error; + } + } + +error: + return ERR_PTR(afs_end_cursor(&ac)); +} + +/* + * Look up a volume in the VL server and create a candidate volume record for + * it. + * + * The volume name can be one of the following: * "%[cell:]volume[.]" R/W volume * "#[cell:]volume[.]" R/O or R/W volume (rwparent=0), * or R/W (rwparent=1) volume @@ -42,353 +190,218 @@ static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" }; * - Rule 3: If parent volume is R/W, then only mount R/W volume unless * explicitly told otherwise */ -struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) +struct afs_volume *afs_create_volume(struct afs_mount_params *params) { - struct afs_vlocation *vlocation = NULL; - struct afs_volume *volume = NULL; - struct afs_server *server = NULL; - char srvtmask; - int ret, loop; - - _enter("{%*.*s,%d}", - params->volnamesz, params->volnamesz, params->volname, params->rwpath); - - /* lookup the volume location record */ - vlocation = afs_vlocation_lookup(params->cell, params->key, - params->volname, params->volnamesz); - if (IS_ERR(vlocation)) { - ret = PTR_ERR(vlocation); - vlocation = NULL; - goto error; - } + struct afs_vldb_entry *vldb; + struct afs_volume *volume; + unsigned long type_mask = 1UL << params->type; - /* make the final decision on the type we want */ - ret = -ENOMEDIUM; - if (params->force && !(vlocation->vldb.vidmask & (1 << params->type))) - goto error; + vldb = afs_vl_lookup_vldb(params->cell, params->key, + params->volname, params->volnamesz); + if (IS_ERR(vldb)) + return ERR_CAST(vldb); - srvtmask = 0; - for (loop = 0; loop < vlocation->vldb.nservers; loop++) - srvtmask |= vlocation->vldb.srvtmask[loop]; + if (test_bit(AFS_VLDB_QUERY_ERROR, &vldb->flags)) { + volume = ERR_PTR(vldb->error); + goto error; + } + /* Make the final decision on the type we want */ + volume = ERR_PTR(-ENOMEDIUM); if (params->force) { - if (!(srvtmask & (1 << params->type))) + if (!(vldb->flags & type_mask)) goto error; - } else if (srvtmask & AFS_VOL_VTM_RO) { + } else if (test_bit(AFS_VLDB_HAS_RO, &vldb->flags)) { params->type = AFSVL_ROVOL; - } else if (srvtmask & AFS_VOL_VTM_RW) { + } else if (test_bit(AFS_VLDB_HAS_RW, &vldb->flags)) { params->type = AFSVL_RWVOL; } else { goto error; } - down_write(¶ms->cell->vl_sem); + type_mask = 1UL << params->type; + volume = afs_alloc_volume(params, vldb, type_mask); - /* is the volume already active? */ - if (vlocation->vols[params->type]) { - /* yes - re-use it */ - volume = vlocation->vols[params->type]; - afs_get_volume(volume); - goto success; - } +error: + kfree(vldb); + return volume; +} - /* create a new volume record */ - _debug("creating new volume record"); +/* + * Destroy a volume record + */ +static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) +{ + _enter("%p", volume); - ret = -ENOMEM; - volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); - if (!volume) - goto error_up; +#ifdef CONFIG_AFS_FSCACHE + ASSERTCMP(volume->cache, ==, NULL); +#endif - atomic_set(&volume->usage, 1); - volume->type = params->type; - volume->type_force = params->force; - volume->cell = params->cell; - volume->vid = vlocation->vldb.vid[params->type]; - - init_rwsem(&volume->server_sem); - - /* look up all the applicable server records */ - for (loop = 0; loop < 8; loop++) { - if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) { - server = afs_lookup_server( - volume->cell, &vlocation->vldb.servers[loop]); - if (IS_ERR(server)) { - ret = PTR_ERR(server); - goto error_discard; - } + afs_put_serverlist(net, volume->servers); + afs_put_cell(net, volume->cell); + kfree(volume); - volume->servers[volume->nservers] = server; - volume->nservers++; - } + _leave(" [destroyed]"); +} + +/* + * Drop a reference on a volume record. + */ +void afs_put_volume(struct afs_cell *cell, struct afs_volume *volume) +{ + if (volume) { + _enter("%s", volume->name); + + if (atomic_dec_and_test(&volume->usage)) + afs_destroy_volume(cell->net, volume); } +} - /* attach the cache and volume location */ +/* + * Activate a volume. + */ +void afs_activate_volume(struct afs_volume *volume) +{ #ifdef CONFIG_AFS_FSCACHE - volume->cache = fscache_acquire_cookie(vlocation->cache, + volume->cache = fscache_acquire_cookie(volume->cell->cache, &afs_volume_cache_index_def, volume, true); #endif - afs_get_vlocation(vlocation); - volume->vlocation = vlocation; - - vlocation->vols[volume->type] = volume; - -success: - _debug("kAFS selected %s volume %08x", - afs_voltypes[volume->type], volume->vid); - up_write(¶ms->cell->vl_sem); - afs_put_vlocation(vlocation); - _leave(" = %p", volume); - return volume; - - /* clean up */ -error_up: - up_write(¶ms->cell->vl_sem); -error: - afs_put_vlocation(vlocation); - _leave(" = %d", ret); - return ERR_PTR(ret); - -error_discard: - up_write(¶ms->cell->vl_sem); - - for (loop = volume->nservers - 1; loop >= 0; loop--) - afs_put_server(volume->servers[loop]); - kfree(volume); - goto error; + write_lock(&volume->cell->proc_lock); + list_add_tail(&volume->proc_link, &volume->cell->proc_volumes); + write_unlock(&volume->cell->proc_lock); } /* - * destroy a volume record + * Deactivate a volume. */ -void afs_put_volume(struct afs_volume *volume) +void afs_deactivate_volume(struct afs_volume *volume) { - struct afs_vlocation *vlocation; - int loop; - - if (!volume) - return; - - _enter("%p", volume); + _enter("%s", volume->name); - ASSERTCMP(atomic_read(&volume->usage), >, 0); + write_lock(&volume->cell->proc_lock); + list_del_init(&volume->proc_link); + write_unlock(&volume->cell->proc_lock); - vlocation = volume->vlocation; - - /* to prevent a race, the decrement and the dequeue must be effectively - * atomic */ - down_write(&vlocation->cell->vl_sem); - - if (likely(!atomic_dec_and_test(&volume->usage))) { - up_write(&vlocation->cell->vl_sem); - _leave(""); - return; - } - - vlocation->vols[volume->type] = NULL; - - up_write(&vlocation->cell->vl_sem); - - /* finish cleaning up the volume */ #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(volume->cache, 0); + fscache_relinquish_cookie(volume->cache, + test_bit(AFS_VOLUME_DELETED, &volume->flags)); + volume->cache = NULL; #endif - afs_put_vlocation(vlocation); - - for (loop = volume->nservers - 1; loop >= 0; loop--) - afs_put_server(volume->servers[loop]); - - kfree(volume); - _leave(" [destroyed]"); + _leave(""); } /* - * pick a server to use to try accessing this volume - * - returns with an elevated usage count on the server chosen + * Query the VL service to update the volume status. */ -struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode) +static int afs_update_volume_status(struct afs_volume *volume, struct key *key) { - struct afs_volume *volume = vnode->volume; - struct afs_server *server; - int ret, state, loop; + struct afs_server_list *new, *old, *discard; + struct afs_vldb_entry *vldb; + char idbuf[16]; + int ret, idsz; + + _enter(""); + + /* We look up an ID by passing it as a decimal string in the + * operation's name parameter. + */ + idsz = sprintf(idbuf, "%u", volume->vid); - _enter("%s", volume->vlocation->vldb.name); + vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); + if (IS_ERR(vldb)) { + ret = PTR_ERR(vldb); + goto error; + } - /* stick with the server we're already using if we can */ - if (vnode->server && vnode->server->fs_state == 0) { - afs_get_server(vnode->server); - _leave(" = %p [current]", vnode->server); - return vnode->server; + /* See if the volume got renamed. */ + if (vldb->name_len != volume->name_len || + memcmp(vldb->name, volume->name, vldb->name_len) != 0) { + /* TODO: Use RCU'd string. */ + memcpy(volume->name, vldb->name, AFS_MAXVOLNAME); + volume->name_len = vldb->name_len; } - down_read(&volume->server_sem); + /* See if the volume's server list got updated. */ + new = afs_alloc_server_list(volume->cell, key, + vldb, (1 << volume->type)); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto error_vldb; + } - /* handle the no-server case */ - if (volume->nservers == 0) { - ret = volume->rjservers ? -ENOMEDIUM : -ESTALE; - up_read(&volume->server_sem); - _leave(" = %d [no servers]", ret); - return ERR_PTR(ret); + write_lock(&volume->servers_lock); + + discard = new; + old = volume->servers; + if (afs_annotate_server_list(new, old)) { + new->seq = volume->servers_seq + 1; + volume->servers = new; + smp_wmb(); + volume->servers_seq++; + discard = old; } - /* basically, just search the list for the first live server and use - * that */ + volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; + clear_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); + write_unlock(&volume->servers_lock); ret = 0; - for (loop = 0; loop < volume->nservers; loop++) { - server = volume->servers[loop]; - state = server->fs_state; - _debug("consider %d [%d]", loop, state); + afs_put_serverlist(volume->cell->net, discard); +error_vldb: + kfree(vldb); +error: + _leave(" = %d", ret); + return ret; +} - switch (state) { - /* found an apparently healthy server */ - case 0: - afs_get_server(server); - up_read(&volume->server_sem); - _leave(" = %p (picked %08x)", - server, ntohl(server->addr.s_addr)); - return server; +/* + * Make sure the volume record is up to date. + */ +int afs_check_volume_status(struct afs_volume *volume, struct key *key) +{ + time64_t now = ktime_get_real_seconds(); + int ret, retries = 0; - case -ENETUNREACH: - if (ret == 0) - ret = state; - break; + _enter(""); - case -EHOSTUNREACH: - if (ret == 0 || - ret == -ENETUNREACH) - ret = state; - break; + if (volume->update_at <= now) + set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); - case -ECONNREFUSED: - if (ret == 0 || - ret == -ENETUNREACH || - ret == -EHOSTUNREACH) - ret = state; - break; +retry: + if (!test_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags) && + !test_bit(AFS_VOLUME_WAIT, &volume->flags)) { + _leave(" = 0"); + return 0; + } - default: - case -EREMOTEIO: - if (ret == 0 || - ret == -ENETUNREACH || - ret == -EHOSTUNREACH || - ret == -ECONNREFUSED) - ret = state; - break; - } + if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) { + ret = afs_update_volume_status(volume, key); + clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags); + clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags); + wake_up_bit(&volume->flags, AFS_VOLUME_WAIT); + _leave(" = %d", ret); + return ret; } - /* no available servers - * - TODO: handle the no active servers case better - */ - up_read(&volume->server_sem); - _leave(" = %d", ret); - return ERR_PTR(ret); -} + if (!test_bit(AFS_VOLUME_WAIT, &volume->flags)) { + _leave(" = 0 [no wait]"); + return 0; + } -/* - * release a server after use - * - releases the ref on the server struct that was acquired by picking - * - records result of using a particular server to access a volume - * - return 0 to try again, 1 if okay or to issue error - * - the caller must release the server struct if result was 0 - */ -int afs_volume_release_fileserver(struct afs_vnode *vnode, - struct afs_server *server, - int result) -{ - struct afs_volume *volume = vnode->volume; - unsigned loop; - - _enter("%s,%08x,%d", - volume->vlocation->vldb.name, ntohl(server->addr.s_addr), - result); - - switch (result) { - /* success */ - case 0: - server->fs_act_jif = jiffies; - server->fs_state = 0; - _leave(""); - return 1; - - /* the fileserver denied all knowledge of the volume */ - case -ENOMEDIUM: - server->fs_act_jif = jiffies; - down_write(&volume->server_sem); - - /* firstly, find where the server is in the active list (if it - * is) */ - for (loop = 0; loop < volume->nservers; loop++) - if (volume->servers[loop] == server) - goto present; - - /* no longer there - may have been discarded by another op */ - goto try_next_server_upw; - - present: - volume->nservers--; - memmove(&volume->servers[loop], - &volume->servers[loop + 1], - sizeof(volume->servers[loop]) * - (volume->nservers - loop)); - volume->servers[volume->nservers] = NULL; - afs_put_server(server); - volume->rjservers++; - - if (volume->nservers > 0) - /* another server might acknowledge its existence */ - goto try_next_server_upw; - - /* handle the case where all the fileservers have rejected the - * volume - * - TODO: try asking the fileservers for volume information - * - TODO: contact the VL server again to see if the volume is - * no longer registered - */ - up_write(&volume->server_sem); - afs_put_server(server); - _leave(" [completely rejected]"); - return 1; - - /* problem reaching the server */ - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - case -ETIME: - case -ETIMEDOUT: - case -EREMOTEIO: - /* mark the server as dead - * TODO: vary dead timeout depending on error - */ - spin_lock(&server->fs_lock); - if (!server->fs_state) { - server->fs_dead_jif = jiffies + HZ * 10; - server->fs_state = result; - printk("kAFS: SERVER DEAD state=%d\n", result); - } - spin_unlock(&server->fs_lock); - goto try_next_server; - - /* miscellaneous error */ - default: - server->fs_act_jif = jiffies; - case -ENOMEM: - case -ENONET: - /* tell the caller to accept the result */ - afs_put_server(server); - _leave(" [local failure]"); - return 1; + ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, TASK_INTERRUPTIBLE); + if (ret == -ERESTARTSYS) { + _leave(" = %d", ret); + return ret; } - /* tell the caller to loop around and try the next server */ -try_next_server_upw: - up_write(&volume->server_sem); -try_next_server: - afs_put_server(server); - _leave(" [try next server]"); - return 0; + retries++; + if (retries == 4) { + _leave(" = -ESTALE"); + return -ESTALE; + } + goto retry; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 106e43db1115..9370e2feb999 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -8,6 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/fs.h> @@ -16,9 +17,6 @@ #include <linux/pagevec.h> #include "internal.h" -static int afs_write_back_from_locked_page(struct afs_writeback *wb, - struct page *page); - /* * mark a page as having been made dirty and thus needing writeback */ @@ -29,58 +27,6 @@ int afs_set_page_dirty(struct page *page) } /* - * unlink a writeback record because its usage has reached zero - * - must be called with the wb->vnode->writeback_lock held - */ -static void afs_unlink_writeback(struct afs_writeback *wb) -{ - struct afs_writeback *front; - struct afs_vnode *vnode = wb->vnode; - - list_del_init(&wb->link); - if (!list_empty(&vnode->writebacks)) { - /* if an fsync rises to the front of the queue then wake it - * up */ - front = list_entry(vnode->writebacks.next, - struct afs_writeback, link); - if (front->state == AFS_WBACK_SYNCING) { - _debug("wake up sync"); - front->state = AFS_WBACK_COMPLETE; - wake_up(&front->waitq); - } - } -} - -/* - * free a writeback record - */ -static void afs_free_writeback(struct afs_writeback *wb) -{ - _enter(""); - key_put(wb->key); - kfree(wb); -} - -/* - * dispose of a reference to a writeback record - */ -void afs_put_writeback(struct afs_writeback *wb) -{ - struct afs_vnode *vnode = wb->vnode; - - _enter("{%d}", wb->usage); - - spin_lock(&vnode->writeback_lock); - if (--wb->usage == 0) - afs_unlink_writeback(wb); - else - wb = NULL; - spin_unlock(&vnode->writeback_lock); - if (wb) - afs_free_writeback(wb); -} - -/* * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, @@ -103,7 +49,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, req->pages[0] = page; get_page(page); - ret = afs_vnode_fetch_data(vnode, key, req); + ret = afs_fetch_data(vnode, key, req); afs_put_read(req); if (ret < 0) { if (ret == -ENOENT) { @@ -125,42 +71,32 @@ int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - struct afs_writeback *candidate, *wb; struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; - struct key *key = file->private_data; - unsigned from = pos & (PAGE_SIZE - 1); - unsigned to = from + len; + struct key *key = afs_file_key(file); + unsigned long priv; + unsigned f, from = pos & (PAGE_SIZE - 1); + unsigned t, to = from + len; pgoff_t index = pos >> PAGE_SHIFT; int ret; _enter("{%x:%u},{%lx},%u,%u", vnode->fid.vid, vnode->fid.vnode, index, from, to); - candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); - if (!candidate) - return -ENOMEM; - candidate->vnode = vnode; - candidate->first = candidate->last = index; - candidate->offset_first = from; - candidate->to_last = to; - INIT_LIST_HEAD(&candidate->link); - candidate->usage = 1; - candidate->state = AFS_WBACK_PENDING; - init_waitqueue_head(&candidate->waitq); + /* We want to store information about how much of a page is altered in + * page->private. + */ + BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8); page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - kfree(candidate); + if (!page) return -ENOMEM; - } if (!PageUptodate(page) && len != PAGE_SIZE) { ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page); if (ret < 0) { unlock_page(page); put_page(page); - kfree(candidate); _leave(" = %d [prep]", ret); return ret; } @@ -171,79 +107,59 @@ int afs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; try_again: - spin_lock(&vnode->writeback_lock); - - /* see if this page is already pending a writeback under a suitable key - * - if so we can just join onto that one */ - wb = (struct afs_writeback *) page_private(page); - if (wb) { - if (wb->key == key && wb->state == AFS_WBACK_PENDING) - goto subsume_in_current_wb; - goto flush_conflicting_wb; + /* See if this page is already partially written in a way that we can + * merge the new write with. + */ + t = f = 0; + if (PagePrivate(page)) { + priv = page_private(page); + f = priv & AFS_PRIV_MAX; + t = priv >> AFS_PRIV_SHIFT; + ASSERTCMP(f, <=, t); } - if (index > 0) { - /* see if we can find an already pending writeback that we can - * append this page to */ - list_for_each_entry(wb, &vnode->writebacks, link) { - if (wb->last == index - 1 && wb->key == key && - wb->state == AFS_WBACK_PENDING) - goto append_to_previous_wb; + if (f != t) { + if (PageWriteback(page)) { + trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), + page->index, priv); + goto flush_conflicting_write; } + if (to < f || from > t) + goto flush_conflicting_write; + if (from < f) + f = from; + if (to > t) + t = to; + } else { + f = from; + t = to; } - list_add_tail(&candidate->link, &vnode->writebacks); - candidate->key = key_get(key); - spin_unlock(&vnode->writeback_lock); + priv = (unsigned long)t << AFS_PRIV_SHIFT; + priv |= f; + trace_afs_page_dirty(vnode, tracepoint_string("begin"), + page->index, priv); SetPagePrivate(page); - set_page_private(page, (unsigned long) candidate); - _leave(" = 0 [new]"); - return 0; - -subsume_in_current_wb: - _debug("subsume"); - ASSERTRANGE(wb->first, <=, index, <=, wb->last); - if (index == wb->first && from < wb->offset_first) - wb->offset_first = from; - if (index == wb->last && to > wb->to_last) - wb->to_last = to; - spin_unlock(&vnode->writeback_lock); - kfree(candidate); - _leave(" = 0 [sub]"); - return 0; - -append_to_previous_wb: - _debug("append into %lx-%lx", wb->first, wb->last); - wb->usage++; - wb->last++; - wb->to_last = to; - spin_unlock(&vnode->writeback_lock); - SetPagePrivate(page); - set_page_private(page, (unsigned long) wb); - kfree(candidate); - _leave(" = 0 [app]"); + set_page_private(page, priv); + _leave(" = 0"); return 0; - /* the page is currently bound to another context, so if it's dirty we - * need to flush it before we can use the new context */ -flush_conflicting_wb: + /* The previous write and this write aren't adjacent or overlapping, so + * flush the page out. + */ +flush_conflicting_write: _debug("flush conflict"); - if (wb->state == AFS_WBACK_PENDING) - wb->state = AFS_WBACK_CONFLICTING; - spin_unlock(&vnode->writeback_lock); - if (clear_page_dirty_for_io(page)) { - ret = afs_write_back_from_locked_page(wb, page); - if (ret < 0) { - afs_put_writeback(candidate); - _leave(" = %d", ret); - return ret; - } + ret = write_one_page(page); + if (ret < 0) { + _leave(" = %d", ret); + return ret; } - /* the page holds a ref on the writeback record */ - afs_put_writeback(wb); - set_page_private(page, 0); - ClearPagePrivate(page); + ret = lock_page_killable(page); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } goto try_again; } @@ -255,7 +171,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct key *key = file->private_data; + struct key *key = afs_file_key(file); loff_t i_size, maybe_i_size; int ret; @@ -266,11 +182,11 @@ int afs_write_end(struct file *file, struct address_space *mapping, i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) { - spin_lock(&vnode->writeback_lock); + spin_lock(&vnode->wb_lock); i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) i_size_write(&vnode->vfs_inode, maybe_i_size); - spin_unlock(&vnode->writeback_lock); + spin_unlock(&vnode->wb_lock); } if (!PageUptodate(page)) { @@ -282,7 +198,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, ret = afs_fill_page(vnode, key, pos + copied, len - copied, page); if (ret < 0) - return ret; + goto out; } SetPageUptodate(page); } @@ -290,25 +206,28 @@ int afs_write_end(struct file *file, struct address_space *mapping, set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); + ret = copied; + +out: unlock_page(page); put_page(page); - - return copied; + return ret; } /* * kill all the pages in the given range */ -static void afs_kill_pages(struct afs_vnode *vnode, bool error, +static void afs_kill_pages(struct address_space *mapping, pgoff_t first, pgoff_t last) { + struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; unsigned count, loop; _enter("{%x:%u},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); - pagevec_init(&pv, 0); + pagevec_init(&pv); do { _debug("kill %lx-%lx", first, last); @@ -316,37 +235,157 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error, count = last - first + 1; if (count > PAGEVEC_SIZE) count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, - first, count, pv.pages); + pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { struct page *page = pv.pages[loop]; ClearPageUptodate(page); - if (error) - SetPageError(page); - if (PageWriteback(page)) - end_page_writeback(page); + SetPageError(page); + end_page_writeback(page); if (page->index >= first) first = page->index + 1; + lock_page(page); + generic_error_remove_page(mapping, page); } __pagevec_release(&pv); - } while (first < last); + } while (first <= last); _leave(""); } /* - * synchronously write back the locked page and any subsequent non-locked dirty - * pages also covered by the same writeback record + * Redirty all the pages in a given range. + */ +static void afs_redirty_pages(struct writeback_control *wbc, + struct address_space *mapping, + pgoff_t first, pgoff_t last) +{ + struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct pagevec pv; + unsigned count, loop; + + _enter("{%x:%u},%lx-%lx", + vnode->fid.vid, vnode->fid.vnode, first, last); + + pagevec_init(&pv); + + do { + _debug("redirty %lx-%lx", first, last); + + count = last - first + 1; + if (count > PAGEVEC_SIZE) + count = PAGEVEC_SIZE; + pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); + ASSERTCMP(pv.nr, ==, count); + + for (loop = 0; loop < count; loop++) { + struct page *page = pv.pages[loop]; + + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + if (page->index >= first) + first = page->index + 1; + } + + __pagevec_release(&pv); + } while (first <= last); + + _leave(""); +} + +/* + * write to a file + */ +static int afs_store_data(struct address_space *mapping, + pgoff_t first, pgoff_t last, + unsigned offset, unsigned to) +{ + struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct afs_fs_cursor fc; + struct afs_wb_key *wbk = NULL; + struct list_head *p; + int ret = -ENOKEY, ret2; + + _enter("%s{%x:%u.%u},%lx,%lx,%x,%x", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + first, last, offset, to); + + spin_lock(&vnode->wb_lock); + p = vnode->wb_keys.next; + + /* Iterate through the list looking for a valid key to use. */ +try_next_key: + while (p != &vnode->wb_keys) { + wbk = list_entry(p, struct afs_wb_key, vnode_link); + _debug("wbk %u", key_serial(wbk->key)); + ret2 = key_validate(wbk->key); + if (ret2 == 0) + goto found_key; + if (ret == -ENOKEY) + ret = ret2; + p = p->next; + } + + spin_unlock(&vnode->wb_lock); + afs_put_wb_key(wbk); + _leave(" = %d [no keys]", ret); + return ret; + +found_key: + refcount_inc(&wbk->usage); + spin_unlock(&vnode->wb_lock); + + _debug("USE WB KEY %u", key_serial(wbk->key)); + + ret = -ERESTARTSYS; + if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) { + while (afs_select_fileserver(&fc)) { + fc.cb_break = vnode->cb_break + vnode->cb_s_break; + afs_fs_store_data(&fc, mapping, first, last, offset, to); + } + + afs_check_for_remote_deletion(&fc, fc.vnode); + afs_vnode_commit_status(&fc, vnode, fc.cb_break); + ret = afs_end_vnode_operation(&fc); + } + + switch (ret) { + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + _debug("next"); + spin_lock(&vnode->wb_lock); + p = wbk->vnode_link.next; + afs_put_wb_key(wbk); + goto try_next_key; + } + + afs_put_wb_key(wbk); + _leave(" = %d", ret); + return ret; +} + +/* + * Synchronously write back the locked page and any subsequent non-locked dirty + * pages. */ -static int afs_write_back_from_locked_page(struct afs_writeback *wb, - struct page *primary_page) +static int afs_write_back_from_locked_page(struct address_space *mapping, + struct writeback_control *wbc, + struct page *primary_page, + pgoff_t final_page) { + struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct page *pages[8], *page; - unsigned long count; - unsigned n, offset, to; + unsigned long count, priv; + unsigned n, offset, to, f, t; pgoff_t start, first, last; int loop, ret; @@ -356,20 +395,33 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb, if (test_set_page_writeback(primary_page)) BUG(); - /* find all consecutive lockable dirty pages, stopping when we find a - * page that is not immediately lockable, is not dirty or is missing, - * or we reach the end of the range */ + /* Find all consecutive lockable dirty pages that have contiguous + * written regions, stopping when we find a page that is not + * immediately lockable, is not dirty or is missing, or we reach the + * end of the range. + */ start = primary_page->index; - if (start >= wb->last) + priv = page_private(primary_page); + offset = priv & AFS_PRIV_MAX; + to = priv >> AFS_PRIV_SHIFT; + trace_afs_page_dirty(vnode, tracepoint_string("store"), + primary_page->index, priv); + + WARN_ON(offset == to); + if (offset == to) + trace_afs_page_dirty(vnode, tracepoint_string("WARN"), + primary_page->index, priv); + + if (start >= final_page || to < PAGE_SIZE) goto no_more; + start++; do { _debug("more %lx [%lx]", start, count); - n = wb->last - start + 1; + n = final_page - start + 1; if (n > ARRAY_SIZE(pages)) n = ARRAY_SIZE(pages); - n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, - start, n, pages); + n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages); _debug("fgpc %u", n); if (n == 0) goto no_more; @@ -381,16 +433,30 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb, } for (loop = 0; loop < n; loop++) { + if (to != PAGE_SIZE) + break; page = pages[loop]; - if (page->index > wb->last) + if (page->index > final_page) break; if (!trylock_page(page)) break; - if (!PageDirty(page) || - page_private(page) != (unsigned long) wb) { + if (!PageDirty(page) || PageWriteback(page)) { unlock_page(page); break; } + + priv = page_private(page); + f = priv & AFS_PRIV_MAX; + t = priv >> AFS_PRIV_SHIFT; + if (f != 0) { + unlock_page(page); + break; + } + to = t; + + trace_afs_page_dirty(vnode, tracepoint_string("store+"), + page->index, priv); + if (!clear_page_dirty_for_io(page)) BUG(); if (test_set_page_writeback(page)) @@ -406,50 +472,55 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb, } start += loop; - } while (start <= wb->last && count < 65536); + } while (start <= final_page && count < 65536); no_more: - /* we now have a contiguous set of dirty pages, each with writeback set - * and the dirty mark cleared; the first page is locked and must remain - * so, all the rest are unlocked */ + /* We now have a contiguous set of dirty pages, each with writeback + * set; the first page is still locked at this point, but all the rest + * have been unlocked. + */ + unlock_page(primary_page); + first = primary_page->index; last = first + count - 1; - offset = (first == wb->first) ? wb->offset_first : 0; - to = (last == wb->last) ? wb->to_last : PAGE_SIZE; - _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); - ret = afs_vnode_store_data(wb, first, last, offset, to); - if (ret < 0) { - switch (ret) { - case -EDQUOT: - case -ENOSPC: - mapping_set_error(wb->vnode->vfs_inode.i_mapping, -ENOSPC); - break; - case -EROFS: - case -EIO: - case -EREMOTEIO: - case -EFBIG: - case -ENOENT: - case -ENOMEDIUM: - case -ENXIO: - afs_kill_pages(wb->vnode, true, first, last); - mapping_set_error(wb->vnode->vfs_inode.i_mapping, -EIO); - break; - case -EACCES: - case -EPERM: - case -ENOKEY: - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EKEYREVOKED: - afs_kill_pages(wb->vnode, false, first, last); - break; - default: - break; - } - } else { + ret = afs_store_data(mapping, first, last, offset, to); + switch (ret) { + case 0: ret = count; + break; + + default: + pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); + /* Fall through */ + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + afs_redirty_pages(wbc, mapping, first, last); + mapping_set_error(mapping, ret); + break; + + case -EDQUOT: + case -ENOSPC: + afs_redirty_pages(wbc, mapping, first, last); + mapping_set_error(mapping, -ENOSPC); + break; + + case -EROFS: + case -EIO: + case -EREMOTEIO: + case -EFBIG: + case -ENOENT: + case -ENOMEDIUM: + case -ENXIO: + afs_kill_pages(mapping, first, last); + mapping_set_error(mapping, ret); + break; } _leave(" = %d", ret); @@ -462,16 +533,12 @@ no_more: */ int afs_writepage(struct page *page, struct writeback_control *wbc) { - struct afs_writeback *wb; int ret; _enter("{%lx},", page->index); - wb = (struct afs_writeback *) page_private(page); - ASSERT(wb != NULL); - - ret = afs_write_back_from_locked_page(wb, page); - unlock_page(page); + ret = afs_write_back_from_locked_page(page->mapping, wbc, page, + wbc->range_end >> PAGE_SHIFT); if (ret < 0) { _leave(" = %d", ret); return 0; @@ -490,33 +557,30 @@ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, pgoff_t index, pgoff_t end, pgoff_t *_next) { - struct afs_writeback *wb; struct page *page; int ret, n; _enter(",,%lx,%lx,", index, end); do { - n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, - 1, &page); + n = find_get_pages_range_tag(mapping, &index, end, + PAGECACHE_TAG_DIRTY, 1, &page); if (!n) break; _debug("wback %lx", page->index); - if (page->index > end) { - *_next = index; - put_page(page); - _leave(" = 0 [%lx]", *_next); - return 0; - } - /* at this point we hold neither mapping->tree_lock nor lock on * the page itself: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled back from * swapper_space to tmpfs file mapping */ - lock_page(page); + ret = lock_page_killable(page); + if (ret < 0) { + put_page(page); + _leave(" = %d", ret); + return ret; + } if (page->mapping != mapping || !PageDirty(page)) { unlock_page(page); @@ -532,17 +596,9 @@ static int afs_writepages_region(struct address_space *mapping, continue; } - wb = (struct afs_writeback *) page_private(page); - ASSERT(wb != NULL); - - spin_lock(&wb->vnode->writeback_lock); - wb->state = AFS_WBACK_WRITING; - spin_unlock(&wb->vnode->writeback_lock); - if (!clear_page_dirty_for_io(page)) BUG(); - ret = afs_write_back_from_locked_page(wb, page); - unlock_page(page); + ret = afs_write_back_from_locked_page(mapping, wbc, page, end); put_page(page); if (ret < 0) { _leave(" = %d", ret); @@ -598,18 +654,15 @@ int afs_writepages(struct address_space *mapping, */ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) { - struct afs_writeback *wb = call->wb; struct pagevec pv; + unsigned long priv; unsigned count, loop; pgoff_t first = call->first, last = call->last; - bool free_wb; _enter("{%x:%u},{%lx-%lx}", vnode->fid.vid, vnode->fid.vnode, first, last); - ASSERT(wb != NULL); - - pagevec_init(&pv, 0); + pagevec_init(&pv); do { _debug("done %lx-%lx", first, last); @@ -617,35 +670,22 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) count = last - first + 1; if (count > PAGEVEC_SIZE) count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(call->mapping, first, count, - pv.pages); + pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, + first, count, pv.pages); ASSERTCMP(pv.nr, ==, count); - spin_lock(&vnode->writeback_lock); for (loop = 0; loop < count; loop++) { - struct page *page = pv.pages[loop]; - end_page_writeback(page); - if (page_private(page) == (unsigned long) wb) { - set_page_private(page, 0); - ClearPagePrivate(page); - wb->usage--; - } - } - free_wb = false; - if (wb->usage == 0) { - afs_unlink_writeback(wb); - free_wb = true; + priv = page_private(pv.pages[loop]); + trace_afs_page_dirty(vnode, tracepoint_string("clear"), + pv.pages[loop]->index, priv); + set_page_private(pv.pages[loop], 0); + end_page_writeback(pv.pages[loop]); } - spin_unlock(&vnode->writeback_lock); first += count; - if (free_wb) { - afs_free_writeback(wb); - wb = NULL; - } - __pagevec_release(&pv); } while (first <= last); + afs_prune_wb_keys(vnode); _leave(""); } @@ -677,28 +717,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) } /* - * flush the vnode to the fileserver - */ -int afs_writeback_all(struct afs_vnode *vnode) -{ - struct address_space *mapping = vnode->vfs_inode.i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_cyclic = 1, - }; - int ret; - - _enter(""); - - ret = mapping->a_ops->writepages(mapping, &wbc); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - - _leave(" = %d", ret); - return ret; -} - -/* * flush any dirty pages for this process, and check for write errors. * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. @@ -706,61 +724,13 @@ int afs_writeback_all(struct afs_vnode *vnode) int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file_inode(file); - struct afs_writeback *wb, *xwb; struct afs_vnode *vnode = AFS_FS_I(inode); - int ret; _enter("{%x:%u},{n=%pD},%d", vnode->fid.vid, vnode->fid.vnode, file, datasync); - ret = file_write_and_wait_range(file, start, end); - if (ret) - return ret; - inode_lock(inode); - - /* use a writeback record as a marker in the queue - when this reaches - * the front of the queue, all the outstanding writes are either - * completed or rejected */ - wb = kzalloc(sizeof(*wb), GFP_KERNEL); - if (!wb) { - ret = -ENOMEM; - goto out; - } - wb->vnode = vnode; - wb->first = 0; - wb->last = -1; - wb->offset_first = 0; - wb->to_last = PAGE_SIZE; - wb->usage = 1; - wb->state = AFS_WBACK_SYNCING; - init_waitqueue_head(&wb->waitq); - - spin_lock(&vnode->writeback_lock); - list_for_each_entry(xwb, &vnode->writebacks, link) { - if (xwb->state == AFS_WBACK_PENDING) - xwb->state = AFS_WBACK_CONFLICTING; - } - list_add_tail(&wb->link, &vnode->writebacks); - spin_unlock(&vnode->writeback_lock); - - /* push all the outstanding writebacks to the server */ - ret = afs_writeback_all(vnode); - if (ret < 0) { - afs_put_writeback(wb); - _leave(" = %d [wb]", ret); - goto out; - } - - /* wait for the preceding writes to actually complete */ - ret = wait_event_interruptible(wb->waitq, - wb->state == AFS_WBACK_COMPLETE || - vnode->writebacks.next == &wb->link); - afs_put_writeback(wb); - _leave(" = %d", ret); -out: - inode_unlock(inode); - return ret; + return file_write_and_wait_range(file, start, end); } /* @@ -781,19 +751,114 @@ int afs_flush(struct file *file, fl_owner_t id) * notification that a previously read-only page is about to become writable * - if it returns an error, the caller will deliver a bus error signal */ -int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int afs_page_mkwrite(struct vm_fault *vmf) { - struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + struct afs_vnode *vnode = AFS_FS_I(inode); + unsigned long priv; _enter("{{%x:%u}},{%lx}", - vnode->fid.vid, vnode->fid.vnode, page->index); + vnode->fid.vid, vnode->fid.vnode, vmf->page->index); - /* wait for the page to be written to the cache before we allow it to - * be modified */ + sb_start_pagefault(inode->i_sb); + + /* Wait for the page to be written to the cache before we allow it to + * be modified. We then assume the entire page will need writing back. + */ #ifdef CONFIG_AFS_FSCACHE - fscache_wait_on_page_write(vnode->cache, page); + fscache_wait_on_page_write(vnode->cache, vmf->page); #endif - _leave(" = 0"); - return 0; + if (PageWriteback(vmf->page) && + wait_on_page_bit_killable(vmf->page, PG_writeback) < 0) + return VM_FAULT_RETRY; + + if (lock_page_killable(vmf->page) < 0) + return VM_FAULT_RETRY; + + /* We mustn't change page->private until writeback is complete as that + * details the portion of the page we need to write back and we might + * need to redirty the page if there's a problem. + */ + wait_on_page_writeback(vmf->page); + + priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */ + priv |= 0; /* From */ + trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), + vmf->page->index, priv); + SetPagePrivate(vmf->page); + set_page_private(vmf->page, priv); + + sb_end_pagefault(inode->i_sb); + return VM_FAULT_LOCKED; +} + +/* + * Prune the keys cached for writeback. The caller must hold vnode->wb_lock. + */ +void afs_prune_wb_keys(struct afs_vnode *vnode) +{ + LIST_HEAD(graveyard); + struct afs_wb_key *wbk, *tmp; + + /* Discard unused keys */ + spin_lock(&vnode->wb_lock); + + if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) && + !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) { + list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) { + if (refcount_read(&wbk->usage) == 1) + list_move(&wbk->vnode_link, &graveyard); + } + } + + spin_unlock(&vnode->wb_lock); + + while (!list_empty(&graveyard)) { + wbk = list_entry(graveyard.next, struct afs_wb_key, vnode_link); + list_del(&wbk->vnode_link); + afs_put_wb_key(wbk); + } +} + +/* + * Clean up a page during invalidation. + */ +int afs_launder_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct afs_vnode *vnode = AFS_FS_I(mapping->host); + unsigned long priv; + unsigned int f, t; + int ret = 0; + + _enter("{%lx}", page->index); + + priv = page_private(page); + if (clear_page_dirty_for_io(page)) { + f = 0; + t = PAGE_SIZE; + if (PagePrivate(page)) { + f = priv & AFS_PRIV_MAX; + t = priv >> AFS_PRIV_SHIFT; + } + + trace_afs_page_dirty(vnode, tracepoint_string("launder"), + page->index, priv); + ret = afs_store_data(mapping, page->index, page->index, t, f); + } + + trace_afs_page_dirty(vnode, tracepoint_string("laundered"), + page->index, priv); + set_page_private(page, 0); + ClearPagePrivate(page); + +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page)) { + fscache_wait_on_page_write(vnode->cache, page); + fscache_uncache_page(vnode->cache, page); + } +#endif + return ret; } diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 2830e4f48d85..cfcc674e64a5 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -45,7 +45,7 @@ static int afs_xattr_get_cell(const struct xattr_handler *handler, struct afs_cell *cell = vnode->volume->cell; size_t namelen; - namelen = strlen(cell->name); + namelen = cell->name_len; if (size == 0) return namelen; if (namelen > size) @@ -96,7 +96,7 @@ static int afs_xattr_get_volume(const struct xattr_handler *handler, void *buffer, size_t size) { struct afs_vnode *vnode = AFS_FS_I(inode); - const char *volname = vnode->volume->vlocation->vldb.name; + const char *volname = vnode->volume->name; size_t namelen; namelen = strlen(volname); @@ -576,7 +576,7 @@ static int kiocb_cancel(struct aio_kiocb *kiocb) * actually has a cancel function, hence the cmpxchg() */ - cancel = ACCESS_ONCE(kiocb->ki_cancel); + cancel = READ_ONCE(kiocb->ki_cancel); do { if (!cancel || cancel == KIOCB_CANCELLED) return -EINVAL; @@ -1297,20 +1297,10 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, - struct timespec __user *timeout) + ktime_t until) { - ktime_t until = KTIME_MAX; long ret = 0; - if (timeout) { - struct timespec ts; - - if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) - return -EFAULT; - - until = timespec_to_ktime(ts); - } - /* * Note that aio_read_events() is being called as the conditional - i.e. * we're calling it after prepare_to_wait() has set task state to @@ -1826,6 +1816,25 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return ret; } +static long do_io_getevents(aio_context_t ctx_id, + long min_nr, + long nr, + struct io_event __user *events, + struct timespec64 *ts) +{ + ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX; + struct kioctx *ioctx = lookup_ioctx(ctx_id); + long ret = -EINVAL; + + if (likely(ioctx)) { + if (likely(min_nr <= nr && min_nr >= 0)) + ret = read_events(ioctx, min_nr, nr, events, until); + percpu_ref_put(&ioctx->users); + } + + return ret; +} + /* io_getevents: * Attempts to read at least min_nr events and up to nr events from * the completion queue for the aio_context specified by ctx_id. If @@ -1844,15 +1853,14 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, struct io_event __user *, events, struct timespec __user *, timeout) { - struct kioctx *ioctx = lookup_ioctx(ctx_id); - long ret = -EINVAL; + struct timespec64 ts; - if (likely(ioctx)) { - if (likely(min_nr <= nr && min_nr >= 0)) - ret = read_events(ioctx, min_nr, nr, events, timeout); - percpu_ref_put(&ioctx->users); + if (timeout) { + if (unlikely(get_timespec64(&ts, timeout))) + return -EFAULT; } - return ret; + + return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); } #ifdef CONFIG_COMPAT @@ -1862,17 +1870,14 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, struct io_event __user *, events, struct compat_timespec __user *, timeout) { - struct timespec t; - struct timespec __user *ut = NULL; + struct timespec64 t; if (timeout) { - if (compat_get_timespec(&t, timeout)) + if (compat_get_timespec64(&t, timeout)) return -EFAULT; - ut = compat_alloc_user_space(sizeof(*ut)); - if (copy_to_user(ut, &t, sizeof(t))) - return -EFAULT; } - return sys_io_getevents(ctx_id, min_nr, nr, events, ut); + + return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); } #endif diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d79ced925861..82e8f6edfb48 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk) pr_debug("waiting for mount name=%pd\n", path->dentry); status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); - ino->last_used = jiffies; } + ino->last_used = jiffies; return status; } @@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; struct dentry *new; new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - if (new == dentry) - dput(new); - else { - struct autofs_info *ino; - - ino = autofs4_dentry_ino(new); - ino->last_used = jiffies; - dput(path->dentry); - path->dentry = new; - } + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; + dput(path->dentry); + path->dentry = new; } return path->dentry; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 4ac49d038bf3..961a12dc6dc8 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -81,7 +81,8 @@ static int autofs4_write(struct autofs_sb_info *sbi, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - return (bytes > 0); + /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ + return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } static void autofs4_notify_daemon(struct autofs_sb_info *sbi, @@ -95,6 +96,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } pkt; struct file *pipe = NULL; size_t pktsz; + int ret; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", (unsigned long) wq->wait_queue_token, @@ -168,8 +170,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - if (autofs4_write(sbi, pipe, &pkt, pktsz)) + switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + case 0: + break; + case -ENOMEM: + case -ERESTARTSYS: + /* Just fail this one */ + autofs4_wait_release(sbi, wq->wait_queue_token, ret); + break; + default: autofs4_catatonic_mode(sbi); + break; + } fput(pipe); } diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog index 75a461cfaca6..16f2dfe8c2f7 100644 --- a/fs/befs/ChangeLog +++ b/fs/befs/ChangeLog @@ -365,7 +365,7 @@ Version 0.4 (2001-10-28) (fs/befs/super.c) * Tell the kernel to only mount befs read-only. - By setting the MS_RDONLY flag in befs_read_super(). + By setting the SB_RDONLY flag in befs_read_super(). Not that it was possible to write before. But now the kernel won't even try. (fs/befs/super.c) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index a92355cc453b..ee236231cafa 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -841,7 +841,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { befs_warning(sb, "No write support. Marking filesystem read-only"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } /* @@ -948,7 +948,7 @@ static int befs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if (!(*flags & MS_RDONLY)) + if (!(*flags & SB_RDONLY)) return -EINVAL; return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 73b01e474fdc..83732fef510d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -51,6 +51,11 @@ #define user_siginfo_t siginfo_t #endif +/* That's for binfmt_elf_fdpic to deal with */ +#ifndef elf_check_fdpic +#define elf_check_fdpic(ex) false +#endif + static int load_elf_binary(struct linux_binprm *bprm); static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long); @@ -541,7 +546,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, if (interp_elf_ex->e_type != ET_EXEC && interp_elf_ex->e_type != ET_DYN) goto out; - if (!elf_check_arch(interp_elf_ex)) + if (!elf_check_arch(interp_elf_ex) || + elf_check_fdpic(interp_elf_ex)) goto out; if (!interpreter->f_op->mmap) goto out; @@ -718,6 +724,8 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; if (!elf_check_arch(&loc->elf_ex)) goto out; + if (elf_check_fdpic(&loc->elf_ex)) + goto out; if (!bprm->file->f_op->mmap) goto out; @@ -817,7 +825,8 @@ static int load_elf_binary(struct linux_binprm *bprm) if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) goto out_free_dentry; /* Verify the interpreter has a valid arch */ - if (!elf_check_arch(&loc->interp_elf_ex)) + if (!elf_check_arch(&loc->interp_elf_ex) || + elf_check_fdpic(&loc->interp_elf_ex)) goto out_free_dentry; /* Load the interpreter program headers */ @@ -1190,6 +1199,8 @@ static int load_elf_library(struct file *file) if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 || !elf_check_arch(&elf_ex) || !file->f_op->mmap) goto out; + if (elf_check_fdpic(&elf_ex)) + goto out; /* Now read in all of the header information */ @@ -1699,7 +1710,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; - unsigned int regset_size = view->regsets[0].n * view->regsets[0].size; + unsigned int regset0_size = regset_size(t->task, &view->regsets[0]); /* * NT_PRSTATUS is the one special case, because the regset data @@ -1708,11 +1719,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus, t->task, signr); - (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size, + (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size, &t->prstatus.pr_reg, NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus); + PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1728,7 +1739,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (regset->core_note_type && regset->get && (!regset->active || regset->active(t->task, regset))) { int ret; - size_t size = regset->n * regset->size; + size_t size = regset_size(t->task, regset); void *data = kmalloc(size, GFP_KERNEL); if (unlikely(!data)) return 0; @@ -1743,7 +1754,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, size, data); else { SET_PR_FPVALID(&t->prstatus, - 1, regset_size); + 1, regset0_size); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index e70c039ac190..429326b6e2e7 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -378,6 +378,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) executable_stack); if (retval < 0) goto error; +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES + retval = arch_setup_additional_pages(bprm, !!interpreter_name); + if (retval < 0) + goto error; +#endif #endif /* load the executable and interpreter into memory */ @@ -831,6 +836,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, if (phdr->p_vaddr >= seg->p_vaddr && phdr->p_vaddr + phdr->p_memsz <= seg->p_vaddr + seg->p_memsz) { + Elf32_Dyn __user *dyn; + Elf32_Sword d_tag; + params->dynamic_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr; @@ -843,8 +851,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, goto dynamic_error; tmp = phdr->p_memsz / sizeof(Elf32_Dyn); - if (((Elf32_Dyn *) - params->dynamic_addr)[tmp - 1].d_tag != 0) + dyn = (Elf32_Dyn __user *)params->dynamic_addr; + __get_user(d_tag, &dyn[tmp - 1].d_tag); + if (d_tag != 0) goto dynamic_error; break; } @@ -1489,7 +1498,9 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm) struct vm_area_struct *vma; for (vma = current->mm->mmap; vma; vma = vma->vm_next) { +#ifdef CONFIG_MMU unsigned long addr; +#endif if (!maydump(vma, cprm->mm_flags)) continue; diff --git a/fs/block_dev.c b/fs/block_dev.c index 789f55e851ae..4a181fcb5175 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); -void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); - va_end(args); -} - static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; @@ -249,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -414,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -674,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, false); + result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); @@ -710,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, false); + result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index a26c63b4ad68..2e558227931a 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -91,3 +91,14 @@ config BTRFS_ASSERT any of the assertions trip. This is meant for btrfs developers only. If unsure, say N. + +config BTRFS_FS_REF_VERIFY + bool "Btrfs with the ref verify tool compiled in" + depends on BTRFS_FS + default n + help + Enable run-time extent reference verification instrumentation. This + is meant to be used by btrfs developers for tracking down extent + reference problems or verifying they didn't break something. + + If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index f2cd9dedb037..6fe881d5cb38 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,10 +10,11 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o free-space-tree.o + uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o +btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index e00c8a9fd5bb..d5540749f0e5 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -67,7 +67,7 @@ struct btrfs_workqueue { static void normal_work_helper(struct btrfs_work *work); #define BTRFS_WORK_HELPER(name) \ -void btrfs_##name(struct work_struct *arg) \ +noinline_for_stack void btrfs_##name(struct work_struct *arg) \ { \ struct btrfs_work *work = container_of(arg, struct btrfs_work, \ normal_work); \ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b517ef1477ea..7d0dc100a09a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -40,12 +40,14 @@ static int check_extent_in_eb(const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, u64 extent_item_pos, - struct extent_inode_elem **eie) + struct extent_inode_elem **eie, + bool ignore_offset) { u64 offset = 0; struct extent_inode_elem *e; - if (!btrfs_file_extent_compression(eb, fi) && + if (!ignore_offset && + !btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; @@ -84,7 +86,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) static int find_extent_in_eb(const struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, - struct extent_inode_elem **eie) + struct extent_inode_elem **eie, + bool ignore_offset) { u64 disk_byte; struct btrfs_key key; @@ -113,7 +116,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb, if (disk_byte != wanted_disk_byte) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset); if (ret < 0) return ret; } @@ -419,7 +422,7 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, - u64 total_refs) + u64 total_refs, bool ignore_offset) { int ret = 0; int slot; @@ -472,7 +475,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, - &eie); + &eie, ignore_offset); if (ret < 0) break; } @@ -510,7 +513,8 @@ next: static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, u64 total_refs) + const u64 *extent_item_pos, u64 total_refs, + bool ignore_offset) { struct btrfs_root *root; struct btrfs_key root_key; @@ -581,7 +585,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, } ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos, total_refs); + extent_item_pos, total_refs, ignore_offset); out: path->lowest_level = 0; btrfs_release_path(path); @@ -616,7 +620,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, const u64 *extent_item_pos, u64 total_refs, - struct share_check *sc) + struct share_check *sc, bool ignore_offset) { int err; int ret = 0; @@ -661,7 +665,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, } err = resolve_indirect_ref(fs_info, path, time_seq, ref, parents, extent_item_pos, - total_refs); + total_refs, ignore_offset); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -769,6 +773,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_key tmp_op_key; struct btrfs_key *op_key = NULL; + struct rb_node *n; int count; int ret = 0; @@ -778,7 +783,9 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } spin_lock(&head->lock); - list_for_each_entry(node, &head->ref_list, list) { + for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) { + node = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); if (node->seq > seq) continue; @@ -1107,13 +1114,17 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, * * Otherwise this returns 0 for success and <0 for an error. * + * If ignore_offset is set to false, only extent refs whose offsets match + * extent_item_pos are returned. If true, every extent ref is returned + * and extent_item_pos is ignored. + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist *refs, struct ulist *roots, const u64 *extent_item_pos, - struct share_check *sc) + struct share_check *sc, bool ignore_offset) { struct btrfs_key key; struct btrfs_path *path; @@ -1178,7 +1189,7 @@ again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1189,7 +1200,7 @@ again: */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); goto again; } spin_unlock(&delayed_refs->lock); @@ -1235,7 +1246,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, total_refs, sc); + extent_item_pos, total_refs, sc, ignore_offset); if (ret) goto out; @@ -1282,7 +1293,7 @@ again: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); ret = find_extent_in_eb(eb, bytenr, - *extent_item_pos, &eie); + *extent_item_pos, &eie, ignore_offset); btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) @@ -1350,7 +1361,7 @@ static void free_leaf_list(struct ulist *blocks) static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos) + const u64 *extent_item_pos, bool ignore_offset) { int ret; @@ -1359,7 +1370,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL); + *leafs, NULL, extent_item_pos, NULL, ignore_offset); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1383,7 +1394,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) + u64 time_seq, struct ulist **roots, + bool ignore_offset) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1402,7 +1414,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, NULL, NULL); + tmp, *roots, NULL, NULL, ignore_offset); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1421,14 +1433,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) + u64 time_seq, struct ulist **roots, + bool ignore_offset) { int ret; if (!trans) down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots); + time_seq, roots, ignore_offset); if (!trans) up_read(&fs_info->commit_root_sem); return ret; @@ -1483,7 +1496,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, - roots, NULL, &shared); + roots, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; @@ -1877,7 +1890,8 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_item_pos, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx) + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset) { int ret; struct btrfs_trans_handle *trans = NULL; @@ -1903,14 +1917,15 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, tree_mod_seq_elem.seq, &refs, - &extent_item_pos); + &extent_item_pos, ignore_offset); if (ret) goto out; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots); + tree_mod_seq_elem.seq, &roots, + ignore_offset); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1943,7 +1958,8 @@ out: int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx) + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset) { int ret; u64 extent_item_pos; @@ -1961,7 +1977,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx); + iterate, ctx, ignore_offset); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e410335841aa..0c2fab8514ff 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -43,17 +43,19 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx); + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx); + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots); + u64 time_seq, struct ulist **roots, bool ignore_offset); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index eccadb5f62a5..63f0ccc92a71 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -36,14 +36,13 @@ #define BTRFS_INODE_ORPHAN_META_RESERVED 1 #define BTRFS_INODE_DUMMY 2 #define BTRFS_INODE_IN_DEFRAG 3 -#define BTRFS_INODE_DELALLOC_META_RESERVED 4 -#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 -#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 -#define BTRFS_INODE_NEEDS_FULL_SYNC 7 -#define BTRFS_INODE_COPY_EVERYTHING 8 -#define BTRFS_INODE_IN_DELALLOC_LIST 9 -#define BTRFS_INODE_READDIO_NEED_LOCK 10 -#define BTRFS_INODE_HAS_PROPS 11 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 4 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 5 +#define BTRFS_INODE_NEEDS_FULL_SYNC 6 +#define BTRFS_INODE_COPY_EVERYTHING 7 +#define BTRFS_INODE_IN_DELALLOC_LIST 8 +#define BTRFS_INODE_READDIO_NEED_LOCK 9 +#define BTRFS_INODE_HAS_PROPS 10 /* in memory btrfs inode */ struct btrfs_inode { @@ -176,7 +175,8 @@ struct btrfs_inode { * of extent items we've reserved metadata for. */ unsigned outstanding_extents; - unsigned reserved_extents; + + struct btrfs_block_rsv block_rsv; /* * Cached values of inode properties @@ -267,6 +267,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) return false; } +static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, + int mod) +{ + lockdep_assert_held(&inode->lock); + inode->outstanding_extents += mod; + if (btrfs_is_free_space_inode(inode)) + return; + trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), + mod); +} + static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { int ret = 0; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7d5a9b51f0d7..7d51b5a5b505 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -613,7 +613,7 @@ static void btrfsic_dev_state_hashtable_add( struct btrfsic_dev_state_hashtable *h) { const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev)) & + (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); list_add(&ds->collision_resolving_node, h->table + hashval); @@ -2803,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio_dev(bio)); + dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2913,7 +2913,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, state = kvzalloc(sizeof(*state), GFP_KERNEL); if (!state) { pr_info("btrfs check-integrity: allocation failed!\n"); - return -1; + return -ENOMEM; } if (!btrfsic_is_initialized) { @@ -2945,7 +2945,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, if (NULL == ds) { pr_info("btrfs check-integrity: kmalloc() failed!\n"); mutex_unlock(&btrfsic_mutex); - return -1; + return -ENOMEM; } ds->bdev = device->bdev; ds->state = state; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 280384bf34f1..5982c8a71f02 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -33,6 +33,8 @@ #include <linux/bit_spinlock.h> #include <linux/slab.h> #include <linux/sched/mm.h> +#include <linux/sort.h> +#include <linux/log2.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -255,7 +257,8 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_status ? 0 : 1); + bio->bi_status ? + BLK_STS_OK : BLK_STS_NOTSUPP); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -292,7 +295,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, - unsigned long nr_pages) + unsigned long nr_pages, + unsigned int write_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio *bio = NULL; @@ -324,7 +328,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; bio = btrfs_bio_alloc(bdev, first_byte); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; refcount_set(&cb->pending_bios, 1); @@ -371,7 +375,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_put(bio); bio = btrfs_bio_alloc(bdev, first_byte); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -706,7 +710,86 @@ out: return ret; } -static struct { +/* + * Heuristic uses systematic sampling to collect data from the input data + * range, the logic can be tuned by the following constants: + * + * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample + * @SAMPLING_INTERVAL - range from which the sampled data can be collected + */ +#define SAMPLING_READ_SIZE (16) +#define SAMPLING_INTERVAL (256) + +/* + * For statistical analysis of the input data we consider bytes that form a + * Galois Field of 256 objects. Each object has an attribute count, ie. how + * many times the object appeared in the sample. + */ +#define BUCKET_SIZE (256) + +/* + * The size of the sample is based on a statistical sampling rule of thumb. + * The common way is to perform sampling tests as long as the number of + * elements in each cell is at least 5. + * + * Instead of 5, we choose 32 to obtain more accurate results. + * If the data contain the maximum number of symbols, which is 256, we obtain a + * sample size bound by 8192. + * + * For a sample of at most 8KB of data per data range: 16 consecutive bytes + * from up to 512 locations. + */ +#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ + SAMPLING_READ_SIZE / SAMPLING_INTERVAL) + +struct bucket_item { + u32 count; +}; + +struct heuristic_ws { + /* Partial copy of input data */ + u8 *sample; + u32 sample_size; + /* Buckets store counters for each byte value */ + struct bucket_item *bucket; + struct list_head list; +}; + +static void free_heuristic_ws(struct list_head *ws) +{ + struct heuristic_ws *workspace; + + workspace = list_entry(ws, struct heuristic_ws, list); + + kvfree(workspace->sample); + kfree(workspace->bucket); + kfree(workspace); +} + +static struct list_head *alloc_heuristic_ws(void) +{ + struct heuristic_ws *ws; + + ws = kzalloc(sizeof(*ws), GFP_KERNEL); + if (!ws) + return ERR_PTR(-ENOMEM); + + ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); + if (!ws->sample) + goto fail; + + ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); + if (!ws->bucket) + goto fail; + + INIT_LIST_HEAD(&ws->list); + return &ws->list; +fail: + free_heuristic_ws(&ws->list); + return ERR_PTR(-ENOMEM); +} + +struct workspaces_list { struct list_head idle_ws; spinlock_t ws_lock; /* Number of free workspaces */ @@ -715,7 +798,11 @@ static struct { atomic_t total_ws; /* Waiters for a free workspace */ wait_queue_head_t ws_wait; -} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; +}; + +static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; + +static struct workspaces_list btrfs_heuristic_ws; static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, @@ -725,11 +812,25 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { void __init btrfs_init_compress(void) { + struct list_head *workspace; int i; - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws); + spin_lock_init(&btrfs_heuristic_ws.ws_lock); + atomic_set(&btrfs_heuristic_ws.total_ws, 0); + init_waitqueue_head(&btrfs_heuristic_ws.ws_wait); + + workspace = alloc_heuristic_ws(); + if (IS_ERR(workspace)) { + pr_warn( + "BTRFS: cannot preallocate heuristic workspace, will try later\n"); + } else { + atomic_set(&btrfs_heuristic_ws.total_ws, 1); + btrfs_heuristic_ws.free_ws = 1; + list_add(workspace, &btrfs_heuristic_ws.idle_ws); + } + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); atomic_set(&btrfs_comp_ws[i].total_ws, 0); @@ -756,18 +857,32 @@ void __init btrfs_init_compress(void) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -static struct list_head *find_workspace(int type) +static struct list_head *__find_workspace(int type, bool heuristic) { struct list_head *workspace; int cpus = num_online_cpus(); int idx = type - 1; unsigned nofs_flag; + struct list_head *idle_ws; + spinlock_t *ws_lock; + atomic_t *total_ws; + wait_queue_head_t *ws_wait; + int *free_ws; + + if (heuristic) { + idle_ws = &btrfs_heuristic_ws.idle_ws; + ws_lock = &btrfs_heuristic_ws.ws_lock; + total_ws = &btrfs_heuristic_ws.total_ws; + ws_wait = &btrfs_heuristic_ws.ws_wait; + free_ws = &btrfs_heuristic_ws.free_ws; + } else { + idle_ws = &btrfs_comp_ws[idx].idle_ws; + ws_lock = &btrfs_comp_ws[idx].ws_lock; + total_ws = &btrfs_comp_ws[idx].total_ws; + ws_wait = &btrfs_comp_ws[idx].ws_wait; + free_ws = &btrfs_comp_ws[idx].free_ws; + } - struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; - spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; - wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { @@ -797,7 +912,10 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (heuristic) + workspace = alloc_heuristic_ws(); + else + workspace = btrfs_compress_op[idx]->alloc_workspace(); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -828,18 +946,38 @@ again: return workspace; } +static struct list_head *find_workspace(int type) +{ + return __find_workspace(type, false); +} + /* * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -static void free_workspace(int type, struct list_head *workspace) +static void __free_workspace(int type, struct list_head *workspace, + bool heuristic) { int idx = type - 1; - struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; - spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; - wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *free_ws = &btrfs_comp_ws[idx].free_ws; + struct list_head *idle_ws; + spinlock_t *ws_lock; + atomic_t *total_ws; + wait_queue_head_t *ws_wait; + int *free_ws; + + if (heuristic) { + idle_ws = &btrfs_heuristic_ws.idle_ws; + ws_lock = &btrfs_heuristic_ws.ws_lock; + total_ws = &btrfs_heuristic_ws.total_ws; + ws_wait = &btrfs_heuristic_ws.ws_wait; + free_ws = &btrfs_heuristic_ws.free_ws; + } else { + idle_ws = &btrfs_comp_ws[idx].idle_ws; + ws_lock = &btrfs_comp_ws[idx].ws_lock; + total_ws = &btrfs_comp_ws[idx].total_ws; + ws_wait = &btrfs_comp_ws[idx].ws_wait; + free_ws = &btrfs_comp_ws[idx].free_ws; + } spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { @@ -850,7 +988,10 @@ static void free_workspace(int type, struct list_head *workspace) } spin_unlock(ws_lock); - btrfs_compress_op[idx]->free_workspace(workspace); + if (heuristic) + free_heuristic_ws(workspace); + else + btrfs_compress_op[idx]->free_workspace(workspace); atomic_dec(total_ws); wake: /* @@ -861,6 +1002,11 @@ wake: wake_up(ws_wait); } +static void free_workspace(int type, struct list_head *ws) +{ + return __free_workspace(type, ws, false); +} + /* * cleanup function for module exit */ @@ -869,6 +1015,13 @@ static void free_workspaces(void) struct list_head *workspace; int i; + while (!list_empty(&btrfs_heuristic_ws.idle_ws)) { + workspace = btrfs_heuristic_ws.idle_ws.next; + list_del(workspace); + free_heuristic_ws(workspace); + atomic_dec(&btrfs_heuristic_ws.total_ws); + } + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { workspace = btrfs_comp_ws[i].idle_ws.next; @@ -883,6 +1036,11 @@ static void free_workspaces(void) * Given an address space and start and length, compress the bytes into @pages * that are allocated on demand. * + * @type_level is encoded algorithm and level, where level 0 means whatever + * default the algorithm chooses and is opaque here; + * - compression algo are 0-3 + * - the level are bits 4-7 + * * @out_pages is an in/out parameter, holds maximum number of pages to allocate * and returns number of actually allocated pages * @@ -897,7 +1055,7 @@ static void free_workspaces(void) * @max_out tells us the max number of bytes that we're allowed to * stuff into pages */ -int btrfs_compress_pages(int type, struct address_space *mapping, +int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, @@ -905,9 +1063,11 @@ int btrfs_compress_pages(int type, struct address_space *mapping, { struct list_head *workspace; int ret; + int type = type_level & 0xF; workspace = find_workspace(type); + btrfs_compress_op[type - 1]->set_level(workspace, type_level); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, pages, out_pages, @@ -1066,6 +1226,211 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, } /* + * Shannon Entropy calculation + * + * Pure byte distribution analysis fails to determine compressiability of data. + * Try calculating entropy to estimate the average minimum number of bits + * needed to encode the sampled data. + * + * For convenience, return the percentage of needed bits, instead of amount of + * bits directly. + * + * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy + * and can be compressible with high probability + * + * @ENTROPY_LVL_HIGH - data are not compressible with high probability + * + * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. + */ +#define ENTROPY_LVL_ACEPTABLE (65) +#define ENTROPY_LVL_HIGH (80) + +/* + * For increasead precision in shannon_entropy calculation, + * let's do pow(n, M) to save more digits after comma: + * + * - maximum int bit length is 64 + * - ilog2(MAX_SAMPLE_SIZE) -> 13 + * - 13 * 4 = 52 < 64 -> M = 4 + * + * So use pow(n, 4). + */ +static inline u32 ilog2_w(u64 n) +{ + return ilog2(n * n * n * n); +} + +static u32 shannon_entropy(struct heuristic_ws *ws) +{ + const u32 entropy_max = 8 * ilog2_w(2); + u32 entropy_sum = 0; + u32 p, p_base, sz_base; + u32 i; + + sz_base = ilog2_w(ws->sample_size); + for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { + p = ws->bucket[i].count; + p_base = ilog2_w(p); + entropy_sum += p * (sz_base - p_base); + } + + entropy_sum /= ws->sample_size; + return entropy_sum * 100 / entropy_max; +} + +/* Compare buckets by size, ascending */ +static int bucket_comp_rev(const void *lv, const void *rv) +{ + const struct bucket_item *l = (const struct bucket_item *)lv; + const struct bucket_item *r = (const struct bucket_item *)rv; + + return r->count - l->count; +} + +/* + * Size of the core byte set - how many bytes cover 90% of the sample + * + * There are several types of structured binary data that use nearly all byte + * values. The distribution can be uniform and counts in all buckets will be + * nearly the same (eg. encrypted data). Unlikely to be compressible. + * + * Other possibility is normal (Gaussian) distribution, where the data could + * be potentially compressible, but we have to take a few more steps to decide + * how much. + * + * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, + * compression algo can easy fix that + * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high + * probability is not compressible + */ +#define BYTE_CORE_SET_LOW (64) +#define BYTE_CORE_SET_HIGH (200) + +static int byte_core_set_size(struct heuristic_ws *ws) +{ + u32 i; + u32 coreset_sum = 0; + const u32 core_set_threshold = ws->sample_size * 90 / 100; + struct bucket_item *bucket = ws->bucket; + + /* Sort in reverse order */ + sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL); + + for (i = 0; i < BYTE_CORE_SET_LOW; i++) + coreset_sum += bucket[i].count; + + if (coreset_sum > core_set_threshold) + return i; + + for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { + coreset_sum += bucket[i].count; + if (coreset_sum > core_set_threshold) + break; + } + + return i; +} + +/* + * Count byte values in buckets. + * This heuristic can detect textual data (configs, xml, json, html, etc). + * Because in most text-like data byte set is restricted to limited number of + * possible characters, and that restriction in most cases makes data easy to + * compress. + * + * @BYTE_SET_THRESHOLD - consider all data within this byte set size: + * less - compressible + * more - need additional analysis + */ +#define BYTE_SET_THRESHOLD (64) + +static u32 byte_set_size(const struct heuristic_ws *ws) +{ + u32 i; + u32 byte_set_size = 0; + + for (i = 0; i < BYTE_SET_THRESHOLD; i++) { + if (ws->bucket[i].count > 0) + byte_set_size++; + } + + /* + * Continue collecting count of byte values in buckets. If the byte + * set size is bigger then the threshold, it's pointless to continue, + * the detection technique would fail for this type of data. + */ + for (; i < BUCKET_SIZE; i++) { + if (ws->bucket[i].count > 0) { + byte_set_size++; + if (byte_set_size > BYTE_SET_THRESHOLD) + return byte_set_size; + } + } + + return byte_set_size; +} + +static bool sample_repeated_patterns(struct heuristic_ws *ws) +{ + const u32 half_of_sample = ws->sample_size / 2; + const u8 *data = ws->sample; + + return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; +} + +static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, + struct heuristic_ws *ws) +{ + struct page *page; + u64 index, index_end; + u32 i, curr_sample_pos; + u8 *in_data; + + /* + * Compression handles the input data by chunks of 128KiB + * (defined by BTRFS_MAX_UNCOMPRESSED) + * + * We do the same for the heuristic and loop over the whole range. + * + * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will + * process no more than BTRFS_MAX_UNCOMPRESSED at a time. + */ + if (end - start > BTRFS_MAX_UNCOMPRESSED) + end = start + BTRFS_MAX_UNCOMPRESSED; + + index = start >> PAGE_SHIFT; + index_end = end >> PAGE_SHIFT; + + /* Don't miss unaligned end */ + if (!IS_ALIGNED(end, PAGE_SIZE)) + index_end++; + + curr_sample_pos = 0; + while (index < index_end) { + page = find_get_page(inode->i_mapping, index); + in_data = kmap(page); + /* Handle case where the start is not aligned to PAGE_SIZE */ + i = start % PAGE_SIZE; + while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { + /* Don't sample any garbage from the last page */ + if (start > end - SAMPLING_READ_SIZE) + break; + memcpy(&ws->sample[curr_sample_pos], &in_data[i], + SAMPLING_READ_SIZE); + i += SAMPLING_INTERVAL; + start += SAMPLING_INTERVAL; + curr_sample_pos += SAMPLING_READ_SIZE; + } + kunmap(page); + put_page(page); + + index++; + } + + ws->sample_size = curr_sample_pos; +} + +/* * Compression heuristic. * * For now is's a naive and optimistic 'return true', we'll extend the logic to @@ -1082,18 +1447,87 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, */ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) { - u64 index = start >> PAGE_SHIFT; - u64 end_index = end >> PAGE_SHIFT; - struct page *page; - int ret = 1; + struct list_head *ws_list = __find_workspace(0, true); + struct heuristic_ws *ws; + u32 i; + u8 byte; + int ret = 0; - while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - kmap(page); - kunmap(page); - put_page(page); - index++; + ws = list_entry(ws_list, struct heuristic_ws, list); + + heuristic_collect_sample(inode, start, end, ws); + + if (sample_repeated_patterns(ws)) { + ret = 1; + goto out; + } + + memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); + + for (i = 0; i < ws->sample_size; i++) { + byte = ws->sample[i]; + ws->bucket[byte].count++; + } + + i = byte_set_size(ws); + if (i < BYTE_SET_THRESHOLD) { + ret = 2; + goto out; + } + + i = byte_core_set_size(ws); + if (i <= BYTE_CORE_SET_LOW) { + ret = 3; + goto out; + } + + if (i >= BYTE_CORE_SET_HIGH) { + ret = 0; + goto out; } + i = shannon_entropy(ws); + if (i <= ENTROPY_LVL_ACEPTABLE) { + ret = 4; + goto out; + } + + /* + * For the levels below ENTROPY_LVL_HIGH, additional analysis would be + * needed to give green light to compression. + * + * For now just assume that compression at that level is not worth the + * resources because: + * + * 1. it is possible to defrag the data later + * + * 2. the data would turn out to be hardly compressible, eg. 150 byte + * values, every bucket has counter at level ~54. The heuristic would + * be confused. This can happen when data have some internal repeated + * patterns like "abbacbbc...". This can be detected by analyzing + * pairs of bytes, which is too costly. + */ + if (i < ENTROPY_LVL_HIGH) { + ret = 5; + goto out; + } else { + ret = 0; + goto out; + } + +out: + __free_workspace(0, ws_list, true); return ret; } + +unsigned int btrfs_compress_str2level(const char *str) +{ + if (strncmp(str, "zlib", 4) != 0) + return 0; + + /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */ + if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) + return str[5] - '0'; + + return BTRFS_ZLIB_DEFAULT_LEVEL; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d2781ff8f994..0868cc554f14 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -34,6 +34,8 @@ /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) +#define BTRFS_ZLIB_DEFAULT_LEVEL 3 + struct compressed_bio { /* number of bios pending for this compressed extent */ refcount_t pending_bios; @@ -76,7 +78,7 @@ struct compressed_bio { void btrfs_init_compress(void); void btrfs_exit_compress(void); -int btrfs_compress_pages(int type, struct address_space *mapping, +int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, @@ -91,10 +93,13 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, - unsigned long nr_pages); + unsigned long nr_pages, + unsigned int write_flags); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); +unsigned btrfs_compress_str2level(const char *str); + enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, @@ -124,6 +129,8 @@ struct btrfs_compress_op { struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); + + void (*set_level)(struct list_head *ws, unsigned int type); }; extern const struct btrfs_compress_op btrfs_zlib_compress; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d49db7d86be..1e74cf826532 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -192,7 +192,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * tree until you end up with a lock on the root. A locked buffer * is returned, with a reference held. */ -static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -1032,14 +1032,17 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; ret = btrfs_inc_ref(trans, root, cow, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { @@ -1049,7 +1052,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } if (new_flags != 0) { int level = btrfs_header_level(buf); @@ -1068,9 +1072,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; ret = btrfs_dec_ref(trans, root, buf, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } clean_tree_block(fs_info, buf); *last_ref = 1; @@ -5496,8 +5502,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } else if (left_end_reached) { if (right_level == 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, ctx); @@ -5508,8 +5513,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, continue; } else if (right_end_reached) { if (left_level == 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, ctx); @@ -5523,8 +5527,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, if (left_level == 0 && right_level == 0) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, ctx); @@ -5532,8 +5535,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_left = ADVANCE; } else if (cmp > 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, ctx); @@ -5550,8 +5552,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, result = BTRFS_COMPARE_TREE_CHANGED; else result = BTRFS_COMPARE_TREE_SAME; - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, result, ctx); if (ret < 0) goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fc690384c58..13c260b525a1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -523,7 +523,7 @@ struct btrfs_caching_control { }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ -#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) +#define CACHING_CTL_WAKE_UP SZ_2M struct btrfs_io_ctl { void *cur, *orig; @@ -763,8 +763,6 @@ struct btrfs_fs_info { * delayed dir index item */ struct btrfs_block_rsv global_block_rsv; - /* block reservation for delay allocation */ - struct btrfs_block_rsv delalloc_block_rsv; /* block reservation for metadata operations */ struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ @@ -790,6 +788,7 @@ struct btrfs_fs_info { */ unsigned long pending_changes; unsigned long compress_type:4; + unsigned int compress_level; int commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -878,9 +877,6 @@ struct btrfs_fs_info { rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; - atomic_t nr_async_submits; - atomic_t async_submit_draining; - atomic_t nr_async_bios; atomic_t async_delalloc_pages; atomic_t open_ioctl_trans; @@ -1100,6 +1096,11 @@ struct btrfs_fs_info { u32 nodesize; u32 sectorsize; u32 stripesize; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; + struct rb_root block_tree; +#endif }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) @@ -1338,6 +1339,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) +#define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -2639,7 +2641,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, + struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, @@ -2658,7 +2660,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); @@ -2670,7 +2672,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); @@ -2744,6 +2746,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, u64 *qgroup_reserved, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); + int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, @@ -2751,6 +2755,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode, void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); @@ -2809,6 +2816,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); @@ -2821,9 +2829,7 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, - struct btrfs_root *right_root, - struct btrfs_path *left_path, +typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, @@ -2951,7 +2957,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) */ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) { - return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info); + return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info); } static inline void free_fs_info(struct btrfs_fs_info *fs_info) @@ -3174,6 +3180,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + unsigned int extra_bits, struct extent_state **cached_state, int dedupe); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 19e4ad2f3f2e..a6226cd6063c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -87,6 +87,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_lock(&root->inode_lock); node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ @@ -94,9 +95,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_unlock(&root->inode_lock); return node; } - btrfs_inode->delayed_node = node; - /* can be accessed and cached in the inode */ - refcount_add(2, &node->refs); + + /* + * It's possible that we're racing into the middle of removing + * this node from the radix tree. In this case, the refcount + * was zero and it should never go back to one. Just return + * NULL like it was never in the radix at all; our release + * function is in the process of removing it. + * + * Some implementations of refcount_inc refuse to bump the + * refcount once it has hit zero. If we don't do this dance + * here, refcount_inc() may decide to just WARN_ONCE() instead + * of actually bumping the refcount. + * + * If this node is properly in the radix, we want to bump the + * refcount twice, once for the inode and once for this get + * operation. + */ + if (refcount_inc_not_zero(&node->refs)) { + refcount_inc(&node->refs); + btrfs_inode->delayed_node = node; + } else { + node = NULL; + } + spin_unlock(&root->inode_lock); return node; } @@ -254,17 +276,18 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (refcount_dec_and_test(&delayed_node->refs)) { - bool free = false; struct btrfs_root *root = delayed_node->root; + spin_lock(&root->inode_lock); - if (refcount_read(&delayed_node->refs) == 0) { - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); - free = true; - } + /* + * Once our refcount goes to zero, nobody is allowed to bump it + * back up. We can delete it now. + */ + ASSERT(refcount_read(&delayed_node->refs) == 0); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); - if (free) - kmem_cache_free(delayed_node_cache, delayed_node); + kmem_cache_free(delayed_node_cache, delayed_node); } } @@ -581,7 +604,6 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - bool release = false; src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; @@ -589,36 +611,13 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); /* - * If our block_rsv is the delalloc block reserve then check and see if - * we have our extra reservation for updating the inode. If not fall - * through and try to reserve space quickly. - * - * We used to try and steal from the delalloc block rsv or the global - * reserve, but we'd steal a full reservation, which isn't kind. We are - * here through delalloc which means we've likely just cowed down close - * to the leaf that contains the inode, so we would steal less just - * doing the fallback inode update, so if we do end up having to steal - * from the global block rsv we hopefully only steal one or two blocks - * worth which is less likely to hurt us. - */ - if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { - spin_lock(&inode->lock); - if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) - release = true; - else - src_rsv = NULL; - spin_unlock(&inode->lock); - } - - /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we * still need to reserve space for this update, so try to reserve the * space. * * Now if src_rsv == delalloc_block_rsv we'll let it just steal since - * we're accounted for. + * we always reserve enough to update the inode item. */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { @@ -643,32 +642,12 @@ static int btrfs_delayed_inode_reserve_metadata( } ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); - - /* - * Migrate only takes a reservation, it doesn't touch the size of the - * block_rsv. This is to simplify people who don't normally have things - * migrated from their block rsv. If they go to release their - * reservation, that will decrease the size as well, so if migrate - * reduced size we'd end up with a negative size. But for the - * delalloc_meta_reserved stuff we will only know to drop 1 reservation, - * but we could in fact do this reserve/migrate dance several times - * between the time we did the original reservation and we'd clean it - * up. So to take care of this, release the space for the meta - * reservation here. I think it may be time for a documentation page on - * how block rsvs. work. - */ if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; } - if (release) { - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 0); - btrfs_block_rsv_release(fs_info, src_rsv, num_bytes); - } - return ret; } @@ -1654,28 +1633,18 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index) { - struct btrfs_delayed_item *curr, *next; - int ret; - - if (list_empty(del_list)) - return 0; + struct btrfs_delayed_item *curr; + int ret = 0; - list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_for_each_entry(curr, del_list, readdir_list) { if (curr->key.offset > index) break; - - list_del(&curr->readdir_list); - ret = (curr->key.offset == index); - - if (refcount_dec_and_test(&curr->refs)) - kfree(curr); - - if (ret) - return 1; - else - continue; + if (curr->key.offset == index) { + ret = 1; + break; + } } - return 0; + return ret; } /* diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 93ffa898df6d..83be8f9fd906 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -40,10 +40,10 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * compare two delayed tree backrefs with same bytenr and type */ -static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, - struct btrfs_delayed_tree_ref *ref1, int type) +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, + struct btrfs_delayed_tree_ref *ref2) { - if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { if (ref1->root < ref2->root) return -1; if (ref1->root > ref2->root) @@ -60,8 +60,8 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, /* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, - struct btrfs_delayed_data_ref *ref1) +static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, + struct btrfs_delayed_data_ref *ref2) { if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { if (ref1->root < ref2->root) @@ -85,6 +85,34 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } +static int comp_refs(struct btrfs_delayed_ref_node *ref1, + struct btrfs_delayed_ref_node *ref2, + bool check_seq) +{ + int ret = 0; + + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), + btrfs_delayed_node_to_tree_ref(ref2)); + else + ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), + btrfs_delayed_node_to_data_ref(ref2)); + if (ret) + return ret; + if (check_seq) { + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; + } + return 0; +} + /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -96,15 +124,43 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, u64 bytenr; ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - bytenr = ins->node.bytenr; + bytenr = ins->bytenr; while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->node.bytenr) + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, + struct btrfs_delayed_ref_node *ins) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *node = &ins->ref_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + + while (*p) { + int comp; + + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + ref_node); + comp = comp_refs(ins, entry, true); + if (comp < 0) p = &(*p)->rb_left; - else if (bytenr > entry->node.bytenr) + else if (comp > 0) p = &(*p)->rb_right; else return entry; @@ -133,15 +189,15 @@ find_ref_head(struct rb_root *root, u64 bytenr, while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->node.bytenr) + if (bytenr < entry->bytenr) n = n->rb_left; - else if (bytenr > entry->node.bytenr) + else if (bytenr > entry->bytenr) n = n->rb_right; else return entry; } if (entry && return_bigger) { - if (bytenr > entry->node.bytenr) { + if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) n = rb_first(root); @@ -164,17 +220,17 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); - if (!head->node.in_tree) { + if (RB_EMPTY_NODE(&head->href_node)) { mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return -EAGAIN; } - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return 0; } @@ -183,15 +239,11 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - rb_erase(&head->href_node, &delayed_refs->href_root); - } else { - assert_spin_locked(&head->lock); - list_del(&ref->list); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - } + assert_spin_locked(&head->lock); + rb_erase(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); @@ -206,36 +258,18 @@ static bool merge_ref(struct btrfs_trans_handle *trans, u64 seq) { struct btrfs_delayed_ref_node *next; + struct rb_node *node = rb_next(&ref->ref_node); bool done = false; - next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); - while (!done && &next->list != &head->ref_list) { + while (!done && node) { int mod; - struct btrfs_delayed_ref_node *next2; - - next2 = list_next_entry(next, list); - - if (next == ref) - goto next; + next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + node = rb_next(node); if (seq && next->seq >= seq) - goto next; - - if (next->type != ref->type) - goto next; - - if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && - comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), - btrfs_delayed_node_to_tree_ref(next), - ref->type)) - goto next; - if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || - ref->type == BTRFS_SHARED_DATA_REF_KEY) && - comp_data_refs(btrfs_delayed_node_to_data_ref(ref), - btrfs_delayed_node_to_data_ref(next))) - goto next; + break; + if (comp_refs(ref, next, false)) + break; if (ref->action == next->action) { mod = next->ref_mod; @@ -259,8 +293,6 @@ static bool merge_ref(struct btrfs_trans_handle *trans, WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } -next: - next = next2; } return done; @@ -272,11 +304,12 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; + struct rb_node *node; u64 seq = 0; assert_spin_locked(&head->lock); - if (list_empty(&head->ref_list)) + if (RB_EMPTY_ROOT(&head->ref_tree)) return; /* We don't have too many refs to merge for data. */ @@ -293,22 +326,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, } spin_unlock(&fs_info->tree_mod_seq_lock); - ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); - while (&ref->list != &head->ref_list) { +again: + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) - goto next; - - if (merge_ref(trans, delayed_refs, head, ref, seq)) { - if (list_empty(&head->ref_list)) - break; - ref = list_first_entry(&head->ref_list, - struct btrfs_delayed_ref_node, - list); continue; - } -next: - ref = list_next_entry(ref, list); + if (merge_ref(trans, delayed_refs, head, ref, seq)) + goto again; } } @@ -380,8 +404,8 @@ again: head->processing = 1; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; - delayed_refs->run_delayed_start = head->node.bytenr + - head->node.num_bytes; + delayed_refs->run_delayed_start = head->bytenr + + head->num_bytes; return head; } @@ -391,37 +415,19 @@ again: * Return 0 for insert. * Return >0 for merge. */ -static int -add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, - struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *ref) +static int insert_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_node *exist; int mod; int ret = 0; spin_lock(&href->lock); - /* Check whether we can merge the tail node with ref */ - if (list_empty(&href->ref_list)) - goto add_tail; - exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, - list); - /* No need to compare bytenr nor is_head */ - if (exist->type != ref->type || exist->seq != ref->seq) - goto add_tail; - - if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || - exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && - comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), - btrfs_delayed_node_to_tree_ref(ref), - ref->type)) - goto add_tail; - if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || - exist->type == BTRFS_SHARED_DATA_REF_KEY) && - comp_data_refs(btrfs_delayed_node_to_data_ref(exist), - btrfs_delayed_node_to_data_ref(ref))) - goto add_tail; + exist = tree_insert(&href->ref_tree, ref); + if (!exist) + goto inserted; /* Now we are sure we can merge */ ret = 1; @@ -452,9 +458,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, drop_delayed_ref(trans, root, href, exist); spin_unlock(&href->lock); return ret; - -add_tail: - list_add_tail(&ref->list, &href->ref_list); +inserted: if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); @@ -469,20 +473,16 @@ add_tail: */ static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update, + struct btrfs_delayed_ref_head *existing, + struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { - struct btrfs_delayed_ref_head *existing_ref; - struct btrfs_delayed_ref_head *ref; int old_ref_mod; - existing_ref = btrfs_delayed_node_to_head(existing); - ref = btrfs_delayed_node_to_head(update); - BUG_ON(existing_ref->is_data != ref->is_data); + BUG_ON(existing->is_data != update->is_data); - spin_lock(&existing_ref->lock); - if (ref->must_insert_reserved) { + spin_lock(&existing->lock); + if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref * entries were processed, we can end up @@ -490,7 +490,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * the must_insert_reserved flag set. * Set it again here */ - existing_ref->must_insert_reserved = ref->must_insert_reserved; + existing->must_insert_reserved = update->must_insert_reserved; /* * update the num_bytes so we make sure the accounting @@ -500,22 +500,22 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, } - if (ref->extent_op) { - if (!existing_ref->extent_op) { - existing_ref->extent_op = ref->extent_op; + if (update->extent_op) { + if (!existing->extent_op) { + existing->extent_op = update->extent_op; } else { - if (ref->extent_op->update_key) { - memcpy(&existing_ref->extent_op->key, - &ref->extent_op->key, - sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = true; + if (update->extent_op->update_key) { + memcpy(&existing->extent_op->key, + &update->extent_op->key, + sizeof(update->extent_op->key)); + existing->extent_op->update_key = true; } - if (ref->extent_op->update_flags) { - existing_ref->extent_op->flags_to_set |= - ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = true; + if (update->extent_op->update_flags) { + existing->extent_op->flags_to_set |= + update->extent_op->flags_to_set; + existing->extent_op->update_flags = true; } - btrfs_free_delayed_extent_op(ref->extent_op); + btrfs_free_delayed_extent_op(update->extent_op); } } /* @@ -523,23 +523,23 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ - old_ref_mod = existing_ref->total_ref_mod; + old_ref_mod = existing->total_ref_mod; if (old_ref_mod_ret) *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; - existing_ref->total_ref_mod += update->ref_mod; + existing->total_ref_mod += update->ref_mod; /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. */ - if (existing_ref->is_data) { - if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + if (existing->is_data) { + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) delayed_refs->pending_csums -= existing->num_bytes; - if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) delayed_refs->pending_csums += existing->num_bytes; } - spin_unlock(&existing_ref->lock); + spin_unlock(&existing->lock); } /* @@ -550,14 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, int is_data, int *qrecord_inserted_ret, int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; - struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; @@ -593,26 +592,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; - /* first set the basic ref node struct up */ - refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = count_mod; - ref->type = 0; - ref->action = 0; - ref->is_head = 1; - ref->in_tree = 1; - ref->seq = 0; - - head_ref = btrfs_delayed_node_to_head(ref); + refcount_set(&head_ref->refs, 1); + head_ref->bytenr = bytenr; + head_ref->num_bytes = num_bytes; + head_ref->ref_mod = count_mod; head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - INIT_LIST_HEAD(&head_ref->ref_list); + head_ref->ref_tree = RB_ROOT; INIT_LIST_HEAD(&head_ref->ref_add_list); + RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; head_ref->qgroup_reserved = 0; head_ref->qgroup_ref_root = 0; + spin_lock_init(&head_ref->lock); + mutex_init(&head_ref->mutex); /* Record qgroup extent info if provided */ if (qrecord) { @@ -632,17 +626,14 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord_inserted = 1; } - spin_lock_init(&head_ref->lock); - mutex_init(&head_ref->mutex); - - trace_add_delayed_ref_head(fs_info, ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref, + update_existing_head_ref(delayed_refs, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -699,7 +690,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; - INIT_LIST_HEAD(&ref->list); + RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -713,7 +704,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); - ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); /* * XXX: memory should be freed at the same level allocated. @@ -756,7 +747,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; - INIT_LIST_HEAD(&ref->list); + RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -772,8 +763,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(fs_info, ref, full_ref, action); - ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); - + ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } @@ -821,7 +811,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, 0, 0, action, 0, &qrecord_inserted, old_ref_mod, new_ref_mod); @@ -888,7 +878,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, ref_root, reserved, action, 1, &qrecord_inserted, old_ref_mod, new_ref_mod); @@ -920,7 +910,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, NULL, NULL, NULL); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index ce88e4ac5276..a43af432f859 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -26,18 +26,8 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ -/* - * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the - * same ref_node structure. - * Ref_head is in a higher logic level than tree/data ref, and duplicated - * bytenr/num_bytes in ref_node is really a waste or memory, they should be - * referred from ref_head. - * This gets more disgusting after we use list to store tree/data ref in - * ref_head. Must clean this mess up later. - */ struct btrfs_delayed_ref_node { - /*data/tree ref use list, stored in ref_head->ref_list. */ - struct list_head list; + struct rb_node ref_node; /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the @@ -91,8 +81,9 @@ struct btrfs_delayed_extent_op { * reference count modifications we've queued up. */ struct btrfs_delayed_ref_head { - struct btrfs_delayed_ref_node node; - + u64 bytenr; + u64 num_bytes; + refcount_t refs; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. @@ -100,7 +91,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct list_head ref_list; + struct rb_root ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; @@ -116,6 +107,14 @@ struct btrfs_delayed_ref_head { int total_ref_mod; /* + * This is the current outstanding mod references for this bytenr. This + * is used with lookup_extent_info to get an accurate reference count + * for a bytenr, so it is adjusted as delayed refs are run so that any + * on disk reference count + ref_mod is accurate. + */ + int ref_mod; + + /* * For qgroup reserved space freeing. * * ref_root and reserved will be recorded after @@ -234,15 +233,18 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) case BTRFS_SHARED_DATA_REF_KEY: kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); break; - case 0: - kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); - break; default: BUG(); } } } +static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) +{ + if (refcount_dec_and_test(&head->refs)) + kmem_cache_free(btrfs_delayed_ref_head_cachep, head); +} + int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, @@ -283,35 +285,17 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); /* - * a node might live in a head or a regular ref, this lets you - * test for the proper type to use. - */ -static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) -{ - return node->is_head; -} - -/* * helper functions to cast a node into its container */ static inline struct btrfs_delayed_tree_ref * btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) { - WARN_ON(btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_tree_ref, node); } static inline struct btrfs_delayed_data_ref * btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) { - WARN_ON(btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_data_ref, node); } - -static inline struct btrfs_delayed_ref_head * -btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(!btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref_head, node); -} #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dfdab849037b..a8ecccfc36de 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,8 @@ #include "sysfs.h" #include "qgroup.h" #include "compression.h" +#include "tree-checker.h" +#include "ref-verify.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -543,146 +545,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, return ret; } -#define CORRUPT(reason, eb, root, slot) \ - btrfs_crit(root->fs_info, \ - "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ - btrfs_header_level(eb) == 0 ? "leaf" : "node", \ - reason, btrfs_header_bytenr(eb), root->objectid, slot) - -static noinline int check_leaf(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_key key; - struct btrfs_key leaf_key; - u32 nritems = btrfs_header_nritems(leaf); - int slot; - - /* - * Extent buffers from a relocation tree have a owner field that - * corresponds to the subvolume tree they are based on. So just from an - * extent buffer alone we can not find out what is the id of the - * corresponding subvolume tree, so we can not figure out if the extent - * buffer corresponds to the root of the relocation tree or not. So skip - * this check for relocation trees. - */ - if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { - struct btrfs_root *check_root; - - key.objectid = btrfs_header_owner(leaf); - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - check_root = btrfs_get_fs_root(fs_info, &key, false); - /* - * The only reason we also check NULL here is that during - * open_ctree() some roots has not yet been set up. - */ - if (!IS_ERR_OR_NULL(check_root)) { - struct extent_buffer *eb; - - eb = btrfs_root_node(check_root); - /* if leaf is the root, then it's fine */ - if (leaf != eb) { - CORRUPT("non-root leaf's nritems is 0", - leaf, check_root, 0); - free_extent_buffer(eb); - return -EIO; - } - free_extent_buffer(eb); - } - return 0; - } - - if (nritems == 0) - return 0; - - /* Check the 0 item */ - if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != - BTRFS_LEAF_DATA_SIZE(fs_info)) { - CORRUPT("invalid item offset size pair", leaf, root, 0); - return -EIO; - } - - /* - * Check to make sure each items keys are in the correct order and their - * offsets make sense. We only have to loop through nritems-1 because - * we check the current slot against the next slot, which verifies the - * next slot's offset+size makes sense and that the current's slot - * offset is correct. - */ - for (slot = 0; slot < nritems - 1; slot++) { - btrfs_item_key_to_cpu(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &key, slot + 1); - - /* Make sure the keys are in the right order */ - if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { - CORRUPT("bad key order", leaf, root, slot); - return -EIO; - } - - /* - * Make sure the offset and ends are right, remember that the - * item data starts at the end of the leaf and grows towards the - * front. - */ - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - CORRUPT("slot offset bad", leaf, root, slot); - return -EIO; - } - - /* - * Check to make sure that we don't point outside of the leaf, - * just in case all the items are consistent to each other, but - * all point outside of the leaf. - */ - if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info)) { - CORRUPT("slot end outside of leaf", leaf, root, slot); - return -EIO; - } - } - - return 0; -} - -static int check_node(struct btrfs_root *root, struct extent_buffer *node) -{ - unsigned long nr = btrfs_header_nritems(node); - struct btrfs_key key, next_key; - int slot; - u64 bytenr; - int ret = 0; - - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { - btrfs_crit(root->fs_info, - "corrupt node: block %llu root %llu nritems %lu", - node->start, root->objectid, nr); - return -EIO; - } - - for (slot = 0; slot < nr - 1; slot++) { - bytenr = btrfs_node_blockptr(node, slot); - btrfs_node_key_to_cpu(node, &key, slot); - btrfs_node_key_to_cpu(node, &next_key, slot + 1); - - if (!bytenr) { - CORRUPT("invalid item slot", node, root, slot); - ret = -EIO; - goto out; - } - - if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { - CORRUPT("bad key order", node, root, slot); - ret = -EIO; - goto out; - } - } -out: - return ret; -} - static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -748,12 +610,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && check_leaf(root, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(root, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } - if (found_level > 0 && check_node(root, eb)) + if (found_level > 0 && btrfs_check_node(root, eb)) ret = -EIO; if (!ret) @@ -879,22 +741,9 @@ static void run_one_async_start(struct btrfs_work *work) static void run_one_async_done(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; struct async_submit_bio *async; - int limit; async = container_of(work, struct async_submit_bio, work); - fs_info = async->fs_info; - - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; - - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ - if (atomic_dec_return(&fs_info->nr_async_submits) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -942,19 +791,10 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, async->status = 0; - atomic_inc(&fs_info->nr_async_submits); - if (op_is_sync(bio->bi_opf)) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); - - while (atomic_read(&fs_info->async_submit_draining) && - atomic_read(&fs_info->nr_async_submits)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0)); - } - return 0; } @@ -1005,9 +845,9 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, return ret; } -static int check_async_write(unsigned long bio_flags) +static int check_async_write(struct btrfs_inode *bi) { - if (bio_flags & EXTENT_BIO_TREE_LOG) + if (atomic_read(&bi->sync_writers)) return 0; #ifdef CONFIG_X86 if (static_cpu_has(X86_FEATURE_XMM4_2)) @@ -1022,7 +862,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, { struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(bio_flags); + int async = check_async_write(BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { @@ -2607,14 +2447,6 @@ int open_ctree(struct super_block *sb, goto fail_delalloc_bytes; } - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_bio_counter; - } - - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -2647,17 +2479,12 @@ int open_ctree(struct super_block *sb, btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, - BTRFS_BLOCK_RSV_DELALLOC); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); - atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); - atomic_set(&fs_info->async_submit_draining, 0); - atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); @@ -2673,12 +2500,21 @@ int open_ctree(struct super_block *sb, /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); + btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bio_counter; + } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_KERNEL); if (!fs_info->delayed_root) { @@ -2895,12 +2731,13 @@ int open_ctree(struct super_block *sb, sb->s_bdi->congested_fn = btrfs_congested_fn; sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); + memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); @@ -3083,6 +2920,9 @@ retry_root_backup: if (ret) goto fail_trans_kthread; + if (btrfs_build_ref_tree(fs_info)) + btrfs_err(fs_info, "couldn't build ref tree"); + /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { @@ -3391,6 +3231,7 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; + int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3433,13 +3274,10 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - if (i == 0) { - ret = btrfsic_submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_FUA | REQ_META | REQ_PRIO, bh); - } else { - ret = btrfsic_submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_META | REQ_PRIO, bh); - } + op_flags = REQ_SYNC | REQ_META | REQ_PRIO; + if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) + op_flags |= REQ_FUA; + ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); if (ret) errors++; } @@ -3948,6 +3786,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; @@ -4007,7 +3846,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->len, fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + /* + * Since btrfs_mark_buffer_dirty() can be called with item pointer set + * but item data not updated. + * So here we should only check item pointers, not item data. + */ + if (btrfs_header_level(buf) == 0 && + btrfs_check_leaf_relaxed(root, buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -4272,26 +4117,28 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_node *tmp; + struct rb_node *n; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); spin_lock(&delayed_refs->lock); continue; } spin_lock(&head->lock); - list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, - list) { + while ((n = rb_first(&head->ref_tree)) != NULL) { + ref = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); ref->in_tree = 0; - list_del(&ref->list); + rb_erase(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); @@ -4304,16 +4151,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); if (pin_bytes) - btrfs_pin_extent(fs_info, head->node.bytenr, - head->node.num_bytes, 1); - btrfs_put_delayed_ref(&head->node); + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e2d7e86b51d1..2f4328511ac8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/percpu_counter.h> +#include <linux/lockdep.h> #include "hash.h" #include "tree-log.h" #include "disk-io.h" @@ -38,6 +39,7 @@ #include "math.h" #include "sysfs.h" #include "qgroup.h" +#include "ref-verify.h" #undef SCRAMBLE_DELAYED_REFS @@ -61,9 +63,6 @@ enum { CHUNK_ALLOC_FORCE = 2, }; -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, u64 parent, @@ -91,17 +90,8 @@ static int find_next_key(struct btrfs_path *path, int level, static void dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); -static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk); static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); @@ -652,7 +642,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { mutex_lock(&caching_ctl->mutex); ret = load_free_space_cache(fs_info, cache); @@ -923,7 +913,7 @@ search_again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -934,7 +924,7 @@ search_again: */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); goto search_again; } spin_lock(&head->lock); @@ -943,7 +933,7 @@ search_again: else BUG_ON(num_refs == 0); - num_refs += head->node.ref_mod; + num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } @@ -2189,16 +2179,20 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + struct btrfs_fs_info *fs_info = root->fs_info; int old_ref_mod, new_ref_mod; int ret; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); + btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, + owner, offset, BTRFS_ADD_DELAYED_REF); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, @@ -2344,7 +2338,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; @@ -2366,14 +2360,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - key.objectid = node->bytenr; + key.objectid = head->bytenr; if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = extent_op->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; + key.offset = head->num_bytes; } again: @@ -2390,17 +2384,17 @@ again: path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == node->bytenr && + if (key.objectid == head->bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == node->num_bytes) + key.offset == head->num_bytes) ret = 0; } if (ret > 0) { btrfs_release_path(path); metadata = 0; - key.objectid = node->bytenr; - key.offset = node->num_bytes; + key.objectid = head->bytenr; + key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; goto again; } @@ -2507,44 +2501,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return 0; } - if (btrfs_delayed_ref_is_head(node)) { - struct btrfs_delayed_ref_head *head; - /* - * we've hit the end of the chain and we were supposed - * to insert this extent into the tree. But, it got - * deleted before we ever needed to insert it, so all - * we have to do is clean up the accounting - */ - BUG_ON(extent_op); - head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(fs_info, node, head, node->action); - - if (head->total_ref_mod < 0) { - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(fs_info, node->bytenr); - ASSERT(cache); - percpu_counter_add(&cache->space_info->total_bytes_pinned, - -node->num_bytes); - btrfs_put_block_group(cache); - } - - if (insert_reserved) { - btrfs_pin_extent(fs_info, node->bytenr, - node->num_bytes, 1); - if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, - node->bytenr, - node->num_bytes); - } - } - - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); - return ret; - } - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, @@ -2563,7 +2519,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; - if (list_empty(&head->ref_list)) + if (RB_EMPTY_ROOT(&head->ref_tree)) return NULL; /* @@ -2576,12 +2532,114 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) return list_first_entry(&head->ref_add_list, struct btrfs_delayed_ref_node, add_list); - ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); + ref = rb_entry(rb_first(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); ASSERT(list_empty(&ref->add_list)); return ref; } +static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); +} + +static int cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + int ret; + + if (!extent_op) + return 0; + head->extent_op = NULL; + if (head->must_insert_reserved) { + btrfs_free_delayed_extent_op(extent_op); + return 0; + } + spin_unlock(&head->lock); + ret = run_delayed_extent_op(trans, fs_info, head, extent_op); + btrfs_free_delayed_extent_op(extent_op); + return ret ? ret : 1; +} + +static int cleanup_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + delayed_refs = &trans->transaction->delayed_refs; + + ret = cleanup_extent_op(trans, fs_info, head); + if (ret < 0) { + unselect_delayed_ref_head(delayed_refs, head); + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + return ret; + } else if (ret) { + return ret; + } + + /* + * Need to drop our head ref lock and re-acquire the delayed ref lock + * and then re-check to make sure nobody got added. + */ + spin_unlock(&head->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) { + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + return 1; + } + delayed_refs->num_heads--; + rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); + spin_unlock(&delayed_refs->lock); + spin_unlock(&head->lock); + atomic_dec(&delayed_refs->num_entries); + + trace_run_delayed_ref_head(fs_info, head, 0); + + if (head->total_ref_mod < 0) { + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, head->bytenr); + ASSERT(cache); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + -head->num_bytes); + btrfs_put_block_group(cache); + + if (head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + } + } + + if (head->must_insert_reserved) { + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + if (head->is_data) { + ret = btrfs_del_csums(trans, fs_info, head->bytenr, + head->num_bytes); + } + } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, + head->qgroup_reserved); + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); + return 0; +} + /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. @@ -2655,11 +2713,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { spin_unlock(&locked_ref->lock); - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(locked_ref); + unselect_delayed_ref_head(delayed_refs, locked_ref); locked_ref = NULL; cond_resched(); count++; @@ -2667,102 +2721,55 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * record the must insert reserved flag before we - * drop the spin lock. + * We're done processing refs in this ref_head, clean everything + * up and move on to the next ref_head. */ - must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; - - extent_op = locked_ref->extent_op; - locked_ref->extent_op = NULL; - if (!ref) { - - - /* All delayed refs have been processed, Go ahead - * and send the head node to run_one_delayed_ref, - * so that any accounting fixes can happen - */ - ref = &locked_ref->node; - - if (extent_op && must_insert_reserved) { - btrfs_free_delayed_extent_op(extent_op); - extent_op = NULL; - } - - if (extent_op) { - spin_unlock(&locked_ref->lock); - ret = run_delayed_extent_op(trans, fs_info, - ref, extent_op); - btrfs_free_delayed_extent_op(extent_op); - - if (ret) { - /* - * Need to reset must_insert_reserved if - * there was an error so the abort stuff - * can cleanup the reserved space - * properly. - */ - if (must_insert_reserved) - locked_ref->must_insert_reserved = 1; - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, - "run_delayed_extent_op returned %d", - ret); - btrfs_delayed_ref_unlock(locked_ref); - return ret; - } + ret = cleanup_ref_head(trans, fs_info, locked_ref); + if (ret > 0 ) { + /* We dropped our lock, we need to loop. */ + ret = 0; continue; + } else if (ret) { + return ret; } + locked_ref = NULL; + count++; + continue; + } - /* - * Need to drop our head ref lock and re-acquire the - * delayed ref lock and then re-check to make sure - * nobody got added. - */ - spin_unlock(&locked_ref->lock); - spin_lock(&delayed_refs->lock); - spin_lock(&locked_ref->lock); - if (!list_empty(&locked_ref->ref_list) || - locked_ref->extent_op) { - spin_unlock(&locked_ref->lock); - spin_unlock(&delayed_refs->lock); - continue; - } - ref->in_tree = 0; - delayed_refs->num_heads--; - rb_erase(&locked_ref->href_node, - &delayed_refs->href_root); - spin_unlock(&delayed_refs->lock); - } else { - actual_count++; - ref->in_tree = 0; - list_del(&ref->list); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->ref_node, &locked_ref->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); + /* + * When we play the delayed ref, also correct the ref_mod on + * head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); } atomic_dec(&delayed_refs->num_entries); - if (!btrfs_delayed_ref_is_head(ref)) { - /* - * when we play the delayed ref, also correct the - * ref_mod on head - */ - switch (ref->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - locked_ref->node.ref_mod -= ref->ref_mod; - break; - case BTRFS_DROP_DELAYED_REF: - locked_ref->node.ref_mod += ref->ref_mod; - break; - default: - WARN_ON(1); - } - } + /* + * Record the must-insert_reserved flag before we drop the spin + * lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, @@ -2770,33 +2777,13 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(locked_ref); + unselect_delayed_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); return ret; } - /* - * If this node is a head, that means all the refs in this head - * have been dealt with, and we will pick the next head to deal - * with, so we must unlock the head and drop it from the cluster - * list before we release it. - */ - if (btrfs_delayed_ref_is_head(ref)) { - if (locked_ref->is_data && - locked_ref->total_ref_mod < 0) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= ref->num_bytes; - spin_unlock(&delayed_refs->lock); - } - btrfs_delayed_ref_unlock(locked_ref); - locked_ref = NULL; - } btrfs_put_delayed_ref(ref); count++; cond_resched(); @@ -3100,33 +3087,16 @@ again: spin_unlock(&delayed_refs->lock); goto out; } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); - while (node) { - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_is_head(&head->node)) { - struct btrfs_delayed_ref_node *ref; - - ref = &head->node; - refcount_inc(&ref->refs); - - spin_unlock(&delayed_refs->lock); - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); + /* Mutex was contended, block until it's released and retry. */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - cond_resched(); - goto again; - } else { - WARN_ON(1); - } - node = rb_next(node); - } - spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } @@ -3169,6 +3139,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; + struct rb_node *node; int ret = 0; cur_trans = root->fs_info->running_transaction; @@ -3184,7 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -3195,13 +3166,18 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return -EAGAIN; } spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - list_for_each_entry(ref, &head->ref_list, list) { + /* + * XXX: We should replace this with a proper search function in the + * future. + */ + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3351,7 +3327,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, - struct btrfs_fs_info *, + struct btrfs_root *, u64, u64, u64, u64, u64, u64); @@ -3391,7 +3367,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, fs_info, bytenr, num_bytes, + ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, key.offset); if (ret) @@ -3399,7 +3375,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; - ret = process_func(trans, fs_info, bytenr, num_bytes, + ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0); if (ret) goto fail; @@ -3526,13 +3502,6 @@ again: goto again; } - /* We've already setup this transaction, go ahead and exit */ - if (block_group->cache_generation == trans->transid && - i_size_read(inode)) { - dcs = BTRFS_DC_SETUP; - goto out_put; - } - /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -3556,6 +3525,13 @@ again: } WARN_ON(ret); + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + if (i_size_read(inode) > 0) { ret = btrfs_check_trunc_cache_free_space(fs_info, &fs_info->global_block_rsv); @@ -4016,16 +3992,9 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) btrfs_put_block_group(bg); } -static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) { - wait_on_atomic_t(&bg->nocow_writers, - btrfs_wait_nocow_writers_atomic_t, + wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait, TASK_UNINTERRUPTIBLE); } @@ -4843,7 +4812,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, u64 orig, bool wait_ordered) { - struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; @@ -4859,8 +4827,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; - block_rsv = &fs_info->delalloc_block_rsv; - space_info = block_rsv->space_info; + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -4919,6 +4886,13 @@ skip_async: } } +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; +}; + /** * maybe_commit_transaction - possibly commit the transaction if its ok to * @root - the root we're allocating for @@ -4930,18 +4904,29 @@ skip_async: * will return -ENOSPC. */ static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 bytes, int force) + struct btrfs_space_info *space_info) { + struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; struct btrfs_trans_handle *trans; + u64 bytes; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; - if (force) - goto commit; + spin_lock(&space_info->lock); + if (!list_empty(&space_info->priority_tickets)) + ticket = list_first_entry(&space_info->priority_tickets, + struct reserve_ticket, list); + else if (!list_empty(&space_info->tickets)) + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + bytes = (ticket) ? ticket->bytes : 0; + spin_unlock(&space_info->lock); + + if (!bytes) + return 0; /* See if there is enough pinned space to make this reservation */ if (percpu_counter_compare(&space_info->total_bytes_pinned, @@ -4956,8 +4941,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return -ENOSPC; spin_lock(&delayed_rsv->lock); + if (delayed_rsv->size > bytes) + bytes = 0; + else + bytes -= delayed_rsv->size; if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes - delayed_rsv->size) < 0) { + bytes) < 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } @@ -4971,13 +4960,6 @@ commit: return btrfs_commit_transaction(trans); } -struct reserve_ticket { - u64 bytes; - int error; - struct list_head list; - wait_queue_head_t wait; -}; - /* * Try to flush some data based on policy set by @state. This is only advisory * and may fail for various reasons. The caller is supposed to examine the @@ -5027,8 +5009,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = 0; break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info, - num_bytes, 0); + ret = may_commit_transaction(fs_info, space_info); break; default: ret = -ENOSPC; @@ -5582,11 +5563,12 @@ again: } } -static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; + u64 ret; spin_lock(&block_rsv->lock); if (num_bytes == (u64)-1) @@ -5601,6 +5583,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } spin_unlock(&block_rsv->lock); + ret = num_bytes; if (num_bytes > 0) { if (dest) { spin_lock(&dest->lock); @@ -5620,6 +5603,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, space_info_add_old_bytes(fs_info, space_info, num_bytes); } + return ret; } int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, @@ -5643,6 +5627,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) rsv->type = type; } +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type) +{ + btrfs_init_block_rsv(rsv, type); + rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); +} + struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type) { @@ -5652,9 +5645,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, if (!block_rsv) return NULL; - btrfs_init_block_rsv(block_rsv, type); - block_rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); return block_rsv; } @@ -5737,6 +5728,66 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } +/** + * btrfs_inode_rsv_refill - refill the inode block rsv. + * @inode - the inode we are refilling. + * @flush - the flusing restriction. + * + * Essentially the same as btrfs_block_rsv_refill, except it uses the + * block_rsv->size as the minimum size. We'll either refill the missing amount + * or return if we already have enough space. This will also handle the resreve + * tracepoint for the reserved amount. + */ +int btrfs_inode_rsv_refill(struct btrfs_inode *inode, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_root *root = inode->root; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) + num_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (num_bytes == 0) + return 0; + + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 1); + } + return ret; +} + +/** + * btrfs_inode_rsv_release - release any excessive reservation. + * @inode - the inode we need to release from. + * + * This is the same as btrfs_block_rsv_release, except that it handles the + * tracepoint for the reservation. + */ +void btrfs_inode_rsv_release(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 released = 0; + + /* + * Since we statically set the block_rsv->size we just want to say we + * are releasing 0 bytes, and then we'll just get the reservation over + * the size free'd. + */ + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delalloc", + btrfs_ino(inode), released, 0); +} + void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes) @@ -5808,7 +5859,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; - fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; fs_info->delayed_block_rsv.space_info = space_info; @@ -5828,8 +5878,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, (u64)-1); - WARN_ON(fs_info->delalloc_block_rsv.size > 0); - WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); @@ -5841,12 +5889,15 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - if (!trans->block_rsv) + if (!trans->block_rsv) { + ASSERT(!trans->bytes_reserved); return; + } if (!trans->bytes_reserved) return; + ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, @@ -5968,104 +6019,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, rsv, (u64)-1); } -/** - * drop_outstanding_extent - drop an outstanding extent - * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're releasing. - * - * This is called when we are freeing up an outstanding extent, either called - * after an error or after an extent is written. This will return the number of - * reserved extents that need to be freed. This must be called with - * BTRFS_I(inode)->lock held. - */ -static unsigned drop_outstanding_extent(struct btrfs_inode *inode, - u64 num_bytes) +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) { - unsigned drop_inode_space = 0; - unsigned dropped_extents = 0; - unsigned num_extents; - - num_extents = count_max_extents(num_bytes); - ASSERT(num_extents); - ASSERT(inode->outstanding_extents >= num_extents); - inode->outstanding_extents -= num_extents; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 csum_leaves; + unsigned outstanding_extents; - if (inode->outstanding_extents == 0 && - test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) - drop_inode_space = 1; + lockdep_assert_held(&inode->lock); + outstanding_extents = inode->outstanding_extents; + if (outstanding_extents) + reserve_size = btrfs_calc_trans_metadata_size(fs_info, + outstanding_extents + 1); + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, + inode->csum_bytes); + reserve_size += btrfs_calc_trans_metadata_size(fs_info, + csum_leaves); - /* - * If we have more or the same amount of outstanding extents than we have - * reserved then we need to leave the reserved extents count alone. - */ - if (inode->outstanding_extents >= inode->reserved_extents) - return drop_inode_space; - - dropped_extents = inode->reserved_extents - inode->outstanding_extents; - inode->reserved_extents -= dropped_extents; - return dropped_extents + drop_inode_space; -} - -/** - * calc_csum_metadata_size - return the amount of metadata space that must be - * reserved/freed for the given bytes. - * @inode: the inode we're manipulating - * @num_bytes: the number of bytes in question - * @reserve: 1 if we are reserving space, 0 if we are freeing space - * - * This adjusts the number of csum_bytes in the inode and then returns the - * correct amount of metadata that must either be reserved or freed. We - * calculate how many checksums we can fit into one leaf and then divide the - * number of bytes that will need to be checksumed by this value to figure out - * how many checksums will be required. If we are adding bytes then the number - * may go up and we will return the number of additional bytes that must be - * reserved. If it is going down we will return the number of bytes that must - * be freed. - * - * This must be called with BTRFS_I(inode)->lock held. - */ -static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, - int reserve) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - u64 old_csums, num_csums; - - if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) - return 0; - - old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); - if (reserve) - inode->csum_bytes += num_bytes; - else - inode->csum_bytes -= num_bytes; - num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); - - /* No change, no need to reserve more */ - if (old_csums == num_csums) - return 0; - - if (reserve) - return btrfs_calc_trans_metadata_size(fs_info, - num_csums - old_csums); - - return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); + spin_lock(&block_rsv->lock); + block_rsv->size = reserve_size; + spin_unlock(&block_rsv->lock); } int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); struct btrfs_root *root = inode->root; - struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; - u64 to_reserve = 0; - u64 csum_bytes; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; - u64 to_free = 0; - unsigned dropped; - bool release_extra = false; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc @@ -6091,19 +6075,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + /* Add our new extents and calculate the new rsv size. */ spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); - inode->outstanding_extents += nr_extents; - - nr_extents = 0; - if (inode->outstanding_extents > inode->reserved_extents) - nr_extents += inode->outstanding_extents - - inode->reserved_extents; - - /* We always want to reserve a slot for updating the inode. */ - to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); - to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); - csum_bytes = inode->csum_bytes; + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { @@ -6113,92 +6090,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) goto out_fail; } - ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); + ret = btrfs_inode_rsv_refill(inode, flush); if (unlikely(ret)) { btrfs_qgroup_free_meta(root, nr_extents * fs_info->nodesize); goto out_fail; } - spin_lock(&inode->lock); - if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) { - to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); - release_extra = true; - } - inode->reserved_extents += nr_extents; - spin_unlock(&inode->lock); - if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); - - if (to_reserve) - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), to_reserve, 1); - if (release_extra) - btrfs_block_rsv_release(fs_info, block_rsv, - btrfs_calc_trans_metadata_size(fs_info, 1)); return 0; out_fail: spin_lock(&inode->lock); - dropped = drop_outstanding_extent(inode, num_bytes); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - */ - if (inode->csum_bytes == csum_bytes) { - calc_csum_metadata_size(inode, num_bytes, 0); - } else { - u64 orig_csum_bytes = inode->csum_bytes; - u64 bytes; - - /* - * This is tricky, but first we need to figure out how much we - * freed from any free-ers that occurred during this - * reservation, so we reset ->csum_bytes to the csum_bytes - * before we dropped our lock, and then call the free for the - * number of bytes that were freed while we were trying our - * reservation. - */ - bytes = csum_bytes - inode->csum_bytes; - inode->csum_bytes = csum_bytes; - to_free = calc_csum_metadata_size(inode, bytes, 0); - - - /* - * Now we need to see how much we would have freed had we not - * been making this reservation and our ->csum_bytes were not - * artificially inflated. - */ - inode->csum_bytes = csum_bytes - num_bytes; - bytes = csum_bytes - orig_csum_bytes; - bytes = calc_csum_metadata_size(inode, bytes, 0); - - /* - * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have freed more space had we - * not had an artificially high ->csum_bytes, so we need to free - * the remainder. If bytes is the same or less then we don't - * need to do anything, the other free-ers did the correct - * thing. - */ - inode->csum_bytes = orig_csum_bytes - num_bytes; - if (bytes > to_free) - to_free = bytes - to_free; - else - to_free = 0; - } + nr_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -nr_extents); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); - if (to_free) { - btrfs_block_rsv_release(fs_info, block_rsv, to_free); - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), to_free, 0); - } + btrfs_inode_rsv_release(inode); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return ret; @@ -6206,36 +6117,55 @@ out_fail: /** * btrfs_delalloc_release_metadata - release a metadata reservation for an inode - * @inode: the inode to release the reservation for - * @num_bytes: the number of bytes we're releasing + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata - * reservations. + * reservations, or on error for the same reason. */ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - u64 to_free = 0; - unsigned dropped; num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&inode->lock); - dropped = drop_outstanding_extent(inode, num_bytes); - - if (num_bytes) - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - if (dropped > 0) - to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); if (btrfs_is_testing(fs_info)) return; - trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), - to_free, 0); + btrfs_inode_rsv_release(inode); +} - btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); +/** + * btrfs_delalloc_release_extents - release our outstanding_extents + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with + * + * When we reserve space we increase outstanding_extents for the extents we may + * add. Once we've set the range as delalloc or created our ordered extents we + * have outstanding_extents to track the real usage, so we use this to free our + * temporarily tracked outstanding_extents. This _must_ be used in conjunction + * with btrfs_delalloc_reserve_metadata. + */ +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + unsigned num_extents; + + spin_lock(&inode->lock); + num_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode); } /** @@ -6282,10 +6212,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * @inode: inode we're releasing space for * @start: start position of the space already reserved * @len: the len of the space already reserved - * - * This must be matched with a call to btrfs_delalloc_reserve_space. This is - * called in the case that we don't need the metadata AND data reservations - * anymore. So if there is an error or we insert an inline extent. + * @release_bytes: the len of the space we consumed or didn't use * * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes @@ -6293,7 +6220,8 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * Also it will handle the qgroup reserved space. */ void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); btrfs_free_reserved_data_space(inode, reserved, start, len); @@ -6595,12 +6523,6 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, btrfs_put_block_group(bg); } -static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) { struct btrfs_space_info *space_info = bg->space_info; @@ -6623,8 +6545,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) down_write(&space_info->groups_sem); up_write(&space_info->groups_sem); - wait_on_atomic_t(&bg->reservations, - btrfs_wait_bg_reservations_atomic_t, + wait_on_atomic_t(&bg->reservations, atomic_t_wait, TASK_UNINTERRUPTIBLE); } @@ -6958,7 +6879,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (is_data) - skinny_metadata = 0; + skinny_metadata = false; ret = lookup_extent_backref(trans, info, path, &iref, bytenr, num_bytes, parent, @@ -7213,7 +7134,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (!list_empty(&head->ref_list)) + if (!RB_EMPTY_ROOT(&head->ref_tree)) goto out; if (head->extent_op) { @@ -7234,9 +7155,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * at this point we have a head with no other entries. Go * ahead and process it. */ - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); - + RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); /* @@ -7255,7 +7175,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, ret = 1; mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return ret; out: spin_unlock(&head->lock); @@ -7277,6 +7197,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { int old_ref_mod, new_ref_mod; + btrfs_ref_tree_mod(root, buf->start, buf->len, parent, + root->root_key.objectid, + btrfs_header_level(buf), 0, + BTRFS_DROP_DELAYED_REF); ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, buf->len, parent, root->root_key.objectid, @@ -7329,16 +7253,21 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + struct btrfs_fs_info *fs_info = root->fs_info; int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) + btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, + root_objectid, owner, offset, + BTRFS_DROP_DELAYED_REF); /* * tree log blocks never actually go into the extent allocation @@ -8306,17 +8235,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, + struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + + btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, + root->root_key.objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT); ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, - ins->offset, 0, root_objectid, owner, + ins->offset, 0, + root->root_key.objectid, owner, offset, ram_bytes, BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); return ret; @@ -8538,6 +8472,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; + btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, + root_objectid, level, 0, + BTRFS_ADD_DELAYED_EXTENT); ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, ins.offset, parent, root_objectid, level, @@ -8894,7 +8831,7 @@ skip: ret); } } - ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0); if (ret) @@ -9269,6 +9206,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_del_root(trans, fs_info, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); + err = ret; goto out_end_trans; } @@ -9311,7 +9249,7 @@ out: * don't have it in the radix (like when we recover after a power fail * or unmount) so we don't leak memory. */ - if (!for_reloc && root_dropped == false) + if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); if (err && err != -EAGAIN) btrfs_handle_fs_error(fs_info, err, NULL); @@ -9968,9 +9906,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) return 0; } -static void __link_block_group(struct btrfs_space_info *space_info, - struct btrfs_block_group_cache *cache) +static void link_block_group(struct btrfs_block_group_cache *cache) { + struct btrfs_space_info *space_info = cache->space_info; int index = get_block_group_index(cache); bool first = false; @@ -10178,7 +10116,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) cache->space_info = space_info; - __link_block_group(space_info, cache); + link_block_group(cache); set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_readonly(info, cache->key.objectid)) { @@ -10337,7 +10275,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->bytes_super, &cache->space_info); update_global_block_rsv(fs_info); - __link_block_group(cache->space_info, cache); + link_block_group(cache); list_add_tail(&cache->bg_list, &trans->new_bgs); @@ -10387,6 +10325,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * remove it. */ free_excluded_extents(fs_info, block_group); + btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, + block_group->key.offset); memcpy(&key, &block_group->key, sizeof(key)); index = get_block_group_index(block_group); @@ -11106,12 +11046,6 @@ int btrfs_start_write_no_snapshotting(struct btrfs_root *root) return 1; } -static int wait_snapshotting_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) { while (true) { @@ -11120,8 +11054,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) ret = btrfs_start_write_no_snapshotting(root); if (ret) break; - wait_on_atomic_t(&root->will_be_snapshotted, - wait_snapshotting_atomic_t, + wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait, TASK_UNINTERRUPTIBLE); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7fa50e12f18e..012d63870b99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -110,7 +110,6 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; - unsigned long bio_flags; /* tells writepage not to lock the state bits for this range * it still does the unlocking @@ -1985,7 +1984,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, struct btrfs_bio *bbio = NULL; int ret; - ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); + ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); bio = btrfs_io_bio_alloc(1); @@ -2762,8 +2761,8 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, */ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct writeback_control *wbc, - struct page *page, sector_t sector, - size_t size, unsigned long offset, + struct page *page, u64 offset, + size_t size, unsigned long pg_offset, struct block_device *bdev, struct bio **bio_ret, bio_end_io_t end_io_func, @@ -2777,6 +2776,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, int contig = 0; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; size_t page_size = min_t(size_t, size, PAGE_SIZE); + sector_t sector = offset >> 9; if (bio_ret && *bio_ret) { bio = *bio_ret; @@ -2787,8 +2787,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, if (prev_bio_flags != bio_flags || !contig || force_bio_submit || - merge_bio(tree, page, offset, page_size, bio, bio_flags) || - bio_add_page(bio, page, page_size, offset) < page_size) { + merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) || + bio_add_page(bio, page, page_size, pg_offset) < page_size) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; @@ -2802,8 +2802,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, (u64)sector << 9); - bio_add_page(bio, page, page_size, offset); + bio = btrfs_bio_alloc(bdev, offset); + bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; @@ -2893,7 +2893,6 @@ static int __do_readpage(struct extent_io_tree *tree, u64 last_byte = i_size_read(inode); u64 block_start; u64 cur_end; - sector_t sector; struct extent_map *em; struct block_device *bdev; int ret = 0; @@ -2929,6 +2928,7 @@ static int __do_readpage(struct extent_io_tree *tree, } while (cur <= end) { bool force_bio_submit = false; + u64 offset; if (cur >= last_byte) { char *userpage; @@ -2968,9 +2968,9 @@ static int __do_readpage(struct extent_io_tree *tree, iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; - sector = em->block_start >> 9; + offset = em->block_start; } else { - sector = (em->block_start + extent_offset) >> 9; + offset = em->block_start + extent_offset; disk_io_size = iosize; } bdev = em->bdev; @@ -3063,8 +3063,8 @@ static int __do_readpage(struct extent_io_tree *tree, } ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, - page, sector, disk_io_size, pg_offset, - bdev, bio, + page, offset, disk_io_size, + pg_offset, bdev, bio, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag, @@ -3253,7 +3253,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, delalloc_start, delalloc_end, &page_started, - nr_written); + nr_written, wbc); /* File system has been set read-only */ if (ret) { SetPageError(page); @@ -3325,7 +3325,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 extent_offset; u64 block_start; u64 iosize; - sector_t sector; struct extent_map *em; struct block_device *bdev; size_t pg_offset = 0; @@ -3368,6 +3367,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; + u64 offset; if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) @@ -3389,7 +3389,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, BUG_ON(end < cur); iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); - sector = (em->block_start + extent_offset) >> 9; + offset = em->block_start + extent_offset; bdev = em->bdev; block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -3432,7 +3432,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, } ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - page, sector, iosize, pg_offset, + page, offset, iosize, pg_offset, bdev, &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); @@ -3716,7 +3716,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; u32 nritems; unsigned long i, num_pages; - unsigned long bio_flags = 0; unsigned long start, end; unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; @@ -3724,8 +3723,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); - if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) - bio_flags = EXTENT_BIO_TREE_LOG; /* set btree blocks beyond nritems with 0 to avoid stale content. */ nritems = btrfs_header_nritems(eb); @@ -3749,11 +3746,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - p, offset >> 9, PAGE_SIZE, 0, bdev, + p, offset, PAGE_SIZE, 0, bdev, &epd->bio, end_bio_extent_buffer_writepage, - 0, epd->bio_flags, bio_flags, false); - epd->bio_flags = bio_flags; + 0, 0, 0, false); if (ret) { set_btree_ioerr(p); if (PageWriteback(p)) @@ -3790,7 +3786,6 @@ int btree_write_cache_pages(struct address_space *mapping, .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; int ret = 0; int done = 0; @@ -3802,7 +3797,7 @@ int btree_write_cache_pages(struct address_space *mapping, int scanned = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -3819,8 +3814,8 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag))) { unsigned i; scanned = 1; @@ -3830,11 +3825,6 @@ retry: if (!PagePrivate(page)) continue; - if (!wbc->range_cyclic && page->index > end) { - done = 1; - break; - } - spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { spin_unlock(&mapping->private_lock); @@ -3946,7 +3936,7 @@ static int extent_write_cache_pages(struct address_space *mapping, if (!igrab(inode)) return 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -3966,8 +3956,8 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, + &index, end, tag))) { unsigned i; scanned = 1; @@ -3992,12 +3982,6 @@ retry: continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; - } - if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) flush_fn(data); @@ -4063,7 +4047,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->bio) { int ret; - ret = submit_one_bio(epd->bio, 0, epd->bio_flags); + ret = submit_one_bio(epd->bio, 0, 0); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -4086,7 +4070,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; ret = __extent_writepage(page, wbc, &epd); @@ -4111,7 +4094,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, - .bio_flags = 0, }; struct writeback_control wbc_writepages = { .sync_mode = mode, @@ -4151,7 +4133,6 @@ int extent_writepages(struct extent_io_tree *tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e5535bbe6953..93dcae0c3183 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -34,7 +34,6 @@ * type for this bio */ #define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_TREE_LOG 2 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -117,7 +116,8 @@ struct extent_io_ops { */ int (*fill_delalloc)(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written); + unsigned long *nr_written, + struct writeback_control *wbc); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, @@ -366,10 +366,11 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) + u64 end, unsigned int extra_bits, + struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, NULL, cached_state, GFP_NOFS); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index aafcc785f840..eb1bac7c8553 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -477,6 +477,47 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, + search_len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + /* * after copy_from_user, pages need to be dirtied and we need to make * sure holes are created between the current EOF and the start of @@ -497,14 +538,34 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, u64 end_of_last_block; u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(inode); + unsigned int extra_bits = 0; start_pos = pos & ~((u64) fs_info->sectorsize - 1); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); end_of_last_block = start_pos + num_bytes - 1; + + if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (start_pos >= isize && + !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { + /* + * There can't be any extents following eof in this case + * so just set the delalloc new bit for the range + * directly. + */ + extra_bits |= EXTENT_DELALLOC_NEW; + } else { + err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode), + start_pos, + num_bytes, cached); + if (err) + return err; + } + } + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - cached, 0); + extra_bits, cached, 0); if (err) return err; @@ -856,7 +917,7 @@ next_slot: btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { - ret = btrfs_inc_extent_ref(trans, fs_info, + ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, @@ -940,7 +1001,7 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - ret = btrfs_free_extent(trans, fs_info, + ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - @@ -1234,7 +1295,7 @@ again: extent_end - split); btrfs_mark_buffer_dirty(leaf); - ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes, + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1268,7 +1329,7 @@ again: extent_end = other_end; del_slot = path->slots[0] + 1; del_nr++; - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1288,7 +1349,7 @@ again: key.offset = other_start; del_slot = path->slots[0]; del_nr++; - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1404,47 +1465,6 @@ fail: } -static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, - const u64 start, - const u64 len, - struct extent_state **cached_state) -{ - u64 search_start = start; - const u64 end = start + len - 1; - - while (search_start < end) { - const u64 search_len = end - search_start + 1; - struct extent_map *em; - u64 em_len; - int ret = 0; - - em = btrfs_get_extent(inode, NULL, 0, search_start, - search_len, 0); - if (IS_ERR(em)) - return PTR_ERR(em); - - if (em->block_start != EXTENT_MAP_HOLE) - goto next; - - em_len = em->len; - if (em->start < search_start) - em_len -= search_start - em->start; - if (em_len > search_len) - em_len = search_len; - - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, - NULL, cached_state, GFP_NOFS); -next: - search_start = extent_map_end(em); - free_extent_map(em); - if (ret) - return ret; - } - return 0; -} - /* * This function locks the extent and properly waits for data=ordered extents * to finish before allowing the pages to be modified if need. @@ -1473,10 +1493,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->vfs_inode.i_size || - (inode->flags & BTRFS_INODE_PREALLOC)) { + if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; - unsigned int clear_bits; lock_extent_bits(&inode->io_tree, start_pos, last_pos, cached_state); @@ -1498,19 +1516,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - ret = btrfs_find_new_delalloc_bytes(inode, start_pos, - last_pos - start_pos + 1, - cached_state); - clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG; - if (ret) - clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED; - clear_extent_bit(&inode->io_tree, start_pos, - last_pos, clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, cached_state, GFP_NOFS); - if (ret) - return ret; + clear_extent_bit(&inode->io_tree, start_pos, last_pos, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached_state, GFP_NOFS); *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -1590,7 +1599,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, int ret = 0; bool only_release_metadata = false; bool force_page_uptodate = false; - bool need_unlock; nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), PAGE_SIZE / (sizeof(struct page *))); @@ -1613,6 +1621,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, size_t copied; size_t dirty_sectors; size_t num_sectors; + int extents_locked; WARN_ON(num_pages > nrptrs); @@ -1656,6 +1665,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } } + WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes); if (ret) { @@ -1669,7 +1679,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = reserve_bytes; - need_unlock = false; again: /* * This is going to setup the pages array with the number of @@ -1679,19 +1688,23 @@ again: ret = prepare_pages(inode, pages, num_pages, pos, write_bytes, force_page_uptodate); - if (ret) + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), + reserve_bytes); break; + } - ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages, + extents_locked = lock_and_cleanup_extent_if_need( + BTRFS_I(inode), pages, num_pages, pos, write_bytes, &lockstart, &lockend, &cached_state); - if (ret < 0) { - if (ret == -EAGAIN) + if (extents_locked < 0) { + if (extents_locked == -EAGAIN) goto again; + btrfs_delalloc_release_extents(BTRFS_I(inode), + reserve_bytes); + ret = extents_locked; break; - } else if (ret > 0) { - need_unlock = true; - ret = 0; } copied = btrfs_copy_from_user(pos, write_bytes, pages, i); @@ -1718,23 +1731,10 @@ again: PAGE_SIZE); } - /* - * If we had a short copy we need to release the excess delaloc - * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space and - * btrfs_delalloc_release_metadata will decrement it, but - * we still have an outstanding extent for the chunk we actually - * managed to copy. - */ if (num_sectors > dirty_sectors) { /* release everything except the sectors we dirtied */ release_bytes -= dirty_sectors << fs_info->sb->s_blocksize_bits; - if (copied > 0) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } if (only_release_metadata) { btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); @@ -1756,10 +1756,11 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, pos, copied, NULL); - if (need_unlock) + if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); break; @@ -2046,7 +2047,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; - bool full_sync = 0; + bool full_sync = false; u64 len; /* @@ -2056,6 +2057,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); + btrfs_init_log_ctx(&ctx, inode); + /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by @@ -2202,8 +2205,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - btrfs_init_log_ctx(&ctx, inode); - ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -2261,6 +2262,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_end_transaction(trans); } out: + ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cdc9f4015ec3..4426d1c73e50 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1264,7 +1264,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Lock all pages first so we can lock the extent safely. */ ret = io_ctl_prepare_pages(io_ctl, inode, 0); if (ret) - goto out; + goto out_unlock; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); @@ -1358,6 +1358,7 @@ out_nospc_locked: out_nospc: cleanup_write_cache_enospc(inode, io_ctl, &cached_state); +out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 684f12247db7..fe5e0324dca9 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1286,12 +1286,8 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { - u64 start, end; int ret; - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; - block_group->needs_free_space = 0; ret = add_new_free_space_info(trans, fs_info, block_group, path); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d02019747d00..022b19336fee 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -500,11 +500,12 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); goto out_put; } ret = btrfs_write_out_ino_cache(root, trans, path, inode); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d94e3f68b9b1..e1a7f3cb5be9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -42,6 +42,7 @@ #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> #include <linux/uio.h> +#include <linux/magic.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -67,7 +68,6 @@ struct btrfs_iget_args { }; struct btrfs_dio_data { - u64 outstanding_extents; u64 reserve; u64 unsubmitted_oe_range_start; u64 unsubmitted_oe_range_end; @@ -316,7 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_free_path(path); return PTR_ERR(trans); } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; if (compressed_size && compressed_pages) extent_item_size = btrfs_file_extent_calc_inline_size( @@ -348,7 +348,6 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start); btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); out: /* @@ -379,6 +378,7 @@ struct async_cow { struct page *locked_page; u64 start; u64 end; + unsigned int write_flags; struct list_head extents; struct btrfs_work work; }; @@ -458,7 +458,6 @@ static noinline void compress_file_range(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes; u64 blocksize = fs_info->sectorsize; u64 actual_end; u64 isize = i_size_read(inode); @@ -508,8 +507,6 @@ again: total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); - num_bytes = ALIGN(end - start + 1, blocksize); - num_bytes = max(blocksize, num_bytes); total_in = 0; ret = 0; @@ -542,7 +539,10 @@ again: */ extent_range_clear_dirty_for_io(inode, start, end); redirty = 1; - ret = btrfs_compress_pages(compress_type, + + /* Compression level is applied here and only here */ + ret = btrfs_compress_pages( + compress_type | (fs_info->compress_level << 4), inode->i_mapping, start, pages, &nr_pages, @@ -570,7 +570,7 @@ again: cont: if (start == 0) { /* lets try to make an inline extent */ - if (ret || total_in < (actual_end - start)) { + if (ret || total_in < actual_end) { /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ @@ -584,16 +584,21 @@ cont: } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING; unsigned long page_error_op; - clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; /* * inline extent creation worked or returned error, * we don't need to create any more async work items. * Unlock and free up our temp pages. + * + * We use DO_ACCOUNTING here because we need the + * delalloc_release_metadata to be done _after_ we drop + * our outstanding extent for clearing delalloc for this + * range. */ extent_clear_unlock_delalloc(inode, start, end, end, NULL, clear_flags, @@ -602,10 +607,6 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - if (ret == 0) - btrfs_free_reserved_data_space_noquota(inode, - start, - end - start + 1); goto free_pages_out; } } @@ -625,7 +626,6 @@ cont: */ total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed + blocksize <= total_in) { - num_bytes = total_in; *num_added += 1; /* @@ -633,12 +633,12 @@ cont: * allocation on disk for these compressed pages, and * will submit them to the elevator. */ - add_async_extent(async_cow, start, num_bytes, + add_async_extent(async_cow, start, total_in, total_compressed, pages, nr_pages, compress_type); - if (start + num_bytes < end) { - start += num_bytes; + if (start + total_in < end) { + start += total_in; pages = NULL; cond_resched(); goto again; @@ -858,7 +858,8 @@ retry: async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages)) { + async_extent->nr_pages, + async_cow->write_flags)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -982,15 +983,19 @@ static noinline int cow_file_range(struct inode *inode, ret = cow_file_range_inline(root, inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL); if (ret == 0) { + /* + * We use DO_ACCOUNTING here because we need the + * delalloc_release_metadata to be run _after_ we drop + * our outstanding extent for clearing delalloc for this + * range. + */ extent_clear_unlock_delalloc(inode, start, end, delalloc_end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG, PAGE_UNLOCK | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - btrfs_free_reserved_data_space_noquota(inode, start, - end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -1188,7 +1193,8 @@ static noinline void async_cow_free(struct btrfs_work *work) static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + unsigned int write_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_cow *async_cow; @@ -1205,6 +1211,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->root = root; async_cow->locked_page = locked_page; async_cow->start = start; + async_cow->write_flags = write_flags; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) @@ -1226,13 +1233,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); - while (atomic_read(&fs_info->async_submit_draining) && - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->async_delalloc_pages) == - 0)); - } - *nr_written += nr_pages; start = cur_end + 1; } @@ -1581,11 +1581,13 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) */ static int run_delalloc_range(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + struct writeback_control *wbc) { struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); + unsigned int write_flags = wbc_to_write_flags(wbc); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1600,7 +1602,8 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); ret = cow_file_range_async(inode, locked_page, start, end, - page_started, nr_written); + page_started, nr_written, + write_flags); } if (ret) btrfs_cleanup_ordered_extents(inode, start, end - start + 1); @@ -1635,7 +1638,7 @@ static void btrfs_split_extent_hook(void *private_data, } spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); } @@ -1665,7 +1668,7 @@ static void btrfs_merge_extent_hook(void *private_data, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= BTRFS_MAX_EXTENT_SIZE) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); return; } @@ -1696,7 +1699,7 @@ static void btrfs_merge_extent_hook(void *private_data, return; spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); } @@ -1766,15 +1769,12 @@ static void btrfs_set_bit_hook(void *private_data, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; + u32 num_extents = count_max_extents(len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); - if (*bits & EXTENT_FIRST_DELALLOC) { - *bits &= ~EXTENT_FIRST_DELALLOC; - } else { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } + spin_lock(&BTRFS_I(inode)->lock); + btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); + spin_unlock(&BTRFS_I(inode)->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) @@ -1828,13 +1828,9 @@ static void btrfs_clear_bit_hook(void *private_data, struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); - if (*bits & EXTENT_FIRST_DELALLOC) { - *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { - spin_lock(&inode->lock); - inode->outstanding_extents -= num_extents; - spin_unlock(&inode->lock); - } + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -num_extents); + spin_unlock(&inode->lock); /* * We don't reserve metadata space for space cache inodes so we @@ -2036,11 +2032,12 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, } int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + unsigned int extra_bits, struct extent_state **cached_state, int dedupe) { WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state); + extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2101,10 +2098,11 @@ again: goto out; } - btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state, + btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state, 0); ClearPageChecked(page); set_page_dirty(page); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -2229,8 +2227,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret < 0) goto out; qg_released = ret; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + file_pos, qg_released, &ins); out: btrfs_free_path(path); @@ -2464,7 +2463,7 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, ret = iterate_inodes_from_logical(old->bytenr + old->extent_offset, fs_info, path, record_one_backref, - old); + old, false); if (ret < 0 && ret != -ENOENT) return false; @@ -2682,7 +2681,7 @@ again: inode_add_bytes(inode, len); btrfs_release_path(path); - ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr, + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ @@ -2964,7 +2963,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out; } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -3000,12 +2999,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, + ordered_extent->len); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + @@ -3058,9 +3059,6 @@ out: 0, &cached_state, GFP_NOFS); } - if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - ordered_extent->len); if (trans) btrfs_end_transaction(trans); @@ -4372,47 +4370,11 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, } -static int truncate_inline_extent(struct inode *inode, - struct btrfs_path *path, - struct btrfs_key *found_key, - const u64 item_end, - const u64 new_size) -{ - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - struct btrfs_file_extent_item *fi; - u32 size = (u32)(new_size - found_key->offset); - struct btrfs_root *root = BTRFS_I(inode)->root; - - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - - if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { - loff_t offset = new_size; - loff_t page_end = ALIGN(offset, PAGE_SIZE); - - /* - * Zero out the remaining of the last page of our inline extent, - * instead of directly truncating our inline extent here - that - * would be much more complex (decompressing all the data, then - * compressing the truncated data, which might be bigger than - * the size of the inline extent, resize the extent, etc). - * We release the path because to get the page we might need to - * read the extent item from disk (data not in the page cache). - */ - btrfs_release_path(path); - return btrfs_truncate_block(inode, offset, page_end - offset, - 0); - } - - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root->fs_info, path, size, 1); - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - inode_sub_bytes(inode, item_end + 1 - new_size); - - return 0; -} +/* + * Return this if we need to call truncate_block for the last bit of the + * truncate. + */ +#define NEED_TRUNCATE_BLOCK 1 /* * this can truncate away extent items, csum items and directory items. @@ -4451,9 +4413,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int err = 0; u64 ino = btrfs_ino(BTRFS_I(inode)); u64 bytes_deleted = 0; - bool be_nice = 0; - bool should_throttle = 0; - bool should_end = 0; + bool be_nice = false; + bool should_throttle = false; + bool should_end = false; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4463,7 +4425,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, */ if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - be_nice = 1; + be_nice = true; path = btrfs_alloc_path(); if (!path) @@ -4573,11 +4535,6 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; - if (del_item) - last_size = found_key.offset; - else - last_size = new_size; - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -4619,40 +4576,30 @@ search_again: */ if (!del_item && btrfs_file_extent_encryption(leaf, fi) == 0 && - btrfs_file_extent_other_encoding(leaf, fi) == 0) { - + btrfs_file_extent_other_encoding(leaf, fi) == 0 && + btrfs_file_extent_compression(leaf, fi) == 0) { + u32 size = (u32)(new_size - found_key.offset); + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(root->fs_info, path, size, 1); + } else if (!del_item) { /* - * Need to release path in order to truncate a - * compressed extent. So delete any accumulated - * extent items so far. + * We have to bail so the last_size is set to + * just before this extent. */ - if (btrfs_file_extent_compression(leaf, fi) != - BTRFS_COMPRESS_NONE && pending_del_nr) { - err = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, - err); - goto error; - } - pending_del_nr = 0; - } + err = NEED_TRUNCATE_BLOCK; + break; + } - err = truncate_inline_extent(inode, path, - &found_key, - item_end, - new_size); - if (err) { - btrfs_abort_transaction(trans, err); - goto error; - } - } else if (test_bit(BTRFS_ROOT_REF_COWS, - &root->state)) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); - } } delete: + if (del_item) + last_size = found_key.offset; + else + last_size = new_size; if (del_item) { if (!pending_del_nr) { /* no pending yet, add ourselves */ @@ -4669,14 +4616,14 @@ delete: } else { break; } - should_throttle = 0; + should_throttle = false; if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == fs_info->tree_root)) { btrfs_set_path_blocking(path); bytes_deleted += extent_num_bytes; - ret = btrfs_free_extent(trans, fs_info, extent_start, + ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset); @@ -4688,11 +4635,11 @@ delete: if (be_nice) { if (truncate_space_check(trans, root, extent_num_bytes)) { - should_end = 1; + should_end = true; } if (btrfs_should_throttle_delayed_refs(trans, fs_info)) - should_throttle = 1; + should_throttle = true; } } @@ -4801,8 +4748,11 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; + block_start = round_down(from, blocksize); + block_end = block_start + blocksize - 1; + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - round_down(from, blocksize), blocksize); + block_start, blocksize); if (ret) goto out; @@ -4810,15 +4760,12 @@ again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, - round_down(from, blocksize), - blocksize); + block_start, blocksize); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; goto out; } - block_start = round_down(from, blocksize); - block_end = block_start + blocksize - 1; - if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); lock_page(page); @@ -4853,7 +4800,7 @@ again: EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, block_start, block_end, + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, @@ -4883,6 +4830,7 @@ out_unlock: if (ret) btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: @@ -5500,6 +5448,14 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, goto out_err; btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); + if (location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY) { + btrfs_warn(root->fs_info, +"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", + __func__, name, btrfs_ino(BTRFS_I(dir)), + location->objectid, location->type, location->offset); + goto out_err; + } out: btrfs_free_path(path); return ret; @@ -5816,8 +5772,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); - index = srcu_read_lock(&fs_info->subvol_srcu); ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); @@ -7797,33 +7751,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, return em; } -static void adjust_dio_outstanding_extents(struct inode *inode, - struct btrfs_dio_data *dio_data, - const u64 len) -{ - unsigned num_extents = count_max_extents(len); - - /* - * If we have an outstanding_extents count still set then we're - * within our reservation, otherwise we need to adjust our inode - * counter appropriately. - */ - if (dio_data->outstanding_extents >= num_extents) { - dio_data->outstanding_extents -= num_extents; - } else { - /* - * If dio write length has been split due to no large enough - * contiguous space, we need to compensate our inode counter - * appropriately. - */ - u64 num_needed = num_extents - dio_data->outstanding_extents; - - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_needed; - spin_unlock(&BTRFS_I(inode)->lock); - } -} - static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -7985,7 +7912,6 @@ unlock: if (!dio_data->overwrite && start + len > i_size_read(inode)) i_size_write(inode, start + len); - adjust_dio_outstanding_extents(inode, dio_data, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -8015,14 +7941,6 @@ unlock_err: err: if (dio_data) current->journal_info = dio_data; - /* - * Compensate the delalloc release we do in btrfs_direct_IO() when we - * write less data then expected, so that we don't underflow our inode's - * outstanding extents counter. - */ - if (create && dio_data) - adjust_dio_outstanding_extents(inode, dio_data, len); - return ret; } @@ -8495,7 +8413,7 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_status = 0; + dip->dio_bio->bi_status = BLK_STS_OK; bio_endio(dip->orig_bio); } out: @@ -8577,7 +8495,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, goto err; } map: - ret = btrfs_map_bio(fs_info, bio, 0, async_submit); + ret = btrfs_map_bio(fs_info, bio, 0, 0); err: bio_put(bio); return ret; @@ -8786,7 +8704,6 @@ free_ordered: } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - struct kiocb *iocb, const struct iov_iter *iter, loff_t offset) { int seg; @@ -8833,7 +8750,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) bool relock = false; ssize_t ret; - if (check_direct_IO(fs_info, iocb, iter, offset)) + if (check_direct_IO(fs_info, iter, offset)) return 0; inode_dio_begin(inode); @@ -8868,7 +8785,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) offset, count); if (ret) goto out; - dio_data.outstanding_extents = count_max_extents(count); /* * We need to know how many extents we reserved so that we can @@ -8915,6 +8831,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, data_reserved, offset, count - (size_t)ret); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); } out: if (wakeup) @@ -9232,9 +9149,6 @@ again: fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE - reserved_space); } @@ -9252,7 +9166,7 @@ again: EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, end, + ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, page_start, page_end, @@ -9286,12 +9200,14 @@ again: out_unlock: if (!ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space); out_noreserve: @@ -9387,12 +9303,12 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); + trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) { err = ret; break; } - trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret) { err = ret; @@ -9416,6 +9332,27 @@ static int btrfs_truncate(struct inode *inode) trans->block_rsv = rsv; } + /* + * We can't call btrfs_truncate_block inside a trans handle as we could + * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know + * we've truncated everything except the last little bit, and can do + * btrfs_truncate_block and then update the disk_i_size. + */ + if (ret == NEED_TRUNCATE_BLOCK) { + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); + if (ret) + goto out; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + } + if (ret == 0 && inode->i_nlink > 0) { trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, BTRFS_I(inode)); @@ -9480,6 +9417,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct inode *btrfs_alloc_inode(struct super_block *sb) { + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; @@ -9506,8 +9444,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) spin_lock_init(&ei->lock); ei->outstanding_extents = 0; - ei->reserved_extents = 0; - + if (sb->s_magic != BTRFS_TEST_MAGIC) + btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); ei->runtime_flags = 0; ei->prop_compress = BTRFS_COMPRESS_NONE; ei->defrag_compress = BTRFS_COMPRESS_NONE; @@ -9557,8 +9496,9 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!hlist_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(BTRFS_I(inode)->block_rsv.reserved); + WARN_ON(BTRFS_I(inode)->block_rsv.size); WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); @@ -10337,19 +10277,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) ret = __start_delalloc_inodes(root, delay_iput, -1); if (ret > 0) ret = 0; - /* - * the filemap_flush will queue IO into the worker threads, but - * we have to make sure the IO is actually started and that - * ordered extents get created before we return - */ - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); return ret; } @@ -10391,14 +10318,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, spin_unlock(&fs_info->delalloc_root_lock); ret = 0; - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); out: if (!list_empty_careful(&splice)) { spin_lock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c7a49faf4e0..2ef8acaac688 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -86,6 +86,19 @@ struct btrfs_ioctl_received_subvol_args_32 { struct btrfs_ioctl_received_subvol_args_32) #endif +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) +struct btrfs_ioctl_send_args_32 { + __s64 send_fd; /* in */ + __u64 clone_sources_count; /* in */ + compat_uptr_t clone_sources; /* in */ + __u64 parent_root; /* in */ + __u64 flags; /* in */ + __u64 reserved[4]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ + struct btrfs_ioctl_send_args_32) +#endif static int btrfs_clone(struct inode *src, struct inode *inode, u64 off, u64 olen, u64 olen_aligned, u64 destoff, @@ -609,23 +622,6 @@ fail_free: return ret; } -static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root) -{ - s64 writers; - DEFINE_WAIT(wait); - - do { - prepare_to_wait(&root->subv_writers->wait, &wait, - TASK_UNINTERRUPTIBLE); - - writers = percpu_counter_sum(&root->subv_writers->counter); - if (writers) - schedule(); - - finish_wait(&root->subv_writers->wait, &wait); - } while (writers); -} - static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, u64 *async_transid, bool readonly, @@ -654,7 +650,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, atomic_inc(&root->will_be_snapshotted); smp_mb__after_atomic(); - btrfs_wait_for_no_snapshotting_writes(root); + /* wait for no snapshot writes */ + wait_event(root->subv_writers->wait, + percpu_counter_sum(&root->subv_writers->counter) == 0); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) @@ -1174,7 +1172,7 @@ again: if (!i_done || ret) goto out; - if (!(inode->i_sb->s_flags & MS_ACTIVE)) + if (!(inode->i_sb->s_flags & SB_ACTIVE)) goto out; /* @@ -1219,6 +1217,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; out: @@ -1229,6 +1228,7 @@ out: btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; @@ -1333,7 +1333,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * make sure we stop running if someone unmounts * the FS */ - if (!(inode->i_sb->s_flags & MS_ACTIVE)) + if (!(inode->i_sb->s_flags & SB_ACTIVE)) break; if (btrfs_defrag_cancelled(fs_info)) { @@ -1420,21 +1420,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, filemap_flush(inode->i_mapping); } - if (do_compress) { - /* the filemap_flush will queue IO into the worker threads, but - * we have to make sure the IO is actually started and that - * ordered extents get created before we return - */ - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); - } - if (range->compress_type == BTRFS_COMPRESS_LZO) { btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { @@ -1842,8 +1827,13 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + if (ret < 0) { + btrfs_end_transaction(trans); + goto out_reset; + } + + ret = btrfs_commit_transaction(trans); - btrfs_commit_transaction(trans); out_reset: if (ret) btrfs_set_root_flags(&root->root_item, root_flags); @@ -2179,7 +2169,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, - (char *)(&uarg->buf[0])); + (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; else if (ret == -EOVERFLOW && @@ -2216,7 +2206,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, if (!path) return -ENOMEM; - ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX]; + ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1]; key.objectid = tree_id; key.type = BTRFS_ROOT_ITEM_KEY; @@ -3706,7 +3696,7 @@ process_slot: if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, - fs_info, + root, disko, diskl, 0, root->root_key.objectid, btrfs_ino(BTRFS_I(inode)), @@ -4129,10 +4119,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_space_info *dest_orig; struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; - u64 types[] = {BTRFS_BLOCK_GROUP_DATA, - BTRFS_BLOCK_GROUP_SYSTEM, - BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + static const u64 types[] = { + BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA + }; int num_types = 4; int alloc_size; int ret = 0; @@ -4504,8 +4496,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } - ret = copy_to_user((void *)(unsigned long)ipa->fspath, - (void *)(unsigned long)ipath->fspath, size); + ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, + ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; @@ -4540,13 +4532,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) } static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, - void __user *arg) + void __user *arg, int version) { int ret = 0; int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; + bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4555,13 +4548,30 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, if (IS_ERR(loi)) return PTR_ERR(loi); + if (version == 1) { + ignore_offset = false; + size = min_t(u32, loi->size, SZ_64K); + } else { + /* All reserved bits must be 0 for now */ + if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { + ret = -EINVAL; + goto out_loi; + } + /* Only accept flags we have defined so far */ + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { + ret = -EINVAL; + goto out_loi; + } + ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; + size = min_t(u32, loi->size, SZ_16M); + } + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } - size = min_t(u32, loi->size, SZ_64K); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -4570,20 +4580,21 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes); + build_ino_list, inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) goto out; - ret = copy_to_user((void *)(unsigned long)loi->inodes, - (void *)(unsigned long)inodes, size); + ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, + size); if (ret) ret = -EFAULT; out: btrfs_free_path(path); kvfree(inodes); +out_loi: kfree(loi); return ret; @@ -5160,15 +5171,11 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); goto out; } } ret = btrfs_commit_transaction(trans); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - goto out; - } - out: up_write(&fs_info->subvol_sem); mnt_drop_write_file(file); @@ -5490,6 +5497,41 @@ out_drop_write: return ret; } +static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_send_args *arg; + int ret; + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_send_args_32 args32; + + ret = copy_from_user(&args32, argp, sizeof(args32)); + if (ret) + return -EFAULT; + arg = kzalloc(sizeof(*arg), GFP_KERNEL); + if (!arg) + return -ENOMEM; + arg->send_fd = args32.send_fd; + arg->clone_sources_count = args32.clone_sources_count; + arg->clone_sources = compat_ptr(args32.clone_sources); + arg->parent_root = args32.parent_root; + arg->flags = args32.flags; + memcpy(arg->reserved, args32.reserved, + sizeof(args32.reserved)); +#else + return -ENOTTY; +#endif + } else { + arg = memdup_user(argp, sizeof(*arg)); + if (IS_ERR(arg)) + return PTR_ERR(arg); + } + ret = btrfs_ioctl_send(file, arg); + kfree(arg); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5554,7 +5596,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: - return btrfs_ioctl_logical_to_ino(fs_info, argp); + return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); + case BTRFS_IOC_LOGICAL_INO_V2: + return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(fs_info, argp); case BTRFS_IOC_SYNC: { @@ -5595,7 +5639,11 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return btrfs_ioctl_send(file, argp); + return _btrfs_ioctl_send(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_SEND_32: + return _btrfs_ioctl_send(file, argp, true); +#endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); case BTRFS_IOC_QUOTA_CTL: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d433e75d489a..6c7f18cd3b61 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -430,10 +430,15 @@ out: return ret; } +static void lzo_set_level(struct list_head *ws, unsigned int type) +{ +} + const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, .decompress_bio = lzo_decompress_bio, .decompress = lzo_decompress, + .set_level = lzo_set_level, }; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a3aca495e33e..5b311aeddcc8 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -242,6 +242,15 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, } spin_unlock(&root->ordered_extent_lock); + /* + * We don't need the count_max_extents here, we can assume that all of + * that work has been done at higher layers, so this is truly the + * smallest the extent is going to get. + */ + spin_lock(&BTRFS_I(inode)->lock); + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); + spin_unlock(&BTRFS_I(inode)->lock); + return 0; } @@ -591,11 +600,19 @@ void btrfs_remove_ordered_extent(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; bool dec_pending_ordered = false; - tree = &BTRFS_I(inode)->ordered_tree; + /* This is paired with btrfs_add_ordered_extent. */ + spin_lock(&btrfs_inode->lock); + btrfs_mod_outstanding_extents(btrfs_inode, -1); + spin_unlock(&btrfs_inode->lock); + if (root != fs_info->tree_root) + btrfs_delalloc_release_metadata(btrfs_inode, entry->len); + + tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e172d4843eae..168fd03ca3ac 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1441,7 +1441,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, u64 bytenr = qrecord->bytenr; int ret; - ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root); + ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false); if (ret < 0) return ret; @@ -2031,7 +2031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, - &record->old_roots); + &record->old_roots, false); if (ret < 0) goto cleanup; } @@ -2042,7 +2042,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, * root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, SEQ_LAST, &new_roots); + record->bytenr, SEQ_LAST, &new_roots, false); if (ret < 0) goto cleanup; if (qgroup_to_skip) { @@ -2570,7 +2570,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, num_bytes = found.offset; ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots); + &roots, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 24a62224b24b..a7f79254ecca 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1326,6 +1326,9 @@ write_data: cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); } /* @@ -1582,6 +1585,10 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return -EIO; finish: @@ -2107,6 +2114,10 @@ cleanup: if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return -EIO; } @@ -2231,12 +2242,18 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, ASSERT(!bio->bi_iter.bi_size); rbio->operation = BTRFS_RBIO_PARITY_SCRUB; - for (i = 0; i < rbio->real_stripes; i++) { + /* + * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted + * to the end position, so this search can start from the first parity + * stripe. + */ + for (i = rbio->nr_data; i < rbio->real_stripes; i++) { if (bbio->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } } + ASSERT(i < rbio->real_stripes); /* Now we just support the sectorsize equals to page size */ ASSERT(fs_info->sectorsize == PAGE_SIZE); @@ -2454,6 +2471,9 @@ submit_write: cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2563,12 +2583,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int stripe; struct bio *bio; + bio_list_init(&bio_list); + ret = alloc_rbio_essential_pages(rbio); if (ret) goto cleanup; - bio_list_init(&bio_list); - atomic_set(&rbio->error, 0); /* * build a list of bios to read all the missing parts of this @@ -2636,6 +2656,10 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return; finish: diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c new file mode 100644 index 000000000000..34878699d363 --- /dev/null +++ b/fs/btrfs/ref-verify.c @@ -0,0 +1,1031 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "delayed-ref.h" +#include "ref-verify.h" + +/* + * Used to keep track the roots and number of refs each root has for a given + * bytenr. This just tracks the number of direct references, no shared + * references. + */ +struct root_entry { + u64 root_objectid; + u64 num_refs; + struct rb_node node; +}; + +/* + * These are meant to represent what should exist in the extent tree, these can + * be used to verify the extent tree is consistent as these should all match + * what the extent tree says. + */ +struct ref_entry { + u64 root_objectid; + u64 parent; + u64 owner; + u64 offset; + u64 num_refs; + struct rb_node node; +}; + +#define MAX_TRACE 16 + +/* + * Whenever we add/remove a reference we record the action. The action maps + * back to the delayed ref action. We hold the ref we are changing in the + * action so we can account for the history properly, and we record the root we + * were called with since it could be different from ref_root. We also store + * stack traces because thats how I roll. + */ +struct ref_action { + int action; + u64 root; + struct ref_entry ref; + struct list_head list; + unsigned long trace[MAX_TRACE]; + unsigned int trace_len; +}; + +/* + * One of these for every block we reference, it holds the roots and references + * to it as well as all of the ref actions that have occured to it. We never + * free it until we unmount the file system in order to make sure re-allocations + * are happening properly. + */ +struct block_entry { + u64 bytenr; + u64 len; + u64 num_refs; + int metadata; + int from_disk; + struct rb_root roots; + struct rb_root refs; + struct rb_node node; + struct list_head actions; +}; + +static struct block_entry *insert_block_entry(struct rb_root *root, + struct block_entry *be) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct block_entry *entry; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct block_entry, node); + if (entry->bytenr > be->bytenr) + p = &(*p)->rb_left; + else if (entry->bytenr < be->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&be->node, parent_node, p); + rb_insert_color(&be->node, root); + return NULL; +} + +static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n; + struct block_entry *entry = NULL; + + n = root->rb_node; + while (n) { + entry = rb_entry(n, struct block_entry, node); + if (entry->bytenr < bytenr) + n = n->rb_right; + else if (entry->bytenr > bytenr) + n = n->rb_left; + else + return entry; + } + return NULL; +} + +static struct root_entry *insert_root_entry(struct rb_root *root, + struct root_entry *re) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct root_entry *entry; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct root_entry, node); + if (entry->root_objectid > re->root_objectid) + p = &(*p)->rb_left; + else if (entry->root_objectid < re->root_objectid) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&re->node, parent_node, p); + rb_insert_color(&re->node, root); + return NULL; + +} + +static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) +{ + if (ref1->root_objectid < ref2->root_objectid) + return -1; + if (ref1->root_objectid > ref2->root_objectid) + return 1; + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + if (ref1->owner < ref2->owner) + return -1; + if (ref1->owner > ref2->owner) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + return 0; +} + +static struct ref_entry *insert_ref_entry(struct rb_root *root, + struct ref_entry *ref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct ref_entry *entry; + int cmp; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct ref_entry, node); + cmp = comp_refs(entry, ref); + if (cmp > 0) + p = &(*p)->rb_left; + else if (cmp < 0) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&ref->node, parent_node, p); + rb_insert_color(&ref->node, root); + return NULL; + +} + +static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) +{ + struct rb_node *n; + struct root_entry *entry = NULL; + + n = root->rb_node; + while (n) { + entry = rb_entry(n, struct root_entry, node); + if (entry->root_objectid < objectid) + n = n->rb_right; + else if (entry->root_objectid > objectid) + n = n->rb_left; + else + return entry; + } + return NULL; +} + +#ifdef CONFIG_STACKTRACE +static void __save_stack_trace(struct ref_action *ra) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = ra->trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + ra->trace_len = stack_trace.nr_entries; +} + +static void __print_stack_trace(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + struct stack_trace trace; + + if (ra->trace_len == 0) { + btrfs_err(fs_info, " ref-verify: no stacktrace"); + return; + } + trace.nr_entries = ra->trace_len; + trace.entries = ra->trace; + print_stack_trace(&trace, 2); +} +#else +static void inline __save_stack_trace(struct ref_action *ra) +{ +} + +static void inline __print_stack_trace(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + btrfs_err(fs_info, " ref-verify: no stacktrace support"); +} +#endif + +static void free_block_entry(struct block_entry *be) +{ + struct root_entry *re; + struct ref_entry *ref; + struct ref_action *ra; + struct rb_node *n; + + while ((n = rb_first(&be->roots))) { + re = rb_entry(n, struct root_entry, node); + rb_erase(&re->node, &be->roots); + kfree(re); + } + + while((n = rb_first(&be->refs))) { + ref = rb_entry(n, struct ref_entry, node); + rb_erase(&ref->node, &be->refs); + kfree(ref); + } + + while (!list_empty(&be->actions)) { + ra = list_first_entry(&be->actions, struct ref_action, + list); + list_del(&ra->list); + kfree(ra); + } + kfree(be); +} + +static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 len, + u64 root_objectid) +{ + struct block_entry *be = NULL, *exist; + struct root_entry *re = NULL; + + re = kzalloc(sizeof(struct root_entry), GFP_KERNEL); + be = kzalloc(sizeof(struct block_entry), GFP_KERNEL); + if (!be || !re) { + kfree(re); + kfree(be); + return ERR_PTR(-ENOMEM); + } + be->bytenr = bytenr; + be->len = len; + + re->root_objectid = root_objectid; + re->num_refs = 0; + + spin_lock(&fs_info->ref_verify_lock); + exist = insert_block_entry(&fs_info->block_tree, be); + if (exist) { + if (root_objectid) { + struct root_entry *exist_re; + + exist_re = insert_root_entry(&exist->roots, re); + if (exist_re) + kfree(re); + } + kfree(be); + return exist; + } + + be->num_refs = 0; + be->metadata = 0; + be->from_disk = 0; + be->roots = RB_ROOT; + be->refs = RB_ROOT; + INIT_LIST_HEAD(&be->actions); + if (root_objectid) + insert_root_entry(&be->roots, re); + else + kfree(re); + return be; +} + +static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root, + u64 parent, u64 bytenr, int level) +{ + struct block_entry *be; + struct root_entry *re; + struct ref_entry *ref = NULL, *exist; + + ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + + if (parent) + ref->root_objectid = 0; + else + ref->root_objectid = ref_root; + ref->parent = parent; + ref->owner = level; + ref->offset = 0; + ref->num_refs = 1; + + be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs++; + be->from_disk = 1; + be->metadata = 1; + + if (!parent) { + ASSERT(ref_root); + re = lookup_root_entry(&be->roots, ref_root); + ASSERT(re); + re->num_refs++; + } + exist = insert_ref_entry(&be->refs, ref); + if (exist) { + exist->num_refs++; + kfree(ref); + } + spin_unlock(&fs_info->ref_verify_lock); + + return 0; +} + +static int add_shared_data_ref(struct btrfs_fs_info *fs_info, + u64 parent, u32 num_refs, u64 bytenr, + u64 num_bytes) +{ + struct block_entry *be; + struct ref_entry *ref; + + ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + be = add_block_entry(fs_info, bytenr, num_bytes, 0); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs += num_refs; + + ref->parent = parent; + ref->num_refs = num_refs; + if (insert_ref_entry(&be->refs, ref)) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "existing shared ref when reading from disk?"); + kfree(ref); + return -EINVAL; + } + spin_unlock(&fs_info->ref_verify_lock); + return 0; +} + +static int add_extent_data_ref(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *dref, + u64 bytenr, u64 num_bytes) +{ + struct block_entry *be; + struct ref_entry *ref; + struct root_entry *re; + u64 ref_root = btrfs_extent_data_ref_root(leaf, dref); + u64 owner = btrfs_extent_data_ref_objectid(leaf, dref); + u64 offset = btrfs_extent_data_ref_offset(leaf, dref); + u32 num_refs = btrfs_extent_data_ref_count(leaf, dref); + + ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs += num_refs; + + ref->parent = 0; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; + ref->num_refs = num_refs; + if (insert_ref_entry(&be->refs, ref)) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "existing ref when reading from disk?"); + kfree(ref); + return -EINVAL; + } + + re = lookup_root_entry(&be->roots, ref_root); + if (!re) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "missing root in new block entry?"); + return -EINVAL; + } + re->num_refs += num_refs; + spin_unlock(&fs_info->ref_verify_lock); + return 0; +} + +static int process_extent_item(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *key, + int slot, int *tree_block_level) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct extent_buffer *leaf = path->nodes[0]; + u32 item_size = btrfs_item_size_nr(leaf, slot); + unsigned long end, ptr; + u64 offset, flags, count; + int type, ret; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + if ((key->type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)(ei + 1); + *tree_block_level = btrfs_tree_block_level(leaf, info); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + if (key->type == BTRFS_METADATA_ITEM_KEY) + *tree_block_level = key->offset; + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, offset, 0, key->objectid, + *tree_block_level); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, 0, offset, key->objectid, + *tree_block_level); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = add_extent_data_ref(fs_info, leaf, dref, + key->objectid, key->offset); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sref); + ret = add_shared_data_ref(fs_info, offset, count, + key->objectid, key->offset); + break; + default: + btrfs_err(fs_info, "invalid key type in iref"); + ret = -EINVAL; + break; + } + if (ret) + break; + ptr += btrfs_extent_inline_ref_size(type); + } + return ret; +} + +static int process_leaf(struct btrfs_root *root, + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + u32 count; + int i = 0, tree_block_level = 0, ret; + struct btrfs_key key; + int nritems = btrfs_header_nritems(leaf); + + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + switch (key.type) { + case BTRFS_EXTENT_ITEM_KEY: + *num_bytes = key.offset; + case BTRFS_METADATA_ITEM_KEY: + *bytenr = key.objectid; + ret = process_extent_item(fs_info, path, &key, i, + &tree_block_level); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, key.offset, 0, + key.objectid, tree_block_level); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, 0, key.offset, + key.objectid, tree_block_level); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(leaf, i, + struct btrfs_extent_data_ref); + ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr, + *num_bytes); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(leaf, i, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sref); + ret = add_shared_data_ref(fs_info, key.offset, count, + *bytenr, *num_bytes); + break; + default: + break; + } + if (ret) + break; + } + return ret; +} + +/* Walk down to the leaf from the given level */ +static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, + int level, u64 *bytenr, u64 *num_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *eb; + u64 block_bytenr, gen; + int ret = 0; + + while (level >= 0) { + if (level) { + block_bytenr = btrfs_node_blockptr(path->nodes[level], + path->slots[level]); + gen = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + eb = read_tree_block(fs_info, block_bytenr, gen); + if (IS_ERR(eb)) + return PTR_ERR(eb); + if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->nodes[level-1] = eb; + path->slots[level-1] = 0; + path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; + } else { + ret = process_leaf(root, path, bytenr, num_bytes); + if (ret) + break; + } + level--; + } + return ret; +} + +/* Walk up to the next node that needs to be processed */ +static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + int l; + + for (l = 0; l < BTRFS_MAX_LEVEL; l++) { + if (!path->nodes[l]) + continue; + if (l) { + path->slots[l]++; + if (path->slots[l] < + btrfs_header_nritems(path->nodes[l])) { + *level = l; + return 0; + } + } + btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]); + free_extent_buffer(path->nodes[l]); + path->nodes[l] = NULL; + path->slots[l] = 0; + path->locks[l] = 0; + } + + return 1; +} + +static void dump_ref_action(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + btrfs_err(fs_info, +" Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", + ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent, + ra->ref.owner, ra->ref.offset, ra->ref.num_refs); + __print_stack_trace(fs_info, ra); +} + +/* + * Dumps all the information from the block entry to printk, it's going to be + * awesome. + */ +static void dump_block_entry(struct btrfs_fs_info *fs_info, + struct block_entry *be) +{ + struct ref_entry *ref; + struct root_entry *re; + struct ref_action *ra; + struct rb_node *n; + + btrfs_err(fs_info, +"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d", + be->bytenr, be->len, be->num_refs, be->metadata, + be->from_disk); + + for (n = rb_first(&be->refs); n; n = rb_next(n)) { + ref = rb_entry(n, struct ref_entry, node); + btrfs_err(fs_info, +" ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", + ref->root_objectid, ref->parent, ref->owner, + ref->offset, ref->num_refs); + } + + for (n = rb_first(&be->roots); n; n = rb_next(n)) { + re = rb_entry(n, struct root_entry, node); + btrfs_err(fs_info, " root entry %llu, num_refs %llu", + re->root_objectid, re->num_refs); + } + + list_for_each_entry(ra, &be->actions, list) + dump_ref_action(fs_info, ra); +} + +/* + * btrfs_ref_tree_mod: called when we modify a ref for a bytenr + * @root: the root we are making this modification from. + * @bytenr: the bytenr we are modifying. + * @num_bytes: number of bytes. + * @parent: the parent bytenr. + * @ref_root: the original root owner of the bytenr. + * @owner: level in the case of metadata, inode in the case of data. + * @offset: 0 for metadata, file offset for data. + * @action: the action that we are doing, this is the same as the delayed ref + * action. + * + * This will add an action item to the given bytenr and do sanity checks to make + * sure we haven't messed something up. If we are making a new allocation and + * this block entry has history we will delete all previous actions as long as + * our sanity checks pass as they are no longer needed. + */ +int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, u64 owner, u64 offset, + int action) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct ref_entry *ref = NULL, *exist; + struct ref_action *ra = NULL; + struct block_entry *be = NULL; + struct root_entry *re = NULL; + int ret = 0; + bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + + if (!btrfs_test_opt(root->fs_info, REF_VERIFY)) + return 0; + + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); + ra = kmalloc(sizeof(struct ref_action), GFP_NOFS); + if (!ra || !ref) { + kfree(ref); + kfree(ra); + ret = -ENOMEM; + goto out; + } + + if (parent) { + ref->parent = parent; + } else { + ref->root_objectid = ref_root; + ref->owner = owner; + ref->offset = offset; + } + ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; + + memcpy(&ra->ref, ref, sizeof(struct ref_entry)); + /* + * Save the extra info from the delayed ref in the ref action to make it + * easier to figure out what is happening. The real ref's we add to the + * ref tree need to reflect what we save on disk so it matches any + * on-disk refs we pre-loaded. + */ + ra->ref.owner = owner; + ra->ref.offset = offset; + ra->ref.root_objectid = ref_root; + __save_stack_trace(ra); + + INIT_LIST_HEAD(&ra->list); + ra->action = action; + ra->root = root->objectid; + + /* + * This is an allocation, preallocate the block_entry in case we haven't + * used it before. + */ + ret = -EINVAL; + if (action == BTRFS_ADD_DELAYED_EXTENT) { + /* + * For subvol_create we'll just pass in whatever the parent root + * is and the new root objectid, so let's not treat the passed + * in root as if it really has a ref for this bytenr. + */ + be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root); + if (IS_ERR(be)) { + kfree(ra); + ret = PTR_ERR(be); + goto out; + } + be->num_refs++; + if (metadata) + be->metadata = 1; + + if (be->num_refs != 1) { + btrfs_err(fs_info, + "re-allocated a block that still has references to it!"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + goto out_unlock; + } + + while (!list_empty(&be->actions)) { + struct ref_action *tmp; + + tmp = list_first_entry(&be->actions, struct ref_action, + list); + list_del(&tmp->list); + kfree(tmp); + } + } else { + struct root_entry *tmp; + + if (!parent) { + re = kmalloc(sizeof(struct root_entry), GFP_NOFS); + if (!re) { + kfree(ref); + kfree(ra); + ret = -ENOMEM; + goto out; + } + /* + * This is the root that is modifying us, so it's the + * one we want to lookup below when we modify the + * re->num_refs. + */ + ref_root = root->objectid; + re->root_objectid = root->objectid; + re->num_refs = 0; + } + + spin_lock(&root->fs_info->ref_verify_lock); + be = lookup_block_entry(&root->fs_info->block_tree, bytenr); + if (!be) { + btrfs_err(fs_info, +"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", + action, (unsigned long long)bytenr, + (unsigned long long)num_bytes); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; + } + + if (!parent) { + tmp = insert_root_entry(&be->roots, re); + if (tmp) { + kfree(re); + re = tmp; + } + } + } + + exist = insert_ref_entry(&be->refs, ref); + if (exist) { + if (action == BTRFS_DROP_DELAYED_REF) { + if (exist->num_refs == 0) { + btrfs_err(fs_info, +"dropping a ref for a existing root that doesn't have a ref on the block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + exist->num_refs--; + if (exist->num_refs == 0) { + rb_erase(&exist->node, &be->refs); + kfree(exist); + } + } else if (!be->metadata) { + exist->num_refs++; + } else { + btrfs_err(fs_info, +"attempting to add another ref for an existing ref on a tree block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + kfree(ref); + } else { + if (action == BTRFS_DROP_DELAYED_REF) { + btrfs_err(fs_info, +"dropping a ref for a root that doesn't have a ref on the block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + } + + if (!parent && !re) { + re = lookup_root_entry(&be->roots, ref_root); + if (!re) { + /* + * This shouldn't happen because we will add our re + * above when we lookup the be with !parent, but just in + * case catch this case so we don't panic because I + * didn't thik of some other corner case. + */ + btrfs_err(fs_info, "failed to find root %llu for %llu", + root->objectid, be->bytenr); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + } + if (action == BTRFS_DROP_DELAYED_REF) { + if (re) + re->num_refs--; + be->num_refs--; + } else if (action == BTRFS_ADD_DELAYED_REF) { + be->num_refs++; + if (re) + re->num_refs++; + } + list_add_tail(&ra->list, &be->actions); + ret = 0; +out_unlock: + spin_unlock(&root->fs_info->ref_verify_lock); +out: + if (ret) + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + return ret; +} + +/* Free up the ref cache */ +void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) +{ + struct block_entry *be; + struct rb_node *n; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return; + + spin_lock(&fs_info->ref_verify_lock); + while ((n = rb_first(&fs_info->block_tree))) { + be = rb_entry(n, struct block_entry, node); + rb_erase(&be->node, &fs_info->block_tree); + free_block_entry(be); + cond_resched_lock(&fs_info->ref_verify_lock); + } + spin_unlock(&fs_info->ref_verify_lock); +} + +void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, + u64 len) +{ + struct block_entry *be = NULL, *entry; + struct rb_node *n; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return; + + spin_lock(&fs_info->ref_verify_lock); + n = fs_info->block_tree.rb_node; + while (n) { + entry = rb_entry(n, struct block_entry, node); + if (entry->bytenr < start) { + n = n->rb_right; + } else if (entry->bytenr > start) { + n = n->rb_left; + } else { + be = entry; + break; + } + /* We want to get as close to start as possible */ + if (be == NULL || + (entry->bytenr < start && be->bytenr > start) || + (entry->bytenr < start && entry->bytenr > be->bytenr)) + be = entry; + } + + /* + * Could have an empty block group, maybe have something to check for + * this case to verify we were actually empty? + */ + if (!be) { + spin_unlock(&fs_info->ref_verify_lock); + return; + } + + n = &be->node; + while (n) { + be = rb_entry(n, struct block_entry, node); + n = rb_next(n); + if (be->bytenr < start && be->bytenr + be->len > start) { + btrfs_err(fs_info, + "block entry overlaps a block group [%llu,%llu]!", + start, len); + dump_block_entry(fs_info, be); + continue; + } + if (be->bytenr < start) + continue; + if (be->bytenr >= start + len) + break; + if (be->bytenr + be->len > start + len) { + btrfs_err(fs_info, + "block entry overlaps a block group [%llu,%llu]!", + start, len); + dump_block_entry(fs_info, be); + } + rb_erase(&be->node, &fs_info->block_tree); + free_block_entry(be); + } + spin_unlock(&fs_info->ref_verify_lock); +} + +/* Walk down all roots and build the ref tree, meant to be called at mount */ +int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct extent_buffer *eb; + u64 bytenr = 0, num_bytes = 0; + int ret, level; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + eb = btrfs_read_lock_root_node(fs_info->extent_root); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + level = btrfs_header_level(eb); + path->nodes[level] = eb; + path->slots[level] = 0; + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + while (1) { + /* + * We have to keep track of the bytenr/num_bytes we last hit + * because we could have run out of space for an inline ref, and + * would have had to added a ref key item which may appear on a + * different leaf from the original extent item. + */ + ret = walk_down_tree(fs_info->extent_root, path, level, + &bytenr, &num_bytes); + if (ret) + break; + ret = walk_up_tree(root, path, &level); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } + if (ret) { + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + btrfs_free_ref_cache(fs_info); + } + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h new file mode 100644 index 000000000000..3bf02ce0e1e2 --- /dev/null +++ b/fs/btrfs/ref-verify.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __REF_VERIFY__ +#define __REF_VERIFY__ + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY +int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); +void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); +int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, u64 owner, u64 offset, + int action); +void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, + u64 len); + +static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->ref_verify_lock); + fs_info->block_tree = RB_ROOT; +} +#else +static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) +{ + return 0; +} + +static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) +{ +} + +static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, + u64 owner, u64 offset, int action) +{ + return 0; +} + +static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, + u64 start, u64 len) +{ +} + +static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) +{ +} + +#endif /* CONFIG_BTRFS_FS_REF_VERIFY */ +#endif /* _REF_VERIFY__ */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9841faef08ea..f0c3f00e97cb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1742,7 +1742,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset); @@ -1751,7 +1751,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, break; } - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { @@ -1952,21 +1952,21 @@ again: path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(path->nodes[level]); - ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr, + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, src->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize, + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, src->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize, + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, 0); BUG_ON(ret); @@ -2808,7 +2808,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(upper->eb); - ret = btrfs_inc_extent_ref(trans, root->fs_info, + ret = btrfs_inc_extent_ref(trans, root, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), @@ -3246,6 +3246,8 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); ret = -EIO; goto out; } @@ -3266,7 +3268,8 @@ static int relocate_file_extent_cluster(struct inode *inode, nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0); + btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL, + 0); set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, @@ -3275,6 +3278,7 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); index++; + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 95bcc3cce78f..3338407ef0f0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -226,10 +226,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; int err = 0; int ret; - bool can_recover = true; - - if (sb_rdonly(fs_info->sb)) - can_recover = false; path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e3f6c49e5c4d..b2f871d80982 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -231,7 +231,7 @@ struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; const char *errstr; - sector_t sector; + u64 physical; u64 logical; struct btrfs_device *dev; }; @@ -797,10 +797,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), - (unsigned long long)swarn->sector, + swarn->physical, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); @@ -810,10 +810,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, err: btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), - (unsigned long long)swarn->sector, + swarn->physical, root, inum, offset, ret); free_ipath(ipath); @@ -845,7 +845,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (!path) return; - swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.physical = sblock->pagev[0]->physical; swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -868,10 +868,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) item_size, &ref_root, &ref_level); btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu", +"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, rcu_str_deref(dev->name), - (unsigned long long)swarn.sector, + swarn.physical, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); @@ -883,7 +883,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, - scrub_print_warning_inode, &swarn); + scrub_print_warning_inode, &swarn, false); } out: @@ -1047,7 +1047,7 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) * can be found. */ ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, - scrub_fixup_readpage, fixup); + scrub_fixup_readpage, fixup, false); if (ret < 0) { uncorrectable = 1; goto out; @@ -4390,7 +4390,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) } ret = iterate_inodes_from_logical(logical, fs_info, path, - record_inode_for_nocow, nocow_ctx); + record_inode_for_nocow, nocow_ctx, false); if (ret != 0 && ret != -ENOENT) { btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d", diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8fd195cfe81b..20d3300bd268 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -26,6 +26,7 @@ #include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> +#include <linux/compat.h> #include "send.h" #include "backref.h" @@ -992,7 +993,6 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, * path must point to the dir item when called. */ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *found_key, iterate_dir_item_t iterate, void *ctx) { int ret = 0; @@ -1271,12 +1271,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) */ if (ino >= bctx->cur_objectid) return 0; -#if 0 - if (ino > bctx->cur_objectid) - return 0; - if (offset + bctx->extent_len > bctx->cur_offset) - return 0; -#endif } bctx->found++; @@ -1429,7 +1423,7 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = 0; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, - backref_ctx); + backref_ctx, false); if (ret < 0) goto out; @@ -3527,7 +3521,40 @@ out: } /* - * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Check if inode ino2, or any of its ancestors, is inode ino1. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int check_ino_in_path(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + const u64 ino2_gen, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + if (ino1 == ino2) + return ino1_gen == ino2_gen; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + u64 parent; + u64 parent_gen; + int ret; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) + return ret; + if (parent == ino1) + return parent_gen == ino1_gen; + ino = parent; + } + return 0; +} + +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root for any + * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ static int is_ancestor(struct btrfs_root *root, @@ -3536,36 +3563,91 @@ static int is_ancestor(struct btrfs_root *root, const u64 ino2, struct fs_path *fs_path) { - u64 ino = ino2; - bool free_path = false; + bool free_fs_path = false; int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; if (!fs_path) { fs_path = fs_path_alloc(); if (!fs_path) return -ENOMEM; - free_path = true; + free_fs_path = true; } - while (ino > BTRFS_FIRST_FREE_OBJECTID) { - u64 parent; - u64 parent_gen; + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } - fs_path_reset(fs_path); - ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); - if (ret < 0) { - if (ret == -ENOENT && ino == ino2) - ret = 0; - goto out; + key.objectid = ino2; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + continue; } - if (parent == ino1) { - ret = parent_gen == ino1_gen ? 1 : 0; - goto out; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino2) + break; + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size_nr(leaf, slot); + while (cur_offset < item_size) { + u64 parent; + u64 parent_gen; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + unsigned long ptr; + struct btrfs_inode_extref *extref; + + ptr = btrfs_item_ptr_offset(leaf, slot); + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + parent = btrfs_inode_extref_parent(leaf, + extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + parent = key.offset; + cur_offset = item_size; + } + + ret = get_inode_info(root, parent, NULL, &parent_gen, + NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = check_ino_in_path(root, ino1, ino1_gen, + parent, parent_gen, fs_path); + if (ret) + goto out; } - ino = parent; + path->slots[0]++; } + ret = 0; out: - if (free_path) + btrfs_free_path(path); + if (free_fs_path) fs_path_free(fs_path); return ret; } @@ -4106,8 +4188,8 @@ out: return ret; } -static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, - struct fs_path *name, void *ctx, struct list_head *refs) +static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, + void *ctx, struct list_head *refs) { int ret = 0; struct send_ctx *sctx = ctx; @@ -4143,8 +4225,7 @@ static int __record_new_ref(int num, u64 dir, int index, void *ctx) { struct send_ctx *sctx = ctx; - return record_ref(sctx->send_root, num, dir, index, name, - ctx, &sctx->new_refs); + return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); } @@ -4153,8 +4234,8 @@ static int __record_deleted_ref(int num, u64 dir, int index, void *ctx) { struct send_ctx *sctx = ctx; - return record_ref(sctx->parent_root, num, dir, index, name, - ctx, &sctx->deleted_refs); + return record_ref(sctx->parent_root, dir, name, ctx, + &sctx->deleted_refs); } static int record_new_ref(struct send_ctx *sctx) @@ -4498,7 +4579,7 @@ static int process_new_xattr(struct send_ctx *sctx) int ret = 0; ret = iterate_dir_item(sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_new_xattr, sctx); + __process_new_xattr, sctx); return ret; } @@ -4506,7 +4587,7 @@ static int process_new_xattr(struct send_ctx *sctx) static int process_deleted_xattr(struct send_ctx *sctx) { return iterate_dir_item(sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_deleted_xattr, sctx); + __process_deleted_xattr, sctx); } struct find_xattr_ctx { @@ -4551,7 +4632,7 @@ static int find_xattr(struct btrfs_root *root, ctx.found_data = NULL; ctx.found_data_len = 0; - ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); + ret = iterate_dir_item(root, path, __find_xattr, &ctx); if (ret < 0) return ret; @@ -4621,11 +4702,11 @@ static int process_changed_xattr(struct send_ctx *sctx) int ret = 0; ret = iterate_dir_item(sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_changed_new_xattr, sctx); + __process_changed_new_xattr, sctx); if (ret < 0) goto out; ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_changed_deleted_xattr, sctx); + __process_changed_deleted_xattr, sctx); out: return ret; @@ -4675,8 +4756,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) goto out; } - ret = iterate_dir_item(root, path, &found_key, - __process_new_xattr, sctx); + ret = iterate_dir_item(root, path, __process_new_xattr, sctx); if (ret < 0) goto out; @@ -4723,16 +4803,27 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) /* initial readahead */ memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, inode->i_mapping); - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index, - last_index - index + 1); while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); + + page = find_lock_page(inode->i_mapping, index); if (!page) { - ret = -ENOMEM; - break; + page_cache_sync_readahead(inode->i_mapping, &sctx->ra, + NULL, index, last_index + 1 - index); + + page = find_or_create_page(inode->i_mapping, index, + GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, &sctx->ra, + NULL, page, index, last_index + 1 - index); } if (!PageUptodate(page)) { @@ -6162,9 +6253,7 @@ out: * Updates compare related fields in sctx and simply forwards to the actual * changed_xxx functions. */ -static int changed_cb(struct btrfs_root *left_root, - struct btrfs_root *right_root, - struct btrfs_path *left_path, +static int changed_cb(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, @@ -6246,8 +6335,8 @@ static int full_send_tree(struct send_ctx *sctx) slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); - ret = changed_cb(send_root, NULL, path, NULL, - &found_key, BTRFS_COMPARE_TREE_NEW, sctx); + ret = changed_cb(path, NULL, &found_key, + BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; @@ -6365,13 +6454,12 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) spin_unlock(&root->root_item_lock); } -long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) +long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) { int ret = 0; struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; - struct btrfs_ioctl_send_args *arg = NULL; struct btrfs_key key; struct send_ctx *sctx = NULL; u32 i; @@ -6407,13 +6495,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - arg = memdup_user(arg_, sizeof(*arg)); - if (IS_ERR(arg)) { - ret = PTR_ERR(arg); - arg = NULL; - goto out; - } - /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside @@ -6654,7 +6735,6 @@ out: if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) btrfs_root_dec_send_in_progress(sctx->parent_root); - kfree(arg); kvfree(clone_sources_tmp); if (sctx) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 02e00166c4da..3aa4bc55754f 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -130,5 +130,5 @@ enum { #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) #ifdef __KERNEL__ -long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 161694b66038..3a4dce153645 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -107,7 +107,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) return; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; btrfs_info(fs_info, "forced readonly"); /* * Note that a running device replace operation is not @@ -137,7 +137,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function /* * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. + * under SB_RDONLY, then it is safe here. */ if (errno == -EROFS && sb_rdonly(sb)) return; @@ -168,7 +168,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) + if (sb->s_flags & SB_BORN) btrfs_handle_error(fs_info); } @@ -202,7 +202,6 @@ static struct ratelimit_state printk_limits[] = { void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { - struct super_block *sb = fs_info->sb; char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; va_list args; @@ -228,7 +227,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) vaf.va = &args; if (__ratelimit(ratelimit)) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, + fs_info ? fs_info->sb->s_id : "<unknown>", &vaf); va_end(args); } @@ -292,7 +292,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, vaf.va = &args; errstr = btrfs_decode_error(errno); - if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", s_id, function, line, &vaf, errno, errstr); @@ -326,6 +326,9 @@ enum { #ifdef CONFIG_BTRFS_DEBUG Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + Opt_ref_verify, +#endif Opt_err, }; @@ -387,6 +390,9 @@ static const match_table_t tokens = { {Opt_fragment_metadata, "fragment=metadata"}, {Opt_fragment_all, "fragment=all"}, #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + {Opt_ref_verify, "ref_verify"}, +#endif {Opt_err, NULL}, }; @@ -501,7 +507,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, token == Opt_compress_force || strncmp(args[0].from, "zlib", 4) == 0) { compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; + info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + /* + * args[0] contains uninitialized data since + * for these tokens we don't expect any + * parameter. + */ + if (token != Opt_compress && + token != Opt_compress_force) + info->compress_level = + btrfs_compress_str2level(args[0].from); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -549,9 +566,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force != saved_compress_force)) || (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { - btrfs_info(info, "%s %s compression", + btrfs_info(info, "%s %s compression, level %d", (compress_force) ? "force" : "use", - compress_type); + compress_type, info->compress_level); } compress_force = false; break; @@ -617,7 +634,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL - info->sb->s_flags |= MS_POSIXACL; + info->sb->s_flags |= SB_POSIXACL; break; #else btrfs_err(info, "support for ACL not compiled in!"); @@ -625,7 +642,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_noacl: - info->sb->s_flags &= ~MS_POSIXACL; + info->sb->s_flags &= ~SB_POSIXACL; break; case Opt_notreelog: btrfs_set_and_info(info, NOTREELOG, @@ -825,6 +842,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); break; #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + case Opt_ref_verify: + btrfs_info(info, "doing ref verification"); + btrfs_set_opt(info->mount_opt, REF_VERIFY); + break; +#endif case Opt_err: btrfs_info(info, "unrecognized mount option '%s'", p); ret = -EINVAL; @@ -837,7 +860,7 @@ check: /* * Extra check for current option against current flag */ - if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { + if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) { btrfs_err(info, "nologreplay must be used with ro mount option"); ret = -EINVAL; @@ -1133,7 +1156,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; #ifdef CONFIG_BTRFS_FS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif sb->s_flags |= SB_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; @@ -1166,7 +1189,7 @@ static int btrfs_fill_super(struct super_block *sb, } cleancache_init_fs(sb); - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; return 0; fail_close: @@ -1205,8 +1228,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) * happens. The pending operations are delayed to the * next commit after thawing. */ - if (__sb_start_write(sb, SB_FREEZE_WRITE, false)) - __sb_end_write(sb, SB_FREEZE_WRITE); + if (sb_start_write_trylock(sb)) + sb_end_write(sb); else return 0; trans = btrfs_start_transaction(root, 0); @@ -1246,6 +1269,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); + if (info->compress_level) + seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); @@ -1261,7 +1286,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD)) seq_puts(seq, ",discard"); - if (!(info->sb->s_flags & MS_POSIXACL)) + if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); if (btrfs_test_opt(info, SPACE_CACHE)) seq_puts(seq, ",space_cache"); @@ -1305,6 +1330,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif + if (btrfs_test_opt(info, REF_VERIFY)) + seq_puts(seq, ",ref_verify"); seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); seq_puts(seq, ",subvol="); @@ -1391,11 +1418,11 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { - if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, + if (flags & SB_RDONLY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY, device_name, newargs); } else { - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, + mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY, device_name, newargs); if (IS_ERR(mnt)) { root = ERR_CAST(mnt); @@ -1547,7 +1574,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, u64 subvol_objectid = 0; int error = 0; - if (!(flags & MS_RDONLY)) + if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; error = btrfs_parse_early_options(data, mode, fs_type, @@ -1601,13 +1628,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error) goto error_fs_info; - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) { error = -EACCES; goto error_close_devices; } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, + s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, fs_info); if (IS_ERR(s)) { error = PTR_ERR(s); @@ -1617,7 +1644,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (s->s_root) { btrfs_close_devices(fs_devices); free_fs_info(fs_info); - if ((flags ^ s->s_flags) & MS_RDONLY) + if ((flags ^ s->s_flags) & SB_RDONLY) error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); @@ -1684,11 +1711,11 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, { if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || - (flags & MS_RDONLY))) { + (flags & SB_RDONLY))) { /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, (atomic_read(&fs_info->defrag_running) == 0)); - if (flags & MS_RDONLY) + if (flags & SB_RDONLY) sync_filesystem(fs_info->sb); } } @@ -1748,10 +1775,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { /* * this also happens on 'umount -rf' or on shutdown, when * the filesystem is busy. @@ -1763,10 +1790,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) /* avoid complains from lockdep et al. */ up(&fs_info->uuid_tree_rescan_sem); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* - * Setting MS_RDONLY will put the cleaner thread to + * Setting SB_RDONLY will put the cleaner thread to * sleep at the next loop if it's already active. * If it's already asleep, we'll leave unused block * groups on disk until we're mounted read-write again @@ -1838,7 +1865,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; set_bit(BTRFS_FS_OPEN, &fs_info->flags); } @@ -1848,9 +1875,9 @@ out: return 0; restore: - /* We've hit an error - don't reset MS_RDONLY */ + /* We've hit an error - don't reset SB_RDONLY */ if (sb_rdonly(sb)) - old_flags |= MS_RDONLY; + old_flags |= SB_RDONLY; sb->s_flags = old_flags; fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; @@ -2112,7 +2139,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * succeed even if the Avail is zero. But this is better than the other * way around. */ - thresh = 4 * 1024 * 1024; + thresh = SZ_4M; if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; @@ -2319,6 +2346,9 @@ static void btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY ", integrity-checker=on" #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + ", ref-verify=on" +#endif "\n", btrfs_crc32c_impl()); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 883881b16c86..a28bba801264 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -247,7 +247,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_size, global_rsv_size_show); +BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show); static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -256,15 +256,15 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show); +BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); -BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); -BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); +BTRFS_ATTR(raid, total_bytes, raid_bytes_show); +BTRFS_ATTR(raid, used_bytes, raid_bytes_show); static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -277,7 +277,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, down_read(&sinfo->groups_sem); list_for_each_entry(block_group, &sinfo->block_groups[index], list) { - if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) + if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) val += block_group->key.offset; else val += btrfs_block_group_used(&block_group->item); @@ -287,8 +287,8 @@ static ssize_t raid_bytes_show(struct kobject *kobj, } static struct attribute *raid_attributes[] = { - BTRFS_RAID_ATTR_PTR(total_bytes), - BTRFS_RAID_ATTR_PTR(used_bytes), + BTRFS_ATTR_PTR(raid, total_bytes), + BTRFS_ATTR_PTR(raid, used_bytes), NULL }; @@ -311,7 +311,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ struct btrfs_space_info *sinfo = to_space_info(kobj); \ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ } \ -BTRFS_ATTR(field, btrfs_space_info_show_##field) +BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, struct kobj_attribute *a, @@ -331,19 +331,20 @@ SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); -BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); +BTRFS_ATTR(space_info, total_bytes_pinned, + btrfs_space_info_show_total_bytes_pinned); static struct attribute *space_info_attrs[] = { - BTRFS_ATTR_PTR(flags), - BTRFS_ATTR_PTR(total_bytes), - BTRFS_ATTR_PTR(bytes_used), - BTRFS_ATTR_PTR(bytes_pinned), - BTRFS_ATTR_PTR(bytes_reserved), - BTRFS_ATTR_PTR(bytes_may_use), - BTRFS_ATTR_PTR(bytes_readonly), - BTRFS_ATTR_PTR(disk_used), - BTRFS_ATTR_PTR(disk_total), - BTRFS_ATTR_PTR(total_bytes_pinned), + BTRFS_ATTR_PTR(space_info, flags), + BTRFS_ATTR_PTR(space_info, total_bytes), + BTRFS_ATTR_PTR(space_info, bytes_used), + BTRFS_ATTR_PTR(space_info, bytes_pinned), + BTRFS_ATTR_PTR(space_info, bytes_reserved), + BTRFS_ATTR_PTR(space_info, bytes_may_use), + BTRFS_ATTR_PTR(space_info, bytes_readonly), + BTRFS_ATTR_PTR(space_info, disk_used), + BTRFS_ATTR_PTR(space_info, disk_total), + BTRFS_ATTR_PTR(space_info, total_bytes_pinned), NULL, }; @@ -361,8 +362,8 @@ struct kobj_type space_info_ktype = { }; static const struct attribute *allocation_attrs[] = { - BTRFS_ATTR_PTR(global_rsv_reserved), - BTRFS_ATTR_PTR(global_rsv_size), + BTRFS_ATTR_PTR(allocation, global_rsv_reserved), + BTRFS_ATTR_PTR(allocation, global_rsv_size), NULL, }; @@ -415,7 +416,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, return len; } -BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); +BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store); static ssize_t btrfs_nodesize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -425,7 +426,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } -BTRFS_ATTR(nodesize, btrfs_nodesize_show); +BTRFS_ATTR(, nodesize, btrfs_nodesize_show); static ssize_t btrfs_sectorsize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -436,7 +437,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, fs_info->super_copy->sectorsize); } -BTRFS_ATTR(sectorsize, btrfs_sectorsize_show); +BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -447,7 +448,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, fs_info->super_copy->sectorsize); } -BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); static ssize_t quota_override_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -487,14 +488,14 @@ static ssize_t quota_override_store(struct kobject *kobj, return len; } -BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); +BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); static const struct attribute *btrfs_attrs[] = { - BTRFS_ATTR_PTR(label), - BTRFS_ATTR_PTR(nodesize), - BTRFS_ATTR_PTR(sectorsize), - BTRFS_ATTR_PTR(clone_alignment), - BTRFS_ATTR_PTR(quota_override), + BTRFS_ATTR_PTR(, label), + BTRFS_ATTR_PTR(, nodesize), + BTRFS_ATTR_PTR(, sectorsize), + BTRFS_ATTR_PTR(, clone_alignment), + BTRFS_ATTR_PTR(, quota_override), NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 4cb908305e5d..80457f31c29f 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -21,21 +21,16 @@ enum btrfs_feature_set { .store = _store, \ } -#define BTRFS_ATTR_RW(_name, _show, _store) \ - static struct kobj_attribute btrfs_attr_##_name = \ +#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) -#define BTRFS_ATTR(_name, _show) \ - static struct kobj_attribute btrfs_attr_##_name = \ +#define BTRFS_ATTR(_prefix, _name, _show) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) -#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) - -#define BTRFS_RAID_ATTR(_name, _show) \ - static struct kobj_attribute btrfs_raid_attr_##_name = \ - __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) - -#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) +#define BTRFS_ATTR_PTR(_prefix, _name) \ + (&btrfs_attr_##_prefix##_##_name.attr) struct btrfs_feature_attr { @@ -44,15 +39,16 @@ struct btrfs_feature_attr { u64 feature_bit; }; -#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ -static struct btrfs_feature_attr btrfs_attr_##_name = { \ +#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ btrfs_feature_attr_show, \ btrfs_feature_attr_store), \ .feature_set = _feature_set, \ - .feature_bit = _prefix ##_## _feature_bit, \ + .feature_bit = _feature_prefix ##_## _feature_bit, \ } -#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) +#define BTRFS_FEAT_ATTR_PTR(_name) \ + (&btrfs_attr_features_##_name.kobj_attr.attr) #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d06b1c931d05..2e7f64a3b22b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -114,7 +114,7 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL); + set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -145,7 +145,7 @@ static int test_find_delalloc(u32 sectorsize) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL); + set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -200,7 +200,7 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 1458bb0ea124..8444a018cca2 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -500,7 +500,8 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, path = btrfs_alloc_path(); if (!path) { test_msg("Couldn't allocate path\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } ret = add_block_group_free_space(&trans, root->fs_info, cache); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 8c91d03cc82d..30affb60da51 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -968,8 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ - BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); @@ -983,10 +982,9 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1003,7 +1001,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1017,11 +1015,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1035,16 +1032,11 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] - * - * I'm artificially adding 2 to outstanding_extents because in the - * buffered IO case we'd add things up as we go, but I don't feel like - * doing that here, this isn't the interesting case we want to test. */ - BTRFS_I(inode)->outstanding_extents += 2; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1059,10 +1051,9 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1079,7 +1070,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1096,10 +1087,9 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) * Refill the hole again just for good measure, because I thought it * might fail and I'd rather satisfy my paranoia at this point. */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1114,7 +1104,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1131,7 +1121,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 0f4ce970d195..90204b166643 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -240,7 +240,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -252,7 +253,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -275,7 +277,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -286,7 +289,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -337,7 +341,8 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -349,7 +354,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -370,7 +376,8 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -382,7 +389,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -409,7 +417,8 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -421,7 +430,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f615d59b0489..5a8c2649af2f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -797,8 +797,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (fs_info->global_block_rsv.space_info->full && - btrfs_check_space_for_delayed_refs(trans, fs_info)) + if (btrfs_check_space_for_delayed_refs(trans, fs_info)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -950,6 +949,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; + atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { bool wait_writeback = false; @@ -985,6 +985,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, cond_resched(); start = end + 1; } + atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers); return werr; } @@ -1915,8 +1916,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { + /* + * We use writeback_inodes_sb here because if we used + * btrfs_start_delalloc_roots we would deadlock with fs freeze. + * Currently are holding the fs freeze lock, if we do an async flush + * we'll do btrfs_join_transaction() and deadlock because we need to + * wait for the fs freeze lock. Using the direct flushing we benefit + * from already being in a transaction and our join_transaction doesn't + * have to re-take the fs freeze lock. + */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - return btrfs_start_delalloc_roots(fs_info, 1, -1); + writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c new file mode 100644 index 000000000000..ce4ed6ec8f39 --- /dev/null +++ b/fs/btrfs/tree-checker.c @@ -0,0 +1,442 @@ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. + */ + +/* + * The module is used to catch unexpected/corrupted tree block data. + * Such behavior can be caused either by a fuzzed image or bugs. + * + * The objective is to do leaf/node validation checks when tree block is read + * from disk, and check *every* possible member, so other code won't + * need to checking them again. + * + * Due to the potential and unwanted damage, every checker needs to be + * carefully reviewed otherwise so it does not prevent mount of valid images. + */ + +#include "ctree.h" +#include "tree-checker.h" +#include "disk-io.h" +#include "compression.h" + +/* + * Error message should follow the following format: + * corrupt <type>: <identifier>, <reason>[, <bad_value>] + * + * @type: leaf or node + * @identifier: the necessary info to locate the leaf/node. + * It's recommened to decode key.objecitd/offset if it's + * meaningful. + * @reason: describe the error + * @bad_value: optional, it's recommened to output bad value and its + * expected value (range). + * + * Since comma is used to separate the components, only space is allowed + * inside each component. + */ + +/* + * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. + * Allows callers to customize the output. + */ +__printf(4, 5) +static void generic_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root->objectid, btrfs_header_bytenr(eb), slot, &vaf); + va_end(args); +} + +/* + * Customized reporter for extent data item, since its key objectid and + * offset has its own meaning. + */ +__printf(4, 5) +static void file_extent_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, + btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); + va_end(args); +} + +/* + * Return 0 if the btrfs_file_extent_##name is aligned to @alignment + * Else return 1 + */ +#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \ +({ \ + if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ + file_extent_err((root), (leaf), (slot), \ + "invalid %s for file extent, have %llu, should be aligned to %u", \ + (#name), btrfs_file_extent_##name((leaf), (fi)), \ + (alignment)); \ + (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ +}) + +static int check_extent_data_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_file_extent_item *fi; + u32 sectorsize = root->fs_info->sectorsize; + u32 item_size = btrfs_item_size_nr(leaf, slot); + + if (!IS_ALIGNED(key->offset, sectorsize)) { + file_extent_err(root, leaf, slot, +"unaligned file_offset for file extent, have %llu should be aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { + file_extent_err(root, leaf, slot, + "invalid type for file extent, have %u expect range [0, %u]", + btrfs_file_extent_type(leaf, fi), + BTRFS_FILE_EXTENT_TYPES); + return -EUCLEAN; + } + + /* + * Support for new compression/encrption must introduce incompat flag, + * and must be caught in open_ctree(). + */ + if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { + file_extent_err(root, leaf, slot, + "invalid compression for file extent, have %u expect range [0, %u]", + btrfs_file_extent_compression(leaf, fi), + BTRFS_COMPRESS_TYPES); + return -EUCLEAN; + } + if (btrfs_file_extent_encryption(leaf, fi)) { + file_extent_err(root, leaf, slot, + "invalid encryption for file extent, have %u expect 0", + btrfs_file_extent_encryption(leaf, fi)); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + /* Inline extent must have 0 as key offset */ + if (key->offset) { + file_extent_err(root, leaf, slot, + "invalid file_offset for inline file extent, have %llu expect 0", + key->offset); + return -EUCLEAN; + } + + /* Compressed inline extent has no on-disk size, skip it */ + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE) + return 0; + + /* Uncompressed inline extent size must match item size */ + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)) { + file_extent_err(root, leaf, slot, + "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", + item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)); + return -EUCLEAN; + } + return 0; + } + + /* Regular or preallocated extent has fixed item size */ + if (item_size != sizeof(*fi)) { + file_extent_err(root, leaf, slot, + "invalid item size for reg/prealloc file extent, have %u expect %zu", + item_size, sizeof(*fi)); + return -EUCLEAN; + } + if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize)) + return -EUCLEAN; + return 0; +} + +static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + u32 sectorsize = root->fs_info->sectorsize; + u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy); + + if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { + generic_err(root, leaf, slot, + "invalid key objectid for csum item, have %llu expect %llu", + key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->offset, sectorsize)) { + generic_err(root, leaf, slot, + "unaligned key offset for csum item, have %llu should be aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { + generic_err(root, leaf, slot, + "unaligned item size for csum item, have %u should be aligned to %u", + btrfs_item_size_nr(leaf, slot), csumsize); + return -EUCLEAN; + } + return 0; +} + +/* + * Common point to switch the item-specific validation. + */ +static int check_leaf_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + int ret = 0; + + switch (key->type) { + case BTRFS_EXTENT_DATA_KEY: + ret = check_extent_data_item(root, leaf, key, slot); + break; + case BTRFS_EXTENT_CSUM_KEY: + ret = check_csum_item(root, leaf, key, slot); + break; + } + return ret; +} + +static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, + bool check_item_data) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + /* No valid key type is 0, so all key should be larger than this key */ + struct btrfs_key prev_key = {0, 0, 0}; + struct btrfs_key key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + /* + * Extent buffers from a relocation tree have a owner field that + * corresponds to the subvolume tree they are based on. So just from an + * extent buffer alone we can not find out what is the id of the + * corresponding subvolume tree, so we can not figure out if the extent + * buffer corresponds to the root of the relocation tree or not. So + * skip this check for relocation trees. + */ + if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + struct extent_buffer *eb; + + eb = btrfs_root_node(check_root); + /* if leaf is the root, then it's fine */ + if (leaf != eb) { + generic_err(check_root, leaf, 0, + "invalid nritems, have %u should not be 0 for non-root leaf", + nritems); + free_extent_buffer(eb); + return -EUCLEAN; + } + free_extent_buffer(eb); + } + return 0; + } + + if (nritems == 0) + return 0; + + /* + * Check the following things to make sure this is a good leaf, and + * leaf users won't need to bother with similar sanity checks: + * + * 1) key ordering + * 2) item offset and size + * No overlap, no hole, all inside the leaf. + * 3) item content + * If possible, do comprehensive sanity check. + * NOTE: All checks must only rely on the item data itself. + */ + for (slot = 0; slot < nritems; slot++) { + u32 item_end_expected; + int ret; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { + generic_err(root, leaf, slot, + "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", + prev_key.objectid, prev_key.type, + prev_key.offset, key.objectid, key.type, + key.offset); + return -EUCLEAN; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (slot == 0) + item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); + else + item_end_expected = btrfs_item_offset_nr(leaf, + slot - 1); + if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { + generic_err(root, leaf, slot, + "unexpected item end, have %u expect %u", + btrfs_item_end_nr(leaf, slot), + item_end_expected); + return -EUCLEAN; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just in case all the items are consistent to each other, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(fs_info)) { + generic_err(root, leaf, slot, + "slot end outside of leaf, have %u expect range [0, %u]", + btrfs_item_end_nr(leaf, slot), + BTRFS_LEAF_DATA_SIZE(fs_info)); + return -EUCLEAN; + } + + /* Also check if the item pointer overlaps with btrfs item. */ + if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > + btrfs_item_ptr_offset(leaf, slot)) { + generic_err(root, leaf, slot, + "slot overlaps with its data, item end %lu data start %lu", + btrfs_item_nr_offset(slot) + + sizeof(struct btrfs_item), + btrfs_item_ptr_offset(leaf, slot)); + return -EUCLEAN; + } + + if (check_item_data) { + /* + * Check if the item size and content meet other + * criteria + */ + ret = check_leaf_item(root, leaf, &key, slot); + if (ret < 0) + return ret; + } + + prev_key.objectid = key.objectid; + prev_key.type = key.type; + prev_key.offset = key.offset; + } + + return 0; +} + +int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf) +{ + return check_leaf(root, leaf, true); +} + +int btrfs_check_leaf_relaxed(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + return check_leaf(root, leaf, false); +} + +int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + struct btrfs_key key, next_key; + int slot; + u64 bytenr; + int ret = 0; + + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { + btrfs_crit(root->fs_info, +"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", + root->objectid, node->start, + nr == 0 ? "small" : "large", nr, + BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)); + return -EUCLEAN; + } + + for (slot = 0; slot < nr - 1; slot++) { + bytenr = btrfs_node_blockptr(node, slot); + btrfs_node_key_to_cpu(node, &key, slot); + btrfs_node_key_to_cpu(node, &next_key, slot + 1); + + if (!bytenr) { + generic_err(root, node, slot, + "invalid NULL node pointer"); + ret = -EUCLEAN; + goto out; + } + if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) { + generic_err(root, node, slot, + "unaligned pointer, have %llu should be aligned to %u", + bytenr, root->fs_info->sectorsize); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { + generic_err(root, node, slot, + "bad key order, current (%llu %u %llu) next (%llu %u %llu)", + key.objectid, key.type, key.offset, + next_key.objectid, next_key.type, + next_key.offset); + ret = -EUCLEAN; + goto out; + } + } +out: + return ret; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h new file mode 100644 index 000000000000..3d53e8d6fda0 --- /dev/null +++ b/fs/btrfs/tree-checker.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. + */ + +#ifndef __BTRFS_TREE_CHECKER__ +#define __BTRFS_TREE_CHECKER__ + +#include "ctree.h" +#include "extent_io.h" + +/* + * Comprehensive leaf checker. + * Will check not only the item pointers, but also every possible member + * in item data. + */ +int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf); + +/* + * Less strict leaf checker. + * Will only check item pointers, not reading item data. + */ +int btrfs_check_leaf_relaxed(struct btrfs_root *root, + struct extent_buffer *leaf); +int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); + +#endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c800d067fcbf..7bf9b31561db 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -717,7 +717,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, fs_info, + ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, key->objectid, offset); @@ -2699,34 +2699,36 @@ static void wait_log_commit(struct btrfs_root *root, int transid) * so we know that if ours is more than 2 older than the * current transaction, we're done */ - do { + for (;;) { prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&root->log_mutex); - if (root->log_transid_committed < transid && - atomic_read(&root->log_commit[index])) - schedule(); + if (!(root->log_transid_committed < transid && + atomic_read(&root->log_commit[index]))) + break; - finish_wait(&root->log_commit_wait[index], &wait); + mutex_unlock(&root->log_mutex); + schedule(); mutex_lock(&root->log_mutex); - } while (root->log_transid_committed < transid && - atomic_read(&root->log_commit[index])); + } + finish_wait(&root->log_commit_wait[index], &wait); } static void wait_for_writer(struct btrfs_root *root) { DEFINE_WAIT(wait); - while (atomic_read(&root->log_writers)) { - prepare_to_wait(&root->log_writer_wait, - &wait, TASK_UNINTERRUPTIBLE); + for (;;) { + prepare_to_wait(&root->log_writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (!atomic_read(&root->log_writers)) + break; + mutex_unlock(&root->log_mutex); - if (atomic_read(&root->log_writers)) - schedule(); - finish_wait(&root->log_writer_wait, &wait); + schedule(); mutex_lock(&root->log_mutex); } + finish_wait(&root->log_writer_wait, &wait); } static inline void btrfs_remove_log_ctx(struct btrfs_root *root, @@ -4100,7 +4102,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ordered_io_err) { ctx->io_err = -EIO; - return 0; + return ctx->io_err; } btrfs_init_map_token(&token); @@ -4645,7 +4647,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - struct extent_buffer *src = NULL; LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; @@ -4888,7 +4889,6 @@ again: goto next_slot; } - src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; goto next_slot; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b39737568c22..a25684287501 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -189,6 +189,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device, dev_list); list_del(&device->dev_list); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } kfree(fs_devices); @@ -236,7 +237,6 @@ static struct btrfs_device *__alloc_device(void) kfree(dev); return ERR_PTR(-ENOMEM); } - bio_get(dev->flush_bio); INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); @@ -360,7 +360,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run; unsigned long batch_run = 0; - unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; int sync_pending = 0; @@ -375,8 +374,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) blk_start_plug(&plug); bdi = device->bdev->bd_bdi; - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; loop: spin_lock(&device->io_lock); @@ -443,13 +440,6 @@ loop_lock: pending = pending->bi_next; cur->bi_next = NULL; - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ - if (atomic_dec_return(&fs_info->nr_async_bios) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* @@ -517,12 +507,6 @@ loop_lock: &device->work); goto done; } - /* unplug every 64 requests just for good measure */ - if (batch_run % 64 == 0) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } } cond_resched(); @@ -547,7 +531,7 @@ static void pending_bios_fn(struct btrfs_work *work) } -void btrfs_free_stale_device(struct btrfs_device *cur_dev) +static void btrfs_free_stale_device(struct btrfs_device *cur_dev) { struct btrfs_fs_devices *fs_devs; struct btrfs_device *dev; @@ -594,6 +578,7 @@ void btrfs_free_stale_device(struct btrfs_device *cur_dev) fs_devs->num_devices--; list_del(&dev->dev_list); rcu_string_free(dev->name); + bio_put(dev->flush_bio); kfree(dev); } break; @@ -646,6 +631,7 @@ static noinline int device_list_add(const char *path, name = rcu_string_strdup(path, GFP_NOFS); if (!name) { + bio_put(device->flush_bio); kfree(device); return -ENOMEM; } @@ -758,6 +744,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); goto error; } @@ -823,6 +810,7 @@ again: list_del_init(&device->dev_list); fs_devices->num_devices--; rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } @@ -1068,14 +1056,15 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } -void btrfs_release_disk_super(struct page *page) +static void btrfs_release_disk_super(struct page *page) { kunmap(page); put_page(page); } -int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, - struct page **page, struct btrfs_super_block **disk_super) +static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, + struct btrfs_super_block **disk_super) { void *p; pgoff_t index; @@ -1765,20 +1754,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, key.offset = device->devid; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = -ENOENT; + if (ret) { + if (ret > 0) + ret = -ENOENT; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + out: btrfs_free_path(path); - btrfs_commit_transaction(trans); + if (!ret) + ret = btrfs_commit_transaction(trans); return ret; } @@ -1817,8 +1810,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, return 0; } -struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, - struct btrfs_device *device) +static struct btrfs_device * btrfs_find_next_active_device( + struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) { struct btrfs_device *next_device; @@ -2008,7 +2001,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, fs_devices = srcdev->fs_devices; list_del_rcu(&srcdev->dev_list); - list_del_rcu(&srcdev->dev_alloc_list); + list_del(&srcdev->dev_alloc_list); fs_devices->num_devices--; if (srcdev->missing) fs_devices->missing_devices--; @@ -2031,19 +2024,20 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, } btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device); - /* - * unless fs_devices is seed fs, num_devices shouldn't go - * zero - */ - BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); - /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { struct btrfs_fs_devices *tmp_fs_devices; + /* + * On a mounted FS, num_devices can't be zero unless it's a + * seed. In case of a seed device being replaced, the replace + * target added to the sprout FS, so there will be no more + * device left under the seed FS. + */ + ASSERT(fs_devices->seeding); + tmp_fs_devices = fs_info->fs_devices; while (tmp_fs_devices) { if (tmp_fs_devices->seed == fs_devices) { @@ -2323,6 +2317,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 tmp; int seeding_dev = 0; int ret = 0; + bool unlocked = false; if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) return -EROFS; @@ -2362,6 +2357,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); ret = -ENOMEM; goto error; @@ -2371,6 +2367,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); ret = PTR_ERR(trans); goto error; @@ -2397,9 +2394,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; ret = btrfs_prepare_sprout(fs_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + goto error_trans; + } } device->fs_devices = fs_info->fs_devices; @@ -2445,14 +2445,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } } ret = btrfs_add_device(trans, fs_info, device); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } if (seeding_dev) { @@ -2461,7 +2461,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path ret = btrfs_finish_sprout(trans, fs_info); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } /* Sprouting would change fsid of the mounted root, @@ -2479,6 +2479,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); + unlocked = true; if (ret) /* transaction commit */ return ret; @@ -2491,7 +2492,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(trans)) { if (PTR_ERR(trans) == -ENOENT) return 0; - return PTR_ERR(trans); + ret = PTR_ERR(trans); + trans = NULL; + goto error_sysfs; } ret = btrfs_commit_transaction(trans); } @@ -2500,14 +2503,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path update_dev_time(device_path); return ret; +error_sysfs: + btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); error_trans: - btrfs_end_transaction(trans); + if (seeding_dev) + sb->s_flags |= SB_RDONLY; + if (trans) + btrfs_end_transaction(trans); rcu_string_free(device->name); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); + bio_put(device->flush_bio); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev) { + if (seeding_dev && !unlocked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -2570,6 +2578,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); ret = -ENOMEM; goto error; @@ -4813,16 +4822,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em_tree = &info->mapping_tree.map_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - if (!ret) { - list_add_tail(&em->list, &trans->transaction->pending_chunks); - refcount_inc(&em->refs); - } - write_unlock(&em_tree->lock); if (ret) { + write_unlock(&em_tree->lock); free_extent_map(em); goto error; } + list_add_tail(&em->list, &trans->transaction->pending_chunks); + refcount_inc(&em->refs); + write_unlock(&em_tree->lock); + ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); if (ret) goto error_del_extent; @@ -5695,10 +5704,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) + if (!need_full_stripe(op)) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5711,7 +5720,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { + if (need_full_stripe(op)) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5725,7 +5734,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -5740,9 +5749,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || - mirror_num > 1)) { + if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); @@ -5769,9 +5776,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && - op != BTRFS_MAP_GET_READ_MIRRORS) && - mirror_num <= 1) + if (!need_full_stripe(op) && mirror_num <= 1) mirror_num = 1; } } else { @@ -6033,7 +6038,7 @@ static void btrfs_end_bio(struct bio *bio) * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_status = 0; + bio->bi_status = BLK_STS_OK; } btrfs_end_bbio(bbio, bio); @@ -6069,13 +6074,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, return; } - /* - * nr_async_bios allows us to reliably return congestion to the - * higher layers. Otherwise, the async bio makes it appear we have - * made progress against dirty pages when we've really just put it - * on a queue for later - */ - atomic_inc(&fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; @@ -6144,7 +6142,10 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_status = BLK_STS_IOERR; + if (atomic_read(&bbio->error) > bbio->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; btrfs_end_bbio(bbio, bio); } } @@ -6249,7 +6250,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, device = btrfs_alloc_device(NULL, &devid, dev_uuid); if (IS_ERR(device)) - return NULL; + return device; list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -6295,6 +6296,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { + bio_put(dev->flush_bio); kfree(dev); return ERR_PTR(ret); } @@ -6377,6 +6379,17 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid, bool error) +{ + if (error) + btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); + else + btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); +} + static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -6447,18 +6460,21 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); - btrfs_report_missing_device(fs_info, devid, uuid); - return -EIO; + btrfs_report_missing_device(fs_info, devid, uuid, true); + return -ENOENT; } if (!map->stripes[i].dev) { map->stripes[i].dev = add_missing_dev(fs_info->fs_devices, devid, uuid); - if (!map->stripes[i].dev) { + if (IS_ERR(map->stripes[i].dev)) { free_extent_map(em); - return -EIO; + btrfs_err(fs_info, + "failed to init missing dev %llu: %ld", + devid, PTR_ERR(map->stripes[i].dev)); + return PTR_ERR(map->stripes[i].dev); } - btrfs_report_missing_device(fs_info, devid, uuid); + btrfs_report_missing_device(fs_info, devid, uuid, false); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6577,19 +6593,28 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { - btrfs_report_missing_device(fs_info, devid, dev_uuid); - return -EIO; + btrfs_report_missing_device(fs_info, devid, + dev_uuid, true); + return -ENOENT; } device = add_missing_dev(fs_devices, devid, dev_uuid); - if (!device) - return -ENOMEM; - btrfs_report_missing_device(fs_info, devid, dev_uuid); + if (IS_ERR(device)) { + btrfs_err(fs_info, + "failed to add missing dev %llu: %ld", + devid, PTR_ERR(device)); + return PTR_ERR(device); + } + btrfs_report_missing_device(fs_info, devid, dev_uuid, false); } else { if (!device->bdev) { - btrfs_report_missing_device(fs_info, devid, dev_uuid); - if (!btrfs_test_opt(fs_info, DEGRADED)) - return -EIO; + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, + devid, dev_uuid, true); + return -ENOENT; + } + btrfs_report_missing_device(fs_info, devid, + dev_uuid, false); } if(!device->bdev && !device->missing) { @@ -6756,12 +6781,6 @@ out_short_read: return -EIO; } -void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid) -{ - btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); -} - /* * Check if all chunks in the fs are OK for read-write degraded mount * diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6108fdfec67f..ff15208344a7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -542,7 +542,5 @@ void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info); -void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid); #endif diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c248f9286366..2b52950dc2c6 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -37,6 +37,7 @@ struct workspace { z_stream strm; char *buf; struct list_head list; + int level; }; static void zlib_free_workspace(struct list_head *ws) @@ -96,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws, *total_out = 0; *total_in = 0; - if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) { + if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) { pr_warn("BTRFS: deflateInit failed\n"); ret = -EIO; goto out; @@ -402,10 +403,22 @@ next: return ret; } +static void zlib_set_level(struct list_head *ws, unsigned int type) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + unsigned level = (type & 0xF0) >> 4; + + if (level > 9) + level = 9; + + workspace->level = level > 0 ? level : 3; +} + const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, .decompress_bio = zlib_decompress_bio, .decompress = zlib_decompress, + .set_level = zlib_set_level, }; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 607ce47b483a..17f2dd8fddb8 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -423,10 +423,15 @@ finish: return ret; } +static void zstd_set_level(struct list_head *ws, unsigned int type) +{ +} + const struct btrfs_compress_op btrfs_zstd_compress = { .alloc_workspace = zstd_alloc_workspace, .free_workspace = zstd_free_workspace, .compress_pages = zstd_compress_pages, .decompress_bio = zstd_decompress_bio, .decompress = zstd_decompress, + .set_level = zstd_set_level, }; diff --git a/fs/buffer.c b/fs/buffer.c index 170df856bdb9..0736a6a2e2f0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -253,27 +253,6 @@ out: } /* - * Kick the writeback threads then try to free up some ZONE_NORMAL memory. - */ -static void free_more_memory(void) -{ - struct zoneref *z; - int nid; - - wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); - yield(); - - for_each_online_node(nid) { - - z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL); - if (z->zone) - try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, - GFP_NOFS, NULL); - } -} - -/* * I/O completion handler for block_read_full_page() - pages * which come unlocked at the end of I/O. */ @@ -861,16 +840,19 @@ int remove_inode_buffers(struct inode *inode) * which may not fail from ordinary buffer allocations. */ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - int retry) + bool retry) { struct buffer_head *bh, *head; + gfp_t gfp = GFP_NOFS; long offset; -try_again: + if (retry) + gfp |= __GFP_NOFAIL; + head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { - bh = alloc_buffer_head(GFP_NOFS); + bh = alloc_buffer_head(gfp); if (!bh) goto no_grow; @@ -896,23 +878,7 @@ no_grow: } while (head); } - /* - * Return failure for non-async IO requests. Async IO requests - * are not allowed to fail, so we have to wait until buffer heads - * become available. But we don't want tasks sleeping with - * partially complete buffers, so all were released above. - */ - if (!retry) - return NULL; - - /* We're _really_ low on memory. Now we just - * wait for old buffer heads to become free due to - * finishing IO. Since this is an async request and - * the reserve list is empty, we're sure there are - * async buffer heads in use. - */ - free_more_memory(); - goto try_again; + return NULL; } EXPORT_SYMBOL_GPL(alloc_page_buffers); @@ -1001,8 +967,6 @@ grow_dev_page(struct block_device *bdev, sector_t block, gfp_mask |= __GFP_NOFAIL; page = find_or_create_page(inode->i_mapping, index, gfp_mask); - if (!page) - return ret; BUG_ON(!PageLocked(page)); @@ -1021,9 +985,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, /* * Allocate some buffers for this page */ - bh = alloc_page_buffers(page, size, 0); - if (!bh) - goto failed; + bh = alloc_page_buffers(page, size, true); /* * Link the page to the buffers and initialise them. Take the @@ -1103,8 +1065,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; - if (ret == 0) - free_more_memory(); } } @@ -1575,7 +1535,7 @@ void create_empty_buffers(struct page *page, { struct buffer_head *bh, *head, *tail; - head = alloc_page_buffers(page, blocksize, 1); + head = alloc_page_buffers(page, blocksize, true); bh = head; do { bh->b_state |= b_state; @@ -1632,7 +1592,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { count = pagevec_count(&pvec); for (i = 0; i < count; i++) { @@ -1692,7 +1652,8 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode * BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits), + b_state); return page_buffers(page); } @@ -1978,8 +1939,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, case IOMAP_MAPPED: if (offset >= i_size_read(inode)) set_buffer_new(bh); - bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) + - ((offset - iomap->offset) >> inode->i_blkbits); + bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> + inode->i_blkbits; set_buffer_mapped(bh); break; } @@ -2638,7 +2599,7 @@ int nobh_write_begin(struct address_space *mapping, * Be careful: the buffer linked list is a NULL terminated one, rather * than the circular one we're used to. */ - head = alloc_page_buffers(page, blocksize, 0); + head = alloc_page_buffers(page, blocksize, false); if (!head) { ret = -ENOMEM; goto out_release; @@ -3055,8 +3016,16 @@ void guard_bio_eod(int op, struct bio *bio) sector_t maxsector; struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned truncated_bytes; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); - maxsector = get_capacity(bio->bi_disk); if (!maxsector) return; @@ -3545,7 +3514,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, if (length <= 0) return -ENOENT; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); do { unsigned nr_pages, i; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 18d7aa61ef0f..883bc7bb12c5 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp | - __GFP_COLD); + newpage = __page_cache_alloc(cachefiles_gfp); if (!newpage) goto nomem_monitor; } @@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp | - __GFP_COLD); + newpage = __page_cache_alloc(cachefiles_gfp); if (!newpage) goto nomem; } @@ -710,7 +708,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, /* calculate the shift required to use bmap */ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - pagevec_init(&pagevec, 0); + pagevec_init(&pagevec); op->op.flags &= FSCACHE_OP_KEEP_FLAGS; op->op.flags |= FSCACHE_OP_ASYNC; @@ -844,7 +842,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op, ret = cachefiles_has_space(cache, 0, *nr_pages); if (ret == 0) { - pagevec_init(&pagevec, 0); + pagevec_init(&pagevec); list_for_each_entry(page, pages, lru) { if (pagevec_add(&pagevec, page) == 0) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4d622654bfbc..dbf07051aacd 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -680,7 +680,7 @@ static void ceph_release_pages(struct page **pages, int num) struct pagevec pvec; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); for (i = 0; i < num; i++) { if (pagevec_add(&pvec, pages[i]) == 0) pagevec_release(&pvec); @@ -811,7 +811,7 @@ static int ceph_writepages_start(struct address_space *mapping, if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; @@ -870,15 +870,10 @@ retry: max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = min_t(unsigned, PAGEVEC_SIZE, - max_pages - locked_pages); - if (end - index < (u64)(pvec_pages - 1)) - pvec_pages = (unsigned)(end - index) + 1; - - pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - pvec_pages); - dout("pagevec_lookup_tag got %d\n", pvec_pages); + pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_DIRTY, + max_pages - locked_pages); + dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { @@ -896,16 +891,6 @@ get_more_pages: unlock_page(page); continue; } - if (page->index > end) { - dout("end of range %p\n", page); - /* can't be range_cyclic (1st pass) because - * end == -1 in that case. */ - stop = true; - if (ceph_wbc.head_snapc) - done = true; - unlock_page(page); - break; - } if (strip_unit_end && (page->index > strip_unit_end)) { dout("end of strip unit %p\n", page); unlock_page(page); @@ -1177,8 +1162,7 @@ release_pvec_pages: index = 0; while ((index <= end) && (nr = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - PAGEVEC_SIZE))) { + PAGECACHE_TAG_WRITEBACK))) { for (i = 0; i < nr; i++) { page = pvec.pages[i]; if (page_snap_context(page) != snapc) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ff5d32cf9578..a14b2c974c9e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1160,7 +1160,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; struct cap_msg_args arg; - int held, revoking, dropping; + int held, revoking; int wake = 0; int delayed = 0; int ret; @@ -1168,7 +1168,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; - dropping = cap->issued & ~retain; dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", inode, cap, cap->session, @@ -1712,7 +1711,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, /* if we are unmounting, flush any unused caps immediately. */ if (mdsc->stopping) - is_delayed = 1; + is_delayed = true; spin_lock(&ci->i_ceph_lock); @@ -3189,8 +3188,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, int dirty = le32_to_cpu(m->dirty); int cleaned = 0; bool drop = false; - bool wake_ci = 0; - bool wake_mdsc = 0; + bool wake_ci = false; + bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { if (cf->tid == flush_tid) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f2550a076edc..ab81652198c4 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -493,6 +493,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_wb_ref = 0; ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; + atomic_set(&ci->i_filelock_ref, 0); ci->i_shared_gen = 0; ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; @@ -786,7 +787,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* update inode */ ci->i_version = le64_to_cpu(info->version); - inode->i_version++; inode->i_rdev = le32_to_cpu(info->rdev); inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -1185,6 +1185,7 @@ retry_lookup: ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); + ceph_dir_clear_ordered(dir); d_delete(dn); dput(dn); goto retry_lookup; @@ -1322,6 +1323,7 @@ retry_lookup: dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dn, d_inode(dn), ceph_vinop(d_inode(dn)), ceph_vinop(in)); + ceph_dir_clear_ordered(dir); d_invalidate(dn); have_lease = false; } @@ -1573,6 +1575,7 @@ retry_lookup: ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); + __ceph_dir_clear_ordered(ci); d_delete(dn); dput(dn); goto retry_lookup; @@ -1597,7 +1600,9 @@ retry_lookup: &req->r_caps_reservation); if (ret < 0) { pr_err("fill_inode badness on %p\n", in); - if (d_really_is_negative(dn)) + if (d_really_is_positive(dn)) + __ceph_dir_clear_ordered(ci); + else iput(in); d_drop(dn); err = ret; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index e7cce412f2cf..9e66f69ee8a5 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -30,19 +30,52 @@ void __init ceph_flock_init(void) get_random_bytes(&lock_secret, sizeof(lock_secret)); } +static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) +{ + struct inode *inode = file_inode(src->fl_file); + atomic_inc(&ceph_inode(inode)->i_filelock_ref); +} + +static void ceph_fl_release_lock(struct file_lock *fl) +{ + struct inode *inode = file_inode(fl->fl_file); + struct ceph_inode_info *ci = ceph_inode(inode); + if (atomic_dec_and_test(&ci->i_filelock_ref)) { + /* clear error when all locks are released */ + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; + spin_unlock(&ci->i_ceph_lock); + } +} + +static const struct file_lock_operations ceph_fl_lock_ops = { + .fl_copy_lock = ceph_fl_copy_lock, + .fl_release_private = ceph_fl_release_lock, +}; + /** * Implement fcntl and flock locking functions. */ -static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, +static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, int cmd, u8 wait, struct file_lock *fl) { - struct inode *inode = file_inode(file); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; u64 length = 0; u64 owner; + if (operation == CEPH_MDS_OP_SETFILELOCK) { + /* + * increasing i_filelock_ref closes race window between + * handling request reply and adding file_lock struct to + * inode. Otherwise, auth caps may get trimmed in the + * window. Caller function will decrease the counter. + */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ceph_inode(inode)->i_filelock_ref); + } + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) wait = 0; @@ -180,10 +213,12 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, */ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { - u8 lock_cmd; - int err; - u8 wait = 0; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + int err = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; + u8 wait = 0; + u8 lock_cmd; if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; @@ -199,6 +234,26 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else if (IS_SETLKW(cmd)) wait = 1; + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } else if (op == CEPH_MDS_OP_SETFILELOCK) { + /* + * increasing i_filelock_ref closes race window between + * handling request reply and adding file_lock struct to + * inode. Otherwise, i_auth_cap may get trimmed in the + * window. Caller function will decrease the counter. + */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ci->i_filelock_ref); + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) + posix_lock_file(file, fl, NULL); + return err; + } + if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; else if (F_WRLCK == fl->fl_type) @@ -206,16 +261,16 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; - err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { - if (op != CEPH_MDS_OP_GETFILELOCK) { + if (op == CEPH_MDS_OP_SETFILELOCK) { dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); - if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + if (err) { /* undo! This should only happen if * the kernel detects local * deadlock. */ - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on posix_lock_file, undid lock", err); @@ -227,9 +282,11 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { - u8 lock_cmd; - int err; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + int err = 0; u8 wait = 0; + u8 lock_cmd; if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; @@ -239,6 +296,21 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) dout("ceph_flock, fl_file: %p", fl->fl_file); + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } else { + /* see comment in ceph_lock */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ci->i_filelock_ref); + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (F_UNLCK == fl->fl_type) + locks_lock_file_wait(file, fl); + return err; + } + if (IS_SETLKW(cmd)) wait = 1; @@ -250,13 +322,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) lock_cmd = CEPH_LOCK_UNLOCK; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, lock_cmd, wait, fl); + inode, lock_cmd, wait, fl); if (!err) { err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, CEPH_LOCK_UNLOCK, 0, fl); + inode, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on locks_lock_file_wait, undid lock", err); } } @@ -288,6 +360,37 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) *flock_count, *fcntl_count); } +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +static int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d", lock->fl_type); + err = -EINVAL; + } + + return err; +} + /** * Encode the flock and fcntl locks for the given inode into the ceph_filelock * array. Must be called with inode->i_lock already held. @@ -356,50 +459,22 @@ int ceph_locks_to_pagelist(struct ceph_filelock *flocks, if (err) goto out_fail; - err = ceph_pagelist_append(pagelist, flocks, - num_fcntl_locks * sizeof(*flocks)); - if (err) - goto out_fail; + if (num_fcntl_locks > 0) { + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + } nlocks = cpu_to_le32(num_flock_locks); err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); if (err) goto out_fail; - err = ceph_pagelist_append(pagelist, - &flocks[num_fcntl_locks], - num_flock_locks * sizeof(*flocks)); -out_fail: - return err; -} - -/* - * Given a pointer to a lock, convert it to a ceph filelock - */ -int lock_to_ceph_filelock(struct file_lock *lock, - struct ceph_filelock *cephlock) -{ - int err = 0; - cephlock->start = cpu_to_le64(lock->fl_start); - cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); - cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64((u64)lock->fl_pid); - cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); - - switch (lock->fl_type) { - case F_RDLCK: - cephlock->type = CEPH_LOCK_SHARED; - break; - case F_WRLCK: - cephlock->type = CEPH_LOCK_EXCL; - break; - case F_UNLCK: - cephlock->type = CEPH_LOCK_UNLOCK; - break; - default: - dout("Have unknown lock type %d", lock->fl_type); - err = -EINVAL; + if (num_flock_locks > 0) { + err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); } - +out_fail: return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0687ab3c3267..1b468250e947 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1039,22 +1039,23 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, * session caps */ -/* caller holds s_cap_lock, we drop it */ -static void cleanup_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) - __releases(session->s_cap_lock) +static void detach_cap_releases(struct ceph_mds_session *session, + struct list_head *target) { - LIST_HEAD(tmp_list); - list_splice_init(&session->s_cap_releases, &tmp_list); + lockdep_assert_held(&session->s_cap_lock); + + list_splice_init(&session->s_cap_releases, target); session->s_num_cap_releases = 0; - spin_unlock(&session->s_cap_lock); + dout("dispose_cap_releases mds%d\n", session->s_mds); +} - dout("cleanup_cap_releases mds%d\n", session->s_mds); - while (!list_empty(&tmp_list)) { +static void dispose_cap_releases(struct ceph_mds_client *mdsc, + struct list_head *dispose) +{ + while (!list_empty(dispose)) { struct ceph_cap *cap; /* zero out the in-progress message */ - cap = list_first_entry(&tmp_list, - struct ceph_cap, session_caps); + cap = list_first_entry(dispose, struct ceph_cap, session_caps); list_del(&cap->session_caps); ceph_put_cap(mdsc, cap); } @@ -1215,6 +1216,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_unlock(&mdsc->cap_dirty_lock); + if (atomic_read(&ci->i_filelock_ref) > 0) { + /* make further file lock syscall return -EIO */ + ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; + pr_warn_ratelimited(" dropping file locks for %p %lld\n", + inode, ceph_ino(inode)); + } + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; @@ -1244,6 +1252,8 @@ static void remove_session_caps(struct ceph_mds_session *session) { struct ceph_fs_client *fsc = session->s_mdsc->fsc; struct super_block *sb = fsc->sb; + LIST_HEAD(dispose); + dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, fsc); @@ -1278,10 +1288,12 @@ static void remove_session_caps(struct ceph_mds_session *session) } // drop cap expires and unlock s_cap_lock - cleanup_cap_releases(session->s_mdsc, session); + detach_cap_releases(session, &dispose); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(session->s_mdsc, &dispose); } /* @@ -1428,6 +1440,29 @@ static int __close_session(struct ceph_mds_client *mdsc, return request_close_session(mdsc, session); } +static bool drop_negative_children(struct dentry *dentry) +{ + struct dentry *child; + bool all_negative = true; + + if (!d_is_dir(dentry)) + goto out; + + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + if (d_really_is_positive(child)) { + all_negative = false; + break; + } + } + spin_unlock(&dentry->d_lock); + + if (all_negative) + shrink_dcache_parent(dentry); +out: + return all_negative; +} + /* * Trim old(er) caps. * @@ -1462,6 +1497,11 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; + /* Note: it's possible that i_filelock_ref becomes non-zero + * after dropping auth caps. It doesn't hurt because reply + * of lock mds request will re-add auth caps. */ + if (atomic_read(&ci->i_filelock_ref) > 0) + goto out; } /* The inode has cached pages, but it's no longer used. * we can safely drop it */ @@ -1473,16 +1513,27 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ - session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); + session->s_trim_caps--; } else { + struct dentry *dentry; /* try dropping referring dentries */ spin_unlock(&ci->i_ceph_lock); - d_prune_aliases(inode); - dout("trim_caps_cb %p cap %p pruned, count now %d\n", - inode, cap, atomic_read(&inode->i_count)); + dentry = d_find_any_alias(inode); + if (dentry && drop_negative_children(dentry)) { + int count; + dput(dentry); + d_prune_aliases(inode); + count = atomic_read(&inode->i_count); + if (count == 1) + session->s_trim_caps--; + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, count); + } else { + dput(dentry); + } return 0; } @@ -2827,7 +2878,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - struct ceph_inode_info *ci; + struct ceph_inode_info *ci = cap->ci; struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; char *path; @@ -2836,8 +2887,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, u64 snap_follows; struct dentry *dentry; - ci = cap->ci; - dout(" adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); @@ -2870,7 +2919,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v2.pathbase = cpu_to_le64(pathbase); - rec.v2.flock_len = 0; + rec.v2.flock_len = (__force __le32) + ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); @@ -2894,26 +2944,37 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; - struct ceph_filelock *flocks; + struct ceph_filelock *flocks = NULL; size_t struct_len, total_len = 0; u8 struct_v = 0; encode_again: - ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - flocks = kmalloc((num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock), GFP_NOFS); - if (!flocks) { - err = -ENOMEM; - goto out_free; + if (rec.v2.flock_len) { + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + } else { + num_fcntl_locks = 0; + num_flock_locks = 0; } - err = ceph_encode_locks_to_buffer(inode, flocks, - num_fcntl_locks, - num_flock_locks); - if (err) { + if (num_fcntl_locks + num_flock_locks > 0) { + flocks = kmalloc((num_fcntl_locks + num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + if (err) { + kfree(flocks); + flocks = NULL; + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + } else { kfree(flocks); - if (err == -ENOSPC) - goto encode_again; - goto out_free; + flocks = NULL; } if (recon_state->msg_version >= 3) { @@ -2993,6 +3054,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int s_nr_caps; struct ceph_pagelist *pagelist; struct ceph_reconnect_state recon_state; + LIST_HEAD(dispose); pr_info("mds%d reconnect start\n", mds); @@ -3026,7 +3088,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ - cleanup_cap_releases(mdsc, session); + detach_cap_releases(session, &dispose); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(mdsc, &dispose); /* trim unused caps to reduce MDS's cache rejoin time */ if (mdsc->fsc->sb->s_root) @@ -3857,14 +3921,14 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) goto err_out; } return; + bad: pr_err("error decoding fsmap\n"); err_out: mutex_lock(&mdsc->mutex); - mdsc->mdsmap_err = -ENOENT; + mdsc->mdsmap_err = err; __wake_requests(mdsc, &mdsc->waiting_for_map); mutex_unlock(&mdsc->mutex); - return; } /* diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e4082afedcb1..a62d2a9841dc 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -84,8 +84,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = -1; buf->f_namelen = NAME_MAX; - /* leave fsid little-endian, regardless of host endianness */ - fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); + /* Must convert the fsid, for consistent values across arches */ + fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^ + le64_to_cpu(*((__le64 *)&monmap->fsid + 1)); buf->f_fsid.val[0] = fsid & 0xffffffff; buf->f_fsid.val[1] = fsid >> 32; @@ -330,11 +331,11 @@ static int parse_fsopt_token(char *c, void *private) break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: - fsopt->sb_flags |= MS_POSIXACL; + fsopt->sb_flags |= SB_POSIXACL; break; #endif case Opt_noacl: - fsopt->sb_flags &= ~MS_POSIXACL; + fsopt->sb_flags &= ~SB_POSIXACL; break; default: BUG_ON(token); @@ -519,7 +520,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nopoolperm"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - if (fsopt->sb_flags & MS_POSIXACL) + if (fsopt->sb_flags & SB_POSIXACL) seq_puts(m, ",acl"); else seq_puts(m, ",noacl"); @@ -987,7 +988,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, dout("ceph_mount\n"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - flags |= MS_POSIXACL; + flags |= SB_POSIXACL; #endif err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); if (err < 0) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3e27a28aa44a..2beeec07fa76 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -352,6 +352,7 @@ struct ceph_inode_info { int i_pin_ref; int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; + atomic_t i_filelock_ref; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ @@ -487,6 +488,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ + /* * We set the ERROR_WRITE bit when we start seeing write errors on an inode @@ -1011,7 +1014,6 @@ extern int ceph_encode_locks_to_buffer(struct inode *inode, extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks); -extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index cbd216b57239..350fa55a1bf7 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -42,7 +42,7 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ -#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ +#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of SB_POSIXACL in mnt_cifs_flags */ #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ #define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8c8b75d33f31..31b7565b1617 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -125,7 +125,7 @@ cifs_read_super(struct super_block *sb) tcon = cifs_sb_master_tcon(cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files) sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -497,7 +497,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) seq_puts(s, ",dynperm"); - if (root->d_sb->s_flags & MS_POSIXACL) + if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) seq_puts(s, ",mfsymlinks"); @@ -573,7 +573,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NODIRATIME; + *flags |= SB_NODIRATIME; return 0; } @@ -708,7 +708,7 @@ cifs_do_mount(struct file_system_type *fs_type, rc = cifs_mount(cifs_sb, volume_info); if (rc) { - if (!(flags & MS_SILENT)) + if (!(flags & SB_SILENT)) cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", rc); root = ERR_PTR(rc); @@ -720,7 +720,7 @@ cifs_do_mount(struct file_system_type *fs_type, mnt_data.flags = flags; /* BB should we make this contingent on mount parm? */ - flags |= MS_NODIRATIME | MS_NOATIME; + flags |= SB_NODIRATIME | SB_NOATIME; sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data); if (IS_ERR(sb)) { @@ -739,7 +739,7 @@ cifs_do_mount(struct file_system_type *fs_type, goto out_super; } - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; } root = cifs_get_root(volume_info, sb); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e185b2853eab..b16583594d1a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -559,8 +559,8 @@ struct smb_vol { CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID) -#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ - MS_NODEV | MS_SYNCHRONOUS) +#define CIFS_MS_MASK (SB_RDONLY | SB_MANDLOCK | SB_NOEXEC | SB_NOSUID | \ + SB_NODEV | SB_SYNCHRONOUS) struct cifs_mnt_data { struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 92fdf9c35de2..df9f682708c6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1963,8 +1963,6 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, pgoff_t end, pgoff_t *index, unsigned int *found_pages) { - unsigned int nr_pages; - struct page **pages; struct cifs_writedata *wdata; wdata = cifs_writedata_alloc((unsigned int)tofind, @@ -1972,23 +1970,8 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, if (!wdata) return NULL; - /* - * find_get_pages_tag seems to return a max of 256 on each - * iteration, so we must call it several times in order to - * fill the array or the wsize is effectively limited to - * 256 * PAGE_SIZE. - */ - *found_pages = 0; - pages = wdata->pages; - do { - nr_pages = find_get_pages_tag(mapping, index, - PAGECACHE_TAG_DIRTY, tofind, - pages); - *found_pages += nr_pages; - tofind -= nr_pages; - pages += nr_pages; - } while (nr_pages && tofind && *index <= end); - + *found_pages = find_get_pages_range_tag(mapping, index, end, + PAGECACHE_TAG_DIRTY, tofind, wdata->pages); return wdata; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7c732cb44164..ecb99079363a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -985,7 +985,7 @@ retry_iget5_locked: } cifs_fattr_to_inode(inode, fattr); - if (sb->s_flags & MS_NOATIME) + if (sb->s_flags & SB_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bdb963d0ba32..ed88ab8a4774 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1406,7 +1406,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, } while (rc == -EAGAIN); if (rc) { - cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); + if (rc != -ENOENT) + cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); goto out; } @@ -2087,22 +2088,6 @@ init_sg(struct smb_rqst *rqst, u8 *sign) return sg; } -struct cifs_crypt_result { - int err; - struct completion completion; -}; - -static void cifs_crypt_complete(struct crypto_async_request *req, int err) -{ - struct cifs_crypt_result *res = req->data; - - if (err == -EINPROGRESS) - return; - - res->err = err; - complete(&res->completion); -} - static int smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) { @@ -2143,12 +2128,10 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) struct aead_request *req; char *iv; unsigned int iv_len; - struct cifs_crypt_result result = {0, }; + DECLARE_CRYPTO_WAIT(wait); struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); - init_completion(&result.completion); - rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key); if (rc) { cifs_dbg(VFS, "%s: Could not get %scryption key\n", __func__, @@ -2208,14 +2191,10 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) aead_request_set_ad(req, assoc_data_len); aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - cifs_crypt_complete, &result); + crypto_req_done, &wait); - rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req); - - if (rc == -EINPROGRESS || rc == -EBUSY) { - wait_for_completion(&result.completion); - rc = result.err; - } + rc = crypto_wait_req(enc ? crypto_aead_encrypt(req) + : crypto_aead_decrypt(req), &wait); if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5331631386a2..01346b8b6edb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2678,27 +2678,27 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; - shdr = get_sync_hdr(rsp); - if (shdr->Status == STATUS_END_OF_FILE) { + if (rc) { + if (rc != -ENODATA) { + cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); + cifs_dbg(VFS, "Send error in read = %d\n", rc); + } free_rsp_buf(resp_buftype, rsp_iov.iov_base); - return 0; + return rc == -ENODATA ? 0 : rc; } - if (rc) { - cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); - cifs_dbg(VFS, "Send error in read = %d\n", rc); - } else { - *nbytes = le32_to_cpu(rsp->DataLength); - if ((*nbytes > CIFS_MAX_MSGSIZE) || - (*nbytes > io_parms->length)) { - cifs_dbg(FYI, "bad length %d for count %d\n", - *nbytes, io_parms->length); - rc = -EIO; - *nbytes = 0; - } + *nbytes = le32_to_cpu(rsp->DataLength); + if ((*nbytes > CIFS_MAX_MSGSIZE) || + (*nbytes > io_parms->length)) { + cifs_dbg(FYI, "bad length %d for count %d\n", + *nbytes, io_parms->length); + rc = -EIO; + *nbytes = 0; } + shdr = get_sync_hdr(rsp); + if (*buf) { memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, rsp_iov.iov_base); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 52f975d848a0..316af84674f1 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -117,7 +117,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, #ifdef CONFIG_CIFS_POSIX if (!value) goto out; - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, value, (const int)size, ACL_TYPE_ACCESS, cifs_sb->local_nls, @@ -129,7 +129,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, #ifdef CONFIG_CIFS_POSIX if (!value) goto out; - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, value, (const int)size, ACL_TYPE_DEFAULT, cifs_sb->local_nls, @@ -266,7 +266,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, case XATTR_ACL_ACCESS: #ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, value, size, ACL_TYPE_ACCESS, cifs_sb->local_nls, @@ -276,7 +276,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, case XATTR_ACL_DEFAULT: #ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, value, size, ACL_TYPE_DEFAULT, cifs_sb->local_nls, diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 6f0a6a4d5faa..97424cf206c0 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -96,7 +96,7 @@ void coda_destroy_inodecache(void) static int coda_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NOATIME; + *flags |= SB_NOATIME; return 0; } @@ -188,7 +188,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) mutex_unlock(&vc->vc_mutex); sb->s_fs_info = vc; - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; sb->s_blocksize = 4096; /* XXXXX what do we put here?? */ sb->s_blocksize_bits = 12; sb->s_magic = CODA_SUPER_MAGIC; diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index a37f003530d7..1175a1722411 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -447,8 +447,7 @@ int venus_fsync(struct super_block *sb, struct CodaFid *fid) UPARG(CODA_FSYNC); inp->coda_fsync.VFid = *fid; - error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs), - &outsize, inp); + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); CODA_FREE(inp, insize); return error; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index bd5d91e119ca..5fc5dc660600 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -54,8 +54,6 @@ #include <linux/if_tun.h> #include <linux/ctype.h> #include <linux/syscalls.h> -#include <linux/i2c.h> -#include <linux/i2c-dev.h> #include <linux/atalk.h> #include <linux/gfp.h> #include <linux/cec.h> @@ -137,22 +135,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return vfs_ioctl(file, cmd, arg); } -static int w_long(struct file *file, - unsigned int cmd, compat_ulong_t __user *argp) -{ - int err; - unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); - - if (valp == NULL) - return -EFAULT; - err = do_ioctl(file, cmd, (unsigned long)valp); - if (err) - return err; - if (convert_in_user(valp, argp)) - return -EFAULT; - return 0; -} - struct compat_video_event { int32_t type; compat_time_t timestamp; @@ -671,96 +653,6 @@ static int serial_struct_ioctl(struct file *file, return err; } -/* - * I2C layer ioctls - */ - -struct i2c_msg32 { - u16 addr; - u16 flags; - u16 len; - compat_caddr_t buf; -}; - -struct i2c_rdwr_ioctl_data32 { - compat_caddr_t msgs; /* struct i2c_msg __user *msgs */ - u32 nmsgs; -}; - -struct i2c_smbus_ioctl_data32 { - u8 read_write; - u8 command; - u32 size; - compat_caddr_t data; /* union i2c_smbus_data *data */ -}; - -struct i2c_rdwr_aligned { - struct i2c_rdwr_ioctl_data cmd; - struct i2c_msg msgs[0]; -}; - -static int do_i2c_rdwr_ioctl(struct file *file, - unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata) -{ - struct i2c_rdwr_aligned __user *tdata; - struct i2c_msg __user *tmsgs; - struct i2c_msg32 __user *umsgs; - compat_caddr_t datap; - u32 nmsgs; - int i; - - if (get_user(nmsgs, &udata->nmsgs)) - return -EFAULT; - if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS) - return -EINVAL; - - if (get_user(datap, &udata->msgs)) - return -EFAULT; - umsgs = compat_ptr(datap); - - tdata = compat_alloc_user_space(sizeof(*tdata) + - nmsgs * sizeof(struct i2c_msg)); - tmsgs = &tdata->msgs[0]; - - if (put_user(nmsgs, &tdata->cmd.nmsgs) || - put_user(tmsgs, &tdata->cmd.msgs)) - return -EFAULT; - - for (i = 0; i < nmsgs; i++) { - if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16))) - return -EFAULT; - if (get_user(datap, &umsgs[i].buf) || - put_user(compat_ptr(datap), &tmsgs[i].buf)) - return -EFAULT; - } - return do_ioctl(file, cmd, (unsigned long)tdata); -} - -static int do_i2c_smbus_ioctl(struct file *file, - unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) -{ - struct i2c_smbus_ioctl_data __user *tdata; - union { - /* beginnings of those have identical layouts */ - struct i2c_smbus_ioctl_data32 data32; - struct i2c_smbus_ioctl_data data; - } v; - - tdata = compat_alloc_user_space(sizeof(*tdata)); - if (tdata == NULL) - return -ENOMEM; - - memset(&v, 0, sizeof(v)); - if (copy_from_user(&v.data32, udata, sizeof(v.data32))) - return -EFAULT; - v.data.data = compat_ptr(v.data32.data); - - if (copy_to_user(tdata, &v.data, sizeof(v.data))) - return -EFAULT; - - return do_ioctl(file, cmd, (unsigned long)tdata); -} - #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) #define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) @@ -1283,13 +1175,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* i2c */ -COMPATIBLE_IOCTL(I2C_SLAVE) -COMPATIBLE_IOCTL(I2C_SLAVE_FORCE) -COMPATIBLE_IOCTL(I2C_TENBIT) -COMPATIBLE_IOCTL(I2C_PEC) -COMPATIBLE_IOCTL(I2C_RETRIES) -COMPATIBLE_IOCTL(I2C_TIMEOUT) /* hiddev */ COMPATIBLE_IOCTL(HIDIOCGVERSION) COMPATIBLE_IOCTL(HIDIOCAPPLICATION) @@ -1464,13 +1349,6 @@ static long do_ioctl_trans(unsigned int cmd, case TIOCGSERIAL: case TIOCSSERIAL: return serial_struct_ioctl(file, cmd, argp); - /* i2c */ - case I2C_FUNCS: - return w_long(file, cmd, argp); - case I2C_RDWR: - return do_i2c_rdwr_ioctl(file, cmd, argp); - case I2C_SMBUS: - return do_i2c_smbus_ioctl(file, cmd, argp); /* Not implemented in the native kernel */ case RTC_IRQP_READ32: case RTC_IRQP_SET32: @@ -1580,6 +1458,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, case FICLONE: case FICLONERANGE: case FIDEDUPERANGE: + case FS_IOC_FIEMAP: goto do_ioctl; case FIBMAP: diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 56fb26127fef..577cff24707b 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -584,7 +584,7 @@ static void detach_attrs(struct config_item * item) static int populate_attrs(struct config_item *item) { - struct config_item_type *t = item->ci_type; + const struct config_item_type *t = item->ci_type; struct configfs_attribute *attr; struct configfs_bin_attribute *bin_attr; int error = 0; @@ -901,7 +901,7 @@ static void configfs_detach_group(struct config_item *item) static void client_disconnect_notify(struct config_item *parent_item, struct config_item *item) { - struct config_item_type *type; + const struct config_item_type *type; type = parent_item->ci_type; BUG_ON(!type); @@ -920,7 +920,7 @@ static void client_disconnect_notify(struct config_item *parent_item, static void client_drop_item(struct config_item *parent_item, struct config_item *item) { - struct config_item_type *type; + const struct config_item_type *type; type = parent_item->ci_type; BUG_ON(!type); @@ -1260,7 +1260,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct config_item *parent_item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; - struct config_item_type *type; + const struct config_item_type *type; struct module *subsys_owner = NULL, *new_item_owner = NULL; char *name; @@ -1810,7 +1810,7 @@ EXPORT_SYMBOL(configfs_unregister_group); struct config_group * configfs_register_default_group(struct config_group *parent_group, const char *name, - struct config_item_type *item_type) + const struct config_item_type *item_type) { int ret; struct config_group *group; diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 39da1103d341..62580dba3552 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -166,7 +166,7 @@ configfs_read_bin_file(struct file *file, char __user *buf, retval = -ETXTBSY; goto out; } - buffer->read_in_progress = 1; + buffer->read_in_progress = true; if (buffer->needs_read_fill) { /* perform first read with buf == NULL to get extent */ @@ -325,7 +325,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf, len = -ETXTBSY; goto out; } - buffer->write_in_progress = 1; + buffer->write_in_progress = true; /* buffer grows? */ if (*ppos + count > buffer->bin_buffer_size) { @@ -429,8 +429,8 @@ static int check_perm(struct inode * inode, struct file * file, int type) } mutex_init(&buffer->mutex); buffer->needs_read_fill = 1; - buffer->read_in_progress = 0; - buffer->write_in_progress = 0; + buffer->read_in_progress = false; + buffer->write_in_progress = false; buffer->ops = ops; file->private_data = buffer; goto Done; @@ -488,10 +488,10 @@ static int configfs_release_bin_file(struct inode *inode, struct file *filp) ssize_t len = 0; int ret; - buffer->read_in_progress = 0; + buffer->read_in_progress = false; if (buffer->write_in_progress) { - buffer->write_in_progress = 0; + buffer->write_in_progress = false; len = bin_attr->write(item, buffer->bin_buffer, buffer->bin_buffer_size); diff --git a/fs/configfs/item.c b/fs/configfs/item.c index a66f6624d899..88f266efc09b 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL(config_item_set_name); void config_item_init_type_name(struct config_item *item, const char *name, - struct config_item_type *type) + const struct config_item_type *type) { config_item_set_name(item, "%s", name); item->ci_type = type; @@ -122,7 +122,7 @@ void config_item_init_type_name(struct config_item *item, EXPORT_SYMBOL(config_item_init_type_name); void config_group_init_type_name(struct config_group *group, const char *name, - struct config_item_type *type) + const struct config_item_type *type) { config_item_set_name(&group->cg_item, "%s", name); group->cg_item.ci_type = type; @@ -148,7 +148,7 @@ EXPORT_SYMBOL(config_item_get_unless_zero); static void config_item_cleanup(struct config_item *item) { - struct config_item_type *t = item->ci_type; + const struct config_item_type *t = item->ci_type; struct config_group *s = item->ci_group; struct config_item *parent = item->ci_parent; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index c8aabba502f6..78ffc2699993 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -138,7 +138,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna struct configfs_dirent *sd; struct config_item *parent_item; struct config_item *target_item = NULL; - struct config_item_type *type; + const struct config_item_type *type; sd = dentry->d_parent->d_fsdata; /* @@ -186,7 +186,7 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry) struct configfs_dirent *sd = dentry->d_fsdata; struct configfs_symlink *sl; struct config_item *parent_item; - struct config_item_type *type; + const struct config_item_type *type; int ret; ret = -EPERM; /* What lack-of-symlink returns */ diff --git a/fs/coredump.c b/fs/coredump.c index 52c63d6c9143..1e2c87acac9b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -680,16 +680,11 @@ void do_coredump(const siginfo_t *siginfo) * privs and don't want to unlink another user's coredump. */ if (!need_suid_safe) { - mm_segment_t old_fs; - - old_fs = get_fs(); - set_fs(KERNEL_DS); /* * If it doesn't exist, that's fine. If there's some * other problem, we'll catch it at the filp_open(). */ - (void) sys_unlink((const char __user *)cn.corename); - set_fs(old_fs); + do_unlinkat(AT_FDCWD, getname_kernel(cn.corename)); } /* diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index 11b29d491b7c..58e2fe40b2a0 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -1,6 +1,5 @@ config CRAMFS - tristate "Compressed ROM file system support (cramfs) (OBSOLETE)" - depends on BLOCK + tristate "Compressed ROM file system support (cramfs)" select ZLIB_INFLATE help Saying Y here includes support for CramFs (Compressed ROM File @@ -16,7 +15,40 @@ config CRAMFS cramfs. Note that the root file system (the one containing the directory /) cannot be compiled as a module. - This filesystem is obsoleted by SquashFS, which is much better - in terms of performance and features. + This filesystem is limited in capabilities and performance on + purpose to remain small and low on RAM usage. It is most suitable + for small embedded systems. If you have ample RAM to spare, you may + consider a more capable compressed filesystem such as SquashFS + which is much better in terms of performance and features. + + If unsure, say N. + +config CRAMFS_BLOCKDEV + bool "Support CramFs image over a regular block device" if EXPERT + depends on CRAMFS && BLOCK + default y + help + This option allows the CramFs driver to load data from a regular + block device such a disk partition or a ramdisk. + +config CRAMFS_MTD + bool "Support CramFs image directly mapped in physical memory" + depends on CRAMFS && MTD + depends on CRAMFS=m || MTD=y + default y if !CRAMFS_BLOCKDEV + help + This option allows the CramFs driver to load data directly from + a linear adressed memory range (usually non volatile memory + like flash) instead of going through the block device layer. + This saves some memory since no intermediate buffering is + necessary. + + The location of the CramFs image is determined by a + MTD device capable of direct memory mapping e.g. from + the 'physmap' map driver or a resulting MTD partition. + For example, this would mount the cramfs image stored in + the MTD partition named "xip_fs" on the /mnt mountpoint: + + mount -t cramfs mtd:xip_fs /mnt If unsure, say N. diff --git a/fs/cramfs/README b/fs/cramfs/README index 9d4e7ea311f4..d71b27e0ff15 100644 --- a/fs/cramfs/README +++ b/fs/cramfs/README @@ -49,17 +49,46 @@ same as the start of the (i+1)'th <block> if there is one). The first <block> immediately follows the last <block_pointer> for the file. <block_pointer>s are each 32 bits long. +When the CRAMFS_FLAG_EXT_BLOCK_POINTERS capability bit is set, each +<block_pointer>'s top bits may contain special flags as follows: + +CRAMFS_BLK_FLAG_UNCOMPRESSED (bit 31): + The block data is not compressed and should be copied verbatim. + +CRAMFS_BLK_FLAG_DIRECT_PTR (bit 30): + The <block_pointer> stores the actual block start offset and not + its end, shifted right by 2 bits. The block must therefore be + aligned to a 4-byte boundary. The block size is either blksize + if CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified, otherwise + the compressed data length is included in the first 2 bytes of + the block data. This is used to allow discontiguous data layout + and specific data block alignments e.g. for XIP applications. + + The order of <file_data>'s is a depth-first descent of the directory tree, i.e. the same order as `find -size +0 \( -type f -o -type l \) -print'. <block>: The i'th <block> is the output of zlib's compress function -applied to the i'th blksize-sized chunk of the input data. +applied to the i'th blksize-sized chunk of the input data if the +corresponding CRAMFS_BLK_FLAG_UNCOMPRESSED <block_ptr> bit is not set, +otherwise it is the input data directly. (For the last <block> of the file, the input may of course be smaller.) Each <block> may be a different size. (See <block_pointer> above.) + <block>s are merely byte-aligned, not generally u32-aligned. +When CRAMFS_BLK_FLAG_DIRECT_PTR is specified then the corresponding +<block> may be located anywhere and not necessarily contiguous with +the previous/next blocks. In that case it is minimally u32-aligned. +If CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified then the size is always +blksize except for the last block which is limited by the file length. +If CRAMFS_BLK_FLAG_DIRECT_PTR is set and CRAMFS_BLK_FLAG_UNCOMPRESSED +is not set then the first 2 bytes of the block contains the size of the +remaining block data as this cannot be determined from the placement of +logically adjacent blocks. + Holes ----- diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 7919967488cb..017b0ab19bc4 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -15,10 +15,15 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/file.h> #include <linux/pagemap.h> +#include <linux/pfn_t.h> +#include <linux/ramfs.h> #include <linux/init.h> #include <linux/string.h> #include <linux/blkdev.h> +#include <linux/mtd/mtd.h> +#include <linux/mtd/super.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/mutex.h> @@ -36,6 +41,9 @@ struct cramfs_sb_info { unsigned long blocks; unsigned long files; unsigned long flags; + void *linear_virt_addr; + resource_size_t linear_phys_addr; + size_t mtd_point_size; }; static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) @@ -46,6 +54,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) static const struct super_operations cramfs_ops; static const struct inode_operations cramfs_dir_inode_operations; static const struct file_operations cramfs_directory_operations; +static const struct file_operations cramfs_physmem_fops; static const struct address_space_operations cramfs_aops; static DEFINE_MUTEX(read_mutex); @@ -93,6 +102,10 @@ static struct inode *get_cramfs_inode(struct super_block *sb, case S_IFREG: inode->i_fop = &generic_ro_fops; inode->i_data.a_ops = &cramfs_aops; + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && + CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS && + CRAMFS_SB(sb)->linear_phys_addr) + inode->i_fop = &cramfs_physmem_fops; break; case S_IFDIR: inode->i_op = &cramfs_dir_inode_operations; @@ -140,6 +153,9 @@ static struct inode *get_cramfs_inode(struct super_block *sb, * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to * worry about end-of-buffer issues even when decompressing a full * page cache. + * + * Note: This is all optimized away at compile time when + * CONFIG_CRAMFS_BLOCKDEV=n. */ #define READ_BUFFERS (2) /* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */ @@ -160,10 +176,10 @@ static struct super_block *buffer_dev[READ_BUFFERS]; static int next_buffer; /* - * Returns a pointer to a buffer containing at least LEN bytes of - * filesystem starting at byte offset OFFSET into the filesystem. + * Populate our block cache and return a pointer to it. */ -static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len) +static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, + unsigned int len) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; struct page *pages[BLKS_PER_BUF]; @@ -239,49 +255,278 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i return read_buffers[buffer] + offset; } +/* + * Return a pointer to the linearly addressed cramfs image in memory. + */ +static void *cramfs_direct_read(struct super_block *sb, unsigned int offset, + unsigned int len) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + + if (!len) + return NULL; + if (len > sbi->size || offset > sbi->size - len) + return page_address(ZERO_PAGE(0)); + return sbi->linear_virt_addr + offset; +} + +/* + * Returns a pointer to a buffer containing at least LEN bytes of + * filesystem starting at byte offset OFFSET into the filesystem. + */ +static void *cramfs_read(struct super_block *sb, unsigned int offset, + unsigned int len) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sbi->linear_virt_addr) + return cramfs_direct_read(sb, offset, len); + else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) + return cramfs_blkdev_read(sb, offset, len); + else + return NULL; +} + +/* + * For a mapping to be possible, we need a range of uncompressed and + * contiguous blocks. Return the offset for the first block and number of + * valid blocks for which that is true, or zero otherwise. + */ +static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); + int i; + u32 *blockptrs, first_block_addr; + + /* + * We can dereference memory directly here as this code may be + * reached only when there is a direct filesystem image mapping + * available in memory. + */ + blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4); + first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS; + i = 0; + do { + u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT); + u32 expect = (first_block_addr + block_off) | + CRAMFS_BLK_FLAG_DIRECT_PTR | + CRAMFS_BLK_FLAG_UNCOMPRESSED; + if (blockptrs[i] != expect) { + pr_debug("range: block %d/%d got %#x expects %#x\n", + pgoff+i, pgoff + *pages - 1, + blockptrs[i], expect); + if (i == 0) + return 0; + break; + } + } while (++i < *pages); + + *pages = i; + return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT; +} + +#ifdef CONFIG_MMU + +/* + * Return true if the last page of a file in the filesystem image contains + * some other data that doesn't belong to that file. It is assumed that the + * last block is CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED + * (verified by cramfs_get_block_range() and directly accessible in memory. + */ +static bool cramfs_last_page_is_shared(struct inode *inode) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); + u32 partial, last_page, blockaddr, *blockptrs; + char *tail_data; + + partial = offset_in_page(inode->i_size); + if (!partial) + return false; + last_page = inode->i_size >> PAGE_SHIFT; + blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode)); + blockaddr = blockptrs[last_page] & ~CRAMFS_BLK_FLAGS; + blockaddr <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; + tail_data = sbi->linear_virt_addr + blockaddr + partial; + return memchr_inv(tail_data, 0, PAGE_SIZE - partial) ? true : false; +} + +static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); + unsigned int pages, max_pages, offset; + unsigned long address, pgoff = vma->vm_pgoff; + char *bailout_reason; + int ret; + + ret = generic_file_readonly_mmap(file, vma); + if (ret) + return ret; + + /* + * Now try to pre-populate ptes for this vma with a direct + * mapping avoiding memory allocation when possible. + */ + + /* Could COW work here? */ + bailout_reason = "vma is writable"; + if (vma->vm_flags & VM_WRITE) + goto bailout; + + max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + bailout_reason = "beyond file limit"; + if (pgoff >= max_pages) + goto bailout; + pages = min(vma_pages(vma), max_pages - pgoff); + + offset = cramfs_get_block_range(inode, pgoff, &pages); + bailout_reason = "unsuitable block layout"; + if (!offset) + goto bailout; + address = sbi->linear_phys_addr + offset; + bailout_reason = "data is not page aligned"; + if (!PAGE_ALIGNED(address)) + goto bailout; + + /* Don't map the last page if it contains some other data */ + if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) { + pr_debug("mmap: %s: last page is shared\n", + file_dentry(file)->d_name.name); + pages--; + } + + if (!pages) { + bailout_reason = "no suitable block remaining"; + goto bailout; + } + + if (pages == vma_pages(vma)) { + /* + * The entire vma is mappable. remap_pfn_range() will + * make it distinguishable from a non-direct mapping + * in /proc/<pid>/maps by substituting the file offset + * with the actual physical address. + */ + ret = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, + pages * PAGE_SIZE, vma->vm_page_prot); + } else { + /* + * Let's create a mixed map if we can't map it all. + * The normal paging machinery will take care of the + * unpopulated ptes via cramfs_readpage(). + */ + int i; + vma->vm_flags |= VM_MIXEDMAP; + for (i = 0; i < pages && !ret; i++) { + unsigned long off = i * PAGE_SIZE; + pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); + ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); + } + } + + if (!ret) + pr_debug("mapped %s[%lu] at 0x%08lx (%u/%lu pages) " + "to vma 0x%08lx, page_prot 0x%llx\n", + file_dentry(file)->d_name.name, pgoff, + address, pages, vma_pages(vma), vma->vm_start, + (unsigned long long)pgprot_val(vma->vm_page_prot)); + return ret; + +bailout: + pr_debug("%s[%lu]: direct mmap impossible: %s\n", + file_dentry(file)->d_name.name, pgoff, bailout_reason); + /* Didn't manage any direct map, but normal paging is still possible */ + return 0; +} + +#else /* CONFIG_MMU */ + +static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; +} + +static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + unsigned int pages, block_pages, max_pages, offset; + + pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= max_pages || pages > max_pages - pgoff) + return -EINVAL; + block_pages = pages; + offset = cramfs_get_block_range(inode, pgoff, &block_pages); + if (!offset || block_pages != pages) + return -ENOSYS; + addr = sbi->linear_phys_addr + offset; + pr_debug("get_unmapped for %s ofs %#lx siz %lu at 0x%08lx\n", + file_dentry(file)->d_name.name, pgoff*PAGE_SIZE, len, addr); + return addr; +} + +static unsigned int cramfs_physmem_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | + NOMMU_MAP_READ | NOMMU_MAP_EXEC; +} + +#endif /* CONFIG_MMU */ + +static const struct file_operations cramfs_physmem_fops = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .splice_read = generic_file_splice_read, + .mmap = cramfs_physmem_mmap, +#ifndef CONFIG_MMU + .get_unmapped_area = cramfs_physmem_get_unmapped_area, + .mmap_capabilities = cramfs_physmem_mmap_capabilities, +#endif +}; + static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); - kill_block_super(sb); + if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) { + if (sbi && sbi->mtd_point_size) + mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); + kill_mtd_super(sb); + } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { + kill_block_super(sb); + } kfree(sbi); } static int cramfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } -static int cramfs_fill_super(struct super_block *sb, void *data, int silent) +static int cramfs_read_super(struct super_block *sb, + struct cramfs_super *super, int silent) { - int i; - struct cramfs_super super; + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); unsigned long root_offset; - struct cramfs_sb_info *sbi; - struct inode *root; - - sb->s_flags |= MS_RDONLY; - - sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - sb->s_fs_info = sbi; - /* Invalidate the read buffers on mount: think disk change.. */ - mutex_lock(&read_mutex); - for (i = 0; i < READ_BUFFERS; i++) - buffer_blocknr[i] = -1; + /* We don't know the real size yet */ + sbi->size = PAGE_SIZE; /* Read the first block and get the superblock from it */ - memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super)); + mutex_lock(&read_mutex); + memcpy(super, cramfs_read(sb, 0, sizeof(*super)), sizeof(*super)); mutex_unlock(&read_mutex); /* Do sanity checks on the superblock */ - if (super.magic != CRAMFS_MAGIC) { + if (super->magic != CRAMFS_MAGIC) { /* check for wrong endianness */ - if (super.magic == CRAMFS_MAGIC_WEND) { + if (super->magic == CRAMFS_MAGIC_WEND) { if (!silent) pr_err("wrong endianness\n"); return -EINVAL; @@ -289,10 +534,12 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* check at 512 byte offset */ mutex_lock(&read_mutex); - memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super)); + memcpy(super, + cramfs_read(sb, 512, sizeof(*super)), + sizeof(*super)); mutex_unlock(&read_mutex); - if (super.magic != CRAMFS_MAGIC) { - if (super.magic == CRAMFS_MAGIC_WEND && !silent) + if (super->magic != CRAMFS_MAGIC) { + if (super->magic == CRAMFS_MAGIC_WEND && !silent) pr_err("wrong endianness\n"); else if (!silent) pr_err("wrong magic\n"); @@ -301,34 +548,34 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) } /* get feature flags first */ - if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { + if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) { pr_err("unsupported filesystem features\n"); return -EINVAL; } /* Check that the root inode is in a sane state */ - if (!S_ISDIR(super.root.mode)) { + if (!S_ISDIR(super->root.mode)) { pr_err("root is not a directory\n"); return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ - super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + super->root.mode |= 0555; - root_offset = super.root.offset << 2; - if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { - sbi->size = super.size; - sbi->blocks = super.fsid.blocks; - sbi->files = super.fsid.files; + root_offset = super->root.offset << 2; + if (super->flags & CRAMFS_FLAG_FSID_VERSION_2) { + sbi->size = super->size; + sbi->blocks = super->fsid.blocks; + sbi->files = super->fsid.files; } else { sbi->size = 1<<28; sbi->blocks = 0; sbi->files = 0; } - sbi->magic = super.magic; - sbi->flags = super.flags; + sbi->magic = super->magic; + sbi->flags = super->flags; if (root_offset == 0) pr_info("empty filesystem"); - else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && + else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && ((root_offset != sizeof(struct cramfs_super)) && (root_offset != 512 + sizeof(struct cramfs_super)))) { @@ -336,9 +583,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) return -EINVAL; } + return 0; +} + +static int cramfs_finalize_super(struct super_block *sb, + struct cramfs_inode *cramfs_root) +{ + struct inode *root; + /* Set it all up.. */ + sb->s_flags |= SB_RDONLY; sb->s_op = &cramfs_ops; - root = get_cramfs_inode(sb, &super.root, 0); + root = get_cramfs_inode(sb, cramfs_root, 0); if (IS_ERR(root)) return PTR_ERR(root); sb->s_root = d_make_root(root); @@ -347,10 +603,79 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } +static int cramfs_blkdev_fill_super(struct super_block *sb, void *data, + int silent) +{ + struct cramfs_sb_info *sbi; + struct cramfs_super super; + int i, err; + + sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + /* Invalidate the read buffers on mount: think disk change.. */ + for (i = 0; i < READ_BUFFERS; i++) + buffer_blocknr[i] = -1; + + err = cramfs_read_super(sb, &super, silent); + if (err) + return err; + return cramfs_finalize_super(sb, &super.root); +} + +static int cramfs_mtd_fill_super(struct super_block *sb, void *data, + int silent) +{ + struct cramfs_sb_info *sbi; + struct cramfs_super super; + int err; + + sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + /* Map only one page for now. Will remap it when fs size is known. */ + err = mtd_point(sb->s_mtd, 0, PAGE_SIZE, &sbi->mtd_point_size, + &sbi->linear_virt_addr, &sbi->linear_phys_addr); + if (err || sbi->mtd_point_size != PAGE_SIZE) { + pr_err("unable to get direct memory access to mtd:%s\n", + sb->s_mtd->name); + return err ? : -ENODATA; + } + + pr_info("checking physical address %pap for linear cramfs image\n", + &sbi->linear_phys_addr); + err = cramfs_read_super(sb, &super, silent); + if (err) + return err; + + /* Remap the whole filesystem now */ + pr_info("linear cramfs image on mtd:%s appears to be %lu KB in size\n", + sb->s_mtd->name, sbi->size/1024); + mtd_unpoint(sb->s_mtd, 0, PAGE_SIZE); + err = mtd_point(sb->s_mtd, 0, sbi->size, &sbi->mtd_point_size, + &sbi->linear_virt_addr, &sbi->linear_phys_addr); + if (err || sbi->mtd_point_size != sbi->size) { + pr_err("unable to get direct memory access to mtd:%s\n", + sb->s_mtd->name); + return err ? : -ENODATA; + } + + return cramfs_finalize_super(sb, &super.root); +} + static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + u64 id = 0; + + if (sb->s_bdev) + id = huge_encode_dev(sb->s_bdev->bd_dev); + else if (sb->s_dev) + id = huge_encode_dev(sb->s_dev); buf->f_type = CRAMFS_MAGIC; buf->f_bsize = PAGE_SIZE; @@ -502,34 +827,86 @@ static int cramfs_readpage(struct file *file, struct page *page) if (page->index < maxblock) { struct super_block *sb = inode->i_sb; - u32 blkptr_offset = OFFSET(inode) + page->index*4; - u32 start_offset, compr_len; + u32 blkptr_offset = OFFSET(inode) + page->index * 4; + u32 block_ptr, block_start, block_len; + bool uncompressed, direct; - start_offset = OFFSET(inode) + maxblock*4; mutex_lock(&read_mutex); - if (page->index) - start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, - 4); - compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - - start_offset); - mutex_unlock(&read_mutex); + block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4); + uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED); + direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR); + block_ptr &= ~CRAMFS_BLK_FLAGS; + + if (direct) { + /* + * The block pointer is an absolute start pointer, + * shifted by 2 bits. The size is included in the + * first 2 bytes of the data block when compressed, + * or PAGE_SIZE otherwise. + */ + block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT; + if (uncompressed) { + block_len = PAGE_SIZE; + /* if last block: cap to file length */ + if (page->index == maxblock - 1) + block_len = + offset_in_page(inode->i_size); + } else { + block_len = *(u16 *) + cramfs_read(sb, block_start, 2); + block_start += 2; + } + } else { + /* + * The block pointer indicates one past the end of + * the current block (start of next block). If this + * is the first block then it starts where the block + * pointer table ends, otherwise its start comes + * from the previous block's pointer. + */ + block_start = OFFSET(inode) + maxblock * 4; + if (page->index) + block_start = *(u32 *) + cramfs_read(sb, blkptr_offset - 4, 4); + /* Beware... previous ptr might be a direct ptr */ + if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) { + /* See comments on earlier code. */ + u32 prev_start = block_start; + block_start = prev_start & ~CRAMFS_BLK_FLAGS; + block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; + if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) { + block_start += PAGE_SIZE; + } else { + block_len = *(u16 *) + cramfs_read(sb, block_start, 2); + block_start += 2 + block_len; + } + } + block_start &= ~CRAMFS_BLK_FLAGS; + block_len = block_ptr - block_start; + } - if (compr_len == 0) + if (block_len == 0) ; /* hole */ - else if (unlikely(compr_len > (PAGE_SIZE << 1))) { - pr_err("bad compressed blocksize %u\n", - compr_len); + else if (unlikely(block_len > 2*PAGE_SIZE || + (uncompressed && block_len > PAGE_SIZE))) { + mutex_unlock(&read_mutex); + pr_err("bad data blocksize %u\n", block_len); goto err; + } else if (uncompressed) { + memcpy(pgdata, + cramfs_read(sb, block_start, block_len), + block_len); + bytes_filled = block_len; } else { - mutex_lock(&read_mutex); bytes_filled = cramfs_uncompress_block(pgdata, PAGE_SIZE, - cramfs_read(sb, start_offset, compr_len), - compr_len); - mutex_unlock(&read_mutex); - if (unlikely(bytes_filled < 0)) - goto err; + cramfs_read(sb, block_start, block_len), + block_len); } + mutex_unlock(&read_mutex); + if (unlikely(bytes_filled < 0)) + goto err; } memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled); @@ -573,10 +950,22 @@ static const struct super_operations cramfs_ops = { .statfs = cramfs_statfs, }; -static struct dentry *cramfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static struct dentry *cramfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super); + struct dentry *ret = ERR_PTR(-ENOPROTOOPT); + + if (IS_ENABLED(CONFIG_CRAMFS_MTD)) { + ret = mount_mtd(fs_type, flags, dev_name, data, + cramfs_mtd_fill_super); + if (!IS_ERR(ret)) + return ret; + } + if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) { + ret = mount_bdev(fs_type, flags, dev_name, data, + cramfs_blkdev_fill_super); + } + return ret; } static struct file_system_type cramfs_fs_type = { diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index 9f6607f17b53..cb496989a6b6 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o -fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index c7835df7e7b8..732a786cce9d 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -126,21 +126,6 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) } EXPORT_SYMBOL(fscrypt_get_ctx); -/** - * page_crypt_complete() - completion callback for page crypto - * @req: The asynchronous cipher request context - * @res: The result of the cipher operation - */ -static void page_crypt_complete(struct crypto_async_request *req, int res) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, unsigned int len, @@ -151,7 +136,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u8 padding[FS_IV_SIZE - sizeof(__le64)]; } iv; struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; @@ -179,7 +164,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - page_crypt_complete, &ecr); + crypto_req_done, &wait); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); @@ -187,14 +172,9 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, sg_set_page(&src, src_page, len, offs); skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) - res = crypto_skcipher_decrypt(req); + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); else - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { printk_ratelimited(KERN_ERR @@ -340,7 +320,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; dir = dget_parent(dentry); - if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + if (!IS_ENCRYPTED(d_inode(dir))) { dput(dir); return 0; } @@ -410,11 +390,8 @@ int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - /* - * No need to allocate a bounce page pool if there already is one or - * this FS won't use it. - */ - if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) + /* No need to allocate a bounce page pool if this FS won't use it. */ + if (cop_flags & FS_CFLG_OWN_PAGES) return 0; mutex_lock(&fscrypt_init_mutex); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 8606da1df0aa..305541bcd108 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -16,21 +16,6 @@ #include "fscrypt_private.h" /** - * fname_crypt_complete() - completion callback for filename crypto - * @req: The asynchronous cipher request context - * @res: The result of the cipher operation - */ -static void fname_crypt_complete(struct crypto_async_request *req, int res) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -/** * fname_encrypt() - encrypt a filename * * The caller must have allocated sufficient memory for the @oname string. @@ -41,7 +26,7 @@ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; @@ -77,17 +62,12 @@ static int fname_encrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fname_crypt_complete, &ecr); + crypto_req_done, &wait); sg_init_one(&sg, oname->name, cryptlen); skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); /* Do the encryption */ - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - /* Request is being completed asynchronously; wait for it */ - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR @@ -111,7 +91,7 @@ static int fname_decrypt(struct inode *inode, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; @@ -132,7 +112,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fname_crypt_complete, &ecr); + crypto_req_done, &wait); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -141,11 +121,7 @@ static int fname_decrypt(struct inode *inode, sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_skcipher_decrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR @@ -383,8 +359,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; - if (!dir->i_sb->s_cop->is_encrypted(dir) || - fscrypt_is_dot_dotdot(iname)) { + if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 092e9dad1414..c0b4f5597e1a 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,7 +12,8 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H -#include <linux/fscrypt_supp.h> +#define __FS_HAS_ENCRYPTION 1 +#include <linux/fscrypt.h> #include <crypto/hash.h> /* Encryption parameters */ @@ -70,16 +71,6 @@ typedef enum { #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 #define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 -struct fscrypt_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_FS_COMPLETION_RESULT(ecr) \ - struct fscrypt_completion_result ecr = { \ - COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } - - /* crypto.c */ extern int fscrypt_initialize(unsigned int cop_flags); extern struct workqueue_struct *fscrypt_read_workqueue; diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c new file mode 100644 index 000000000000..9f5fb2eb9cf7 --- /dev/null +++ b/fs/crypto/hooks.c @@ -0,0 +1,112 @@ +/* + * fs/crypto/hooks.c + * + * Encryption hooks for higher-level filesystem operations. + */ + +#include <linux/ratelimit.h> +#include "fscrypt_private.h" + +/** + * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * Currently, an encrypted regular file can only be opened if its encryption key + * is available; access to the raw encrypted contents is not supported. + * Therefore, we first set up the inode's encryption key (if not already done) + * and return an error if it's unavailable. + * + * We also verify that if the parent directory (from the path via which the file + * is being opened) is encrypted, then the inode being opened uses the same + * encryption policy. This is needed as part of the enforcement that all files + * in an encrypted directory tree use the same encryption policy, as a + * protection against certain types of offline attacks. Note that this check is + * needed even when opening an *unencrypted* file, since it's forbidden to have + * an unencrypted file in an encrypted directory. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + */ +int fscrypt_file_open(struct inode *inode, struct file *filp) +{ + int err; + struct dentry *dir; + + err = fscrypt_require_key(inode); + if (err) + return err; + + dir = dget_parent(file_dentry(filp)); + if (IS_ENCRYPTED(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu", + d_inode(dir)->i_ino, inode->i_ino); + err = -EPERM; + } + dput(dir); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_file_open); + +int __fscrypt_prepare_link(struct inode *inode, struct inode *dir) +{ + int err; + + err = fscrypt_require_key(dir); + if (err) + return err; + + if (!fscrypt_has_permitted_context(dir, inode)) + return -EPERM; + + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); + +int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + err = fscrypt_require_key(old_dir); + if (err) + return err; + + err = fscrypt_require_key(new_dir); + if (err) + return err; + + if (old_dir != new_dir) { + if (IS_ENCRYPTED(new_dir) && + !fscrypt_has_permitted_context(new_dir, + d_inode(old_dentry))) + return -EPERM; + + if ((flags & RENAME_EXCHANGE) && + IS_ENCRYPTED(old_dir) && + !fscrypt_has_permitted_context(old_dir, + d_inode(new_dentry))) + return -EPERM; + } + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); + +int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry) +{ + int err = fscrypt_get_encryption_info(dir); + + if (err) + return err; + + if (fscrypt_has_encryption_key(dir)) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); + } + + d_set_d_op(dentry, &fscrypt_d_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index a38630214058..5e6e846f5a24 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -18,17 +18,6 @@ static struct crypto_shash *essiv_hash_tfm; -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - /** * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. @@ -43,7 +32,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], { int res = 0; struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); @@ -60,7 +49,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); + crypto_req_done, &wait); res = crypto_skcipher_setkey(tfm, deriving_key, FS_AES_128_ECB_KEY_SIZE); if (res < 0) @@ -70,11 +59,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], sg_init_one(&dst_sg, derived_raw_key, source_key->size); skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, NULL); - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: skcipher_request_free(req); crypto_free_skcipher(tfm); @@ -274,7 +259,7 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || - inode->i_sb->s_cop->is_encrypted(inode)) + IS_ENCRYPTED(inode)) return res; /* Fake up a context for an unencrypted directory */ memset(&ctx, 0, sizeof(ctx)); @@ -374,7 +359,7 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) struct fscrypt_info *prev; if (ci == NULL) - ci = ACCESS_ONCE(inode->i_crypt_info); + ci = READ_ONCE(inode->i_crypt_info); if (ci == NULL) return; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index a120649beeca..c6d431a5cce9 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -110,7 +110,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->is_encrypted(inode)) + if (!IS_ENCRYPTED(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -167,11 +167,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 1; /* No restrictions if the parent directory is unencrypted */ - if (!cops->is_encrypted(parent)) + if (!IS_ENCRYPTED(parent)) return 1; /* Encrypted directories must not contain unencrypted files */ - if (!cops->is_encrypted(child)) + if (!IS_ENCRYPTED(child)) return 0; /* @@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, void *entry, sector_t sector, - unsigned long flags) + unsigned long flags, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; void *new_entry; pgoff_t index = vmf->pgoff; - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { @@ -565,11 +565,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); __radix_tree_replace(page_tree, node, slot, - new_entry, NULL, NULL); + new_entry, NULL); entry = new_entry; } - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); @@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) continue; + /* + * No need to call mmu_notifier_invalidate_range() as we are + * downgrading page table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ if (pmdp) { #ifdef CONFIG_FS_DAX_PMD pmd_t pmd; @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pmd: spin_unlock(ptl); #endif @@ -643,7 +649,6 @@ unlock_pmd: pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, address, ptep, pte); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pte: pte_unmap_unlock(ptep, ptl); } @@ -789,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, tag_pages_for_writeback(mapping, start_index, end_index); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, @@ -820,38 +825,42 @@ out: } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct address_space *mapping, - struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void *entry, - struct vm_area_struct *vma, struct vm_fault *vmf) +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) { - unsigned long vaddr = vmf->address; - void *ret, *kaddr; + return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; +} + +static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, + pfn_t *pfnp) +{ + const sector_t sector = dax_iomap_sector(iomap, pos); pgoff_t pgoff; + void *kaddr; int id, rc; - pfn_t pfn; + long length; - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); if (rc) return rc; - id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); - if (rc < 0) { - dax_read_unlock(id); - return rc; + length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), + &kaddr, pfnp); + if (length < 0) { + rc = length; + goto out; } + rc = -EINVAL; + if (PFN_PHYS(length) < size) + goto out; + if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) + goto out; + /* For larger pages we need devmap */ + if (length > 1 && !pfn_t_devmap(*pfnp)) + goto out; + rc = 0; +out: dax_read_unlock(id); - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); - if (IS_ERR(ret)) - return PTR_ERR(ret); - - trace_dax_insert_mapping(mapping->host, vmf, ret); - if (vmf->flags & FAULT_FLAG_WRITE) - return vm_insert_mixed_mkwrite(vma, vaddr, pfn); - else - return vm_insert_mixed(vma, vaddr, pfn); + return rc; } /* @@ -877,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, } entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_ZERO_PAGE); + RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(entry2)) { ret = VM_FAULT_SIGBUS; goto out; @@ -936,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) -{ - return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); -} - static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) @@ -1080,19 +1084,33 @@ static int dax_fault_return(int error) return VM_FAULT_SIGBUS; } -static int dax_iomap_pte_fault(struct vm_fault *vmf, +/* + * MAP_SYNC on a dax mapping guarantees dirty metadata is + * flushed on write-faults (non-cow), but not read-faults. + */ +static bool dax_fault_is_synchronous(unsigned long flags, + struct vm_area_struct *vma, struct iomap *iomap) +{ + return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) + && (iomap->flags & IOMAP_F_DIRTY); +} + +static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - sector_t sector; struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; int vmf_ret = 0; void *entry; + pfn_t pfn; trace_dax_pte_fault(inode, vmf, vmf_ret); /* @@ -1105,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto out; } - if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + if (write && !vmf->cow_page) flags |= IOMAP_WRITE; entry = grab_mapping_entry(mapping, vmf->pgoff, 0); @@ -1140,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto error_finish_iomap; } - sector = dax_iomap_sector(&iomap, pos); - if (vmf->cow_page) { + sector_t sector = dax_iomap_sector(&iomap, pos); + switch (iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: @@ -1168,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto finish_iomap; } + sync = dax_fault_is_synchronous(flags, vma, &iomap); + switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, entry, vmf->vma, vmf); + error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); + if (error < 0) + goto error_finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + 0, write && !sync); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto error_finish_iomap; + } + + /* + * If we are doing synchronous page fault and inode needs fsync, + * we can insert PTE into page tables only after that happens. + * Skip insertion for now and return the pfn so that caller can + * insert it after fsync is done. + */ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) { + error = -EIO; + goto error_finish_iomap; + } + *pfnp = pfn; + vmf_ret = VM_FAULT_NEEDDSYNC | major; + goto finish_iomap; + } + trace_dax_insert_mapping(inode, vmf, entry); + if (write) + error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + error = vm_insert_mixed(vma, vaddr, pfn); + /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: - if (!(vmf->flags & FAULT_FLAG_WRITE)) { + if (!write) { vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } @@ -1218,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) -{ - struct address_space *mapping = vmf->vma->vm_file->f_mapping; - const sector_t sector = dax_iomap_sector(iomap, pos); - struct dax_device *dax_dev = iomap->dax_dev; - struct block_device *bdev = iomap->bdev; - struct inode *inode = mapping->host; - const size_t size = PMD_SIZE; - void *ret = NULL, *kaddr; - long length = 0; - pgoff_t pgoff; - pfn_t pfn = {}; - int id; - - if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) - goto fallback; - - id = dax_read_lock(); - length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); - if (length < 0) - goto unlock_fallback; - length = PFN_PHYS(length); - - if (length < size) - goto unlock_fallback; - if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) - goto unlock_fallback; - if (!pfn_t_devmap(pfn)) - goto unlock_fallback; - dax_read_unlock(id); - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, - RADIX_DAX_PMD); - if (IS_ERR(ret)) - goto fallback; - - trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); - return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - pfn, vmf->flags & FAULT_FLAG_WRITE); - -unlock_fallback: - dax_read_unlock(id); -fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); - return VM_FAULT_FALLBACK; -} +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) @@ -1283,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(ret)) goto fallback; @@ -1305,13 +1314,14 @@ fallback: return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; @@ -1320,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, void *entry; loff_t pos; int error; + pfn_t pfn; /* * Check whether offset isn't beyond end of file now. Caller is @@ -1327,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * this is a reliable test. */ pgoff = linear_page_index(vma, pmd_addr); - max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1351,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - if (pgoff > max_pgoff) { + if (pgoff >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ - if ((pgoff | PG_PMD_COLOUR) > max_pgoff) + if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* @@ -1395,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; + sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); + switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); + error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); + if (error < 0) + goto finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + RADIX_DAX_PMD, write && !sync); + if (IS_ERR(entry)) + goto finish_iomap; + + /* + * If we are doing synchronous page fault and inode needs fsync, + * we can insert PMD into page tables only after that happens. + * Skip insertion for now and return the pfn so that caller can + * insert it after fsync is done. + */ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) + goto finish_iomap; + *pfnp = pfn; + result = VM_FAULT_NEEDDSYNC; + goto finish_iomap; + } + + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); + result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + write); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: @@ -1437,7 +1476,7 @@ out: return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1447,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @ops: iomap ops passed from the file system + * @pe_size: Size of the page to fault in + * @pfnp: PFN to insert for synchronous faults if fsync is required + * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller @@ -1455,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * successfully. */ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - const struct iomap_ops *ops) + pfn_t *pfnp, const struct iomap_ops *ops) { switch (pe_size) { case PE_SIZE_PTE: - return dax_iomap_pte_fault(vmf, ops); + return dax_iomap_pte_fault(vmf, pfnp, ops); case PE_SIZE_PMD: - return dax_iomap_pmd_fault(vmf, ops); + return dax_iomap_pmd_fault(vmf, pfnp, ops); default: return VM_FAULT_FALLBACK; } } EXPORT_SYMBOL_GPL(dax_iomap_fault); + +/** + * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function inserts writeable PTE or PMD entry into page tables for mmaped + * DAX file. It takes care of marking corresponding radix tree entry as dirty + * as well. + */ +static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, + enum page_entry_size pe_size, + pfn_t pfn) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + void *entry, **slot; + pgoff_t index = vmf->pgoff; + int vmf_ret, error; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, &slot); + /* Did we race with someone splitting entry or so? */ + if (!entry || + (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || + (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, + VM_FAULT_NOPAGE); + return VM_FAULT_NOPAGE; + } + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + entry = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + switch (pe_size) { + case PE_SIZE_PTE: + error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + vmf_ret = dax_fault_return(error); + break; +#ifdef CONFIG_FS_DAX_PMD + case PE_SIZE_PMD: + vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + pfn, true); + break; +#endif + default: + vmf_ret = VM_FAULT_FALLBACK; + } + put_locked_mapping_entry(mapping, index); + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); + return vmf_ret; +} + +/** + * dax_finish_sync_fault - finish synchronous page fault + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function ensures that the file range touched by the page fault is + * stored persistently on the media and handles inserting of appropriate page + * table entry. + */ +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn) +{ + int err; + loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; + size_t len = 0; + + if (pe_size == PE_SIZE_PTE) + len = PAGE_SIZE; + else if (pe_size == PE_SIZE_PMD) + len = PMD_SIZE; + else + WARN_ON_ONCE(1); + err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); + if (err) + return VM_FAULT_SIGBUS; + return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); +} +EXPORT_SYMBOL_GPL(dax_finish_sync_fault); diff --git a/fs/dcache.c b/fs/dcache.c index 34c852af215c..5c7df1df81ff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -630,7 +630,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry) rcu_read_lock(); spin_unlock(&dentry->d_lock); again: - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); spin_lock(&parent->d_lock); /* * We can't blindly lock dentry until we are sure @@ -721,7 +721,7 @@ static inline bool fast_dput(struct dentry *dentry) * around with a zero refcount. */ smp_rmb(); - d_flags = ACCESS_ONCE(dentry->d_flags); + d_flags = READ_ONCE(dentry->d_flags); d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ @@ -850,11 +850,11 @@ struct dentry *dget_parent(struct dentry *dentry) * locking. */ rcu_read_lock(); - ret = ACCESS_ONCE(dentry->d_parent); + ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { - if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + if (likely(ret == READ_ONCE(dentry->d_parent))) return ret; dput(ret); } @@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target) */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); - kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN); - kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); @@ -3040,7 +3038,7 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) * @buflen: allocated length of the buffer * @name: name string and length qstr structure * - * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * The length cannot be trusted, we need to copy it byte-by-byte until @@ -3054,8 +3052,8 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) */ static int prepend_name(char **buffer, int *buflen, const struct qstr *name) { - const char *dname = ACCESS_ONCE(name->name); - u32 dlen = ACCESS_ONCE(name->len); + const char *dname = READ_ONCE(name->name); + u32 dlen = READ_ONCE(name->len); char *p; smp_read_barrier_depends(); @@ -3120,7 +3118,7 @@ restart: struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + struct mount *parent = READ_ONCE(mnt->mnt_parent); /* Escaped? */ if (dentry != vfsmnt->mnt_root) { bptr = *buffer; @@ -3130,7 +3128,7 @@ restart: } /* Global root? */ if (mnt != parent) { - dentry = ACCESS_ONCE(mnt->mnt_mountpoint); + dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = parent; vfsmnt = &mnt->mnt; continue; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 6dabc4a10396..cd12e6576b48 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1,16 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* * file.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * * debugfs is for people to use instead of /proc or /sys. * See Documentation/filesystems/ for more details. - * */ #include <linux/module.h> @@ -22,7 +18,6 @@ #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> -#include <linux/srcu.h> #include <asm/poll.h> #include "internal.h" @@ -48,66 +43,108 @@ const struct file_operations debugfs_noop_file_operations = { .llseek = noop_llseek, }; +#define F_DENTRY(filp) ((filp)->f_path.dentry) + +const struct file_operations *debugfs_real_fops(const struct file *filp) +{ + struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata; + + if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) { + /* + * Urgh, we've been called w/o a protecting + * debugfs_file_get(). + */ + WARN_ON(1); + return NULL; + } + + return fsd->real_fops; +} +EXPORT_SYMBOL_GPL(debugfs_real_fops); + /** - * debugfs_use_file_start - mark the beginning of file data access + * debugfs_file_get - mark the beginning of file data access * @dentry: the dentry object whose data is being accessed. - * @srcu_idx: a pointer to some memory to store a SRCU index in. * - * Up to a matching call to debugfs_use_file_finish(), any - * successive call into the file removing functions debugfs_remove() - * and debugfs_remove_recursive() will block. Since associated private + * Up to a matching call to debugfs_file_put(), any successive call + * into the file removing functions debugfs_remove() and + * debugfs_remove_recursive() will block. Since associated private * file data may only get freed after a successful return of any of * the removal functions, you may safely access it after a successful - * call to debugfs_use_file_start() without worrying about - * lifetime issues. + * call to debugfs_file_get() without worrying about lifetime issues. * * If -%EIO is returned, the file has already been removed and thus, * it is not safe to access any of its data. If, on the other hand, * it is allowed to access the file data, zero is returned. - * - * Regardless of the return code, any call to - * debugfs_use_file_start() must be followed by a matching call - * to debugfs_use_file_finish(). */ -int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx) - __acquires(&debugfs_srcu) +int debugfs_file_get(struct dentry *dentry) { - *srcu_idx = srcu_read_lock(&debugfs_srcu); - barrier(); + struct debugfs_fsdata *fsd; + void *d_fsd; + + d_fsd = READ_ONCE(dentry->d_fsdata); + if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) { + fsd = d_fsd; + } else { + fsd = kmalloc(sizeof(*fsd), GFP_KERNEL); + if (!fsd) + return -ENOMEM; + + fsd->real_fops = (void *)((unsigned long)d_fsd & + ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); + refcount_set(&fsd->active_users, 1); + init_completion(&fsd->active_users_drained); + if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) { + kfree(fsd); + fsd = READ_ONCE(dentry->d_fsdata); + } + } + + /* + * In case of a successful cmpxchg() above, this check is + * strictly necessary and must follow it, see the comment in + * __debugfs_remove_file(). + * OTOH, if the cmpxchg() hasn't been executed or wasn't + * successful, this serves the purpose of not starving + * removers. + */ if (d_unlinked(dentry)) return -EIO; + + if (!refcount_inc_not_zero(&fsd->active_users)) + return -EIO; + return 0; } -EXPORT_SYMBOL_GPL(debugfs_use_file_start); +EXPORT_SYMBOL_GPL(debugfs_file_get); /** - * debugfs_use_file_finish - mark the end of file data access - * @srcu_idx: the SRCU index "created" by a former call to - * debugfs_use_file_start(). + * debugfs_file_put - mark the end of file data access + * @dentry: the dentry object formerly passed to + * debugfs_file_get(). * * Allow any ongoing concurrent call into debugfs_remove() or * debugfs_remove_recursive() blocked by a former call to - * debugfs_use_file_start() to proceed and return to its caller. + * debugfs_file_get() to proceed and return to its caller. */ -void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu) +void debugfs_file_put(struct dentry *dentry) { - srcu_read_unlock(&debugfs_srcu, srcu_idx); -} -EXPORT_SYMBOL_GPL(debugfs_use_file_finish); + struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata); -#define F_DENTRY(filp) ((filp)->f_path.dentry) + if (refcount_dec_and_test(&fsd->active_users)) + complete(&fsd->active_users_drained); +} +EXPORT_SYMBOL_GPL(debugfs_file_put); static int open_proxy_open(struct inode *inode, struct file *filp) { - const struct dentry *dentry = F_DENTRY(filp); + struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; - int srcu_idx, r; + int r; - r = debugfs_use_file_start(dentry, &srcu_idx); - if (r) { - r = -ENOENT; - goto out; - } + r = debugfs_file_get(dentry); + if (r) + return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); @@ -124,7 +161,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) r = real_fops->open(inode, filp); out: - debugfs_use_file_finish(srcu_idx); + debugfs_file_put(dentry); return r; } @@ -138,16 +175,16 @@ const struct file_operations debugfs_open_proxy_file_operations = { #define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \ static ret_type full_proxy_ ## name(proto) \ { \ - const struct dentry *dentry = F_DENTRY(filp); \ - const struct file_operations *real_fops = \ - debugfs_real_fops(filp); \ - int srcu_idx; \ + struct dentry *dentry = F_DENTRY(filp); \ + const struct file_operations *real_fops; \ ret_type r; \ \ - r = debugfs_use_file_start(dentry, &srcu_idx); \ - if (likely(!r)) \ - r = real_fops->name(args); \ - debugfs_use_file_finish(srcu_idx); \ + r = debugfs_file_get(dentry); \ + if (unlikely(r)) \ + return r; \ + real_fops = debugfs_real_fops(filp); \ + r = real_fops->name(args); \ + debugfs_file_put(dentry); \ return r; \ } @@ -172,18 +209,16 @@ FULL_PROXY_FUNC(unlocked_ioctl, long, filp, static unsigned int full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { - const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = debugfs_real_fops(filp); - int srcu_idx; + struct dentry *dentry = F_DENTRY(filp); unsigned int r = 0; + const struct file_operations *real_fops; - if (debugfs_use_file_start(dentry, &srcu_idx)) { - debugfs_use_file_finish(srcu_idx); + if (debugfs_file_get(dentry)) return POLLHUP; - } + real_fops = debugfs_real_fops(filp); r = real_fops->poll(filp, wait); - debugfs_use_file_finish(srcu_idx); + debugfs_file_put(dentry); return r; } @@ -227,16 +262,14 @@ static void __full_proxy_fops_init(struct file_operations *proxy_fops, static int full_proxy_open(struct inode *inode, struct file *filp) { - const struct dentry *dentry = F_DENTRY(filp); + struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; struct file_operations *proxy_fops = NULL; - int srcu_idx, r; + int r; - r = debugfs_use_file_start(dentry, &srcu_idx); - if (r) { - r = -ENOENT; - goto out; - } + r = debugfs_file_get(dentry); + if (r) + return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); @@ -274,7 +307,7 @@ free_proxy: kfree(proxy_fops); fops_put(real_fops); out: - debugfs_use_file_finish(srcu_idx); + debugfs_file_put(dentry); return r; } @@ -285,13 +318,14 @@ const struct file_operations debugfs_full_proxy_file_operations = { ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { + struct dentry *dentry = F_DENTRY(file); ssize_t ret; - int srcu_idx; - ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); - if (likely(!ret)) - ret = simple_attr_read(file, buf, len, ppos); - debugfs_use_file_finish(srcu_idx); + ret = debugfs_file_get(dentry); + if (unlikely(ret)) + return ret; + ret = simple_attr_read(file, buf, len, ppos); + debugfs_file_put(dentry); return ret; } EXPORT_SYMBOL_GPL(debugfs_attr_read); @@ -299,13 +333,14 @@ EXPORT_SYMBOL_GPL(debugfs_attr_read); ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { + struct dentry *dentry = F_DENTRY(file); ssize_t ret; - int srcu_idx; - ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); - if (likely(!ret)) - ret = simple_attr_write(file, buf, len, ppos); - debugfs_use_file_finish(srcu_idx); + ret = debugfs_file_get(dentry); + if (unlikely(ret)) + return ret; + ret = simple_attr_write(file, buf, len, ppos); + debugfs_file_put(dentry); return ret; } EXPORT_SYMBOL_GPL(debugfs_attr_write); @@ -739,14 +774,14 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, { char buf[3]; bool val; - int r, srcu_idx; + int r; + struct dentry *dentry = F_DENTRY(file); - r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); - if (likely(!r)) - val = *(bool *)file->private_data; - debugfs_use_file_finish(srcu_idx); - if (r) + r = debugfs_file_get(dentry); + if (unlikely(r)) return r; + val = *(bool *)file->private_data; + debugfs_file_put(dentry); if (val) buf[0] = 'Y'; @@ -764,8 +799,9 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, char buf[32]; size_t buf_size; bool bv; - int r, srcu_idx; + int r; bool *val = file->private_data; + struct dentry *dentry = F_DENTRY(file); buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -773,12 +809,11 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, buf[buf_size] = '\0'; if (strtobool(buf, &bv) == 0) { - r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); - if (likely(!r)) - *val = bv; - debugfs_use_file_finish(srcu_idx); - if (r) + r = debugfs_file_get(dentry); + if (unlikely(r)) return r; + *val = bv; + debugfs_file_put(dentry); } return count; @@ -840,14 +875,15 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; + struct dentry *dentry = F_DENTRY(file); ssize_t r; - int srcu_idx; - r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); - if (likely(!r)) - r = simple_read_from_buffer(user_buf, count, ppos, blob->data, - blob->size); - debugfs_use_file_finish(srcu_idx); + r = debugfs_file_get(dentry); + if (unlikely(r)) + return r; + r = simple_read_from_buffer(user_buf, count, ppos, blob->data, + blob->size); + debugfs_file_put(dentry); return r; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c59f015f386e..63a998c3f252 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -1,16 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * * debugfs is for people to use instead of /proc or /sys. * See ./Documentation/core-api/kernel-api.rst for more details. - * */ #include <linux/module.h> @@ -27,14 +23,11 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> -#include <linux/srcu.h> #include "internal.h" #define DEBUGFS_DEFAULT_MODE 0700 -DEFINE_SRCU(debugfs_srcu); - static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; @@ -185,6 +178,14 @@ static const struct super_operations debugfs_super_operations = { .evict_inode = debugfs_evict_inode, }; +static void debugfs_release_dentry(struct dentry *dentry) +{ + void *fsd = dentry->d_fsdata; + + if (!((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) + kfree(dentry->d_fsdata); +} + static struct vfsmount *debugfs_automount(struct path *path) { debugfs_automount_t f; @@ -194,6 +195,7 @@ static struct vfsmount *debugfs_automount(struct path *path) static const struct dentry_operations debugfs_dops = { .d_delete = always_delete_dentry, + .d_release = debugfs_release_dentry, .d_automount = debugfs_automount, }; @@ -358,7 +360,8 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, inode->i_private = data; inode->i_fop = proxy_fops; - dentry->d_fsdata = (void *)real_fops; + dentry->d_fsdata = (void *)((unsigned long)real_fops | + DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); @@ -615,18 +618,43 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, } EXPORT_SYMBOL_GPL(debugfs_create_symlink); +static void __debugfs_remove_file(struct dentry *dentry, struct dentry *parent) +{ + struct debugfs_fsdata *fsd; + + simple_unlink(d_inode(parent), dentry); + d_delete(dentry); + + /* + * Paired with the closing smp_mb() implied by a successful + * cmpxchg() in debugfs_file_get(): either + * debugfs_file_get() must see a dead dentry or we must see a + * debugfs_fsdata instance at ->d_fsdata here (or both). + */ + smp_mb(); + fsd = READ_ONCE(dentry->d_fsdata); + if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) + return; + if (!refcount_dec_and_test(&fsd->active_users)) + wait_for_completion(&fsd->active_users_drained); +} + static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; if (simple_positive(dentry)) { dget(dentry); - if (d_is_dir(dentry)) - ret = simple_rmdir(d_inode(parent), dentry); - else - simple_unlink(d_inode(parent), dentry); - if (!ret) - d_delete(dentry); + if (!d_is_reg(dentry)) { + if (d_is_dir(dentry)) + ret = simple_rmdir(d_inode(parent), dentry); + else + simple_unlink(d_inode(parent), dentry); + if (!ret) + d_delete(dentry); + } else { + __debugfs_remove_file(dentry, parent); + } dput(dentry); } return ret; @@ -660,8 +688,6 @@ void debugfs_remove(struct dentry *dentry) inode_unlock(d_inode(parent)); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - - synchronize_srcu(&debugfs_srcu); } EXPORT_SYMBOL_GPL(debugfs_remove); @@ -735,8 +761,6 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); inode_unlock(d_inode(parent)); - - synchronize_srcu(&debugfs_srcu); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index b3e8443a1f47..f0d73d86cc1a 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * internal.h - declarations internal to debugfs * * Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * */ #ifndef _DEBUGFS_INTERNAL_H_ @@ -19,4 +15,18 @@ extern const struct file_operations debugfs_noop_file_operations; extern const struct file_operations debugfs_open_proxy_file_operations; extern const struct file_operations debugfs_full_proxy_file_operations; +struct debugfs_fsdata { + const struct file_operations *real_fops; + refcount_t active_users; + struct completion active_users_drained; +}; + +/* + * A dentry's ->d_fsdata either points to the real fops or to a + * dynamically allocated debugfs_fsdata instance. + * In order to distinguish between these two cases, a real fops + * pointer gets its lowest bit set. + */ +#define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0) + #endif /* _DEBUGFS_INTERNAL_H_ */ diff --git a/fs/direct-io.c b/fs/direct-io.c index b53e66d9abd7..3aafb3343a65 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -497,7 +497,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); @@ -1152,7 +1152,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { - unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 07fed838d8fd..562fa8c3edff 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -181,6 +181,8 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, spin_lock(&dlm_cb_seq_spin); new_seq = ++dlm_cb_seq; + if (!dlm_cb_seq) + new_seq = ++dlm_cb_seq; spin_unlock(&dlm_cb_seq_spin); if (lkb->lkb_flags & DLM_IFL_USER) { diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 7211e826d90d..1270551d24e3 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -282,44 +282,44 @@ static struct configfs_item_operations node_ops = { .release = release_node, }; -static struct config_item_type clusters_type = { +static const struct config_item_type clusters_type = { .ct_group_ops = &clusters_ops, .ct_owner = THIS_MODULE, }; -static struct config_item_type cluster_type = { +static const struct config_item_type cluster_type = { .ct_item_ops = &cluster_ops, .ct_attrs = cluster_attrs, .ct_owner = THIS_MODULE, }; -static struct config_item_type spaces_type = { +static const struct config_item_type spaces_type = { .ct_group_ops = &spaces_ops, .ct_owner = THIS_MODULE, }; -static struct config_item_type space_type = { +static const struct config_item_type space_type = { .ct_item_ops = &space_ops, .ct_owner = THIS_MODULE, }; -static struct config_item_type comms_type = { +static const struct config_item_type comms_type = { .ct_group_ops = &comms_ops, .ct_owner = THIS_MODULE, }; -static struct config_item_type comm_type = { +static const struct config_item_type comm_type = { .ct_item_ops = &comm_ops, .ct_attrs = comm_attrs, .ct_owner = THIS_MODULE, }; -static struct config_item_type nodes_type = { +static const struct config_item_type nodes_type = { .ct_group_ops = &nodes_ops, .ct_owner = THIS_MODULE, }; -static struct config_item_type node_type = { +static const struct config_item_type node_type = { .ct_item_ops = &node_ops, .ct_attrs = node_attrs, .ct_owner = THIS_MODULE, diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index d4aaddec1b16..cc91963683de 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1003,7 +1003,6 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, if (r->res_master_nodeid == our_nodeid) { log_error(ls, "from_master %d our_master", from_nodeid); dlm_dump_rsb(r); - dlm_send_rcom_lookup_dump(r, from_nodeid); goto out_found; } @@ -2465,14 +2464,12 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) { lkb->lkb_grmode = DLM_LOCK_NL; lkb->lkb_sbflags |= DLM_SBF_DEMOTED; - } else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { - if (err) - *err = -EDEADLK; - else { - log_print("can_be_granted deadlock %x now %d", - lkb->lkb_id, now); - dlm_dump_rsb(r); - } + } else if (err) { + *err = -EDEADLK; + } else { + log_print("can_be_granted deadlock %x now %d", + lkb->lkb_id, now); + dlm_dump_rsb(r); } goto out; } @@ -2501,13 +2498,6 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, return rv; } -/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock - for locks pending on the convert list. Once verified (watch for these - log_prints), we should be able to just call _can_be_granted() and not - bother with the demote/deadlk cases here (and there's no easy way to deal - with a deadlk here, we'd have to generate something like grant_lock with - the deadlk error.) */ - /* Returns the highest requested mode of all blocked conversions; sets cw if there's a blocked conversion to DLM_LOCK_CW. */ @@ -2545,9 +2535,22 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, } if (deadlk) { - log_print("WARN: pending deadlock %x node %d %s", - lkb->lkb_id, lkb->lkb_nodeid, r->res_name); - dlm_dump_rsb(r); + /* + * If DLM_LKB_NODLKWT flag is set and conversion + * deadlock is detected, we request blocking AST and + * down (or cancel) conversion. + */ + if (lkb->lkb_exflags & DLM_LKF_NODLCKWT) { + if (lkb->lkb_highbast < lkb->lkb_rqmode) { + queue_bast(r, lkb, lkb->lkb_rqmode); + lkb->lkb_highbast = lkb->lkb_rqmode; + } + } else { + log_print("WARN: pending deadlock %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, + r->res_name); + dlm_dump_rsb(r); + } continue; } @@ -3123,7 +3126,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) deadlock, so we leave it on the granted queue and return EDEADLK in the ast for the convert. */ - if (deadlk) { + if (deadlk && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { /* it's left on the granted queue */ revert_lock(r, lkb); queue_cast(r, lkb, -EDEADLK); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4813d0e0cd9b..05707850f93a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -107,11 +107,11 @@ struct connection { unsigned long flags; #define CF_READ_PENDING 1 #define CF_WRITE_PENDING 2 -#define CF_CONNECT_PENDING 3 #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 #define CF_CLOSE 6 #define CF_APP_LIMITED 7 +#define CF_CLOSING 8 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ @@ -124,10 +124,6 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ - void (*orig_error_report)(struct sock *); - void (*orig_data_ready)(struct sock *); - void (*orig_state_change)(struct sock *); - void (*orig_write_space)(struct sock *); }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -150,6 +146,13 @@ struct dlm_node_addr { struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; +static struct listen_sock_callbacks { + void (*sk_error_report)(struct sock *); + void (*sk_data_ready)(struct sock *); + void (*sk_state_change)(struct sock *); + void (*sk_write_space)(struct sock *); +} listen_sock; + static LIST_HEAD(dlm_node_addrs); static DEFINE_SPINLOCK(dlm_node_addrs_spin); @@ -408,17 +411,23 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk) { - struct connection *con = sock2con(sk); + struct connection *con; + + read_lock_bh(&sk->sk_callback_lock); + con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); + read_unlock_bh(&sk->sk_callback_lock); } static void lowcomms_write_space(struct sock *sk) { - struct connection *con = sock2con(sk); + struct connection *con; + read_lock_bh(&sk->sk_callback_lock); + con = sock2con(sk); if (!con) - return; + goto out; clear_bit(SOCK_NOSPACE, &con->sock->flags); @@ -427,16 +436,17 @@ static void lowcomms_write_space(struct sock *sk) clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); + queue_work(send_workqueue, &con->swork); +out: + read_unlock_bh(&sk->sk_callback_lock); } static inline void lowcomms_connect_sock(struct connection *con) { if (test_bit(CF_CLOSE, &con->flags)) return; - if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); + queue_work(send_workqueue, &con->swork); + cond_resched(); } static void lowcomms_state_change(struct sock *sk) @@ -480,7 +490,7 @@ static void lowcomms_error_report(struct sock *sk) if (con == NULL) goto out; - orig_report = con->orig_error_report; + orig_report = listen_sock.sk_error_report; if (con->sock == NULL || kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " @@ -517,27 +527,31 @@ out: } /* Note: sk_callback_lock must be locked before calling this function. */ -static void save_callbacks(struct connection *con, struct sock *sk) +static void save_listen_callbacks(struct socket *sock) { - con->orig_data_ready = sk->sk_data_ready; - con->orig_state_change = sk->sk_state_change; - con->orig_write_space = sk->sk_write_space; - con->orig_error_report = sk->sk_error_report; + struct sock *sk = sock->sk; + + listen_sock.sk_data_ready = sk->sk_data_ready; + listen_sock.sk_state_change = sk->sk_state_change; + listen_sock.sk_write_space = sk->sk_write_space; + listen_sock.sk_error_report = sk->sk_error_report; } -static void restore_callbacks(struct connection *con, struct sock *sk) +static void restore_callbacks(struct socket *sock) { + struct sock *sk = sock->sk; + write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; - sk->sk_data_ready = con->orig_data_ready; - sk->sk_state_change = con->orig_state_change; - sk->sk_write_space = con->orig_write_space; - sk->sk_error_report = con->orig_error_report; + sk->sk_data_ready = listen_sock.sk_data_ready; + sk->sk_state_change = listen_sock.sk_state_change; + sk->sk_write_space = listen_sock.sk_write_space; + sk->sk_error_report = listen_sock.sk_error_report; write_unlock_bh(&sk->sk_callback_lock); } /* Make a socket active */ -static void add_sock(struct socket *sock, struct connection *con, bool save_cb) +static void add_sock(struct socket *sock, struct connection *con) { struct sock *sk = sock->sk; @@ -545,8 +559,6 @@ static void add_sock(struct socket *sock, struct connection *con, bool save_cb) con->sock = sock; sk->sk_user_data = con; - if (save_cb) - save_callbacks(con, sk); /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; sk->sk_write_space = lowcomms_write_space; @@ -579,17 +591,20 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { - clear_bit(CF_CONNECT_PENDING, &con->flags); - clear_bit(CF_WRITE_PENDING, &con->flags); - if (tx && cancel_work_sync(&con->swork)) + bool closing = test_and_set_bit(CF_CLOSING, &con->flags); + + if (tx && !closing && cancel_work_sync(&con->swork)) { log_print("canceled swork for node %d", con->nodeid); - if (rx && cancel_work_sync(&con->rwork)) + clear_bit(CF_WRITE_PENDING, &con->flags); + } + if (rx && !closing && cancel_work_sync(&con->rwork)) { log_print("canceled rwork for node %d", con->nodeid); + clear_bit(CF_READ_PENDING, &con->flags); + } mutex_lock(&con->sock_mutex); if (con->sock) { - if (!test_bit(CF_IS_OTHERCON, &con->flags)) - restore_callbacks(con, con->sock->sk); + restore_callbacks(con->sock); sock_release(con->sock); con->sock = NULL; } @@ -604,6 +619,7 @@ static void close_connection(struct connection *con, bool and_other, con->retries = 0; mutex_unlock(&con->sock_mutex); + clear_bit(CF_CLOSING, &con->flags); } /* Data received from remote end */ @@ -700,7 +716,7 @@ out_resched: out_close: mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN) { - close_connection(con, false, true, false); + close_connection(con, true, true, false); /* Reconnect when there is something to send */ } /* Don't return success if we really got EOF */ @@ -728,22 +744,14 @@ static int tcp_accept_from_sock(struct connection *con) } mutex_unlock(&connections_lock); - memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_lite(dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &newsock); - if (result < 0) - return -ENOMEM; - mutex_lock_nested(&con->sock_mutex, 0); - result = -ENOTCONN; - if (con->sock == NULL) - goto accept_err; - - newsock->type = con->sock->type; - newsock->ops = con->sock->ops; + if (!con->sock) { + mutex_unlock(&con->sock_mutex); + return -ENOTCONN; + } - result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK, true); + result = kernel_accept(con->sock, &newsock, O_NONBLOCK); if (result < 0) goto accept_err; @@ -794,31 +802,33 @@ static int tcp_accept_from_sock(struct connection *con) othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); + INIT_LIST_HEAD(&othercon->writequeue); + spin_lock_init(&othercon->writequeue_lock); INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); } + mutex_lock_nested(&othercon->sock_mutex, 2); if (!othercon->sock) { newcon->othercon = othercon; - othercon->sock = newsock; - newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon, false); + add_sock(newsock, othercon); addcon = othercon; + mutex_unlock(&othercon->sock_mutex); } else { printk("Extra connection from node %d attempted\n", nodeid); result = -EAGAIN; + mutex_unlock(&othercon->sock_mutex); mutex_unlock(&newcon->sock_mutex); goto accept_err; } } else { - newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ - add_sock(newsock, newcon, false); + add_sock(newsock, newcon); addcon = newcon; } @@ -837,7 +847,8 @@ static int tcp_accept_from_sock(struct connection *con) accept_err: mutex_unlock(&con->sock_mutex); - sock_release(newsock); + if (newsock) + sock_release(newsock); if (result != -EAGAIN) log_print("error accepting connection from node: %d", result); @@ -911,26 +922,28 @@ static int sctp_accept_from_sock(struct connection *con) othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); + INIT_LIST_HEAD(&othercon->writequeue); + spin_lock_init(&othercon->writequeue_lock); INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); } + mutex_lock_nested(&othercon->sock_mutex, 2); if (!othercon->sock) { newcon->othercon = othercon; - othercon->sock = newsock; - newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon, false); + add_sock(newsock, othercon); addcon = othercon; + mutex_unlock(&othercon->sock_mutex); } else { printk("Extra connection from node %d attempted\n", nodeid); ret = -EAGAIN; + mutex_unlock(&othercon->sock_mutex); mutex_unlock(&newcon->sock_mutex); goto accept_err; } } else { - newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon, false); + add_sock(newsock, newcon); addcon = newcon; } @@ -1055,10 +1068,9 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; - sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = sctp_connect_to_sock; - add_sock(sock, con, true); + add_sock(sock, con); /* Bind to all addresses. */ if (sctp_bind_addrs(con, 0)) @@ -1079,7 +1091,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result == 0) goto out; - bind_err: con->sock = NULL; sock_release(sock); @@ -1098,14 +1109,12 @@ socket_err: con->retries, result); mutex_unlock(&con->sock_mutex); msleep(1000); - clear_bit(CF_CONNECT_PENDING, &con->flags); lowcomms_connect_sock(con); return; } out: mutex_unlock(&con->sock_mutex); - set_bit(CF_WRITE_PENDING, &con->flags); } /* Connect a new socket to its peer */ @@ -1143,10 +1152,9 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; } - sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = tcp_connect_to_sock; - add_sock(sock, con, true); + add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid routing problems */ @@ -1194,13 +1202,11 @@ out_err: con->retries, result); mutex_unlock(&con->sock_mutex); msleep(1000); - clear_bit(CF_CONNECT_PENDING, &con->flags); lowcomms_connect_sock(con); return; } out: mutex_unlock(&con->sock_mutex); - set_bit(CF_WRITE_PENDING, &con->flags); return; } @@ -1235,10 +1241,12 @@ static struct socket *tcp_create_listen_sock(struct connection *con, if (result < 0) { log_print("Failed to set SO_REUSEADDR on socket: %d", result); } + write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = con; - + save_listen_callbacks(sock); con->rx_action = tcp_accept_from_sock; con->connect_action = tcp_connect_to_sock; + write_unlock_bh(&sock->sk->sk_callback_lock); /* Bind to our port */ make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); @@ -1320,6 +1328,7 @@ static int sctp_listen_for_all(void) write_lock_bh(&sock->sk->sk_callback_lock); /* Init con struct */ sock->sk->sk_user_data = con; + save_listen_callbacks(sock); con->sock = sock; con->sock->sk->sk_data_ready = lowcomms_data_ready; con->rx_action = sctp_accept_from_sock; @@ -1366,7 +1375,7 @@ static int tcp_listen_for_all(void) sock = tcp_create_listen_sock(con, dlm_local_addr[0]); if (sock) { - add_sock(sock, con, true); + add_sock(sock, con); result = 0; } else { @@ -1456,9 +1465,7 @@ void dlm_lowcomms_commit_buffer(void *mh) e->len = e->end - e->offset; spin_unlock(&con->writequeue_lock); - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { - queue_work(send_workqueue, &con->swork); - } + queue_work(send_workqueue, &con->swork); return; out: @@ -1527,13 +1534,16 @@ out: send_error: mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); - lowcomms_connect_sock(con); + close_connection(con, true, false, true); + /* Requeue the send work. When the work daemon runs again, it will try + a new connection, then call this function again. */ + queue_work(send_workqueue, &con->swork); return; out_connect: mutex_unlock(&con->sock_mutex); - lowcomms_connect_sock(con); + queue_work(send_workqueue, &con->swork); + cond_resched(); } static void clean_one_writequeue(struct connection *con) @@ -1593,9 +1603,10 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); - if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) + clear_bit(CF_WRITE_PENDING, &con->flags); + if (con->sock == NULL) /* not mutex protected so check it inside too */ con->connect_action(con); - if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) + if (!list_empty(&con->writequeue)) send_to_sock(con); } @@ -1632,11 +1643,25 @@ static int work_start(void) return 0; } -static void stop_conn(struct connection *con) +static void _stop_conn(struct connection *con, bool and_other) { - con->flags |= 0x0F; - if (con->sock && con->sock->sk) + mutex_lock(&con->sock_mutex); + set_bit(CF_CLOSE, &con->flags); + set_bit(CF_READ_PENDING, &con->flags); + set_bit(CF_WRITE_PENDING, &con->flags); + if (con->sock && con->sock->sk) { + write_lock_bh(&con->sock->sk->sk_callback_lock); con->sock->sk->sk_user_data = NULL; + write_unlock_bh(&con->sock->sk->sk_callback_lock); + } + if (con->othercon && and_other) + _stop_conn(con->othercon, false); + mutex_unlock(&con->sock_mutex); +} + +static void stop_conn(struct connection *con) +{ + _stop_conn(con, true); } static void free_conn(struct connection *con) @@ -1648,6 +1673,36 @@ static void free_conn(struct connection *con) kmem_cache_free(con_cache, con); } +static void work_flush(void) +{ + int ok; + int i; + struct hlist_node *n; + struct connection *con; + + flush_workqueue(recv_workqueue); + flush_workqueue(send_workqueue); + do { + ok = 1; + foreach_conn(stop_conn); + flush_workqueue(recv_workqueue); + flush_workqueue(send_workqueue); + for (i = 0; i < CONN_HASH_SIZE && ok; i++) { + hlist_for_each_entry_safe(con, n, + &connection_hash[i], list) { + ok &= test_bit(CF_READ_PENDING, &con->flags); + ok &= test_bit(CF_WRITE_PENDING, &con->flags); + if (con->othercon) { + ok &= test_bit(CF_READ_PENDING, + &con->othercon->flags); + ok &= test_bit(CF_WRITE_PENDING, + &con->othercon->flags); + } + } + } + } while (!ok); +} + void dlm_lowcomms_stop(void) { /* Set all the flags to prevent any @@ -1655,11 +1710,10 @@ void dlm_lowcomms_stop(void) */ mutex_lock(&connections_lock); dlm_allow_conn = 0; - foreach_conn(stop_conn); + mutex_unlock(&connections_lock); + work_flush(); clean_writequeues(); foreach_conn(free_conn); - mutex_unlock(&connections_lock); - work_stop(); kmem_cache_destroy(con_cache); diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f3f5e72a29ba..70c625999d36 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -155,6 +155,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) goto out; } +retry: error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, sizeof(struct rcom_status), &rc, &mh); if (error) @@ -169,6 +170,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); + if (error == -ETIMEDOUT) + goto retry; if (error) goto out; @@ -276,6 +279,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) ls->ls_recover_nodeid = nodeid; +retry: error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); if (error) goto out; @@ -288,6 +292,8 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); + if (error == -ETIMEDOUT) + goto retry; out: return error; } @@ -332,25 +338,6 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) return error; } -int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid) -{ - struct dlm_rcom *rc; - struct dlm_mhandle *mh; - struct dlm_ls *ls = r->res_ls; - int error; - - error = create_rcom(ls, to_nodeid, DLM_RCOM_LOOKUP, r->res_length, - &rc, &mh); - if (error) - goto out; - memcpy(rc->rc_buf, r->res_name, r->res_length); - rc->rc_id = 0xFFFFFFFF; - - send_rcom(ls, mh, rc); - out: - return error; -} - static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; @@ -362,6 +349,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) if (error) return; + /* Old code would send this special id to trigger a debug dump. */ if (rc_in->rc_id == 0xFFFFFFFF) { log_error(ls, "receive_rcom_lookup dump from %d", nodeid); dlm_dump_rsb_name(ls, rc_in->rc_buf, len); diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h index f8e243463c15..206723ab744d 100644 --- a/fs/dlm/rcom.h +++ b/fs/dlm/rcom.h @@ -17,7 +17,6 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); -int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid); int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index eaea789bf97d..ce2aa54ca2e2 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -52,6 +52,10 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) dlm_config.ci_recover_timer * HZ); if (rv) break; + if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) { + log_debug(ls, "dlm_wait_function timed out"); + return -ETIMEDOUT; + } } if (dlm_recovery_stopped(ls)) { diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 6859b4bf971e..6f4e1d42d733 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -287,11 +287,23 @@ static int dlm_recoverd(void *arg) set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); wake_up(&ls->ls_recover_lock_wait); - while (!kthread_should_stop()) { + while (1) { + /* + * We call kthread_should_stop() after set_current_state(). + * This is because it works correctly if kthread_stop() is + * called just before set_current_state(). + */ set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && - !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) + !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { + if (kthread_should_stop()) + break; schedule(); + } set_current_state(TASK_RUNNING); if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e5e29f8c920b..846ca150d52e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -36,27 +36,13 @@ #include <linux/scatterlist.h> #include <linux/slab.h> #include <asm/unaligned.h> +#include <linux/kernel.h> #include "ecryptfs_kernel.h" #define DECRYPT 0 #define ENCRYPT 1 /** - * ecryptfs_to_hex - * @dst: Buffer to take hex character representation of contents of - * src; must be at least of size (src_size * 2) - * @src: Buffer to be converted to a hex string representation - * @src_size: number of bytes to convert - */ -void ecryptfs_to_hex(char *dst, char *src, size_t src_size) -{ - int x; - - for (x = 0; x < src_size; x++) - sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]); -} - -/** * ecryptfs_from_hex * @dst: Buffer to take the bytes from src hex; must be at least of * size (src_size / 2) @@ -899,8 +885,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, u32 flags; flags = get_unaligned_be32(page_virt); - for (i = 0; i < ((sizeof(ecryptfs_flag_map) - / sizeof(struct ecryptfs_flag_map_elem))); i++) + for (i = 0; i < ARRAY_SIZE(ecryptfs_flag_map); i++) if (flags & ecryptfs_flag_map[i].file_flag) { crypt_stat->flags |= ecryptfs_flag_map[i].local_flag; } else @@ -937,8 +922,7 @@ void ecryptfs_write_crypt_stat_flags(char *page_virt, u32 flags = 0; int i; - for (i = 0; i < ((sizeof(ecryptfs_flag_map) - / sizeof(struct ecryptfs_flag_map_elem))); i++) + for (i = 0; i < ARRAY_SIZE(ecryptfs_flag_map); i++) if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag) flags |= ecryptfs_flag_map[i].file_flag; /* Version is in top 8 bits of the 32-bit flag vector */ @@ -1434,8 +1418,6 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER); if (!page_virt) { rc = -ENOMEM; - printk(KERN_ERR "%s: Unable to allocate page_virt\n", - __func__); goto out; } rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, @@ -1522,9 +1504,6 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, filename->encrypted_filename = kmalloc(filename->encrypted_filename_size, GFP_KERNEL); if (!filename->encrypted_filename) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc [%zd] bytes\n", __func__, - filename->encrypted_filename_size); rc = -ENOMEM; goto out; } @@ -1669,12 +1648,10 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, BUG_ON(!mutex_is_locked(&key_tfm_list_mutex)); tmp_tfm = kmem_cache_alloc(ecryptfs_key_tfm_cache, GFP_KERNEL); - if (key_tfm != NULL) + if (key_tfm) (*key_tfm) = tmp_tfm; if (!tmp_tfm) { rc = -ENOMEM; - printk(KERN_ERR "Error attempting to allocate from " - "ecryptfs_key_tfm_cache\n"); goto out; } mutex_init(&tmp_tfm->key_tfm_mutex); @@ -1690,7 +1667,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, "cipher with name = [%s]; rc = [%d]\n", tmp_tfm->cipher_name, rc); kmem_cache_free(ecryptfs_key_tfm_cache, tmp_tfm); - if (key_tfm != NULL) + if (key_tfm) (*key_tfm) = NULL; goto out; } @@ -1881,7 +1858,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, size_t src_byte_offset = 0; size_t dst_byte_offset = 0; - if (dst == NULL) { + if (!dst) { (*dst_size) = ecryptfs_max_decoded_size(src_size); goto out; } @@ -1949,9 +1926,6 @@ int ecryptfs_encrypt_and_encode_filename( filename = kzalloc(sizeof(*filename), GFP_KERNEL); if (!filename) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kzalloc [%zd] bytes\n", __func__, - sizeof(*filename)); rc = -ENOMEM; goto out; } @@ -1980,9 +1954,6 @@ int ecryptfs_encrypt_and_encode_filename( + encoded_name_no_prefix_size); (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL); if (!(*encoded_name)) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kzalloc [%zd] bytes\n", __func__, - (*encoded_name_size)); rc = -ENOMEM; kfree(filename->encrypted_filename); kfree(filename); @@ -2064,9 +2035,6 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, name, name_size); decoded_name = kmalloc(decoded_name_size, GFP_KERNEL); if (!decoded_name) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc [%zd] bytes\n", __func__, - decoded_name_size); rc = -ENOMEM; goto out; } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 3fbc0ff79699..e74cb2a0b299 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -31,6 +31,7 @@ #include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> +#include <linux/kernel.h> #include <linux/fs.h> #include <linux/fs_stack.h> #include <linux/namei.h> @@ -51,7 +52,13 @@ #define ECRYPTFS_XATTR_NAME "user.ecryptfs" void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); -extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); +static inline void +ecryptfs_to_hex(char *dst, char *src, size_t src_size) +{ + char *end = bin2hex(dst, src, src_size); + *end = '\0'; +} + extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); struct ecryptfs_key_record { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index efc2db42d175..847904aa63a9 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -64,7 +64,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque) /* i_size will be overwritten for encrypted regular files */ fsstack_copy_inode_size(inode, lower_inode); inode->i_ino = lower_inode->i_ino; - inode->i_version++; inode->i_mapping->a_ops = &ecryptfs_aops; if (S_ISLNK(inode->i_mode)) @@ -334,9 +333,6 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); if (!dentry_info) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to allocate ecryptfs_dentry_info struct\n", - __func__); dput(lower_dentry); return ERR_PTR(-ENOMEM); } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index fa218cd64f74..c89a58cfc991 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -639,11 +639,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, int rc = 0; s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) { - printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " - "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + if (!s) return -ENOMEM; - } + (*packet_size) = 0; rc = ecryptfs_find_auth_tok_for_sig( &auth_tok_key, @@ -687,7 +685,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, * separator, and then the filename */ s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE + s->block_aligned_filename_size); - if (dest == NULL) { + if (!dest) { (*packet_size) = s->max_packet_size; goto out_unlock; } @@ -714,9 +712,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, GFP_KERNEL); if (!s->block_aligned_filename) { - printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " - "kzalloc [%zd] bytes\n", __func__, - s->block_aligned_filename_size); rc = -ENOMEM; goto out_unlock; } @@ -769,10 +764,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, s->hash_desc = kmalloc(sizeof(*s->hash_desc) + crypto_shash_descsize(s->hash_tfm), GFP_KERNEL); if (!s->hash_desc) { - printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " - "kmalloc [%zd] bytes\n", __func__, - sizeof(*s->hash_desc) + - crypto_shash_descsize(s->hash_tfm)); rc = -ENOMEM; goto out_release_free_unlock; } @@ -925,11 +916,9 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, (*filename_size) = 0; (*filename) = NULL; s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) { - printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " - "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + if (!s) return -ENOMEM; - } + if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) { printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " "at least [%d]\n", __func__, max_packet_size, @@ -1015,9 +1004,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, s->decrypted_filename = kmalloc(s->block_aligned_filename_size, GFP_KERNEL); if (!s->decrypted_filename) { - printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%zd] bytes\n", __func__, - s->block_aligned_filename_size); rc = -ENOMEM; goto out_unlock; } @@ -1097,9 +1083,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, } (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL); if (!(*filename)) { - printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%zd] bytes\n", __func__, - ((*filename_size) + 1)); rc = -ENOMEM; goto out_free_unlock; } @@ -1333,7 +1316,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat, if ((*new_auth_tok)->session_key.encrypted_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { printk(KERN_WARNING "Tag 1 packet contains key larger " - "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES"); + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); rc = -EINVAL; goto out; } @@ -2525,11 +2508,9 @@ int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) struct ecryptfs_key_sig *new_key_sig; new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL); - if (!new_key_sig) { - printk(KERN_ERR - "Error allocating from ecryptfs_key_sig_cache\n"); + if (!new_key_sig) return -ENOMEM; - } + memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; /* Caller must hold keysig_list_mutex */ @@ -2545,16 +2526,12 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig, u32 global_auth_tok_flags) { struct ecryptfs_global_auth_tok *new_auth_tok; - int rc = 0; new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache, GFP_KERNEL); - if (!new_auth_tok) { - rc = -ENOMEM; - printk(KERN_ERR "Error allocating from " - "ecryptfs_global_auth_tok_cache\n"); - goto out; - } + if (!new_auth_tok) + return -ENOMEM; + memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); new_auth_tok->flags = global_auth_tok_flags; new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; @@ -2562,7 +2539,6 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, list_add(&new_auth_tok->mount_crypt_stat_list, &mount_crypt_stat->global_auth_tok_list); mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); -out: - return rc; + return 0; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 6b801186baa5..025d66a705db 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -426,7 +426,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, mount_crypt_stat->global_default_cipher_key_size); if (!cipher_code) { ecryptfs_printk(KERN_ERR, - "eCryptfs doesn't support cipher: %s", + "eCryptfs doesn't support cipher: %s\n", mount_crypt_stat->global_default_cipher_name); rc = -EINVAL; goto out; @@ -560,8 +560,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags * Set the POSIX ACL flag based on whether they're enabled in the lower * mount. */ - s->s_flags = flags & ~MS_POSIXACL; - s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL; + s->s_flags = flags & ~SB_POSIXACL; + s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL; /** * Force a read-only eCryptfs mount when: @@ -569,7 +569,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags * 2) The ecryptfs_encrypted_view mount option is specified */ if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) - s->s_flags |= MS_RDONLY; + s->s_flags |= SB_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; @@ -602,7 +602,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags ecryptfs_set_dentry_private(s->s_root, root_info); root_info->lower_path = path; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; return dget(s->s_root); out_free: @@ -660,7 +660,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; - unsigned long flags; + slab_flags_t flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { @@ -781,7 +781,7 @@ static struct attribute *attributes[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = attributes, }; diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 286f10b0363b..9fdd5bcf4564 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -147,8 +147,6 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file) (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); if (!(*daemon)) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " - "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); goto out; } (*daemon)->file = file; @@ -250,8 +248,6 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon, msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL); if (!msg_ctx->msg) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " - "GFP_KERNEL memory\n", __func__, msg_size); goto unlock; } msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; @@ -386,7 +382,6 @@ int __init ecryptfs_init_messaging(void) GFP_KERNEL); if (!ecryptfs_daemon_hash) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); mutex_unlock(&ecryptfs_daemon_hash_mux); goto out; } @@ -398,7 +393,6 @@ int __init ecryptfs_init_messaging(void) GFP_KERNEL); if (!ecryptfs_msg_ctx_arr) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); goto out; } mutex_init(&ecryptfs_msg_ctx_lists_mux); @@ -442,15 +436,16 @@ void ecryptfs_release_messaging(void) } if (ecryptfs_daemon_hash) { struct ecryptfs_daemon *daemon; + struct hlist_node *n; int i; mutex_lock(&ecryptfs_daemon_hash_mux); for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; - hlist_for_each_entry(daemon, - &ecryptfs_daemon_hash[i], - euid_chain) { + hlist_for_each_entry_safe(daemon, n, + &ecryptfs_daemon_hash[i], + euid_chain) { rc = ecryptfs_exorcise_daemon(daemon); if (rc) printk(KERN_ERR "%s: Error whilst " diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index e4141f257495..f09cacaf8c80 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -163,12 +163,8 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, struct ecryptfs_message *msg; msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL); - if (!msg) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc(%zd, GFP_KERNEL)\n", __func__, - (sizeof(*msg) + data_size)); + if (!msg) return -ENOMEM; - } mutex_lock(&msg_ctx->mux); msg_ctx->msg = msg; @@ -383,7 +379,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, goto memdup; } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) { printk(KERN_WARNING "%s: Acceptable packet size range is " - "[%d-%zu], but amount of data written is [%zu].", + "[%d-%zu], but amount of data written is [%zu].\n", __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count); return -EINVAL; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 1f0c471b4ba3..cdf358b209d9 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -431,8 +431,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) } xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL); if (!xattr_virt) { - printk(KERN_ERR "Out of memory whilst attempting to write " - "inode size to xattr\n"); rc = -ENOMEM; goto out; } diff --git a/fs/efs/super.c b/fs/efs/super.c index 65b59009555b..6ffb7ba1547a 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -116,7 +116,7 @@ static void destroy_inodecache(void) static int efs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -311,7 +311,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) #ifdef DEBUG pr_info("forcing read-only mode\n"); #endif - s->s_flags |= MS_RDONLY; + s->s_flags |= SB_RDONLY; } s->s_op = &efs_superblock_operations; s->s_export_op = &efs_export_ops; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..afd548ebc328 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -276,12 +276,6 @@ static DEFINE_MUTEX(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; -/* Used for safe wake up implementation */ -static struct nested_calls poll_safewake_ncalls; - -/* Used to call file's f_op->poll() under the nested calls boundaries */ -static struct nested_calls poll_readywalk_ncalls; - /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -551,40 +545,21 @@ out_unlock: * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) + +static struct nested_calls poll_safewake_ncalls; + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { unsigned long flags; + wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; - spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); - wake_up_locked_poll(wqueue, events); + spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); + wake_up_locked_poll(wqueue, POLLIN); spin_unlock_irqrestore(&wqueue->lock, flags); -} -#else -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) -{ - wake_up_poll(wqueue, events); -} -#endif -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) -{ - ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, - 1 + call_nests); return 0; } -/* - * Perform a safe wake up of the poll wait list. The problem is that - * with the new callback'd wake up system, it is possible that the - * poll callback is reentered from inside the call to wake_up() done - * on the poll wait queue head. The rule is that we cannot reenter the - * wake up code from the same task more than EP_MAX_NESTS times, - * and we cannot reenter the same wait queue head at all. This will - * enable to have a hierarchy of epoll file descriptor of no more than - * EP_MAX_NESTS deep. - */ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); @@ -595,6 +570,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +#else + +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + wake_up_poll(wq, POLLIN); +} + +#endif + static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; @@ -880,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv); +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +/* + * Differs from ep_eventpoll_poll() in that internal callers already have + * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() + * is correctly annotated. + */ +static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth) { + struct eventpoll *ep; + bool locked; + pt->_key = epi->event.events; + if (!is_file_epoll(epi->ffd.file)) + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & + epi->event.events; + + ep = epi->ffd.file->private_data; + poll_wait(epi->ffd.file, &ep->poll_wait, pt); + locked = pt && (pt->_qproc == ep_ptable_queue_proc); - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; + return ep_scan_ready_list(epi->ffd.file->private_data, + ep_read_events_proc, &depth, depth, + locked) & epi->event.events; } static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, @@ -892,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, { struct epitem *epi, *tmp; poll_table pt; + int depth = *(int *)priv; init_poll_funcptr(&pt, NULL); + depth++; list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (ep_item_poll(epi, &pt)) + if (ep_item_poll(epi, &pt, depth)) { return POLLIN | POLLRDNORM; - else { + } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as @@ -912,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } -static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); - -struct readyevents_arg { - struct eventpoll *ep; - bool locked; -}; - -static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) -{ - struct readyevents_arg *arg = priv; - - return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, - call_nests + 1, arg->locked); -} - static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { - int pollflags; struct eventpoll *ep = file->private_data; - struct readyevents_arg arg; - - /* - * During ep_insert() we already hold the ep->mtx for the tfile. - * Prevent re-aquisition. - */ - arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); - arg.ep = ep; + int depth = 0; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside - * the ready list. This need to be done under ep_call_nested() - * supervision, since the call to f_op->poll() done on listed files - * could re-enter here. + * the ready list. */ - pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, &arg, ep, current); - - return pollflags != -1 ? pollflags : 0; + return ep_scan_ready_list(ep, ep_read_events_proc, + &depth, depth, false); } #ifdef CONFIG_PROC_FS @@ -1472,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = ep_item_poll(epi, &epq.pt); + revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue @@ -1606,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the item is "hot" and it is not registered inside the ready @@ -1674,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the event mask intersect the caller-requested one, @@ -2259,7 +2239,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, compat_size_t, sigsetsize) { long err; - compat_sigset_t csigmask; sigset_t ksigmask, sigsaved; /* @@ -2269,9 +2248,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + if (get_compat_sigset(&ksigmask, sigmask)) return -EFAULT; - sigset_from_compat(&ksigmask, &csigmask); sigsaved = current->blocked; set_current_blocked(&ksigmask); } @@ -2315,11 +2293,10 @@ static int __init eventpoll_init(void) */ ep_nested_calls_init(&poll_loop_ncalls); +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); - - /* Initialize the structure used to perform file's f_op->poll() calls */ - ep_nested_calls_init(&poll_readywalk_ncalls); +#endif /* * We can have many thousands of epitems, so prevent this from @@ -2329,11 +2306,11 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } diff --git a/fs/exec.c b/fs/exec.c index 3e14ba25f678..7eb8d21bcab9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1216,15 +1216,14 @@ killed: return -EAGAIN; } -char *get_task_comm(char *buf, struct task_struct *tsk) +char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { - /* buf must be at least sizeof(tsk->comm) in size */ task_lock(tsk); - strncpy(buf, tsk->comm, sizeof(tsk->comm)); + strncpy(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } -EXPORT_SYMBOL_GPL(get_task_comm); +EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable @@ -1350,9 +1349,14 @@ void setup_new_exec(struct linux_binprm * bprm) current->sas_ss_sp = current->sas_ss_size = 0; - /* Figure out dumpability. */ + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || - bprm->secureexec) + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); @@ -1911,7 +1915,7 @@ void set_dumpable(struct mm_struct *mm, int value) return; do { - old = ACCESS_ONCE(mm->flags); + old = READ_ONCE(mm->flags); new = (old & ~MMF_DUMPABLE_MASK) | value; } while (cmpxchg(&mm->flags, old, new) != old); } diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index e1b3724bebf2..33db13365c5e 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -548,7 +548,7 @@ do_more: } mark_buffer_dirty(bitmap_bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); group_adjust_blocks(sb, block_group, desc, bh2, group_freed); @@ -1424,7 +1424,7 @@ allocated: percpu_counter_sub(&sbi->s_freeblocks_counter, num); mark_buffer_dirty(bitmap_bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); *errp = 0; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index c67b486488fd..2da67699dc33 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index a1fc3dabca41..6484199b35d1 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -145,7 +145,7 @@ void ext2_free_inode (struct inode * inode) else ext2_release_inode(sb, block_group, is_directory); mark_buffer_dirty(bitmap_bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); brelse(bitmap_bh); @@ -517,7 +517,7 @@ repeat_in_this_group: goto fail; got: mark_buffer_dirty(bitmap_bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); brelse(bitmap_bh); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 1442a4c734c8..9b2ac55ac34f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -821,11 +821,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret == 0) { iomap->type = IOMAP_HOLE; - iomap->blkno = IOMAP_NULL_BLOCK; + iomap->addr = IOMAP_NULL_ADDR; iomap->length = 1 << blkbits; } else { iomap->type = IOMAP_MAPPED; - iomap->blkno = (sector_t)bno << (blkbits - 9); + iomap->addr = (u64)bno << blkbits; iomap->length = (u64)ret << blkbits; iomap->flags |= IOMAP_F_MERGED; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1458706bd2ec..7646818ab266 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -75,7 +75,7 @@ void ext2_error(struct super_block *sb, const char *function, if (test_opt(sb, ERRORS_RO)) { ext2_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } } @@ -479,10 +479,10 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, struct super_block *sb) +static int parse_options(char *options, struct super_block *sb, + struct ext2_mount_options *opts) { char *p; - struct ext2_sb_info *sbi = EXT2_SB(sb); substring_t args[MAX_OPT_ARGS]; int option; kuid_t uid; @@ -499,16 +499,16 @@ static int parse_options(char *options, struct super_block *sb) token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: - clear_opt (sbi->s_mount_opt, MINIX_DF); + clear_opt (opts->s_mount_opt, MINIX_DF); break; case Opt_minix_df: - set_opt (sbi->s_mount_opt, MINIX_DF); + set_opt (opts->s_mount_opt, MINIX_DF); break; case Opt_grpid: - set_opt (sbi->s_mount_opt, GRPID); + set_opt (opts->s_mount_opt, GRPID); break; case Opt_nogrpid: - clear_opt (sbi->s_mount_opt, GRPID); + clear_opt (opts->s_mount_opt, GRPID); break; case Opt_resuid: if (match_int(&args[0], &option)) @@ -519,7 +519,7 @@ static int parse_options(char *options, struct super_block *sb) return 0; } - sbi->s_resuid = uid; + opts->s_resuid = uid; break; case Opt_resgid: if (match_int(&args[0], &option)) @@ -529,51 +529,51 @@ static int parse_options(char *options, struct super_block *sb) ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option); return 0; } - sbi->s_resgid = gid; + opts->s_resgid = gid; break; case Opt_sb: /* handled by get_sb_block() instead of here */ /* *sb_block = match_int(&args[0]); */ break; case Opt_err_panic: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_RO); - set_opt (sbi->s_mount_opt, ERRORS_PANIC); + clear_opt (opts->s_mount_opt, ERRORS_CONT); + clear_opt (opts->s_mount_opt, ERRORS_RO); + set_opt (opts->s_mount_opt, ERRORS_PANIC); break; case Opt_err_ro: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt (opts->s_mount_opt, ERRORS_CONT); + clear_opt (opts->s_mount_opt, ERRORS_PANIC); + set_opt (opts->s_mount_opt, ERRORS_RO); break; case Opt_err_cont: - clear_opt (sbi->s_mount_opt, ERRORS_RO); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (opts->s_mount_opt, ERRORS_RO); + clear_opt (opts->s_mount_opt, ERRORS_PANIC); + set_opt (opts->s_mount_opt, ERRORS_CONT); break; case Opt_nouid32: - set_opt (sbi->s_mount_opt, NO_UID32); + set_opt (opts->s_mount_opt, NO_UID32); break; case Opt_nocheck: - clear_opt (sbi->s_mount_opt, CHECK); + clear_opt (opts->s_mount_opt, CHECK); break; case Opt_debug: - set_opt (sbi->s_mount_opt, DEBUG); + set_opt (opts->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + set_opt (opts->s_mount_opt, OLDALLOC); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + clear_opt (opts->s_mount_opt, OLDALLOC); break; case Opt_nobh: - set_opt (sbi->s_mount_opt, NOBH); + set_opt (opts->s_mount_opt, NOBH); break; #ifdef CONFIG_EXT2_FS_XATTR case Opt_user_xattr: - set_opt (sbi->s_mount_opt, XATTR_USER); + set_opt (opts->s_mount_opt, XATTR_USER); break; case Opt_nouser_xattr: - clear_opt (sbi->s_mount_opt, XATTR_USER); + clear_opt (opts->s_mount_opt, XATTR_USER); break; #else case Opt_user_xattr: @@ -584,10 +584,10 @@ static int parse_options(char *options, struct super_block *sb) #endif #ifdef CONFIG_EXT2_FS_POSIX_ACL case Opt_acl: - set_opt(sbi->s_mount_opt, POSIX_ACL); + set_opt(opts->s_mount_opt, POSIX_ACL); break; case Opt_noacl: - clear_opt(sbi->s_mount_opt, POSIX_ACL); + clear_opt(opts->s_mount_opt, POSIX_ACL); break; #else case Opt_acl: @@ -598,13 +598,13 @@ static int parse_options(char *options, struct super_block *sb) #endif case Opt_xip: ext2_msg(sb, KERN_INFO, "use dax instead of xip"); - set_opt(sbi->s_mount_opt, XIP); + set_opt(opts->s_mount_opt, XIP); /* Fall through */ case Opt_dax: #ifdef CONFIG_FS_DAX ext2_msg(sb, KERN_WARNING, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(sbi->s_mount_opt, DAX); + set_opt(opts->s_mount_opt, DAX); #else ext2_msg(sb, KERN_INFO, "dax option not supported"); #endif @@ -613,11 +613,11 @@ static int parse_options(char *options, struct super_block *sb) #if defined(CONFIG_QUOTA) case Opt_quota: case Opt_usrquota: - set_opt(sbi->s_mount_opt, USRQUOTA); + set_opt(opts->s_mount_opt, USRQUOTA); break; case Opt_grpquota: - set_opt(sbi->s_mount_opt, GRPQUOTA); + set_opt(opts->s_mount_opt, GRPQUOTA); break; #else case Opt_quota: @@ -629,11 +629,11 @@ static int parse_options(char *options, struct super_block *sb) #endif case Opt_reservation: - set_opt(sbi->s_mount_opt, RESERVATION); + set_opt(opts->s_mount_opt, RESERVATION); ext2_msg(sb, KERN_INFO, "reservations ON"); break; case Opt_noreservation: - clear_opt(sbi->s_mount_opt, RESERVATION); + clear_opt(opts->s_mount_opt, RESERVATION); ext2_msg(sb, KERN_INFO, "reservations OFF"); break; case Opt_ignore: @@ -656,7 +656,7 @@ static int ext2_setup_super (struct super_block * sb, ext2_msg(sb, KERN_ERR, "error: revision level too high, " "forcing read-only mode"); - res = MS_RDONLY; + res = SB_RDONLY; } if (read_only) return res; @@ -830,6 +830,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) int i, j; __le32 features; int err; + struct ext2_mount_options opts; err = -ENOMEM; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); @@ -890,38 +891,42 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); if (def_mount_opts & EXT2_DEFM_DEBUG) - set_opt(sbi->s_mount_opt, DEBUG); + set_opt(opts.s_mount_opt, DEBUG); if (def_mount_opts & EXT2_DEFM_BSDGROUPS) - set_opt(sbi->s_mount_opt, GRPID); + set_opt(opts.s_mount_opt, GRPID); if (def_mount_opts & EXT2_DEFM_UID16) - set_opt(sbi->s_mount_opt, NO_UID32); + set_opt(opts.s_mount_opt, NO_UID32); #ifdef CONFIG_EXT2_FS_XATTR if (def_mount_opts & EXT2_DEFM_XATTR_USER) - set_opt(sbi->s_mount_opt, XATTR_USER); + set_opt(opts.s_mount_opt, XATTR_USER); #endif #ifdef CONFIG_EXT2_FS_POSIX_ACL if (def_mount_opts & EXT2_DEFM_ACL) - set_opt(sbi->s_mount_opt, POSIX_ACL); + set_opt(opts.s_mount_opt, POSIX_ACL); #endif if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) - set_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(opts.s_mount_opt, ERRORS_PANIC); else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE) - set_opt(sbi->s_mount_opt, ERRORS_CONT); + set_opt(opts.s_mount_opt, ERRORS_CONT); else - set_opt(sbi->s_mount_opt, ERRORS_RO); + set_opt(opts.s_mount_opt, ERRORS_RO); - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + opts.s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + opts.s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); - set_opt(sbi->s_mount_opt, RESERVATION); + set_opt(opts.s_mount_opt, RESERVATION); - if (!parse_options((char *) data, sb)) + if (!parse_options((char *) data, sb, &opts)) goto failed_mount; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sbi->s_mount_opt = opts.s_mount_opt; + sbi->s_resuid = opts.s_resuid; + sbi->s_resgid = opts.s_resgid; + + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? - MS_POSIXACL : 0); + SB_POSIXACL : 0); sb->s_iflags |= SB_I_CGROUPWB; if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && @@ -1173,7 +1178,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_msg(sb, KERN_WARNING, "warning: mounting ext3 filesystem as ext2"); if (ext2_setup_super (sb, es, sb_rdonly(sb))) - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ext2_write_super(sb); return 0; @@ -1312,46 +1317,36 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) { struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; - struct ext2_mount_options old_opts; - unsigned long old_sb_flags; + struct ext2_mount_options new_opts; int err; sync_filesystem(sb); - spin_lock(&sbi->s_lock); - /* Store the old options */ - old_sb_flags = sb->s_flags; - old_opts.s_mount_opt = sbi->s_mount_opt; - old_opts.s_resuid = sbi->s_resuid; - old_opts.s_resgid = sbi->s_resgid; + spin_lock(&sbi->s_lock); + new_opts.s_mount_opt = sbi->s_mount_opt; + new_opts.s_resuid = sbi->s_resuid; + new_opts.s_resgid = sbi->s_resgid; + spin_unlock(&sbi->s_lock); /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb)) { - err = -EINVAL; - goto restore_opts; - } - - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + if (!parse_options(data, sb, &new_opts)) + return -EINVAL; + spin_lock(&sbi->s_lock); es = sbi->s_es; - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) { + if ((sbi->s_mount_opt ^ new_opts.s_mount_opt) & EXT2_MOUNT_DAX) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT2_MOUNT_DAX; - } - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) { - spin_unlock(&sbi->s_lock); - return 0; + new_opts.s_mount_opt ^= EXT2_MOUNT_DAX; } - if (*flags & MS_RDONLY) { + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + goto out_set; + if (*flags & SB_RDONLY) { if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || - !(sbi->s_mount_state & EXT2_VALID_FS)) { - spin_unlock(&sbi->s_lock); - return 0; - } + !(sbi->s_mount_state & EXT2_VALID_FS)) + goto out_set; /* * OK, we are remounting a valid rw partition rdonly, so set @@ -1362,22 +1357,20 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) spin_unlock(&sbi->s_lock); err = dquot_suspend(sb, -1); - if (err < 0) { - spin_lock(&sbi->s_lock); - goto restore_opts; - } + if (err < 0) + return err; ext2_sync_super(sb, es, 1); } else { __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP); if (ret) { + spin_unlock(&sbi->s_lock); ext2_msg(sb, KERN_WARNING, "warning: couldn't remount RDWR because of " "unsupported optional features (%x).", le32_to_cpu(ret)); - err = -EROFS; - goto restore_opts; + return -EROFS; } /* * Mounting a RDONLY partition read-write, so reread and @@ -1386,7 +1379,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) */ sbi->s_mount_state = le16_to_cpu(es->s_state); if (!ext2_setup_super (sb, es, 0)) - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; spin_unlock(&sbi->s_lock); ext2_write_super(sb); @@ -1394,14 +1387,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) dquot_resume(sb, -1); } - return 0; -restore_opts: - sbi->s_mount_opt = old_opts.s_mount_opt; - sbi->s_resuid = old_opts.s_resuid; - sbi->s_resgid = old_opts.s_resgid; - sb->s_flags = old_sb_flags; + spin_lock(&sbi->s_lock); +out_set: + sbi->s_mount_opt = new_opts.s_mount_opt; + sbi->s_resuid = new_opts.s_resuid; + sbi->s_resgid = new_opts.s_resgid; + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0); spin_unlock(&sbi->s_lock); - return err; + + return 0; } static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e38039fd96ff..73b850f5659c 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -37,6 +37,7 @@ config EXT4_FS select CRC16 select CRYPTO select CRYPTO_CRC32C + select FS_IOMAP help This is the next generation of the ext3 filesystem. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d5ddfb96c83c..a943e568292e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -601,22 +601,21 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi, * ext4_should_retry_alloc() is called when ENOSPC is returned, and if * it is profitable to retry the operation, this function will wait * for the current or committing transaction to complete, and then - * return TRUE. - * - * if the total number of retries exceed three times, return FALSE. + * return TRUE. We will only retry once. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || - (*retries)++ > 3 || + (*retries)++ > 1 || !EXT4_SB(sb)->s_journal) return 0; - jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - smp_mb(); - if (EXT4_SB(sb)->s_mb_free_pending) - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + if (EXT4_SB(sb)->s_mb_free_pending == 0) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); return 1; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 58a0304566db..4e091eae38b1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -34,17 +34,15 @@ #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <crypto/hash.h> -#ifdef CONFIG_EXT4_FS_ENCRYPTION -#include <linux/fscrypt_supp.h> -#else -#include <linux/fscrypt_notsupp.h> -#endif #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) +#include <linux/fscrypt.h> + /* * The fourth extended filesystem constants/structures */ @@ -546,8 +544,8 @@ struct ext4_new_group_data { __u64 inode_table; __u32 blocks_count; __u16 reserved_blocks; - __u16 unused; - __u32 free_blocks_count; + __u16 mdata_blocks; + __u32 free_clusters_count; }; /* Indexes used to index group tables in ext4_new_group_data */ @@ -645,43 +643,6 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY -#ifndef FS_IOC_FSGETXATTR -/* Until the uapi changes get merged for project quota... */ - -#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) -#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) - -/* - * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. - */ -struct fsxattr { - __u32 fsx_xflags; /* xflags field value (get/set) */ - __u32 fsx_extsize; /* extsize field value (get/set)*/ - __u32 fsx_nextents; /* nextents field value (get) */ - __u32 fsx_projid; /* project identifier (get/set) */ - unsigned char fsx_pad[12]; -}; - -/* - * Flags for the fsx_xflags field - */ -#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ -#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ -#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ -#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ -#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ -#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ -#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ -#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ -#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ -#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ -#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ -#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ -#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ -#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ -#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ -#endif /* !defined(FS_IOC_FSGETXATTR) */ - #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -1393,8 +1354,6 @@ struct ext4_sb_info { int s_first_ino; unsigned int s_inode_readahead_blks; unsigned int s_inode_goal; - spinlock_t s_next_gen_lock; - u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ @@ -2516,9 +2475,6 @@ extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); -extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, - unsigned int map_len, - struct extent_status *result); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -3049,6 +3005,10 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 97f0fd06728d..c941251ac0c0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4722,6 +4722,7 @@ retry: EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); ret2 = ext4_journal_stop(handle); if (ret2) break; @@ -4794,7 +4795,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { + (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) @@ -4965,7 +4967,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { + (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5cb9aa3ad249..a0ae27b1bc66 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -21,12 +21,14 @@ #include <linux/time.h> #include <linux/fs.h> +#include <linux/iomap.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> +#include <linux/mman.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -296,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + pfn_t pfn; if (write) { sb_start_pagefault(sb); @@ -303,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); + if (IS_ERR(handle)) { + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + return VM_FAULT_SIGBUS; + } } else { down_read(&EXT4_I(inode)->i_mmap_sem); } - if (!IS_ERR(handle)) - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); - else - result = VM_FAULT_SIGBUS; + result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops); if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); + ext4_journal_stop(handle); + /* Handling synchronous page fault? */ + if (result & VM_FAULT_NEEDDSYNC) + result = dax_finish_sync_fault(vmf, pe_size, pfn); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); } else { @@ -350,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; + /* + * We don't support synchronous mappings for non-DAX files. At least + * until someone comes with a sensible use case. + */ + if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -365,7 +379,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct vfsmount *mnt = filp->f_path.mnt; - struct dentry *dir; struct path path; char buf[64], *cp; int ret; @@ -405,25 +418,11 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_journal_stop(handle); } } - if (ext4_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); - if (ret) - return -EACCES; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - dir = dget_parent(file_dentry(filp)); - if (ext4_encrypted_inode(d_inode(dir)) && - !fscrypt_has_permitted_context(d_inode(dir), inode)) { - ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu", - (unsigned long) d_inode(dir)->i_ino, - (unsigned long) inode->i_ino); - dput(dir); - return -EPERM; - } - dput(dir); + ret = fscrypt_file_open(inode, filp); + if (ret) + return ret; + /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -439,248 +438,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } /* - * Here we use ext4_map_blocks() to get a block mapping for a extent-based - * file rather than ext4_ext_walk_space() because we can introduce - * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same - * function. When extent status tree has been fully implemented, it will - * track all extent status for a file and we can directly use it to - * retrieve the offset for SEEK_DATA/SEEK_HOLE. - */ - -/* - * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to - * lookup page cache to check whether or not there has some data between - * [startoff, endoff] because, if this range contains an unwritten extent, - * we determine this extent as a data or a hole according to whether the - * page cache has data or not. - */ -static int ext4_find_unwritten_pgoff(struct inode *inode, - int whence, - ext4_lblk_t end_blk, - loff_t *offset) -{ - struct pagevec pvec; - unsigned int blkbits; - pgoff_t index; - pgoff_t end; - loff_t endoff; - loff_t startoff; - loff_t lastoff; - int found = 0; - - blkbits = inode->i_sb->s_blocksize_bits; - startoff = *offset; - lastoff = startoff; - endoff = (loff_t)end_blk << blkbits; - - index = startoff >> PAGE_SHIFT; - end = (endoff - 1) >> PAGE_SHIFT; - - pagevec_init(&pvec, 0); - do { - int i; - unsigned long nr_pages; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, - &index, end); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - /* - * If current offset is smaller than the page offset, - * there is a hole at this offset. - */ - if (whence == SEEK_HOLE && lastoff < endoff && - lastoff < page_offset(pvec.pages[i])) { - found = 1; - *offset = lastoff; - goto out; - } - - lock_page(page); - - if (unlikely(page->mapping != inode->i_mapping)) { - unlock_page(page); - continue; - } - - if (!page_has_buffers(page)) { - unlock_page(page); - continue; - } - - if (page_has_buffers(page)) { - lastoff = page_offset(page); - bh = head = page_buffers(page); - do { - if (lastoff + bh->b_size <= startoff) - goto next; - if (buffer_uptodate(bh) || - buffer_unwritten(bh)) { - if (whence == SEEK_DATA) - found = 1; - } else { - if (whence == SEEK_HOLE) - found = 1; - } - if (found) { - *offset = max_t(loff_t, - startoff, lastoff); - unlock_page(page); - goto out; - } -next: - lastoff += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); - } - - lastoff = page_offset(page) + PAGE_SIZE; - unlock_page(page); - } - - pagevec_release(&pvec); - } while (index <= end); - - /* There are no pages upto endoff - that would be a hole in there. */ - if (whence == SEEK_HOLE && lastoff < endoff) { - found = 1; - *offset = lastoff; - } -out: - pagevec_release(&pvec); - return found; -} - -/* - * ext4_seek_data() retrieves the offset for SEEK_DATA. - */ -static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) -{ - struct inode *inode = file->f_mapping->host; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t dataoff, isize; - int blkbits; - int ret; - - inode_lock(inode); - - isize = i_size_read(inode); - if (offset < 0 || offset >= isize) { - inode_unlock(inode); - return -ENXIO; - } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - dataoff = offset; - - do { - ret = ext4_get_next_extent(inode, last, end - last + 1, &es); - if (ret <= 0) { - /* No extent found -> no data */ - if (ret == 0) - ret = -ENXIO; - inode_unlock(inode); - return ret; - } - - last = es.es_lblk; - if (last != start) - dataoff = (loff_t)last << blkbits; - if (!ext4_es_is_unwritten(&es)) - break; - - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, - es.es_lblk + es.es_len, &dataoff)) - break; - last += es.es_len; - dataoff = (loff_t)last << blkbits; - cond_resched(); - } while (last <= end); - - inode_unlock(inode); - - if (dataoff > isize) - return -ENXIO; - - return vfs_setpos(file, dataoff, maxsize); -} - -/* - * ext4_seek_hole() retrieves the offset for SEEK_HOLE. - */ -static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) -{ - struct inode *inode = file->f_mapping->host; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t holeoff, isize; - int blkbits; - int ret; - - inode_lock(inode); - - isize = i_size_read(inode); - if (offset < 0 || offset >= isize) { - inode_unlock(inode); - return -ENXIO; - } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - holeoff = offset; - - do { - ret = ext4_get_next_extent(inode, last, end - last + 1, &es); - if (ret < 0) { - inode_unlock(inode); - return ret; - } - /* Found a hole? */ - if (ret == 0 || es.es_lblk > last) { - if (last != start) - holeoff = (loff_t)last << blkbits; - break; - } - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (ext4_es_is_unwritten(&es) && - ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - last + es.es_len, &holeoff)) - break; - - last += es.es_len; - holeoff = (loff_t)last << blkbits; - cond_resched(); - } while (last <= end); - - inode_unlock(inode); - - if (holeoff > isize) - holeoff = isize; - - return vfs_setpos(file, holeoff, maxsize); -} - -/* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. @@ -696,18 +453,24 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes = inode->i_sb->s_maxbytes; switch (whence) { - case SEEK_SET: - case SEEK_CUR: - case SEEK_END: + default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); - case SEEK_DATA: - return ext4_seek_data(file, offset, maxbytes); case SEEK_HOLE: - return ext4_seek_hole(file, offset, maxbytes); + inode_lock_shared(inode); + offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + inode_unlock_shared(inode); + break; + case SEEK_DATA: + inode_lock_shared(inode); + offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + inode_unlock_shared(inode); + break; } - return -EINVAL; + if (offset < 0) + return offset; + return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { @@ -719,6 +482,7 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, + .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c5f697a3fad4..b32cf263750d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -816,6 +816,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, #ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) + return ERR_CAST(p); if (p) { int acl_size = p->a_count * sizeof(ext4_acl_entry); @@ -1139,9 +1141,7 @@ got: inode->i_ino); goto out; } - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); + inode->i_generation = prandom_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 28c5c3abddb3..1367553c43bb 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -12,6 +12,7 @@ * GNU General Public License for more details. */ +#include <linux/iomap.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" @@ -302,11 +303,6 @@ static int ext4_create_inline_data(handle_t *handle, EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); - /* - * Propagate changes to inode->i_flags as well - e.g. S_DAX may - * get cleared - */ - ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -451,11 +447,6 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); - /* - * Propagate changes to inode->i_flags as well - e.g. S_DAX may - * get set. - */ - ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -1827,6 +1818,38 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) return ret; } +int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) +{ + __u64 addr; + int error = -EAGAIN; + struct ext4_iloc iloc; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) + goto out; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto out; + + addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; + addr += offsetof(struct ext4_inode, i_block); + + brelse(iloc.bh); + + iomap->addr = addr; + iomap->offset = 0; + iomap->length = min_t(loff_t, ext4_get_inline_size(inode), + i_size_read(inode)); + iomap->type = 0; + iomap->flags = IOMAP_F_DATA_INLINE; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + return error; +} + int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 90afeb7293a6..534a9130f625 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -149,6 +149,15 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_inode_is_fast_symlink(struct inode *inode) { + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } @@ -1719,7 +1728,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, ext4_es_remove_extent(inode, start, last - start + 1); } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (index <= end) { nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) @@ -2345,7 +2354,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) lblk = start << bpp_bits; pblock = mpd->map.m_pblk; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (start <= end) { nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &start, end); @@ -2616,12 +2625,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) else tag = PAGECACHE_TAG_DIRTY; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) goto out; @@ -2629,16 +2638,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct page *page = pvec.pages[i]; /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) - goto out; - - /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently @@ -2752,7 +2751,7 @@ static int ext4_writepages(struct address_space *mapping, * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because + * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want @@ -3394,7 +3393,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -#ifdef CONFIG_FS_DAX +static bool ext4_inode_datasync_dirty(struct inode *inode) +{ + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal) + return !jbd2_transaction_committed(journal, + EXT4_I(inode)->i_datasync_tid); + /* Any metadata buffers to write? */ + if (!list_empty(&inode->i_mapping->private_list)) + return true; + return inode->i_state & I_DIRTY_DATASYNC; +} + static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { @@ -3403,17 +3414,54 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; struct ext4_map_blocks map; + bool delalloc = false; int ret; - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) - return -ERANGE; + + if (flags & IOMAP_REPORT) { + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } + } else { + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; + } map.m_lblk = first_block; map.m_len = last_block - first_block + 1; - if (!(flags & IOMAP_WRITE)) { + if (flags & IOMAP_REPORT) { ret = ext4_map_blocks(NULL, inode, &map, 0); - } else { + if (ret < 0) + return ret; + + if (ret == 0) { + ext4_lblk_t end = map.m_lblk + map.m_len - 1; + struct extent_status es; + + ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); + + if (!es.es_len || es.es_lblk > end) { + /* entire range is a hole */ + } else if (es.es_lblk > map.m_lblk) { + /* range starts with a hole */ + map.m_len = es.es_lblk - map.m_lblk; + } else { + ext4_lblk_t offs = 0; + + if (es.es_lblk < map.m_lblk) + offs = map.m_lblk - es.es_lblk; + map.m_lblk = es.es_lblk + offs; + map.m_len = es.es_len - offs; + delalloc = true; + } + } + } else if (flags & IOMAP_WRITE) { int dio_credits; handle_t *handle; int retries = 0; @@ -3464,17 +3512,23 @@ retry: } } ext4_journal_stop(handle); + } else { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; } iomap->flags = 0; + if (ext4_inode_datasync_dirty(inode)) + iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; + iomap->length = (u64)map.m_len << blkbits; if (ret == 0) { - iomap->type = IOMAP_HOLE; - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->length = (u64)map.m_len << blkbits; + iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; } else { if (map.m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; @@ -3484,12 +3538,12 @@ retry: WARN_ON_ONCE(1); return -EIO; } - iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); - iomap->length = (u64)map.m_len << blkbits; + iomap->addr = (u64)map.m_pblk << blkbits; } if (map.m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + return 0; } @@ -3550,8 +3604,6 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; -#endif - static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { @@ -4573,6 +4625,21 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } +static bool ext4_should_use_dax(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DAX)) + return false; + if (!S_ISREG(inode->i_mode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + if (ext4_encrypted_inode(inode)) + return false; + return true; +} + void ext4_set_inode_flags(struct inode *inode) { unsigned int flags = EXT4_I(inode)->i_flags; @@ -4588,12 +4655,13 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) && - !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) && - !ext4_encrypted_inode(inode)) + if (ext4_should_use_dax(inode)) new_fl |= S_DAX; + if (flags & EXT4_ENCRYPT_FL) + new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| + S_ENCRYPTED); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, @@ -5124,7 +5192,7 @@ static int ext4_do_update_inode(handle_t *handle, ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); - if (inode->i_sb->s_flags & MS_LAZYTIME) + if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); @@ -5309,6 +5377,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + error = fscrypt_prepare_setattr(dentry, attr); + if (error) + return error; + if (is_quota_modification(inode, attr)) { error = dquot_initialize(inode); if (error) @@ -5354,14 +5426,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; int shrink = (attr->ia_size <= inode->i_size); - if (ext4_encrypted_inode(inode)) { - error = fscrypt_get_encryption_info(inode); - if (error) - return error; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -5967,11 +6031,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); - /* - * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated. - * E.g. S_DAX may get cleared / set. - */ - ext4_set_inode_flags(inode); jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_journal_flag_rwsem); @@ -6107,70 +6166,3 @@ int ext4_filemap_fault(struct vm_fault *vmf) return err; } - -/* - * Find the first extent at or after @lblk in an inode that is not a hole. - * Search for @map_len blocks at most. The extent is returned in @result. - * - * The function returns 1 if we found an extent. The function returns 0 in - * case there is no extent at or after @lblk and in that case also sets - * @result->es_len to 0. In case of error, the error code is returned. - */ -int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, - unsigned int map_len, struct extent_status *result) -{ - struct ext4_map_blocks map; - struct extent_status es = {}; - int ret; - - map.m_lblk = lblk; - map.m_len = map_len; - - /* - * For non-extent based files this loop may iterate several times since - * we do not determine full hole size. - */ - while (map.m_len > 0) { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - /* There's extent covering m_lblk? Just return it. */ - if (ret > 0) { - int status; - - ext4_es_store_pblock(result, map.m_pblk); - result->es_lblk = map.m_lblk; - result->es_len = map.m_len; - if (map.m_flags & EXT4_MAP_UNWRITTEN) - status = EXTENT_STATUS_UNWRITTEN; - else - status = EXTENT_STATUS_WRITTEN; - ext4_es_store_status(result, status); - return 1; - } - ext4_es_find_delayed_extent_range(inode, map.m_lblk, - map.m_lblk + map.m_len - 1, - &es); - /* Is delalloc data before next block in extent tree? */ - if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { - ext4_lblk_t offset = 0; - - if (es.es_lblk < lblk) - offset = lblk - es.es_lblk; - result->es_lblk = es.es_lblk + offset; - ext4_es_store_pblock(result, - ext4_es_pblock(&es) + offset); - result->es_len = es.es_len - offset; - ext4_es_store_status(result, ext4_es_status(&es)); - - return 1; - } - /* There's a hole at m_lblk, advance us after it */ - map.m_lblk += map.m_len; - map_len -= map.m_len; - map.m_len = map_len; - cond_resched(); - } - result->es_len = 0; - return 0; -} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 75d83471f65c..1eec25014f62 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -15,6 +15,7 @@ #include <linux/mount.h> #include <linux/file.h> #include <linux/quotaops.h> +#include <linux/random.h> #include <linux/uuid.h> #include <linux/uaccess.h> #include <linux/delay.h> @@ -99,7 +100,6 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; - struct ext4_sb_info *sbi = EXT4_SB(sb); if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) return -EINVAL; @@ -158,10 +158,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ctime = inode_bl->i_ctime = current_time(inode); - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - inode_bl->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); + inode->i_generation = prandom_u32(); + inode_bl->i_generation = prandom_u32(); ext4_discard_preallocations(inode); @@ -291,10 +289,20 @@ flags_err: if (err) goto flags_out; - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + /* + * Changes to the journaling mode can cause unsafe changes to + * S_DAX if we are using the DAX mount option. + */ + if (test_opt(inode->i_sb, DAX)) { + err = -EBUSY; + goto flags_out; + } + err = ext4_change_inode_journal_flag(inode, jflag); - if (err) - goto flags_out; + if (err) + goto flags_out; + } if (migrate) { if (flags & EXT4_EXTENTS_FL) err = ext4_ext_migrate(inode); @@ -584,6 +592,44 @@ static int ext4_ioc_getfsmap(struct super_block *sb, return 0; } +static long ext4_ioctl_group_add(struct file *file, + struct ext4_new_group_data *input) +{ + struct super_block *sb = file_inode(file)->i_sb; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_add_out; + } + + err = mnt_want_write_file(file); + if (err) + goto group_add_out; + + err = ext4_group_add(sb, input); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(file); + if (!err && ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, input->group); +group_add_out: + ext4_resize_end(sb); + return err; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -768,44 +814,12 @@ mext_out: case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; - int err, err2=0; - - err = ext4_resize_begin(sb); - if (err) - return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, - sizeof(input))) { - err = -EFAULT; - goto group_add_out; - } - - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not supported with bigalloc"); - err = -EOPNOTSUPP; - goto group_add_out; - } - - err = mnt_want_write_file(filp); - if (err) - goto group_add_out; + sizeof(input))) + return -EFAULT; - err = ext4_group_add(sb, &input); - if (EXT4_SB(sb)->s_journal) { - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - } - if (err == 0) - err = err2; - mnt_drop_write_file(filp); - if (!err && ext4_has_group_desc_csum(sb) && - test_opt(sb, INIT_INODE_TABLE)) - err = ext4_register_li_request(sb, input.group); -group_add_out: - ext4_resize_end(sb); - return err; + return ext4_ioctl_group_add(filp, &input); } case EXT4_IOC_MIGRATE: @@ -862,12 +876,6 @@ group_add_out: int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not (yet) supported with bigalloc"); - return -EOPNOTSUPP; - } - if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; @@ -1076,8 +1084,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; case EXT4_IOC32_GROUP_ADD: { struct compat_ext4_new_group_input __user *uinput; - struct ext4_new_group_input input; - mm_segment_t old_fs; + struct ext4_new_group_data input; int err; uinput = compat_ptr(arg); @@ -1090,12 +1097,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) &uinput->reserved_blocks); if (err) return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, - (unsigned long) &input); - set_fs(old_fs); - return err; + return ext4_ioctl_group_add(file, &input); } case EXT4_IOC_MOVE_EXT: case EXT4_IOC_RESIZE_FS: diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 701085620cd8..d9f8b90a93ed 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4994,8 +4994,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int err = 0, ret, blk_free_count; - ext4_grpblk_t blocks_freed; + int err = 0, ret, free_clusters_count; + ext4_grpblk_t clusters_freed; + ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); + ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); + unsigned long cluster_count = last_cluster - first_cluster + 1; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); @@ -5007,8 +5010,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - ext4_warning(sb, "too much blocks added to group %u", + if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { + ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; goto error_return; @@ -5054,14 +5057,14 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (err) goto error_return; - for (i = 0, blocks_freed = 0; i < count; i++) { + for (i = 0, clusters_freed = 0; i < cluster_count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { ext4_error(sb, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { - blocks_freed++; + clusters_freed++; } } @@ -5075,19 +5078,20 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * them with group lock_held */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(NULL, &e4b, bit, count); - blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); - ext4_free_group_clusters_set(sb, desc, blk_free_count); + mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); + mb_free_blocks(NULL, &e4b, bit, cluster_count); + free_clusters_count = clusters_freed + + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, free_clusters_count); ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_NUM_B2C(sbi, blocks_freed)); + clusters_freed); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + atomic64_add(clusters_freed, &sbi->s_flex_groups[flex_group].free_clusters); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bd48a8d83961..e750d68fbcb5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1399,6 +1399,10 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, "falling back\n")); } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (!nblocks) { + ret = NULL; + goto cleanup_and_exit; + } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; @@ -1539,24 +1543,14 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct inode *inode; struct ext4_dir_entry_2 *de; struct buffer_head *bh; + int err; - if (ext4_encrypted_inode(dir)) { - int res = fscrypt_get_encryption_info(dir); - - /* - * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is - * created while the directory was encrypted and we - * have access to the key. - */ - if (fscrypt_has_encryption_key(dir)) - fscrypt_set_encrypted_dentry(dentry); - fscrypt_set_d_op(dentry); - if (res && res != -ENOKEY) - return ERR_PTR(res); - } + err = fscrypt_prepare_lookup(dir, dentry, flags); + if (err) + return ERR_PTR(err); - if (dentry->d_name.len > EXT4_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) @@ -3222,9 +3216,10 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; - if (ext4_encrypted_inode(dir) && - !fscrypt_has_permitted_context(dir, inode)) - return -EPERM; + + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(dir)->i_projid, @@ -3516,12 +3511,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; - if ((ext4_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (ext4_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3550,13 +3539,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; - if ((old.dir != new.dir) && - ext4_encrypted_inode(new.dir) && - !fscrypt_has_permitted_context(new.dir, old.inode)) { - retval = -EPERM; - goto end_rename; - } - new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { @@ -3722,19 +3704,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int retval; struct timespec ctime; - if ((ext4_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (ext4_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - - if ((ext4_encrypted_inode(old_dir) || - ext4_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!fscrypt_has_permitted_context(new_dir, old.inode) || - !fscrypt_has_permitted_context(old_dir, new.inode))) - return -EPERM; - if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid)) || @@ -3861,12 +3830,19 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int err; + if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 1dac59c24792..50443bda8e98 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -107,7 +107,7 @@ static int verify_group_input(struct super_block *sb, overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; - input->free_blocks_count = free_blocks_count = + input->free_clusters_count = free_blocks_count = input->blocks_count - 2 - overhead - sbi->s_itb_per_group; if (test_opt(sb, DEBUG)) @@ -258,6 +258,7 @@ static int ext4_alloc_group_tables(struct super_block *sb, ext4_group_t last_group; unsigned overhead; __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; + int i; BUG_ON(flex_gd->count == 0 || group_data == NULL); @@ -294,7 +295,7 @@ next_group: group_data[bb_index].block_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; - group_data[group].free_blocks_count--; + group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } @@ -305,7 +306,7 @@ next_group: group_data[ib_index].inode_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; - group_data[group].free_blocks_count--; + group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } @@ -324,15 +325,22 @@ next_group: if (start_blk + itb > next_group_start) { flex_gd->bg_flags[group + 1] &= uninit_mask; overhead = start_blk + itb - next_group_start; - group_data[group + 1].free_blocks_count -= overhead; + group_data[group + 1].mdata_blocks += overhead; itb -= overhead; } - group_data[group].free_blocks_count -= itb; + group_data[group].mdata_blocks += itb; flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } + /* Update free clusters count to exclude metadata blocks */ + for (i = 0; i < flex_gd->count; i++) { + group_data[i].free_clusters_count -= + EXT4_NUM_B2C(EXT4_SB(sb), + group_data[i].mdata_blocks); + } + if (test_opt(sb, DEBUG)) { int i; group = group_data[0].group; @@ -342,12 +350,13 @@ next_group: flexbg_size); for (i = 0; i < flex_gd->count; i++) { - printk(KERN_DEBUG "adding %s group %u: %u " - "blocks (%d free)\n", + ext4_debug( + "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n", ext4_bg_has_super(sb, group + i) ? "normal" : "no-super", group + i, group_data[i].blocks_count, - group_data[i].free_blocks_count); + group_data[i].free_clusters_count, + group_data[i].mdata_blocks); } } return 0; @@ -399,7 +408,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) } /* - * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * set_flexbg_block_bitmap() mark clusters [@first_cluster, @last_cluster] used. * * Helper function for ext4_setup_new_group_blocks() which set . * @@ -409,22 +418,26 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) */ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, struct ext4_new_flex_group_data *flex_gd, - ext4_fsblk_t block, ext4_group_t count) + ext4_fsblk_t first_cluster, ext4_fsblk_t last_cluster) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t count = last_cluster - first_cluster + 1; ext4_group_t count2; - ext4_debug("mark blocks [%llu/%u] used\n", block, count); - for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, + last_cluster); + for (count2 = count; count > 0; + count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; int err; - group = ext4_get_group_number(sb, block); - start = ext4_group_first_block_no(sb, group); + group = ext4_get_group_number(sb, EXT4_C2B(sbi, first_cluster)); + start = EXT4_B2C(sbi, ext4_group_first_block_no(sb, group)); group -= flex_gd->groups[0].group; - count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start); + count2 = EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start); if (count2 > count) count2 = count; @@ -445,9 +458,9 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, err = ext4_journal_get_write_access(handle, bh); if (err) return err; - ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, - block - start, count2); - ext4_set_bits(bh->b_data, block - start, count2); + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", + first_cluster, first_cluster - start, count2); + ext4_set_bits(bh->b_data, first_cluster - start, count2); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (unlikely(err)) @@ -596,9 +609,10 @@ handle_bb: if (overhead != 0) { ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, overhead); + ext4_set_bits(bh->b_data, 0, + EXT4_NUM_B2C(sbi, overhead)); } - ext4_mark_bitmap_end(group_data[i].blocks_count, + ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) @@ -643,7 +657,11 @@ handle_ib: continue; } err = set_flexbg_block_bitmap(sb, handle, - flex_gd, start, count); + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); if (err) goto out; count = group_table_count[j]; @@ -653,7 +671,11 @@ handle_ib: if (count) { err = set_flexbg_block_bitmap(sb, handle, - flex_gd, start, count); + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); if (err) goto out; } @@ -841,7 +863,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_std_error(sb, err); goto exit_inode; } - inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> + (9 - EXT4_SB(sb)->s_cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); memset(gdb_bh->b_data, 0, sb->s_blocksize); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); @@ -936,6 +959,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + int cluster_bits = EXT4_SB(sb)->s_cluster_bits; struct buffer_head **primary; struct buffer_head *dind; struct ext4_iloc iloc; @@ -1011,7 +1035,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, if (!err) err = err2; } - inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + + inode->i_blocks += reserved_gdb * sb->s_blocksize >> (9 - cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); exit_bh: @@ -1245,7 +1270,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_group_t group; __u16 *bg_flags = flex_gd->bg_flags; int i, gdb_off, gdb_num, err = 0; - + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { group = group_data->group; @@ -1272,7 +1297,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, - EXT4_NUM_B2C(sbi, group_data->free_blocks_count)); + group_data->free_clusters_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); if (ext4_has_group_desc_csum(sb)) ext4_itable_unused_set(sb, gdp, @@ -1328,7 +1353,7 @@ static void ext4_update_super(struct super_block *sb, */ for (i = 0; i < flex_gd->count; i++) { blocks_count += group_data[i].blocks_count; - free_blocks += group_data[i].free_blocks_count; + free_blocks += EXT4_C2B(sbi, group_data[i].free_clusters_count); } reserved_blocks = ext4_r_blocks_count(es) * 100; @@ -1500,17 +1525,18 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, ext4_fsblk_t n_blocks_count, unsigned long flexbg_size) { - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t o_blocks_count; ext4_group_t n_group; ext4_group_t group; ext4_group_t last_group; ext4_grpblk_t last; - ext4_grpblk_t blocks_per_group; + ext4_grpblk_t clusters_per_group; unsigned long i; - blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + clusters_per_group = EXT4_CLUSTERS_PER_GROUP(sb); o_blocks_count = ext4_blocks_count(es); @@ -1531,9 +1557,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, int overhead; group_data[i].group = group + i; - group_data[i].blocks_count = blocks_per_group; + group_data[i].blocks_count = EXT4_BLOCKS_PER_GROUP(sb); overhead = ext4_group_overhead_blocks(sb, group + i); - group_data[i].free_blocks_count = blocks_per_group - overhead; + group_data[i].mdata_blocks = overhead; + group_data[i].free_clusters_count = EXT4_CLUSTERS_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) { flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; @@ -1547,10 +1574,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, /* We need to initialize block bitmap of last group. */ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; - if ((last_group == n_group) && (last != blocks_per_group - 1)) { - group_data[i - 1].blocks_count = last + 1; - group_data[i - 1].free_blocks_count -= blocks_per_group- - last - 1; + if ((last_group == n_group) && (last != clusters_per_group - 1)) { + group_data[i - 1].blocks_count = EXT4_C2B(sbi, last + 1); + group_data[i - 1].free_clusters_count -= clusters_per_group - + last - 1; } return 1; @@ -1797,7 +1824,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) } /* Do a quick sanity check of the resize inode */ - if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) + if (inode->i_blocks != 1 << (inode->i_blkbits - + (9 - sbi->s_cluster_bits))) goto invalid_resize_inode; for (i = 0; i < EXT4_N_BLOCKS; i++) { if (i == EXT4_DIND_BLOCK) { @@ -1960,7 +1988,7 @@ retry: if (n_group == o_group) add = n_blocks_count - o_blocks_count; else - add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + add = EXT4_C2B(sbi, EXT4_CLUSTERS_PER_GROUP(sb) - (offset + 1)); if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b0915b734a38..7c46693a14d7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -422,7 +422,7 @@ static void ext4_handle_error(struct super_block *sb) * before ->s_flags update */ smp_wmb(); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } if (test_opt(sb, ERRORS_PANIC)) { if (EXT4_SB(sb)->s_journal && @@ -635,7 +635,7 @@ void __ext4_abort(struct super_block *sb, const char *function, * before ->s_flags update */ smp_wmb(); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); @@ -1159,6 +1159,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (inode->i_ino == EXT4_ROOT_INO) return -EPERM; + if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) + return -EINVAL; + res = ext4_convert_inline_data(inode); if (res) return res; @@ -1181,7 +1184,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); /* - * Update inode->i_flags - e.g. S_DAX may get disabled + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled */ ext4_set_inode_flags(inode); } @@ -1206,7 +1210,10 @@ retry: ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - /* Update inode->i_flags - e.g. S_DAX may get disabled */ + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ ext4_set_inode_flags(inode); res = ext4_mark_inode_dirty(handle, inode); if (res) @@ -1237,14 +1244,9 @@ static const struct fscrypt_operations ext4_cryptops = { .get_context = ext4_get_context, .set_context = ext4_set_context, .dummy_context = ext4_dummy_context, - .is_encrypted = ext4_encrypted_inode, .empty_dir = ext4_empty_dir, .max_namelen = ext4_max_namelen, }; -#else -static const struct fscrypt_operations ext4_cryptops = { - .is_encrypted = ext4_encrypted_inode, -}; #endif #ifdef CONFIG_QUOTA @@ -1680,10 +1682,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sb->s_flags |= SB_I_VERSION; return 1; case Opt_lazytime: - sb->s_flags |= MS_LAZYTIME; + sb->s_flags |= SB_LAZYTIME; return 1; case Opt_nolazytime: - sb->s_flags &= ~MS_LAZYTIME; + sb->s_flags &= ~SB_LAZYTIME; return 1; } @@ -2114,7 +2116,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); - res = MS_RDONLY; + res = SB_RDONLY; } if (read_only) goto done; @@ -2427,7 +2429,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ - if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { + if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list.\n"); es->s_last_orphan = 0; @@ -2436,19 +2438,19 @@ static void ext4_orphan_cleanup(struct super_block *sb, return; } - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ - if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { int ret = ext4_enable_quotas(sb); if (!ret) @@ -2537,7 +2539,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, } } #endif - sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } /* @@ -2739,7 +2741,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; return 1; } @@ -2791,14 +2793,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) * This function is called once a day if we have errors logged * on the file system */ -static void print_daily_error_info(unsigned long arg) +static void print_daily_error_info(struct timer_list *t) { - struct super_block *sb = (struct super_block *) arg; - struct ext4_sb_info *sbi; - struct ext4_super_block *es; - - sbi = EXT4_SB(sb); - es = sbi->s_es; + struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); + struct super_block *sb = sbi->s_sb; + struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ @@ -3624,8 +3623,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_iflags |= SB_I_CGROUPWB; } - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || @@ -3708,6 +3707,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (ext4_has_feature_inline_data(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" + " that may contain inline data"); + goto failed_mount; + } err = bdev_dax_supported(sb, blocksize); if (err) goto failed_mount; @@ -3977,11 +3981,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_gdb_count = db_count; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); - spin_lock_init(&sbi->s_next_gen_lock); - setup_timer(&sbi->s_err_report, print_daily_error_info, - (unsigned long) sb); + timer_setup(&sbi->s_err_report, print_daily_error_info, 0); /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) @@ -3996,7 +3997,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; +#ifdef CONFIG_EXT4_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; +#endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) @@ -4196,7 +4199,7 @@ no_journal: } if (ext4_setup_super(sb, es, sb_rdonly(sb))) - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && @@ -4612,7 +4615,8 @@ static int ext4_load_journal(struct super_block *sb, "required on readonly filesystem"); if (really_read_only) { ext4_msg(sb, KERN_ERR, "write access " - "unavailable, cannot proceed"); + "unavailable, cannot proceed " + "(try mounting with noload)"); return -EROFS; } ext4_msg(sb, KERN_INFO, "write access will " @@ -4689,7 +4693,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ - if (!(sb->s_flags & MS_RDONLY)) + if (!(sb->s_flags & SB_RDONLY)) es->s_wtime = cpu_to_le32(get_seconds()); if (sb->s_bdev->bd_part) es->s_kbytes_written = @@ -5043,8 +5047,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, "Abort forced by user"); - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); es = sbi->s_es; @@ -5053,16 +5057,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if (*flags & MS_LAZYTIME) - sb->s_flags |= MS_LAZYTIME; + if (*flags & SB_LAZYTIME) + sb->s_flags |= SB_LAZYTIME; - if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) { + if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; } - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; @@ -5074,7 +5078,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount */ - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* * OK, test if we are remounting a valid rw partition @@ -5136,7 +5140,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); if (!ext4_setup_super(sb, es, 0)) - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) { @@ -5160,7 +5164,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY)) + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) ext4_commit_super(sb, 1); #ifdef CONFIG_QUOTA @@ -5178,7 +5182,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } #endif - *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME); + *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); return 0; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 436b3a1464d9..2bb7c9fc5144 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -250,6 +250,9 @@ static int __f2fs_set_acl(struct inode *inode, int type, int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + return __f2fs_set_acl(inode, type, acl, NULL); } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 04fe1df052b2..4aa69bc1c70a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { set_ckpt_flags(sbi, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; if (!end_io) f2fs_flush_merged_writes(sbi); } @@ -305,25 +304,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; + pgoff_t index = 0, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; + int nr_pages; struct writeback_control wbc = { .for_reclaim = 0, }; struct blk_plug plug; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); blk_start_plug(&plug); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (unlikely(nr_pages == 0)) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -401,24 +397,23 @@ const struct address_space_operations f2fs_meta_aops = { #endif }; -static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e, *tmp; tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); -retry: + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { e = tmp; - if (radix_tree_insert(&im->ino_root, ino, e)) { - spin_unlock(&im->ino_lock); - radix_tree_preload_end(); - goto retry; - } + if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) + f2fs_bug_on(sbi, 1); + memset(e, 0, sizeof(struct ino_entry)); e->ino = ino; @@ -426,6 +421,10 @@ retry: if (type != ORPHAN_INO) im->ino_num++; } + + if (type == FLUSH_INO) + f2fs_set_bit(devidx, (char *)&e->dirty_device); + spin_unlock(&im->ino_lock); radix_tree_preload_end(); @@ -454,7 +453,7 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ - __add_ino_entry(sbi, ino, type); + __add_ino_entry(sbi, ino, 0, type); } void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -480,7 +479,7 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) struct ino_entry *e, *tmp; int i; - for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -494,6 +493,27 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) } } +void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + __add_ino_entry(sbi, ino, devidx, type); +} + +bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + bool is_dirty = false; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) + is_dirty = true; + spin_unlock(&im->ino_lock); + return is_dirty; +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; @@ -530,7 +550,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); update_inode_page(inode); } @@ -554,7 +574,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return err; } - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(sbi, ino, 0, ORPHAN_INO); inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { @@ -590,20 +610,24 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; unsigned int s_flags = sbi->sb->s_flags; int err = 0; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); - sbi->sb->s_flags &= ~MS_RDONLY; + sbi->sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= MS_ACTIVE; + sbi->sb->s_flags |= SB_ACTIVE; + /* Turn on quotas so that they are updated correctly */ - f2fs_enable_quota_files(sbi); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -631,9 +655,10 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) out: #ifdef CONFIG_QUOTA /* Turn quotas off */ - f2fs_quota_off_umount(sbi->sb); + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); #endif - sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } @@ -986,7 +1011,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) update_inode_page(inode); iput(inode); } - }; + } return 0; } @@ -1146,6 +1171,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; + int err; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { @@ -1239,6 +1265,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; + /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 36b535207c88..516fa0d3ff9c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -173,7 +173,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, { struct bio *bio; - bio = f2fs_bio_alloc(npages); + bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; @@ -418,8 +418,8 @@ next: bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - /* set submitted = 1 as a return value */ - fio->submitted = 1; + /* set submitted = true as a return value */ + fio->submitted = true; inc_page_count(sbi, WB_DATA_TYPE(bio_page)); @@ -473,7 +473,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, f2fs_wait_on_block_writeback(sbi, blkaddr); } - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); if (!bio) { if (ctx) fscrypt_release_ctx(ctx); @@ -833,6 +833,13 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + /* convert inline data for Direct I/O*/ + if (iocb->ki_flags & IOCB_DIRECT) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; @@ -845,15 +852,11 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; - if (iocb->ki_flags & IOCB_DIRECT) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; + if (iocb->ki_flags & IOCB_DIRECT) return f2fs_map_blocks(inode, &map, 1, __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); - } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) @@ -1334,7 +1337,7 @@ static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); @@ -1495,6 +1498,7 @@ static int __write_data_page(struct page *page, bool *submitted, int err = 0; struct f2fs_io_info fio = { .sbi = sbi, + .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), @@ -1566,8 +1570,11 @@ write: err = do_write_data_page(&fio); } } + + down_write(&F2FS_I(inode)->i_sem); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; + up_write(&F2FS_I(inode)->i_sem); done: if (err && err != -ENOENT) @@ -1635,7 +1642,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) @@ -1669,8 +1676,8 @@ retry: while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) break; @@ -1678,11 +1685,6 @@ retry: struct page *page = pvec.pages[i]; bool submitted = false; - if (page->index > end) { - done = 1; - break; - } - done_index = page->index; retry_write: lock_page(page); @@ -1937,6 +1939,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); + if (f2fs_is_atomic_file(inode) && + !available_free_memory(sbi, INMEM_PAGES)) { + err = -ENOMEM; + goto fail; + } + /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1952,7 +1960,7 @@ repeat: * Do not use grab_cache_page_write_begin() to avoid deadlock due to * wait_for_stable_page. Will wait that below with our IO control. */ - page = pagecache_get_page(mapping, index, + page = f2fs_pagecache_get_page(mapping, index, FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (!page) { err = -ENOMEM; @@ -2014,6 +2022,8 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); + if (f2fs_is_atomic_file(inode)) + drop_inmem_pages_all(sbi); return err; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 87f449845f5f..ecada8425268 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -45,9 +45,18 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA); si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + + si->nquota_files = 0; + if (f2fs_sb_has_quota_ino(sbi->sb)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + si->nquota_files++; + } + } si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); @@ -61,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->fcc_info->issued_flush); si->nr_flushing = atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + si->flush_list_empty = + llist_empty(&SM_I(sbi)->fcc_info->issue_list); } if (SM_I(sbi) && SM_I(sbi)->dcc_info) { si->nr_discarded = @@ -96,9 +107,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; si->avail_nids = NM_I(sbi)->available_nids; - si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -231,14 +242,14 @@ get_cache: } /* free nids */ - si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + - NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - for (i = 0; i <= ORPHAN_INO; i++) + for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree); @@ -262,9 +273,10 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW"); + f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_cp_error(si->sbi) ? "Error": "Good"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", @@ -349,10 +361,11 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, + si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " @@ -365,6 +378,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); + seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + si->ndirty_qdata, si->nquota_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - imeta: %4d\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c0c933ad43c8..2d98d877c09d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -10,10 +10,12 @@ */ #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/sched/signal.h> #include "f2fs.h" #include "node.h" #include "acl.h" #include "xattr.h" +#include <trace/events/f2fs.h> static unsigned long dir_blocks(struct inode *inode) { @@ -847,6 +849,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct f2fs_dentry_block *dentry_blk = NULL; struct page *dentry_page = NULL; struct file_ra_state *ra = &file->f_ra; + loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); @@ -855,24 +858,32 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) if (f2fs_encrypted_inode(inode)) { err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) - return err; + goto out; err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); if (err < 0) - return err; + goto out; } if (f2fs_has_inline_dentry(inode)) { err = f2fs_read_inline_dir(file, ctx, &fstr); - goto out; + goto out_free; } - /* readahead for multi pages of dir */ - if (npages - n > 1 && !ra_has_index(ra, n)) - page_cache_sync_readahead(inode->i_mapping, ra, file, n, + for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { + + /* allow readdir() to be interrupted */ + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out_free; + } + cond_resched(); + + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); @@ -880,7 +891,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = 0; continue; } else { - goto out; + goto out_free; } } @@ -896,12 +907,13 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) break; } - ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } -out: +out_free: fscrypt_fname_free_buffer(&fstr); +out: + trace_f2fs_readdir(inode, start_pos, ctx->pos, err); return err < 0 ? err : 0; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b4a72f392be..6abf26c31d01 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -23,13 +23,11 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/quotaops.h> -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#include <linux/fscrypt_supp.h> -#else -#include <linux/fscrypt_notsupp.h> -#endif #include <crypto/hash.h> +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) +#include <linux/fscrypt.h> + #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else @@ -46,6 +44,8 @@ enum { FAULT_KMALLOC, FAULT_PAGE_ALLOC, + FAULT_PAGE_GET, + FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -93,6 +93,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 #define F2FS_MOUNT_QUOTA 0x00400000 +#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -118,6 +119,8 @@ struct f2fs_mount_info { #define F2FS_FEATURE_EXTRA_ATTR 0x0008 #define F2FS_FEATURE_PRJQUOTA 0x0010 #define F2FS_FEATURE_INODE_CHKSUM 0x0020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 +#define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -147,7 +150,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) -#define DISCARD_ISSUE_RATE 8 +#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ @@ -158,7 +161,6 @@ struct cp_control { __u64 trim_start; __u64 trim_end; __u64 trim_minlen; - __u64 trimmed; }; /* @@ -177,12 +179,14 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ + FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; struct ino_entry { - struct list_head list; /* list head */ - nid_t ino; /* inode number */ + struct list_head list; /* list head */ + nid_t ino; /* inode number */ + unsigned int dirty_device; /* dirty device bitmap */ }; /* for the list of inodes to be GCed */ @@ -206,10 +210,6 @@ struct discard_entry { #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) -#define P_ACTIVE 0x01 -#define P_TRIM 0x02 -#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) - enum { D_PREP, D_SUBMIT, @@ -241,12 +241,32 @@ struct discard_cmd { int error; /* bio error */ }; +enum { + DPOLICY_BG, + DPOLICY_FORCE, + DPOLICY_FSTRIM, + DPOLICY_UMOUNT, + MAX_DPOLICY, +}; + +struct discard_policy { + int type; /* type of discard */ + unsigned int min_interval; /* used for candidates exist */ + unsigned int max_interval; /* used for candidates not exist */ + unsigned int max_requests; /* # of discards issued per round */ + unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ + bool io_aware; /* issue discard in idle time */ + bool sync; /* submit discard with REQ_SYNC flag */ + unsigned int granularity; /* discard granularity */ +}; + struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ + struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; @@ -379,11 +399,14 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 +#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); -#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ - (CUR_ADDRS_PER_INODE(inode) - \ - DEF_INLINE_RESERVED_SIZE - \ - F2FS_INLINE_XATTR_ADDRS)) +static inline int get_inline_xattr_addrs(struct inode *inode); +#define F2FS_INLINE_XATTR_ADDRS(inode) get_inline_xattr_addrs(inode) +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + F2FS_INLINE_XATTR_ADDRS(inode) - \ + DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ #define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ @@ -583,6 +606,7 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ + struct list_head inmem_ilist; /* list for inmem inodes */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ @@ -593,6 +617,7 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ + int i_inline_xattr_size; /* inline xattr size */ }; static inline void get_extent_info(struct extent_info *ext, @@ -666,10 +691,13 @@ static inline void __try_update_largest_extent(struct inode *inode, } } -enum nid_list { - FREE_NID_LIST, - ALLOC_NID_LIST, - MAX_NID_LIST, +/* + * For free nid management + */ +enum nid_state { + FREE_NID, /* newly added to free nid list */ + PREALLOC_NID, /* it is preallocated */ + MAX_NID_STATE, }; struct f2fs_nm_info { @@ -692,8 +720,8 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ - unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ + unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; @@ -771,6 +799,7 @@ enum { struct flush_cmd { struct completion wait; struct llist_node llnode; + nid_t ino; int ret; }; @@ -789,6 +818,8 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ + struct rw_semaphore curseg_lock; /* for preventing curseg change */ + block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ @@ -810,6 +841,7 @@ struct f2fs_sm_info { unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ + unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -831,6 +863,7 @@ struct f2fs_sm_info { enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, + F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, @@ -879,6 +912,18 @@ enum need_lock_type { LOCK_RETRY, }; +enum cp_reason_type { + CP_NO_NEEDED, + CP_NON_REGULAR, + CP_HARDLINK, + CP_SB_NEED_CP, + CP_WRONG_PINO, + CP_NO_SPC_ROLL, + CP_NODE_NEED_CP, + CP_FASTBOOT_MODE, + CP_SPEC_LOG_NUM, +}; + enum iostat_type { APP_DIRECT_IO, /* app direct IOs */ APP_BUFFERED_IO, /* app buffered IOs */ @@ -898,6 +943,7 @@ enum iostat_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ + nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ @@ -942,6 +988,7 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ + ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1044,12 +1091,15 @@ struct f2fs_sb_info { loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ + int inline_xattr_size; /* inline xattr size */ + unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ + block_t current_reserved_blocks; /* current reserved blocks */ u32 s_next_generation; /* for NFS support */ @@ -1115,6 +1165,8 @@ struct f2fs_sb_info { struct list_head s_list; int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ + unsigned int dirty_device; /* for checkpoint data flush */ + spinlock_t dev_lock; /* protect dirty_device */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -1178,8 +1230,7 @@ static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) { - struct timespec ts = {sbi->interval_time[type], 0}; - unsigned long interval = timespec_to_jiffies(&ts); + unsigned long interval = sbi->interval_time[type] * HZ; return time_after(jiffies, sbi->last_time[type] + interval); } @@ -1346,6 +1397,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) +{ + if (type < F2FS_MAX_QUOTAS) + return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); + return 0; +} + static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) { size_t crc_offset = le32_to_cpu(cp->checksum_offset); @@ -1524,7 +1582,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; @@ -1558,6 +1617,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); f2fs_i_blocks_write(inode, count, false, true); } @@ -1578,6 +1641,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode) atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1594,6 +1659,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode) atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -1701,10 +1768,17 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return ret; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + goto enospc; + } +#endif + spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count + sbi->reserved_blocks > + if (unlikely(valid_block_count + sbi->current_reserved_blocks > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; @@ -1747,6 +1821,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, sbi->total_valid_node_count--; sbi->total_valid_block_count--; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks++; spin_unlock(&sbi->stat_lock); @@ -1793,6 +1870,19 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); } +static inline struct page *f2fs_pagecache_get_page( + struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp_mask) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { + f2fs_show_injection_info(FAULT_PAGE_GET); + return NULL; + } +#endif + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); +} + static inline void f2fs_copy_page(struct page *src, struct page *dst) { char *src_kaddr = kmap(src); @@ -1842,15 +1932,25 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline struct bio *f2fs_bio_alloc(int npages) +static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, + int npages, bool no_fail) { struct bio *bio; - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; + if (no_fail) { + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; + } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(FAULT_ALLOC_BIO); + return NULL; + } +#endif + return bio_alloc(GFP_KERNEL, npages); } static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, @@ -2160,25 +2260,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { - if (f2fs_has_inline_xattr(inode)) - return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; - return CUR_ADDRS_PER_INODE(inode); + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode); } -static inline void *inline_xattr_addr(struct page *page) +static inline void *inline_xattr_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS]); + F2FS_INLINE_XATTR_ADDRS(inode)]); } static inline int inline_xattr_size(struct inode *inode) { - if (f2fs_has_inline_xattr(inode)) - return F2FS_INLINE_XATTR_ADDRS << 2; - else - return 0; + return get_inline_xattr_addrs(inode) * sizeof(__le32); } static inline int f2fs_has_inline_data(struct inode *inode) @@ -2259,9 +2354,10 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + bool ret; + if (dsync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; spin_lock(&sbi->inode_lock[DIRTY_META]); ret = list_empty(&F2FS_I(inode)->gdirty_list); @@ -2272,12 +2368,17 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) file_keep_isize(inode) || i_size_read(inode) & PAGE_MASK) return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); + + down_read(&F2FS_I(inode)->i_sem); + ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); + up_read(&F2FS_I(inode)->i_sem); + + return ret; } static inline int f2fs_readonly(struct super_block *sb) { - return sb->s_flags & MS_RDONLY; + return sb->s_flags & SB_RDONLY; } static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) @@ -2322,6 +2423,12 @@ static inline int get_extra_isize(struct inode *inode) return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } +static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb); +static inline int get_inline_xattr_addrs(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_xattr_size; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -2450,7 +2557,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); -void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@ -2478,7 +2585,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int truncate_inode_blocks(struct inode *inode, pgoff_t from); -int truncate_xattr_node(struct inode *inode, struct page *page); +int truncate_xattr_node(struct inode *inode); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); @@ -2513,19 +2620,22 @@ void destroy_node_manager_caches(void); */ bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages_all(struct f2fs_sb_info *sbi); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); -int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); -void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void init_discard_policy(struct discard_policy *dpolicy, int discard_type, + unsigned int granularity); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); @@ -2580,6 +2690,10 @@ void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); int acquire_orphan_inode(struct f2fs_sb_info *sbi); void release_orphan_inode(struct f2fs_sb_info *sbi); @@ -2667,14 +2781,16 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; + int ndirty_data, ndirty_qdata; int inmem_pages; - unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; - int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_flushing, nr_flushed, flush_list_empty; + int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; @@ -2949,6 +3065,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION file_set_encrypt(inode); + inode->i_flags |= S_ENCRYPTED; #endif } @@ -2982,6 +3099,16 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); } +static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); +} + +static inline int f2fs_sb_has_quota_ino(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 517e112c8a9a..7874bbd7311d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -53,6 +53,11 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct dnode_of_data dn; int err; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto err; + } + sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -114,6 +119,7 @@ out_sem: out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); +err: return block_page_mkwrite_return(err); } @@ -138,27 +144,29 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) return 1; } -static inline bool need_do_checkpoint(struct inode *inode) +static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool need_cp = false; + enum cp_reason_type cp_reason = CP_NO_NEEDED; - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) - need_cp = true; + if (!S_ISREG(inode->i_mode)) + cp_reason = CP_NON_REGULAR; + else if (inode->i_nlink != 1) + cp_reason = CP_HARDLINK; else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) - need_cp = true; + cp_reason = CP_SB_NEED_CP; else if (file_wrong_pino(inode)) - need_cp = true; + cp_reason = CP_WRONG_PINO; else if (!space_for_roll_forward(sbi)) - need_cp = true; + cp_reason = CP_NO_SPC_ROLL; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) - need_cp = true; + cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) - need_cp = true; + cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) - need_cp = true; + cp_reason = CP_SPEC_LOG_NUM; - return need_cp; + return cp_reason; } static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) @@ -193,7 +201,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; - bool need_cp = false; + enum cp_reason_type cp_reason = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, @@ -212,7 +220,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, clear_inode_flag(inode, FI_NEED_IPU); if (ret) { - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } @@ -243,10 +251,10 @@ go_write: * sudden-power-off. */ down_read(&F2FS_I(inode)->i_sem); - need_cp = need_do_checkpoint(inode); + cp_reason = need_do_checkpoint(inode); up_read(&F2FS_I(inode)->i_sem); - if (need_cp) { + if (cp_reason) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); @@ -294,37 +302,43 @@ sync_nodes: remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - remove_ino_entry(sbi, ino, UPDATE_INO); - clear_inode_flag(inode, FI_UPDATE_WRITE); if (!atomic) - ret = f2fs_issue_flush(sbi); + ret = f2fs_issue_flush(sbi, inode->i_ino); + if (!ret) { + remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); + remove_ino_entry(sbi, ino, FLUSH_INO); + } f2fs_update_time(sbi, REQ_TIME); out: - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; return f2fs_do_sync_file(file, start, end, datasync, false); } static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { - struct pagevec pvec; + struct page *page; int nr_pages; if (whence != SEEK_DATA) return 0; /* find first dirty page index */ - pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, - PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; - pagevec_release(&pvec); + nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, + 1, &page); + if (!nr_pages) + return ULONG_MAX; + pgofs = page->index; + put_page(page); return pgofs; } @@ -443,6 +457,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -629,6 +646,9 @@ int f2fs_truncate(struct inode *inode) { int err; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return 0; @@ -683,6 +703,12 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_NODUMP); generic_fillattr(inode, stat); + + /* we need to show initial sectors used for inline_data/dentries */ + if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || + f2fs_has_inline_dentry(inode)) + stat->blocks += (stat->size + 511) >> 9; + return 0; } @@ -722,6 +748,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) int err; bool size_changed = false; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + err = setattr_prepare(dentry, attr); if (err) return err; @@ -774,6 +803,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) inode->i_mtime = inode->i_ctime = current_time(inode); } + down_write(&F2FS_I(inode)->i_sem); + F2FS_I(inode)->last_disk_size = i_size_read(inode); + up_write(&F2FS_I(inode)->i_sem); + size_changed = true; } @@ -844,7 +877,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { if (err == -ENOENT) { - pg_start++; + pg_start = get_next_page_offset(&dn, pg_start); continue; } return err; @@ -1159,11 +1192,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - goto out; + goto out_unlock; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1175,7 +1211,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) f2fs_i_size_write(inode, new_size); - +out_unlock: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); return ret; @@ -1358,6 +1395,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + truncate_pagecache(inode, offset); pg_start = offset >> PAGE_SHIFT; @@ -1385,6 +1425,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); + + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); return ret; @@ -1434,8 +1476,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) - f2fs_i_size_write(inode, new_size); + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } return err; } @@ -1446,6 +1492,9 @@ static long f2fs_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); long ret = 0; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -1479,8 +1528,6 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); - if (mode & FALLOC_FL_KEEP_SIZE) - file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1882,6 +1929,9 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); + if (!f2fs_sb_has_crypto(inode->i_sb)) + return -EOPNOTSUPP; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); @@ -1889,6 +1939,8 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { + if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb)) + return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } @@ -2244,9 +2296,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); + down_write(&F2FS_I(src)->dio_rwsem[WRITE]); if (src != dst) { - if (!inode_trylock(dst)) { - ret = -EBUSY; + ret = -EBUSY; + if (!inode_trylock(dst)) + goto out; + if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) { + inode_unlock(dst); goto out; } } @@ -2306,9 +2362,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } f2fs_unlock_op(sbi); out_unlock: - if (src != dst) + if (src != dst) { + up_write(&F2FS_I(dst)->dio_rwsem[WRITE]); inode_unlock(dst); + } out: + up_write(&F2FS_I(src)->dio_rwsem[WRITE]); inode_unlock(src); return ret; } @@ -2624,6 +2683,9 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + switch (cmd) { case F2FS_IOC_GETFLAGS: return f2fs_ioc_getflags(filp, arg); @@ -2681,6 +2743,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { @@ -2691,6 +2756,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) err = f2fs_preallocate_blocks(iocb, from); if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); inode_unlock(inode); return err; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bfe6a8ccc3a0..d844dcb80570 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -267,16 +267,6 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } -static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - unsigned int valid_blocks = - get_valid_blocks(sbi, segno, true); - - return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? - valid_blocks * 2 : valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -285,7 +275,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_greedy_cost(sbi, segno); + return get_valid_blocks(sbi, segno, true); else return get_cb_cost(sbi, segno); } @@ -466,10 +456,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi, struct seg_entry *sentry; int ret; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); sentry = get_seg_entry(sbi, segno); ret = f2fs_test_bit(offset, sentry->cur_valid_map); - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return ret; } @@ -608,6 +598,7 @@ static void move_data_block(struct inode *inode, block_t bidx, { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_READ, @@ -659,8 +650,8 @@ static void move_data_block(struct inode *inode, block_t bidx, allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); - fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, - FGP_LOCK | FGP_CREAT, GFP_NOFS); + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); if (!fio.encrypted_page) { err = -ENOMEM; goto recover_block; @@ -738,6 +729,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_WRITE, @@ -840,10 +832,17 @@ next_step: continue; } + if (!down_write_trylock( + &F2FS_I(inode)->dio_rwsem[WRITE])) { + iput(inode); + continue; + } + start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -901,10 +900,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, struct sit_info *sit_i = SIT_I(sbi); int ret; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, LFS); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); return ret; } @@ -952,8 +951,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* * this is to avoid deadlock: * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - mutex_lock(sentry_lock) - * - mutex_lock(sentry_lock) - change_curseg() + * - check_valid_map() - down_write(sentry_lock) + * - down_read(sentry_lock) - change_curseg() * - lock_page(sum_page) */ if (type == SUM_TYPE_NODE) @@ -1006,7 +1005,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); gc_more: - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; goto stop; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 8322e4e7bb3f..90e38d8ea688 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -112,6 +112,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), + .ino = dn->inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 50c88e37ed66..b4c4f2b25304 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -43,8 +43,11 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (f2fs_encrypted_inode(inode)) + new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| + S_ENCRYPTED); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -232,6 +235,23 @@ static int do_read_inode(struct inode *inode) fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } else { + + /* + * Previous inline data or directory always reserved 200 bytes + * in inode layout, even if inline_xattr is disabled. In order + * to keep inline_dentry's structure for backward compatibility, + * we get the space back only from inline_data. + */ + fi->i_inline_xattr_size = 0; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -384,6 +404,10 @@ int update_inode(struct inode *inode, struct page *node_page) if (f2fs_has_extra_attr(inode)) { ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb)) + ri->i_inline_xattr_size = + cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size); + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_projid)) { @@ -480,6 +504,7 @@ void f2fs_evict_inode(struct inode *inode) remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); @@ -519,8 +544,10 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + else + f2fs_inode_synced(inode); /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a4dab98c4b7b..28bdf8828e73 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; + int xattr_size = 0; int err; inode = new_inode(dir->i_sb); @@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = sbi->inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } + F2FS_I(inode)->i_inline_xattr_size = xattr_size; + f2fs_init_extent_tree(inode, NULL); stat_inc_inline_xattr(inode); @@ -177,6 +190,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -221,6 +237,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (f2fs_encrypted_inode(dir) && !fscrypt_has_permitted_context(dir, inode)) return -EPERM; @@ -331,12 +350,15 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct f2fs_dir_entry *de; struct page *page; - nid_t ino; + struct dentry *new; + nid_t ino = -1; int err = 0; unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + trace_f2fs_lookup_start(dir, dentry, flags); + if (f2fs_encrypted_inode(dir)) { - int res = fscrypt_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); /* * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is @@ -346,18 +368,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (fscrypt_has_encryption_key(dir)) fscrypt_set_encrypted_dentry(dentry); fscrypt_set_d_op(dentry); - if (res && res != -ENOKEY) - return ERR_PTR(res); + if (err && err != -ENOKEY) + goto out; } - if (dentry->d_name.len > F2FS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > F2FS_NAME_LEN) { + err = -ENAMETOOLONG; + goto out; + } de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { - if (IS_ERR(page)) - return (struct dentry *)page; - return d_splice_alias(inode, dentry); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + goto out_splice; } ino = le32_to_cpu(de->ino); @@ -365,19 +391,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { err = __recover_dot_dentries(dir, root_ino); if (err) - goto err_out; + goto out_iput; } if (f2fs_has_inline_dots(inode)) { err = __recover_dot_dentries(inode, dir->i_ino); if (err) - goto err_out; + goto out_iput; } if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && @@ -386,12 +414,18 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); err = -EPERM; - goto err_out; + goto out_iput; } - return d_splice_alias(inode, dentry); - -err_out: +out_splice: + new = d_splice_alias(inode, dentry); + if (IS_ERR(new)) + err = PTR_ERR(new); + trace_f2fs_lookup_end(dir, dentry, ino, err); + return new; +out_iput: iput(inode); +out: + trace_f2fs_lookup_end(dir, dentry, ino, err); return ERR_PTR(err); } @@ -405,9 +439,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; + err = dquot_initialize(inode); + if (err) + return err; de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { @@ -460,6 +500,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct fscrypt_symlink_data *sd = NULL; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (f2fs_encrypted_inode(dir)) { err = fscrypt_get_encryption_info(dir); if (err) @@ -566,6 +609,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -618,6 +664,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -712,6 +761,9 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + if (f2fs_encrypted_inode(dir)) { int err = fscrypt_get_encryption_info(dir); if (err) @@ -723,6 +775,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); } @@ -742,6 +797,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if ((f2fs_encrypted_inode(old_dir) && !fscrypt_has_encryption_key(old_dir)) || (f2fs_encrypted_inode(new_dir) && @@ -767,6 +825,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; + if (new_inode) { + err = dquot_initialize(new_inode); + if (err) + goto out; + } + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -935,6 +999,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if ((f2fs_encrypted_inode(old_dir) && !fscrypt_has_encryption_key(old_dir)) || (f2fs_encrypted_inode(new_dir) && diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index fca87835a1da..d3322752426f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,7 +46,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + mem_size = (nm_i->nid_cnt[FREE_NID] * sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { @@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) } else if (type == INO_ENTRIES) { int i; - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i < MAX_INO_ENTRY; i++) mem_size += sbi->im[i].ino_num * sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; @@ -74,6 +74,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INMEM_PAGES) { + /* it allows 20% / total_ram for inmemory pages */ + mem_size = get_pages(sbi, F2FS_INMEM_PAGES); + res = mem_size < (val.totalram / 5); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; @@ -134,6 +138,44 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } +static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +{ + struct nat_entry *new; + + if (no_fail) + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_NOFS | __GFP_ZERO); + else + new = kmem_cache_alloc(nat_entry_slab, + GFP_NOFS | __GFP_ZERO); + if (new) { + nat_set_nid(new, nid); + nat_reset_flag(new); + } + return new; +} + +static void __free_nat_entry(struct nat_entry *e) +{ + kmem_cache_free(nat_entry_slab, e); +} + +/* must be locked by nat_tree_lock */ +static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) +{ + if (no_fail) + f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); + else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne)) + return NULL; + + if (raw_ne) + node_info_from_raw_nat(&ne->ni, raw_ne); + list_add_tail(&ne->list, &nm_i->nat_entries); + nm_i->nat_cnt++; + return ne; +} + static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -150,7 +192,7 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) list_del(&e->list); radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); nm_i->nat_cnt--; - kmem_cache_free(nat_entry_slab, e); + __free_nat_entry(e); } static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, @@ -246,49 +288,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, - bool no_fail) -{ - struct nat_entry *new; - - if (no_fail) { - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); - } else { - new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - if (!new) - return NULL; - if (radix_tree_insert(&nm_i->nat_root, nid, new)) { - kmem_cache_free(nat_entry_slab, new); - return NULL; - } - } - - memset(new, 0, sizeof(struct nat_entry)); - nat_set_nid(new, nid); - nat_reset_flag(new); - list_add_tail(&new->list, &nm_i->nat_entries); - nm_i->nat_cnt++; - return new; -} - +/* must be locked by nat_tree_lock */ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry *ne) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct nat_entry *e; + struct nat_entry *new, *e; + new = __alloc_nat_entry(nid, false); + if (!new) + return; + + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); - if (!e) { - e = grab_nat_entry(nm_i, nid, false); - if (e) - node_info_from_raw_nat(&e->ni, ne); - } else { + if (!e) + e = __init_nat_entry(nm_i, new, ne, false); + else f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); - } + up_write(&nm_i->nat_tree_lock); + if (e != new) + __free_nat_entry(new); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -296,11 +318,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; + struct nat_entry *new = __alloc_nat_entry(ni->nid, true); down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid, true); + e = __init_nat_entry(nm_i, new, NULL, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -312,6 +335,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); } + /* let's free early to reduce memory consumption */ + if (e != new) + __free_nat_entry(new); /* sanity check */ f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); @@ -327,10 +353,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); - - /* in order to reuse the nid */ - if (nm_i->next_scan_nid > ni->nid) - nm_i->next_scan_nid = ni->nid; } /* change address */ @@ -424,9 +446,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) f2fs_put_page(page, 1); cache: /* cache nat entry */ - down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - up_write(&nm_i->nat_tree_lock); } /* @@ -962,7 +982,8 @@ fail: return err > 0 ? 0 : err; } -int truncate_xattr_node(struct inode *inode, struct page *page) +/* caller must lock inode page */ +int truncate_xattr_node(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; @@ -978,10 +999,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - set_new_dnode(&dn, inode, page, npage, nid); - - if (page) - dn.inode_page_locked = true; + set_new_dnode(&dn, inode, NULL, npage, nid); truncate_node(&dn); return 0; } @@ -1000,7 +1018,7 @@ int remove_inode_page(struct inode *inode) if (err) return err; - err = truncate_xattr_node(inode, dn.inode_page); + err = truncate_xattr_node(inode); if (err) { f2fs_put_dnode(&dn); return err; @@ -1220,7 +1238,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!inode) return; - page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); + page = f2fs_pagecache_get_page(inode->i_mapping, 0, + FGP_LOCK|FGP_NOWAIT, 0); if (!page) goto iput_out; @@ -1244,54 +1263,19 @@ iput_out: iput(inode); } -void move_node_page(struct page *node_page, int gc_type) -{ - if (gc_type == FG_GC) { - struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - .for_reclaim = 0, - }; - - set_page_dirty(node_page); - f2fs_wait_on_page_writeback(node_page, NODE, true); - - f2fs_bug_on(sbi, PageWriteback(node_page)); - if (!clear_page_dirty_for_io(node_page)) - goto out_page; - - if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) - unlock_page(node_page); - goto release_page; - } else { - /* set page dirty and write it */ - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } -out_page: - unlock_page(node_page); -release_page: - f2fs_put_page(node_page, 0); -} - static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; struct page *last_page = NULL; + int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = 0; - end = ULONG_MAX; - - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1344,6 +1328,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, + .ino = ino_of_node(page), .type = NODE, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), @@ -1416,6 +1401,37 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (__write_node_page(node_page, false, NULL, + &wbc, false, FS_GC_NODE_IO)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { @@ -1425,13 +1441,14 @@ static int f2fs_write_node_page(struct page *page, int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { - pgoff_t index, end; + pgoff_t index; pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; + int nr_pages; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1439,17 +1456,12 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return PTR_ERR_OR_ZERO(last_page); } retry: - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = 0; - end = ULONG_MAX; - - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1548,25 +1560,21 @@ out: int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; int step = 0; int nwritten = 0; int ret = 0; + int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next_step: index = 0; - end = ULONG_MAX; - - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1655,27 +1663,20 @@ out: int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = ULONG_MAX; + pgoff_t index = 0; struct pagevec pvec; int ret2, ret = 0; + int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (unlikely(page->index > end)) - continue; - if (ino && ino_of_node(page) == ino) { f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) @@ -1761,35 +1762,54 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static int __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list, bool new) +static int __insert_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (new) { - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); - if (err) - return err; - } + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; - f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : - i->state != NID_ALLOC); - nm_i->nid_cnt[list]++; - list_add_tail(&i->list, &nm_i->nid_list[list]); + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]++; + if (state == FREE_NID) + list_add_tail(&i->list, &nm_i->free_nid_list); return 0; } -static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list, bool reuse) +static void __remove_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state) { struct f2fs_nm_info *nm_i = NM_I(sbi); - f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : - i->state != NID_ALLOC); - nm_i->nid_cnt[list]--; - list_del(&i->list); - if (!reuse) - radix_tree_delete(&nm_i->free_nid_root, i->nid); + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]--; + if (state == FREE_NID) + list_del(&i->list); + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, + enum nid_state org_state, enum nid_state dst_state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, org_state != i->state); + i->state = dst_state; + nm_i->nid_cnt[org_state]--; + nm_i->nid_cnt[dst_state]++; + + switch (dst_state) { + case PREALLOC_NID: + list_del(&i->list); + break; + case FREE_NID: + list_add_tail(&i->list, &nm_i->free_nid_list); + break; + default: + BUG_ON(1); + } } /* return if the nid is recognized as free */ @@ -1807,7 +1827,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; - i->state = NID_NEW; + i->state = FREE_NID; if (radix_tree_preload(GFP_NOFS)) goto err; @@ -1820,7 +1840,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) * - f2fs_create * - f2fs_new_inode * - alloc_nid - * - __insert_nid_to_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(PREALLOC_NID) * - f2fs_balance_fs_bg * - build_free_nids * - __build_free_nids @@ -1833,8 +1853,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) * - new_node_page * - set_node_addr * - alloc_nid_done - * - __remove_nid_from_list(ALLOC_NID_LIST) - * - __insert_nid_to_list(FREE_NID_LIST) + * - __remove_nid_from_list(PREALLOC_NID) + * - __insert_nid_to_list(FREE_NID) */ ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || @@ -1843,13 +1863,13 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) e = __lookup_free_nid_list(nm_i, nid); if (e) { - if (e->state == NID_NEW) + if (e->state == FREE_NID) ret = true; goto err_out; } } ret = true; - err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + err = __insert_free_nid(sbi, i, FREE_NID); err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); @@ -1867,8 +1887,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + if (i && i->state == FREE_NID) { + __remove_free_nid(sbi, i, FREE_NID); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1887,15 +1907,18 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) return; - if (set) + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - else - __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - - if (set) nm_i->free_nid_count[nat_ofs]++; - else if (!build) - nm_i->free_nid_count[nat_ofs]--; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1930,12 +1953,32 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +static void scan_curseg_cache(struct f2fs_sb_info *sbi) { - struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; + int i; + + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); +} + +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int i, idx; + nid_t nid; down_read(&nm_i->nat_tree_lock); @@ -1945,40 +1988,27 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) if (!nm_i->free_nid_count[i]) continue; for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { - nid_t nid; - - if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) - continue; + idx = find_next_bit_le(nm_i->free_nid_bitmap[i], + NAT_ENTRY_PER_BLOCK, idx); + if (idx >= NAT_ENTRY_PER_BLOCK) + break; nid = i * NAT_ENTRY_PER_BLOCK + idx; add_free_nid(sbi, nid, true); - if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) goto out; } } out: - down_read(&curseg->journal_rwsem); - for (i = 0; i < nats_in_cursum(journal); i++) { - block_t addr; - nid_t nid; + scan_curseg_cache(sbi); - addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); - nid = le32_to_cpu(nid_in_journal(journal, i)); - if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); - else - remove_free_nid(sbi, nid); - } - up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); } static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_journal *journal = curseg->journal; int i = 0; nid_t nid = nm_i->next_scan_nid; @@ -1986,7 +2016,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) nid = 0; /* Enough entries */ - if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; if (!sync && !available_free_memory(sbi, FREE_NIDS)) @@ -1996,7 +2026,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); - if (nm_i->nid_cnt[FREE_NID_LIST]) + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; } @@ -2024,18 +2054,8 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - down_read(&curseg->journal_rwsem); - for (i = 0; i < nats_in_cursum(journal); i++) { - block_t addr; + scan_curseg_cache(sbi); - addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); - nid = le32_to_cpu(nid_in_journal(journal, i)); - if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); - else - remove_free_nid(sbi, nid); - } - up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), @@ -2073,15 +2093,13 @@ retry: } /* We should not use stale free nids created by build_free_nids */ - if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); - i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); + i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); - i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); nm_i->available_nids--; update_free_nid_bitmap(sbi, *nid, false, false); @@ -2107,7 +2125,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -2130,12 +2148,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, !i); if (!available_free_memory(sbi, FREE_NIDS)) { - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID); need_free = true; } else { - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); - i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); + __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID); } nm_i->available_nids++; @@ -2154,20 +2170,19 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], - list) { + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { if (nr_shrink <= 0 || - nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + __remove_free_nid(sbi, i, FREE_NID); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2193,8 +2208,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page) goto update_inode; } - dst_addr = inline_xattr_addr(ipage); - src_addr = inline_xattr_addr(page); + dst_addr = inline_xattr_addr(inode, ipage); + src_addr = inline_xattr_addr(inode, page); inline_size = inline_xattr_size(inode); f2fs_wait_on_page_writeback(ipage, NODE, true); @@ -2283,6 +2298,12 @@ retry: dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); if (dst->i_inline & F2FS_EXTRA_ATTR) { dst->i_extra_isize = src->i_extra_isize; + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_inline_xattr_size)) + dst->i_inline_xattr_size = src->i_inline_xattr_size; + if (f2fs_sb_has_project_quota(sbi->sb) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) @@ -2354,8 +2375,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid, true); - node_info_from_raw_nat(&ne->ni, &raw_ne); + ne = __alloc_nat_entry(nid, true); + __init_nat_entry(nm_i, ne, &raw_ne, true); } /* @@ -2401,15 +2422,17 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; struct f2fs_nat_block *nat_blk = page_address(page); int valid = 0; - int i; + int i = 0; if (!enabled_nat_bits(sbi, NULL)) return; - for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { - if (start_nid == 0 && i == 0) - valid++; - if (nat_blk->entries[i].block_addr) + if (nat_index == 0) { + valid = 1; + i = 1; + } + for (; i < NAT_ENTRY_PER_BLOCK; i++) { + if (nat_blk->entries[i].block_addr != NULL_ADDR) valid++; } if (valid == 0) { @@ -2604,7 +2627,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) __set_bit_le(i, nm_i->nat_block_bitmap); nid = i * NAT_ENTRY_PER_BLOCK; - last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + last_nid = nid + NAT_ENTRY_PER_BLOCK; spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) @@ -2639,16 +2662,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - F2FS_RESERVED_NODE_NUM; - nm_i->nid_cnt[FREE_NID_LIST] = 0; - nm_i->nid_cnt[ALLOC_NID_LIST] = 0; + nm_i->nid_cnt[FREE_NID] = 0; + nm_i->nid_cnt[PREALLOC_NID] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); - INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); + INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -2740,16 +2762,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], - list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { + __remove_free_nid(sbi, i, FREE_NID); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); - f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); - f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]); + f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]); + f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list)); spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index bb53e9955ff2..0ee3e5ff49a3 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -140,6 +140,7 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ + INMEM_PAGES, /* indicates inmemory pages */ BASE_CHECK, /* check kernel status */ }; @@ -150,18 +151,10 @@ struct nat_entry_set { unsigned int entry_cnt; /* the # of nat entries in set */ }; -/* - * For free nid mangement - */ -enum nid_state { - NID_NEW, /* newly added to free nid list */ - NID_ALLOC /* it is allocated */ -}; - struct free_nid { struct list_head list; /* for free node id list */ nid_t nid; /* node id */ - int state; /* in use or not: NID_NEW or NID_ALLOC */ + int state; /* in use or not: FREE_NID or PREALLOC_NID */ }; static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) @@ -170,12 +163,11 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *fnid; spin_lock(&nm_i->nid_list_lock); - if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + if (nm_i->nid_cnt[FREE_NID] <= 0) { spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], - struct free_nid, list); + fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9626758bc762..b3a14b0429f2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -594,17 +594,20 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); - sbi->sb->s_flags &= ~MS_RDONLY; + sbi->sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= MS_ACTIVE; + sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ - f2fs_enable_quota_files(sbi); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -665,9 +668,10 @@ skip: out: #ifdef CONFIG_QUOTA /* Turn quotas off */ - f2fs_quota_off_umount(sbi->sb); + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); #endif - sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c695ff462ee6..c117e0913f2a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -181,11 +181,12 @@ bool need_SSR(struct f2fs_sb_info *sbi) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } void register_inmem_page(struct inode *inode, struct page *page) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; @@ -204,6 +205,10 @@ void register_inmem_page(struct inode *inode, struct page *page) mutex_lock(&fi->inmem_lock); get_page(page); list_add_tail(&new->list, &fi->inmem_pages); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(&fi->inmem_ilist)) + list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); mutex_unlock(&fi->inmem_lock); @@ -262,12 +267,41 @@ next: return err; } +void drop_inmem_pages_all(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; + struct inode *inode; + struct f2fs_inode_info *fi; +next: + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + return; + } + fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + if (inode) { + drop_inmem_pages(inode); + iput(inode); + } + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto next; +} + void drop_inmem_pages(struct inode *inode) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); @@ -313,6 +347,7 @@ static int __commit_inmem_pages(struct inode *inode, struct inmem_pages *cur, *tmp; struct f2fs_io_info fio = { .sbi = sbi, + .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, @@ -398,6 +433,10 @@ int commit_inmem_pages(struct inode *inode) /* drop all uncommitted pages */ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_COMMIT); @@ -472,7 +511,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(0); + struct bio *bio = f2fs_bio_alloc(sbi, 0, true); int ret; bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; @@ -485,15 +524,17 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, return ret; } -static int submit_flush_wait(struct f2fs_sb_info *sbi) +static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) { - int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); + int ret = 0; int i; - if (!sbi->s_ndevs || ret) - return ret; + if (!sbi->s_ndevs) + return __submit_flush_wait(sbi, sbi->sb->s_bdev); - for (i = 1; i < sbi->s_ndevs; i++) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (!is_dirty_device(sbi, ino, i, FLUSH_INO)) + continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) break; @@ -519,7 +560,9 @@ repeat: fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - ret = submit_flush_wait(sbi); + cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); + + ret = submit_flush_wait(sbi, cmd->ino); atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, @@ -537,7 +580,7 @@ repeat: goto repeat; } -int f2fs_issue_flush(struct f2fs_sb_info *sbi) +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; @@ -547,19 +590,20 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { - ret = submit_flush_wait(sbi); + ret = submit_flush_wait(sbi, ino); atomic_inc(&fcc->issued_flush); return ret; } - if (atomic_inc_return(&fcc->issing_flush) == 1) { - ret = submit_flush_wait(sbi); + if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) { + ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->issing_flush); atomic_inc(&fcc->issued_flush); return ret; } + cmd.ino = ino; init_completion(&cmd.wait); llist_add(&cmd.llnode, &fcc->issue_list); @@ -583,7 +627,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) } else { struct flush_cmd *tmp, *next; - ret = submit_flush_wait(sbi); + ret = submit_flush_wait(sbi, ino); llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { @@ -653,6 +697,28 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } } +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) +{ + int ret = 0, i; + + if (!sbi->s_ndevs) + return 0; + + for (i = 1; i < sbi->s_ndevs; i++) { + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) + continue; + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + + spin_lock(&sbi->dev_lock); + f2fs_clear_bit(i, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + return ret; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -794,6 +860,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) @@ -845,10 +913,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_cmd *dc) + struct discard_policy *dpolicy, + struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); struct bio *bio = NULL; + int flag = dpolicy->sync ? REQ_SYNC : 0; if (dc->state != D_PREP) return; @@ -867,9 +939,9 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (bio) { bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; - bio->bi_opf |= REQ_SYNC; + bio->bi_opf |= flag; submit_bio(bio); - list_move_tail(&dc->list, &dcc->wait_list); + list_move_tail(&dc->list, wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); f2fs_update_iostat(sbi, FS_DISCARD, 1); @@ -886,7 +958,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct rb_node **p = &dcc->root.rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct discard_cmd *dc = NULL; @@ -1054,58 +1126,107 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + + if (dc->len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + __submit_discard_cmd(sbi, dpolicy, dc); + + if (++issued >= dpolicy->max_requests) { + start = dc->lstart + dc->len; + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + schedule(); + + goto next; + } +skip: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int iter = 0, issued = 0; - int i; + int i, iter = 0, issued = 0; bool io_interrupted = false; - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); - blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; - i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dpolicy->granularity) + break; pend_list = &dcc->pend_list[i]; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - /* Hurry up to finish fstrim */ - if (dcc->pend_list_tag[i] & P_TRIM) { - __submit_discard_cmd(sbi, dc); - issued++; - - if (fatal_signal_pending(current)) - break; - continue; - } - - if (!issue_cond) { - __submit_discard_cmd(sbi, dc); - issued++; - continue; - } - - if (is_idle(sbi)) { - __submit_discard_cmd(sbi, dc); - issued++; - } else { + if (dpolicy->io_aware && i < dpolicy->io_aware_gran && + !is_idle(sbi)) { io_interrupted = true; + goto skip; } - if (++iter >= DISCARD_ISSUE_RATE) - goto out; + __submit_discard_cmd(sbi, dpolicy, dc); + issued++; +skip: + if (++iter >= dpolicy->max_requests) + break; } - if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) - dcc->pend_list_tag[i] &= (~P_TRIM); + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + if (iter >= dpolicy->max_requests) + break; } -out: - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); if (!issued && io_interrupted) issued = -1; @@ -1113,12 +1234,13 @@ out: return issued; } -static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; int i; + bool dropped = false; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { @@ -1126,39 +1248,58 @@ static void __drop_discard_cmd(struct f2fs_sb_info *sbi) list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); __remove_discard_cmd(sbi, dc); + dropped = true; } } mutex_unlock(&dcc->cmd_lock); + + return dropped; } -static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, +static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned int len = 0; wait_for_completion_io(&dc->wait); mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, dc->state != D_DONE); dc->ref--; - if (!dc->ref) + if (!dc->ref) { + if (!dc->error) + len = dc->len; __remove_discard_cmd(sbi, dc); + } mutex_unlock(&dcc->cmd_lock); + + return len; } -static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + block_t start, block_t end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->wait_list); + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc, *tmp; bool need_wait; + unsigned int trimmed = 0; next: need_wait = false; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { + if (dc->lstart + dc->len <= start || end <= dc->lstart) + continue; + if (dc->len < dpolicy->granularity) + continue; + if (dc->state == D_DONE && !dc->ref) { wait_for_completion_io(&dc->wait); + if (!dc->error) + trimmed += dc->len; __remove_discard_cmd(sbi, dc); } else { dc->ref++; @@ -1169,9 +1310,17 @@ next: mutex_unlock(&dcc->cmd_lock); if (need_wait) { - __wait_one_discard_bio(sbi, dc); + trimmed += __wait_one_discard_bio(sbi, dc); goto next; } + + return trimmed; +} + +static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1209,23 +1358,19 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) -{ - __issue_discard_cmd(sbi, false); - __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, !umount); -} - -static void mark_discard_range_all(struct f2fs_sb_info *sbi) +/* This comes from f2fs_put_super */ +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - int i; + struct discard_policy dpolicy; + bool dropped; - mutex_lock(&dcc->cmd_lock); - for (i = 0; i < MAX_PLIST_NUM; i++) - dcc->pend_list_tag[i] |= P_TRIM; - mutex_unlock(&dcc->cmd_lock); + init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + __issue_discard_cmd(sbi, &dpolicy); + dropped = __drop_discard_cmd(sbi); + __wait_all_discard_cmd(sbi, &dpolicy); + + return dropped; } static int issue_discard_thread(void *data) @@ -1233,12 +1378,16 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + struct discard_policy dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; int issued; set_freezable(); do { + init_discard_policy(&dpolicy, DPOLICY_BG, + dcc->discard_granularity); + wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || dcc->discard_wake, @@ -1251,17 +1400,18 @@ static int issue_discard_thread(void *data) if (dcc->discard_wake) { dcc->discard_wake = 0; if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - mark_discard_range_all(sbi); + init_discard_policy(&dpolicy, + DPOLICY_FORCE, 1); } sb_start_intwrite(sbi->sb); - issued = __issue_discard_cmd(sbi, true); + issued = __issue_discard_cmd(sbi, &dpolicy); if (issued) { - __wait_discard_cmd(sbi, true); - wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + __wait_all_discard_cmd(sbi, &dpolicy); + wait_ms = dpolicy.min_interval; } else { - wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + wait_ms = dpolicy.max_interval; } sb_end_intwrite(sbi->sb); @@ -1525,7 +1675,6 @@ find_next: f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, len); - cpc->trimmed += len; total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, @@ -1546,6 +1695,37 @@ skip: wake_up_discard_thread(sbi, false); } +void init_discard_policy(struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->granularity = granularity; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = false; + } +} + static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -1563,12 +1743,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) { + for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); - if (i >= dcc->discard_granularity - 1) - dcc->pend_list_tag[i] |= P_ACTIVE; - } INIT_LIST_HEAD(&dcc->wait_list); + INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); @@ -1716,16 +1894,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) -{ - update_sit_entry(sbi, new, 1); - if (GET_SEGNO(sbi, old) != NULL_SEGNO) - update_sit_entry(sbi, old, -1); - - locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); - locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); -} - void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) { unsigned int segno = GET_SEGNO(sbi, addr); @@ -1736,14 +1904,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) return; /* add it into sit main buffer */ - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ locate_dirty_segment(sbi, segno); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); } bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) @@ -1756,7 +1924,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return true; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); segno = GET_SEGNO(sbi, blkaddr); se = get_seg_entry(sbi, segno); @@ -1765,7 +1933,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) if (f2fs_test_bit(offset, se->ckpt_valid_map)) is_cp = true; - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return is_cp; } @@ -1823,12 +1991,8 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct page *page = grab_meta_page(sbi, blk_addr); - void *dst = page_address(page); - if (src) - memcpy(dst, src, PAGE_SIZE); - else - memset(dst, 0, PAGE_SIZE); + memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } @@ -1927,7 +2091,6 @@ find_other_zone: } secno = left_start; skip_left: - hint = secno; segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); @@ -2162,12 +2325,16 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; + down_write(&SIT_I(sbi)->sentry_lock); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); locate_dirty_segment(sbi, old_segno); } + + up_write(&SIT_I(sbi)->sentry_lock); } static const struct segment_allocation default_salloc_ops = { @@ -2179,14 +2346,14 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u64 trim_start = cpc->trim_start; bool has_candidate = false; - mutex_lock(&SIT_I(sbi)->sentry_lock); + down_write(&SIT_I(sbi)->sentry_lock); for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { if (add_discard_addrs(sbi, cpc, true)) { has_candidate = true; break; } } - mutex_unlock(&SIT_I(sbi)->sentry_lock); + up_write(&SIT_I(sbi)->sentry_lock); cpc->trim_start = trim_start; return has_candidate; @@ -2196,14 +2363,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; - unsigned int start_segno, end_segno; + unsigned int start_segno, end_segno, cur_segno; + block_t start_block, end_block; struct cp_control cpc; + struct discard_policy dpolicy; + unsigned long long trimmed = 0; int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; - cpc.trimmed = 0; if (end <= MAIN_BLKADDR(sbi)) goto out; @@ -2217,12 +2386,14 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); /* do checkpoint to issue discard commands safely */ - for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { - cpc.trim_start = start_segno; + for (cur_segno = start_segno; cur_segno <= end_segno; + cur_segno = cpc.trim_end + 1) { + cpc.trim_start = cur_segno; if (sbi->discard_blks == 0) break; @@ -2230,7 +2401,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) cpc.trim_end = end_segno; else cpc.trim_end = min_t(unsigned int, - rounddown(start_segno + + rounddown(cur_segno + BATCHED_TRIM_SEGMENTS(sbi), sbi->segs_per_sec) - 1, end_segno); @@ -2242,11 +2413,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } - /* It's time to issue all the filed discards */ - mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi, false); + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); + + init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); out: - range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); + range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } @@ -2258,6 +2434,18 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } +int rw_hint_to_seg_type(enum rw_hint hint) +{ + switch (hint) { + case WRITE_LIFE_SHORT: + return CURSEG_HOT_DATA; + case WRITE_LIFE_EXTREME: + return CURSEG_COLD_DATA; + default: + return CURSEG_WARM_DATA; + } +} + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2292,7 +2480,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; - return CURSEG_WARM_DATA; + return rw_hint_to_seg_type(inode->i_write_hint); } else { if (IS_DNODE(fio->page)) return is_cold_node(fio->page) ? CURSEG_WARM_NODE : @@ -2336,8 +2524,10 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); + down_read(&SM_I(sbi)->curseg_lock); + mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -2354,15 +2544,26 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + /* + * SIT information should be updated before segment allocation, + * since SSR needs latest valid block information. + */ + update_sit_entry(sbi, *new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); + /* - * SIT information should be updated after segment allocation, - * since we need to keep dirty segments precisely under SSR. + * segment dirty status should be updated after segment allocation, + * so we just need to update status only one time after previous + * segment being closed. */ - refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); @@ -2382,6 +2583,29 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, } mutex_unlock(&curseg->curseg_mutex); + + up_read(&SM_I(sbi)->curseg_lock); +} + +static void update_device_state(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + unsigned int devidx; + + if (!sbi->s_ndevs) + return; + + devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + + /* update device state for fsync */ + set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) @@ -2398,6 +2622,8 @@ reallocate: if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; + } else if (!err) { + update_device_state(fio); } } @@ -2458,12 +2684,26 @@ int rewrite_data_page(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); + if (!err) + update_device_state(fio); f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); return err; } +static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (CURSEG_I(sbi, i)->segno == segno) + break; + } + return i; +} + void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr) @@ -2479,6 +2719,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, se = get_seg_entry(sbi, segno); type = se->type; + down_write(&SM_I(sbi)->curseg_lock); + if (!recover_curseg) { /* for recovery flow */ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { @@ -2488,14 +2730,19 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, type = CURSEG_WARM_DATA; } } else { - if (!IS_CURSEG(sbi, segno)) + if (IS_CURSEG(sbi, segno)) { + /* se->type is volatile as SSR allocation */ + type = __f2fs_get_curseg(sbi, segno); + f2fs_bug_on(sbi, type == NO_CHECK_TYPE); + } else { type = CURSEG_WARM_DATA; + } } curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; @@ -2527,8 +2774,9 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, curseg->next_blkoff = old_blkoff; } - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); + up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, @@ -2982,7 +3230,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool to_journal = true; struct seg_entry *se; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) goto out; @@ -3076,7 +3324,7 @@ out: cpc->trim_start = trim_start; } - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } @@ -3169,7 +3417,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); sit_i->mounted_time = ktime_get_real_seconds(); - mutex_init(&sit_i->sentry_lock); + init_rwsem(&sit_i->sentry_lock); return 0; } @@ -3410,7 +3658,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); unsigned int segno; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); sit_i->min_mtime = LLONG_MAX; @@ -3427,7 +3675,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); } int build_segment_manager(struct f2fs_sb_info *sbi) @@ -3460,11 +3708,14 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; + sm_info->min_ssr_sections = reserved_sections(sbi); sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; INIT_LIST_HEAD(&sm_info->sit_entry_set); + init_rwsem(&sm_info->curseg_lock); + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e0a6cc23ace3..d1d394cdf61d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -231,7 +231,7 @@ struct sit_info { unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ - struct mutex sentry_lock; /* to protect SIT cache */ + struct rw_semaphore sentry_lock; /* to protect SIT cache */ struct seg_entry *sentries; /* SIT segment-level cache */ struct sec_entry *sec_entries; /* SIT section-level cache */ @@ -497,6 +497,33 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +{ + unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; + int i; + + /* check current node segment */ + for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + segno = CURSEG_I(sbi, i)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + if (node_blocks > left_blocks) + return false; + } + + /* check current data segment */ + segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + if (dent_blocks > left_blocks) + return false; + return true; +} + static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { @@ -507,6 +534,9 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; + if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && + has_curseg_enough_space(sbi)) + return false; return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + needed); @@ -731,7 +761,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) > sbi->fggc_threshold) return true; return false; @@ -796,8 +826,9 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) goto wake_up; mutex_lock(&dcc->cmd_lock); - for (i = MAX_PLIST_NUM - 1; - i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dcc->discard_granularity) + break; if (!list_empty(&dcc->pend_list[i])) { wakeup = true; break; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 5c60fc28ec75..0b5664a1a6cc 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,7 +28,7 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS; return count > 0 ? count : 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 933c3d529e65..708155d9c2e4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -44,6 +44,8 @@ static struct kmem_cache *f2fs_inode_cachep; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", @@ -92,6 +94,7 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_noinline_xattr, + Opt_inline_xattr_size, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -141,6 +144,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_noinline_xattr, "noinline_xattr"}, + {Opt_inline_xattr_size, "inline_xattr_size=%u"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -209,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quota options when quota turned on"); return -EINVAL; } + if (f2fs_sb_has_quota_ino(sb)) { + f2fs_msg(sb, KERN_INFO, + "QUOTA feature is enabled, so ignore qf_name"); + return 0; + } + qname = match_strdup(args); if (!qname) { f2fs_msg(sb, KERN_ERR, @@ -287,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) return -1; } } + + if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_INFO, + "QUOTA feature is enabled, so ignore jquota_fmt"); + sbi->s_jquota_fmt = 0; + } + if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return -1; + } return 0; } #endif @@ -383,6 +405,12 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_xattr: clear_opt(sbi, INLINE_XATTR); break; + case Opt_inline_xattr_size: + if (args->from && match_int(args, &arg)) + return -EINVAL; + set_opt(sbi, INLINE_XATTR_SIZE); + sbi->inline_xattr_size = arg; + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -506,10 +534,10 @@ static int parse_options(struct super_block *sb, char *options) #endif break; case Opt_lazytime: - sb->s_flags |= MS_LAZYTIME; + sb->s_flags |= SB_LAZYTIME; break; case Opt_nolazytime: - sb->s_flags &= ~MS_LAZYTIME; + sb->s_flags &= ~SB_LAZYTIME; break; #ifdef CONFIG_QUOTA case Opt_quota: @@ -604,6 +632,24 @@ static int parse_options(struct super_block *sb, char *options) F2FS_IO_SIZE_KB(sbi)); return -EINVAL; } + + if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!test_opt(sbi, INLINE_XATTR)) { + f2fs_msg(sb, KERN_ERR, + "inline_xattr_size option should be " + "set with inline_xattr option"); + return -EINVAL; + } + if (!sbi->inline_xattr_size || + sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE - + DEF_INLINE_RESERVED_SIZE - + DEF_MIN_INLINE_SIZE) { + f2fs_msg(sb, KERN_ERR, + "inline xattr size is out of range"); + return -EINVAL; + } + } return 0; } @@ -618,13 +664,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); /* Initialize f2fs-specific inode info */ - fi->vfs_inode.i_version = 1; atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); @@ -673,7 +719,6 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); - fscrypt_put_encryption_info(inode, NULL); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } @@ -781,6 +826,7 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; + bool dropped; f2fs_quota_off_umount(sb); @@ -801,9 +847,9 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi, true); + dropped = f2fs_wait_discard_bios(sbi); - if (f2fs_discard_en(sbi) && !sbi->discard_blks) { + if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -858,6 +904,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) struct f2fs_sb_info *sbi = F2FS_SB(sb); int err = 0; + if (unlikely(f2fs_cp_error(sbi))) + return 0; + trace_f2fs_sync_fs(sb, sync); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -957,7 +1006,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi) - - sbi->reserved_blocks; + sbi->current_reserved_blocks; avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -1046,6 +1095,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_xattr"); else seq_puts(seq, ",noinline_xattr"); + if (test_opt(sbi, INLINE_XATTR_SIZE)) + seq_printf(seq, ",inline_xattr_size=%u", + sbi->inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -1108,6 +1160,7 @@ static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; + sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1115,7 +1168,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); - sbi->sb->s_flags |= MS_LAZYTIME; + sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); @@ -1136,6 +1189,9 @@ static void default_options(struct f2fs_sb_info *sbi) #endif } +#ifdef CONFIG_QUOTA +static int f2fs_enable_quotas(struct super_block *sb); +#endif static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -1180,7 +1236,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #endif /* recover superblocks we couldn't write due to previous RO mount */ - if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); f2fs_msg(sb, KERN_INFO, "Try to recover all the superblocks, ret: %d", err); @@ -1199,19 +1255,26 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) + if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; - if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { +#ifdef CONFIG_QUOTA + if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; } else { /* dquot_resume needs RW */ - sb->s_flags &= ~MS_RDONLY; - dquot_resume(sb, -1); + sb->s_flags &= ~SB_RDONLY; + if (sb_any_quota_suspended(sb)) { + dquot_resume(sb, -1); + } else if (f2fs_sb_has_quota_ino(sb)) { + err = f2fs_enable_quotas(sb); + if (err) + goto restore_opts; + } } - +#endif /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1225,7 +1288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { + if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); need_restart_gc = true; @@ -1237,7 +1300,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1251,7 +1314,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. */ - if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); destroy_flush_cmd_control(sbi, false); } else { @@ -1266,8 +1329,8 @@ skip: kfree(s_qf_names[i]); #endif /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); return 0; restore_gc: @@ -1320,8 +1383,13 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: page = read_mapping_page(mapping, blkidx, NULL); - if (IS_ERR(page)) + if (IS_ERR(page)) { + if (PTR_ERR(page) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto repeat; + } return PTR_ERR(page); + } lock_page(page); @@ -1364,11 +1432,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, while (towrite > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); - +retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, &page, NULL); - if (unlikely(err)) + if (unlikely(err)) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } break; + } kaddr = kmap_atomic(page); memcpy(kaddr + offset, data, tocopy); @@ -1385,8 +1458,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, } if (len == towrite) - return 0; - inode->i_version++; + return err; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; @@ -1408,19 +1480,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) sbi->s_jquota_fmt, type); } -void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) { - int i, ret; + int enabled = 0; + int i, err; + + if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) { + err = f2fs_enable_quotas(sbi->sb); + if (err) { + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quota_ino: %d", err); + return 0; + } + return 1; + } for (i = 0; i < MAXQUOTAS; i++) { if (sbi->s_qf_names[i]) { - ret = f2fs_quota_on_mount(sbi, i); - if (ret < 0) - f2fs_msg(sbi->sb, KERN_ERR, - "Cannot turn on journaled " - "quota: error %d", ret); + err = f2fs_quota_on_mount(sbi, i); + if (!err) { + enabled = 1; + continue; + } + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quotas: %d on %d", err, i); + } + } + return enabled; +} + +static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + struct inode *qf_inode; + unsigned long qf_inum; + int err; + + BUG_ON(!f2fs_sb_has_quota_ino(sb)); + + qf_inum = f2fs_qf_ino(sb, type); + if (!qf_inum) + return -EPERM; + + qf_inode = f2fs_iget(sb, qf_inum); + if (IS_ERR(qf_inode)) { + f2fs_msg(sb, KERN_ERR, + "Bad quota inode %u:%lu", type, qf_inum); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + err = dquot_enable(qf_inode, type, format_id, flags); + iput(qf_inode); + return err; +} + +static int f2fs_enable_quotas(struct super_block *sb) +{ + int type, err = 0; + unsigned long qf_inum; + bool quota_mopt[MAXQUOTAS] = { + test_opt(F2FS_SB(sb), USRQUOTA), + test_opt(F2FS_SB(sb), GRPQUOTA), + test_opt(F2FS_SB(sb), PRJQUOTA), + }; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + for (type = 0; type < MAXQUOTAS; type++) { + qf_inum = f2fs_qf_ino(sb, type); + if (qf_inum) { + err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "fsck to fix.", type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; + } } } + return 0; } static int f2fs_quota_sync(struct super_block *sb, int type) @@ -1491,7 +1635,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); - if (err) + if (err || f2fs_sb_has_quota_ino(sb)) goto out_put; inode_lock(inode); @@ -1594,14 +1738,9 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; -#else -static const struct fscrypt_operations f2fs_cryptops = { - .is_encrypted = f2fs_encrypted_inode, -}; #endif static struct inode *f2fs_nfs_get_inode(struct super_block *sb, @@ -1656,7 +1795,7 @@ static loff_t max_file_blocks(void) /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - - * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more * space in inode.i_addr, it will be more safe to reassign * result as zero. */ @@ -1965,6 +2104,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (j = HOT; j < NR_TEMP_TYPE; j++) mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); + + sbi->dirty_device = 0; + spin_lock_init(&sbi->dev_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -2315,18 +2457,23 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; - sb->s_qcop = &f2fs_quotactl_ops; + if (f2fs_sb_has_quota_ino(sb)) + sb->s_qcop = &dquot_quotactl_sysfile_ops; + else + sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif sb->s_op = &f2fs_sops; +#ifdef CONFIG_F2FS_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; +#endif sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; sb->s_time_gran = 1; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ @@ -2411,6 +2558,7 @@ try_onemore: le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->reserved_blocks = 0; + sbi->current_reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -2485,10 +2633,24 @@ try_onemore: if (err) goto free_root_inode; +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) { + err = f2fs_enable_quotas(sb); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", err); + goto free_sysfs; + } + } +#endif /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) - goto free_sysfs; + goto free_meta; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2522,7 +2684,7 @@ try_onemore: err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_sysfs; + goto free_meta; } } skip_recovery: @@ -2556,6 +2718,10 @@ skip_recovery: return 0; free_meta: +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) + f2fs_quota_off_umount(sbi->sb); +#endif f2fs_sync_inode_meta(sbi); /* * Some dirty meta pages can be produced by recover_orphan_inodes() @@ -2564,7 +2730,9 @@ free_meta: * falls into an infinite loop in sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); +#ifdef CONFIG_QUOTA free_sysfs: +#endif f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e2c258f717cd..9835348b6e5d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -30,7 +30,7 @@ enum { FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif - RESERVED_BLOCKS, + RESERVED_BLOCKS, /* struct f2fs_sb_info */ }; struct f2fs_attr { @@ -63,6 +63,13 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return NULL; } +static ssize_t dirty_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); +} + static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -100,10 +107,22 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_inode_chksum(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); + if (f2fs_sb_has_flexible_inline_xattr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "flexible_inline_xattr"); + if (f2fs_sb_has_quota_ino(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "quota_ino"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } +static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -143,34 +162,22 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); - if ((unsigned long)sbi->total_valid_block_count + t > - (unsigned long)sbi->user_block_count) { + if (t > (unsigned long)sbi->user_block_count) { spin_unlock(&sbi->stat_lock); return -EINVAL; } *ui = t; + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->user_block_count - valid_user_blocks(sbi)); spin_unlock(&sbi->stat_lock); return count; } if (!strcmp(a->attr.name, "discard_granularity")) { - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - int i; - if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; if (t == *ui) return count; - - mutex_lock(&dcc->cmd_lock); - for (i = 0; i < MAX_PLIST_NUM; i++) { - if (i >= t - 1) - dcc->pend_list_tag[i] |= P_ACTIVE; - else - dcc->pend_list_tag[i] &= (~P_ACTIVE); - } - mutex_unlock(&dcc->cmd_lock); - *ui = t; return count; } @@ -222,6 +229,8 @@ enum feat_id { FEAT_EXTRA_ATTR, FEAT_PROJECT_QUOTA, FEAT_INODE_CHECKSUM, + FEAT_FLEXIBLE_INLINE_XATTR, + FEAT_QUOTA_INO, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -234,6 +243,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_EXTRA_ATTR: case FEAT_PROJECT_QUOTA: case FEAT_INODE_CHECKSUM: + case FEAT_FLEXIBLE_INLINE_XATTR: + case FEAT_QUOTA_INO: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -279,6 +290,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -291,8 +303,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif +F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); +F2FS_GENERAL_RO_ATTR(current_reserved_blocks); #ifdef CONFIG_F2FS_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); @@ -304,6 +318,8 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -321,6 +337,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), ATTR_LIST(min_hot_blocks), + ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), @@ -333,9 +350,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif + ATTR_LIST(dirty_segments), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), + ATTR_LIST(current_reserved_blocks), NULL, }; @@ -350,6 +369,8 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(extra_attr), ATTR_LIST(project_quota), ATTR_LIST(inode_checksum), + ATTR_LIST(flexible_inline_xattr), + ATTR_LIST(quota_ino), NULL, }; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7c65540148f8..ec8961ef8cac 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -217,12 +217,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, - void **last_addr, int index, - size_t len, const char *name) +static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, + void *base_addr, void **last_addr, int index, + size_t len, const char *name) { struct f2fs_xattr_entry *entry; - unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + unsigned int inline_size = inline_xattr_size(inode); list_for_each_xattr(entry, base_addr) { if ((void *)entry + sizeof(__u32) > base_addr + inline_size || @@ -241,12 +241,54 @@ static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, return entry; } +static int read_inline_xattr(struct inode *inode, struct page *ipage, + void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int inline_size = inline_xattr_size(inode); + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(inode, ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + inline_addr = inline_xattr_addr(inode, page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + return 0; +} + +static int read_xattr_block(struct inode *inode, void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) + return PTR_ERR(xpage); + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE); + f2fs_put_page(xpage, 1); + + return 0; +} + static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, void **base_addr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; @@ -263,23 +305,11 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto out; - } - inline_addr = inline_xattr_addr(page); - } - memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto out; - *xe = __find_inline_xattr(txattr_addr, &last_addr, + *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, index, len, name); if (*xe) goto check; @@ -287,19 +317,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from xattr node block */ if (xnid) { - struct page *xpage; - void *xattr_addr; - - /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + err = read_xattr_block(inode, txattr_addr); + if (err) goto out; - } - - xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, size); - f2fs_put_page(xpage, 1); } if (last_addr) @@ -324,7 +344,6 @@ out: static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = VALID_XATTR_BLOCK_SIZE; @@ -339,38 +358,16 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; - } - inline_addr = inline_xattr_addr(page); - } - memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto fail; } /* read from xattr node block */ if (xnid) { - struct page *xpage; - void *xattr_addr; - - /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + err = read_xattr_block(inode, txattr_addr); + if (err) goto fail; - } - - xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, size); - f2fs_put_page(xpage, 1); } header = XATTR_HDR(txattr_addr); @@ -392,10 +389,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t inline_size = inline_xattr_size(inode); + struct page *in_page = NULL; void *xattr_addr; + void *inline_addr = NULL; struct page *xpage; nid_t new_nid = 0; - int err; + int err = 0; if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) @@ -403,30 +402,30 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true); - set_page_dirty(ipage); + inline_addr = inline_xattr_addr(inode, ipage); } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { + in_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(in_page)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(page); + return PTR_ERR(in_page); } - inline_addr = inline_xattr_addr(page); - f2fs_wait_on_page_writeback(page, NODE, true); + inline_addr = inline_xattr_addr(inode, in_page); } - memcpy(inline_addr, txattr_addr, inline_size); - f2fs_put_page(page, 1); + f2fs_wait_on_page_writeback(ipage ? ipage : in_page, + NODE, true); /* no need to use xattr node block */ if (hsize <= inline_size) { - err = truncate_xattr_node(inode, ipage); + err = truncate_xattr_node(inode); alloc_nid_failed(sbi, new_nid); - return err; + if (err) { + f2fs_put_page(in_page, 1); + return err; + } + memcpy(inline_addr, txattr_addr, inline_size); + set_page_dirty(ipage ? ipage : in_page); + goto in_page_out; } } @@ -435,7 +434,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(xpage); + goto in_page_out; } f2fs_bug_on(sbi, new_nid); f2fs_wait_on_page_writeback(xpage, NODE, true); @@ -445,17 +444,24 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(xpage); + goto in_page_out; } alloc_nid_done(sbi, new_nid); } - xattr_addr = page_address(xpage); + + if (inline_size) + memcpy(inline_addr, txattr_addr, inline_size); memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); + + if (inline_size) + set_page_dirty(ipage ? ipage : in_page); set_page_dirty(xpage); - f2fs_put_page(xpage, 1); - return 0; + f2fs_put_page(xpage, 1); +in_page_out: + f2fs_put_page(in_page, 1); + return err; } int f2fs_getxattr(struct inode *inode, int index, const char *name, @@ -681,6 +687,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; + err = dquot_initialize(inode); + if (err) + return err; + /* this case is only from init_inode_metadata */ if (ipage) return __f2fs_setxattr(inode, index, name, value, diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 81cecbe6d7cf..b833ffeee1e1 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -291,7 +291,6 @@ static int fat_parse_long(struct inode *dir, loff_t *pos, } } parse_long: - slots = 0; ds = (struct msdos_dir_slot *)*de; id = ds->id; if (!(id & 0x40)) diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 48b2336692f9..bac10de678cc 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -392,7 +392,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); brelse(c_bh); if (err) @@ -597,7 +597,7 @@ int fat_free_clusters(struct inode *inode, int cluster) } if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { - if (sb->s_flags & MS_SYNCHRONOUS) { + if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; @@ -612,7 +612,7 @@ int fat_free_clusters(struct inode *inode, int cluster) fat_collect_bhs(bhs, &nr_bhs, &fatent); } while (cluster != FAT_ENT_EOF); - if (sb->s_flags & MS_SYNCHRONOUS) { + if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 30c52394a7ad..20a0a89eaca5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -779,14 +779,14 @@ static void __exit fat_destroy_inodecache(void) static int fat_remount(struct super_block *sb, int *flags, char *data) { - int new_rdonly; + bool new_rdonly; struct msdos_sb_info *sbi = MSDOS_SB(sb); - *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); sync_filesystem(sb); /* make sure we update state on remount. */ - new_rdonly = *flags & MS_RDONLY; + new_rdonly = *flags & SB_RDONLY; if (new_rdonly != sb_rdonly(sb)) { if (new_rdonly) fat_set_state(sb, 0, 0); @@ -1352,7 +1352,7 @@ out: if (opts->unicode_xlate) opts->utf8 = 0; if (opts->nfs == FAT_NFS_NOSTALE_RO) { - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; sb->s_export_op = &fat_export_ops_nostale; } @@ -1608,7 +1608,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, return -ENOMEM; sb->s_fs_info = sbi; - sb->s_flags |= MS_NODIRATIME; + sb->s_flags |= SB_NODIRATIME; sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index acc3aa30ee54..f9bdc1e01c98 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -33,7 +33,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) if (opts->errors == FAT_ERRORS_PANIC) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) { - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 7d6a105d601b..d24d2758a363 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -646,7 +646,7 @@ static void setup(struct super_block *sb) { MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations; sb->s_d_op = &msdos_dentry_operations; - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; } static int msdos_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/fcntl.c b/fs/fcntl.c index 8d78ffd7b399..0522e283a4f4 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -563,6 +563,9 @@ static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __u { struct compat_flock64 fl; + BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start)); + BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len)); + memset(&fl, 0, sizeof(struct compat_flock64)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) @@ -632,9 +635,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, if (err) break; err = fixup_compat_flock(&flock); - if (err) - return err; - err = put_compat_flock(&flock, compat_ptr(arg)); + if (!err) + err = put_compat_flock(&flock, compat_ptr(arg)); break; case F_GETLK64: case F_OFD_GETLK: @@ -642,12 +644,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, if (err) break; err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); - if (err) - break; - err = fixup_compat_flock(&flock); - if (err) - return err; - err = put_compat_flock64(&flock, compat_ptr(arg)); + if (!err) + err = put_compat_flock64(&flock, compat_ptr(arg)); break; case F_SETLK: case F_SETLKW: @@ -725,7 +723,7 @@ static void send_sigio_to_task(struct task_struct *p, * F_SETSIG can change ->signum lockless in parallel, make * sure we read it once and use the same value throughout. */ - int signum = ACCESS_ONCE(fown->signum); + int signum = READ_ONCE(fown->signum); if (!sigio_perm(p, fown, signum)) return; diff --git a/fs/fhandle.c b/fs/fhandle.c index 474adc8d2a3a..0ace128f5d23 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -213,8 +213,8 @@ out_err: return retval; } -long do_handle_open(int mountdirfd, - struct file_handle __user *ufh, int open_flag) +static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, + int open_flag) { long retval = 0; struct path path; diff --git a/fs/file.c b/fs/file.c index 4eecbf4244a5..3b080834b870 100644 --- a/fs/file.c +++ b/fs/file.c @@ -593,13 +593,16 @@ void __fd_install(struct files_struct *files, unsigned int fd, { struct fdtable *fdt; - might_sleep(); rcu_read_lock_sched(); - while (unlikely(files->resize_in_progress)) { + if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); - wait_event(files->resize_wait, !files->resize_in_progress); - rcu_read_lock_sched(); + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); + return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); @@ -632,7 +635,6 @@ int __close_fd(struct files_struct *files, unsigned fd) if (!file) goto out_unlock; rcu_assign_pointer(fdt->fd[fd], NULL); - __clear_close_on_exec(fd, fdt); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); return filp_close(file, files); diff --git a/fs/file_table.c b/fs/file_table.c index 61517f57f8ef..2dc9f38bd195 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -201,11 +201,11 @@ static void __fput(struct file *file) eventpoll_release(file); locks_remove_file(file); + ima_file_free(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } - ima_file_free(file); if (file->f_op->release) file->f_op->release(inode, file); security_file_free(file); @@ -312,7 +312,7 @@ void put_filp(struct file *file) void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 455ce5b77e9b..f989efa051a0 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -116,7 +116,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) static int vxfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -220,7 +220,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) int ret = -EINVAL; u32 j; - sbp->s_flags |= MS_RDONLY; + sbp->s_flags |= SB_RDONLY; infp = kzalloc(sizeof(*infp), GFP_KERNEL); if (!infp) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 245c430a2e41..cea4836385b7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -490,7 +490,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); - if (!(inode->i_sb->s_flags & MS_ACTIVE) || + if (!(inode->i_sb->s_flags & SB_ACTIVE) || inode->i_state & (I_WB_SWITCH | I_FREEING) || inode_to_wb(inode) == isw->new_wb) { spin_unlock(&inode->i_lock); @@ -933,33 +933,36 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, #endif /* CONFIG_CGROUP_WRITEBACK */ -void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - bool range_cyclic, enum wb_reason reason) +/* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ +static unsigned long get_nr_dirty_pages(void) { - struct wb_writeback_work *work; + return global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS) + + get_nr_dirty_inodes(); +} +static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) +{ if (!wb_has_dirty_io(wb)) return; /* - * This is WB_SYNC_NONE writeback, so if allocation fails just - * wakeup the thread for old dirty data writeback + * All callers of this function want to start writeback of all + * dirty pages. Places like vmscan can call this at a very + * high frequency, causing pointless allocations of tons of + * work items and keeping the flusher threads busy retrieving + * that work. Ensure that we only allow one of them pending and + * inflight at the time. */ - work = kzalloc(sizeof(*work), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (!work) { - trace_writeback_nowork(wb); - wb_wakeup(wb); + if (test_bit(WB_start_all, &wb->state) || + test_and_set_bit(WB_start_all, &wb->state)) return; - } - - work->sync_mode = WB_SYNC_NONE; - work->nr_pages = nr_pages; - work->range_cyclic = range_cyclic; - work->reason = reason; - work->auto_free = 1; - wb_queue_work(wb, work); + wb->start_all_reason = reason; + wb_wakeup(wb); } /** @@ -1814,17 +1817,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) return work; } -/* - * Add in the number of potentially dirty inodes, because each inode - * write can dirty pagecache in the underlying blockdev. - */ -static unsigned long get_nr_dirty_pages(void) -{ - return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + - get_nr_dirty_inodes(); -} - static long wb_check_background_flush(struct bdi_writeback *wb) { if (wb_over_bg_thresh(wb)) { @@ -1877,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; } +static long wb_check_start_all(struct bdi_writeback *wb) +{ + long nr_pages; + + if (!test_bit(WB_start_all, &wb->state)) + return 0; + + nr_pages = get_nr_dirty_pages(); + if (nr_pages) { + struct wb_writeback_work work = { + .nr_pages = wb_split_bdi_pages(wb, nr_pages), + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + .reason = wb->start_all_reason, + }; + + nr_pages = wb_writeback(wb, &work); + } + + clear_bit(WB_start_all, &wb->state); + return nr_pages; +} + + /* * Retrieve work items and do the writeback they describe */ @@ -1893,6 +1909,11 @@ static long wb_do_writeback(struct bdi_writeback *wb) } /* + * Check for a flush-everything request + */ + wrote += wb_check_start_all(wb); + + /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); @@ -1947,10 +1968,33 @@ void wb_workfn(struct work_struct *work) } /* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. + * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero, + * write back the whole world. */ -void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) +static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + enum wb_reason reason) +{ + struct bdi_writeback *wb; + + if (!bdi_has_dirty_io(bdi)) + return; + + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) + wb_start_writeback(wb, reason); +} + +void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + enum wb_reason reason) +{ + rcu_read_lock(); + __wakeup_flusher_threads_bdi(bdi, reason); + rcu_read_unlock(); +} + +/* + * Wakeup the flusher threads to start writeback of all currently dirty pages + */ +void wakeup_flusher_threads(enum wb_reason reason) { struct backing_dev_info *bdi; @@ -1960,20 +2004,9 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) if (blk_needs_flush_plug(current)) blk_schedule_flush_plug(current); - if (!nr_pages) - nr_pages = get_nr_dirty_pages(); - rcu_read_lock(); - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { - struct bdi_writeback *wb; - - if (!bdi_has_dirty_io(bdi)) - continue; - - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) - wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), - false, reason); - } + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) + __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } @@ -2343,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) EXPORT_SYMBOL(writeback_inodes_sb); /** - * try_to_writeback_inodes_sb_nr - try to start writeback if none underway + * try_to_writeback_inodes_sb - try to start writeback if none underway * @sb: the superblock - * @nr: the number of pages to write - * @reason: the reason of writeback + * @reason: reason why some writeback work was initiated * - * Invoke writeback_inodes_sb_nr if no writeback is currently underway. - * Returns 1 if writeback was started, 0 if not. + * Invoke __writeback_inodes_sb_nr if no writeback is currently underway. */ -bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, - enum wb_reason reason) +void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { if (!down_read_trylock(&sb->s_umount)) - return false; + return; - __writeback_inodes_sb_nr(sb, nr, reason, true); + __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); up_read(&sb->s_umount); - return true; -} -EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); - -/** - * try_to_writeback_inodes_sb - try to start writeback if none underway - * @sb: the superblock - * @reason: reason why some writeback work was initiated - * - * Implement by try_to_writeback_inodes_sb_nr() - * Returns 1 if writeback was started, 0 if not. - */ -bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) -{ - return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(try_to_writeback_inodes_sb); diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 0d285fd5b44a..a6497cf8ae53 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -79,7 +79,7 @@ void mnt_pin_kill(struct mount *m) while (1) { struct hlist_node *p; rcu_read_lock(); - p = ACCESS_ONCE(m->mnt_pins.first); + p = READ_ONCE(m->mnt_pins.first); if (!p) { rcu_read_unlock(); break; @@ -93,7 +93,7 @@ void group_pin_kill(struct hlist_head *p) while (1) { struct hlist_node *q; rcu_read_lock(); - q = ACCESS_ONCE(p->first); + q = READ_ONCE(p->first); if (!q) { rcu_read_unlock(); break; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 40d61077bead..ff84258132bb 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -558,7 +558,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) * have completed. */ if (!atomic_dec_and_test(&cookie->n_active)) - wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, + wait_on_atomic_t(&cookie->n_active, atomic_t_wait, TASK_UNINTERRUPTIBLE); /* Make sure any pending writes are cancelled. */ diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 97ec45110957..0ff4b49a0037 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void) return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); } -extern int fscache_wait_atomic_t(atomic_t *); - /* * object.c */ diff --git a/fs/fscache/main.c b/fs/fscache/main.c index b39d487ccfb0..249968dcbf5c 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -195,12 +195,3 @@ static void __exit fscache_exit(void) } module_exit(fscache_exit); - -/* - * wait_on_atomic_t() sleep function for uninterruptible waiting - */ -int fscache_wait_atomic_t(atomic_t *p) -{ - schedule(); - return 0; -} diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 0ad3fd3ad0b4..961029e04027 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1175,7 +1175,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, return; } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next = 0; do { if (!pagevec_lookup(&pvec, mapping, &next)) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 13c65dd2d37d..17f0d05bfd4c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -33,7 +33,7 @@ static struct fuse_dev *fuse_get_dev(struct file *file) * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return ACCESS_ONCE(file->private_data); + return READ_ONCE(file->private_data); } static void fuse_request_init(struct fuse_req *req, struct page **pages, @@ -1636,7 +1636,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - release_pages(req->pages, req->num_pages, false); + release_pages(req->pages, req->num_pages); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 94a745acaef8..624f18bbfd2b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -31,7 +31,7 @@ static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); -static int set_global_limit(const char *val, struct kernel_param *kp); +static int set_global_limit(const char *val, const struct kernel_param *kp); unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, @@ -130,7 +130,7 @@ static void fuse_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - if (inode->i_sb->s_flags & MS_ACTIVE) { + if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); @@ -141,7 +141,7 @@ static void fuse_evict_inode(struct inode *inode) static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if (*flags & MS_MANDLOCK) + if (*flags & SB_MANDLOCK) return -EINVAL; return 0; @@ -823,7 +823,7 @@ static void sanitize_global_limit(unsigned *limit) *limit = (1 << 16) - 1; } -static int set_global_limit(const char *val, struct kernel_param *kp) +static int set_global_limit(const char *val, const struct kernel_param *kp) { int rv; @@ -1056,10 +1056,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) int is_bdev = sb->s_bdev != NULL; err = -EINVAL; - if (sb->s_flags & MS_MANDLOCK) + if (sb->s_flags & SB_MANDLOCK) goto err; - sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION); + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); if (!parse_fuse_opt(data, &d, is_bdev)) goto err; @@ -1109,9 +1109,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_dev_free; /* Handle umasking inside the fuse code */ - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) fc->dont_mask = 1; - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; fc->default_permissions = d.default_permissions; fc->allow_other = d.allow_other; @@ -1273,9 +1273,9 @@ static int __init fuse_fs_init(void) int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", - sizeof(struct fuse_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, - fuse_inode_init_once); + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT, + fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) goto out; diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 90c6a8faaecb..43c827a7cce5 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -4,6 +4,7 @@ config GFS2_FS select FS_POSIX_ACL select CRC32 select QUOTACTL + select FS_IOMAP help A cluster filesystem. diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 9d5eecb123de..776717f1eeea 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -141,6 +141,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) ret = __gfs2_set_acl(inode, acl, type); if (!ret && mode != inode->i_mode) { + inode->i_ctime = current_time(inode); inode->i_mode = mode; mark_inode_dirty(inode); } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 68ed06962537..1daf15a1f00c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -280,22 +280,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - ret = 1; - break; - } - *done_index = page->index; lock_page(page); @@ -387,7 +371,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -413,8 +397,8 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) break; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3dd0cceefa43..d5f0d96169c5 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> +#include <linux/iomap.h> #include "gfs2.h" #include "incore.h" @@ -36,6 +37,8 @@ struct metapath { struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; __u16 mp_list[GFS2_MAX_META_HEIGHT]; + int mp_fheight; /* find_metapath height */ + int mp_aheight; /* actual height (lookup height) */ }; /** @@ -235,9 +238,9 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block, { unsigned int i; + mp->mp_fheight = height; for (i = height; i--;) mp->mp_list[i] = do_div(block, sdp->sd_inptrs); - } static inline unsigned int metapath_branch_start(const struct metapath *mp) @@ -248,7 +251,7 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp) } /** - * metaptr1 - Return the first possible metadata pointer in a metaath buffer + * metaptr1 - Return the first possible metadata pointer in a metapath buffer * @height: The metadata height (0 = dinode) * @mp: The metapath */ @@ -345,10 +348,13 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) for (x = 0; x < end_of_metadata; x++) { ret = lookup_mp_height(ip, mp, x); if (ret) - return ret; + goto out; } - return ip->i_height; + ret = ip->i_height; +out: + mp->mp_aheight = ret; + return ret; } /** @@ -480,10 +486,11 @@ static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt) * @inode: The GFS2 inode * @lblock: The logical starting block of the extent * @bh_map: This is used to return the mapping details - * @mp: The metapath - * @sheight: The starting height (i.e. whats already mapped) - * @height: The height to build to + * @zero_new: True if newly allocated blocks should be zeroed + * @mp: The metapath, with proper height information calculated * @maxlen: The max number of data blocks to alloc + * @dblock: Pointer to return the resulting new block + * @dblks: Pointer to return the number of blocks allocated * * In this routine we may have to alloc: * i) Indirect blocks to grow the metadata tree height @@ -499,63 +506,63 @@ static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt) * Returns: errno on error */ -static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, - struct buffer_head *bh_map, struct metapath *mp, - const unsigned int sheight, - const unsigned int height, - const size_t maxlen) +static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, + unsigned flags, struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct super_block *sb = sdp->sd_vfs; struct buffer_head *dibh = mp->mp_bh[0]; - u64 bn, dblock = 0; + u64 bn; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; - const unsigned end_of_metadata = height - 1; + const unsigned end_of_metadata = mp->mp_fheight - 1; int ret; - int eob = 0; enum alloc_state state; __be64 *ptr; __be64 zero_bn = 0; + size_t maxlen = iomap->length >> inode->i_blkbits; - BUG_ON(sheight < 1); + BUG_ON(mp->mp_aheight < 1); BUG_ON(dibh == NULL); gfs2_trans_add_meta(ip->i_gl, dibh); - if (height == sheight) { + if (mp->mp_fheight == mp->mp_aheight) { struct buffer_head *bh; + int eob; + /* Bottom indirect block exists, find unalloced extent size */ ptr = metapointer(end_of_metadata, mp); bh = mp->mp_bh[end_of_metadata]; - dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, - &eob); + dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, + maxlen, &eob); BUG_ON(dblks < 1); state = ALLOC_DATA; } else { /* Need to allocate indirect blocks */ - ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; + ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs : + sdp->sd_diptrs; dblks = min(maxlen, (size_t)(ptrs_per_blk - mp->mp_list[end_of_metadata])); - if (height == ip->i_height) { + if (mp->mp_fheight == ip->i_height) { /* Writing into existing tree, extend tree down */ - iblks = height - sheight; + iblks = mp->mp_fheight - mp->mp_aheight; state = ALLOC_GROW_DEPTH; } else { /* Building up tree height */ state = ALLOC_GROW_HEIGHT; - iblks = height - ip->i_height; + iblks = mp->mp_fheight - ip->i_height; branch_start = metapath_branch_start(mp); - iblks += (height - branch_start); + iblks += (mp->mp_fheight - branch_start); } } /* start of the second part of the function (state machine) */ blks = dblks + iblks; - i = sheight; + i = mp->mp_aheight; do { int error; n = blks - alloced; @@ -573,9 +580,10 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, sizeof(struct gfs2_dinode)); zero_bn = *ptr; } - for (; i - 1 < height - ip->i_height && n > 0; i++, n--) + for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0; + i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); - if (i - 1 == height - ip->i_height) { + if (i - 1 == mp->mp_fheight - ip->i_height) { i--; gfs2_buffer_copy_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header), @@ -587,7 +595,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, sizeof(struct gfs2_meta_header)); *ptr = zero_bn; state = ALLOC_GROW_DEPTH; - for(i = branch_start; i < height; i++) { + for(i = branch_start; i < mp->mp_fheight; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); @@ -599,12 +607,12 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, break; /* Branching from existing tree */ case ALLOC_GROW_DEPTH: - if (i > 1 && i < height) + if (i > 1 && i < mp->mp_fheight) gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); - for (; i < height && n > 0; i++, n--) + for (; i < mp->mp_fheight && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, mp->mp_list[i-1], bn++); - if (i == height) + if (i == mp->mp_fheight) state = ALLOC_DATA; if (n == 0) break; @@ -615,119 +623,269 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); dblks = n; ptr = metapointer(end_of_metadata, mp); - dblock = bn; + iomap->addr = bn << inode->i_blkbits; + iomap->flags |= IOMAP_F_NEW; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); - if (buffer_zeronew(bh_map)) { - ret = sb_issue_zeroout(sb, dblock, dblks, - GFP_NOFS); + if (flags & IOMAP_ZERO) { + ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits, + dblks, GFP_NOFS); if (ret) { fs_err(sdp, "Failed to zero data buffers\n"); - clear_buffer_zeronew(bh_map); + flags &= ~IOMAP_ZERO; } } break; } - } while ((state != ALLOC_DATA) || !dblock); + } while (iomap->addr == IOMAP_NULL_ADDR); - ip->i_height = height; + iomap->length = (u64)dblks << inode->i_blkbits; + ip->i_height = mp->mp_fheight; gfs2_add_inode_blocks(&ip->i_inode, alloced); gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); - map_bh(bh_map, inode->i_sb, dblock); - bh_map->b_size = dblks << inode->i_blkbits; - set_buffer_new(bh_map); return 0; } /** - * gfs2_block_map - Map a block from an inode to a disk block + * hole_size - figure out the size of a hole * @inode: The inode - * @lblock: The logical block number - * @bh_map: The bh to be mapped - * @create: True if its ok to alloc blocks to satify the request + * @lblock: The logical starting block number + * @mp: The metapath * - * Sets buffer_mapped() if successful, sets buffer_boundary() if a - * read of metadata will be required before the next block can be - * mapped. Sets buffer_new() if new blocks were allocated. + * Returns: The hole size in bytes * - * Returns: errno */ +static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct metapath mp_eof; + u64 factor = 1; + int hgt; + u64 holesz = 0; + const __be64 *first, *end, *ptr; + const struct buffer_head *bh; + u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits; + int zeroptrs; + bool done = false; + + /* Get another metapath, to the very last byte */ + find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height); + for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) { + bh = mp->mp_bh[hgt]; + if (bh) { + zeroptrs = 0; + first = metapointer(hgt, mp); + end = (const __be64 *)(bh->b_data + bh->b_size); + + for (ptr = first; ptr < end; ptr++) { + if (*ptr) { + done = true; + break; + } else { + zeroptrs++; + } + } + } else { + zeroptrs = sdp->sd_inptrs; + } + if (factor * zeroptrs >= lblock_stop - lblock + 1) { + holesz = lblock_stop - lblock + 1; + break; + } + holesz += factor * zeroptrs; -int gfs2_block_map(struct inode *inode, sector_t lblock, - struct buffer_head *bh_map, int create) + factor *= sdp->sd_inptrs; + if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1])) + (mp->mp_list[hgt - 1])++; + } + return holesz << inode->i_blkbits; +} + +static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + iomap->addr = (ip->i_no_addr << inode->i_blkbits) + + sizeof(struct gfs2_dinode); + iomap->offset = 0; + iomap->length = i_size_read(inode); + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_DATA_INLINE; +} + +/** + * gfs2_iomap_begin - Map blocks from an inode to disk blocks + * @inode: The inode + * @pos: Starting position in bytes + * @length: Length to map, in bytes + * @flags: iomap flags + * @iomap: The iomap structure + * + * Returns: errno + */ +int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned int bsize = sdp->sd_sb.sb_bsize; - const size_t maxlen = bh_map->b_size >> inode->i_blkbits; + struct metapath mp = { .mp_aheight = 1, }; + unsigned int factor = sdp->sd_sb.sb_bsize; const u64 *arr = sdp->sd_heightsize; __be64 *ptr; - u64 size; - struct metapath mp; + sector_t lblock; + sector_t lend; int ret; int eob; unsigned int len; struct buffer_head *bh; u8 height; - BUG_ON(maxlen == 0); + trace_gfs2_iomap_start(ip, pos, length, flags); + if (!length) { + ret = -EINVAL; + goto out; + } - memset(&mp, 0, sizeof(mp)); - bmap_lock(ip, create); - clear_buffer_mapped(bh_map); - clear_buffer_new(bh_map); - clear_buffer_boundary(bh_map); - trace_gfs2_bmap(ip, bh_map, lblock, create, 1); + if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) { + gfs2_stuffed_iomap(inode, iomap); + if (pos >= iomap->length) + return -ENOENT; + ret = 0; + goto out; + } + + lblock = pos >> inode->i_blkbits; + lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits; + + iomap->offset = lblock << inode->i_blkbits; + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + iomap->length = (u64)(lend - lblock) << inode->i_blkbits; + iomap->flags = IOMAP_F_MERGED; + bmap_lock(ip, 0); + + /* + * Directory data blocks have a struct gfs2_meta_header header, so the + * remaining size is smaller than the filesystem block size. Logical + * block numbers for directories are in units of this remaining size! + */ if (gfs2_is_dir(ip)) { - bsize = sdp->sd_jbsize; + factor = sdp->sd_jbsize; arr = sdp->sd_jheightsize; } ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); if (ret) - goto out; + goto out_release; height = ip->i_height; - size = (lblock + 1) * bsize; - while (size > arr[height]) + while ((lblock + 1) * factor > arr[height]) height++; find_metapath(sdp, lblock, &mp, height); - ret = 1; if (height > ip->i_height || gfs2_is_stuffed(ip)) goto do_alloc; + ret = lookup_metapath(ip, &mp); if (ret < 0) - goto out; - if (ret != ip->i_height) + goto out_release; + + if (mp.mp_aheight != ip->i_height) goto do_alloc; + ptr = metapointer(ip->i_height - 1, &mp); if (*ptr == 0) goto do_alloc; - map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr)); + + iomap->type = IOMAP_MAPPED; + iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; + bh = mp.mp_bh[ip->i_height - 1]; - len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob); - bh_map->b_size = (len << inode->i_blkbits); + len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob); if (eob) - set_buffer_boundary(bh_map); + iomap->flags |= IOMAP_F_BOUNDARY; + iomap->length = (u64)len << inode->i_blkbits; + ret = 0; -out: + +out_release: release_metapath(&mp); - trace_gfs2_bmap(ip, bh_map, lblock, create, ret); - bmap_unlock(ip, create); + bmap_unlock(ip, 0); +out: + trace_gfs2_iomap_end(ip, iomap, ret); return ret; do_alloc: - /* All allocations are done here, firstly check create flag */ - if (!create) { - BUG_ON(gfs2_is_stuffed(ip)); + if (!(flags & IOMAP_WRITE)) { + if (pos >= i_size_read(inode)) { + ret = -ENOENT; + goto out_release; + } ret = 0; + iomap->length = hole_size(inode, lblock, &mp); + goto out_release; + } + + ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); + goto out_release; +} + +/** + * gfs2_block_map - Map a block from an inode to a disk block + * @inode: The inode + * @lblock: The logical block number + * @bh_map: The bh to be mapped + * @create: True if its ok to alloc blocks to satify the request + * + * Sets buffer_mapped() if successful, sets buffer_boundary() if a + * read of metadata will be required before the next block can be + * mapped. Sets buffer_new() if new blocks were allocated. + * + * Returns: errno + */ + +int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh_map, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct iomap iomap; + int ret, flags = 0; + + clear_buffer_mapped(bh_map); + clear_buffer_new(bh_map); + clear_buffer_boundary(bh_map); + trace_gfs2_bmap(ip, bh_map, lblock, create, 1); + + if (create) + flags |= IOMAP_WRITE; + if (buffer_zeronew(bh_map)) + flags |= IOMAP_ZERO; + ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits, + bh_map->b_size, flags, &iomap); + if (ret) { + if (!create && ret == -ENOENT) { + /* Return unmapped buffer beyond the end of file. */ + ret = 0; + } goto out; } - /* At this point ret is the tree depth of already allocated blocks */ - ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen); - goto out; + if (iomap.length > bh_map->b_size) { + iomap.length = bh_map->b_size; + iomap.flags &= ~IOMAP_F_BOUNDARY; + } + if (iomap.addr != IOMAP_NULL_ADDR) + map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits); + bh_map->b_size = iomap.length; + if (iomap.flags & IOMAP_F_BOUNDARY) + set_buffer_boundary(bh_map); + if (iomap.flags & IOMAP_F_NEW) + set_buffer_new(bh_map); + +out: + trace_gfs2_bmap(ip, bh_map, lblock, create, ret); + return ret; } /* diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 81ded5e2aaa2..443cc182cf18 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -10,6 +10,8 @@ #ifndef __BMAP_DOT_H__ #define __BMAP_DOT_H__ +#include <linux/iomap.h> + #include "inode.h" struct inode; @@ -47,6 +49,8 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); extern int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); +extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap); extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); extern int gfs2_setattr_size(struct inode *inode, u64 size); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 33a0cb5701a3..58705ef8643a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -60,9 +60,7 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) loff_t error; switch (whence) { - case SEEK_END: /* These reference inode->i_size */ - case SEEK_DATA: - case SEEK_HOLE: + case SEEK_END: error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { @@ -70,8 +68,21 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) gfs2_glock_dq_uninit(&i_gh); } break; + + case SEEK_DATA: + error = gfs2_seek_data(file, offset); + break; + + case SEEK_HOLE: + error = gfs2_seek_hole(file, offset); + break; + case SEEK_CUR: case SEEK_SET: + /* + * These don't reference inode->i_size and don't depend on the + * block mapping, so we don't need the glock. + */ error = generic_file_llseek(file, offset, whence); break; default: @@ -108,45 +119,22 @@ static int gfs2_readdir(struct file *file, struct dir_context *ctx) } /** - * fsflags_cvt - * @table: A table of 32 u32 flags - * @val: a 32 bit value to convert - * - * This function can be used to convert between fsflags values and - * GFS2's own flags values. + * fsflag_gfs2flag * - * Returns: the converted flags + * The FS_JOURNAL_DATA_FL flag maps to GFS2_DIF_INHERIT_JDATA for directories, + * and to GFS2_DIF_JDATA for non-directories. */ -static u32 fsflags_cvt(const u32 *table, u32 val) -{ - u32 res = 0; - while(val) { - if (val & 1) - res |= *table; - table++; - val >>= 1; - } - return res; -} - -static const u32 fsflags_to_gfs2[32] = { - [3] = GFS2_DIF_SYNC, - [4] = GFS2_DIF_IMMUTABLE, - [5] = GFS2_DIF_APPENDONLY, - [7] = GFS2_DIF_NOATIME, - [12] = GFS2_DIF_EXHASH, - [14] = GFS2_DIF_INHERIT_JDATA, - [17] = GFS2_DIF_TOPDIR, -}; - -static const u32 gfs2_to_fsflags[32] = { - [gfs2fl_Sync] = FS_SYNC_FL, - [gfs2fl_Immutable] = FS_IMMUTABLE_FL, - [gfs2fl_AppendOnly] = FS_APPEND_FL, - [gfs2fl_NoAtime] = FS_NOATIME_FL, - [gfs2fl_ExHash] = FS_INDEX_FL, - [gfs2fl_TopLevel] = FS_TOPDIR_FL, - [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, +static struct { + u32 fsflag; + u32 gfsflag; +} fsflag_gfs2flag[] = { + {FS_SYNC_FL, GFS2_DIF_SYNC}, + {FS_IMMUTABLE_FL, GFS2_DIF_IMMUTABLE}, + {FS_APPEND_FL, GFS2_DIF_APPENDONLY}, + {FS_NOATIME_FL, GFS2_DIF_NOATIME}, + {FS_INDEX_FL, GFS2_DIF_EXHASH}, + {FS_TOPDIR_FL, GFS2_DIF_TOPDIR}, + {FS_JOURNAL_DATA_FL, GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA}, }; static int gfs2_get_flags(struct file *filp, u32 __user *ptr) @@ -154,17 +142,23 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) struct inode *inode = file_inode(filp); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; - int error; - u32 fsflags; + int i, error; + u32 gfsflags, fsflags = 0; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); error = gfs2_glock_nq(&gh); if (error) goto out_uninit; - fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); - if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) - fsflags |= FS_JOURNAL_DATA_FL; + gfsflags = ip->i_diskflags; + if (S_ISDIR(inode->i_mode)) + gfsflags &= ~GFS2_DIF_JDATA; + else + gfsflags &= ~GFS2_DIF_INHERIT_JDATA; + for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) + if (gfsflags & fsflag_gfs2flag[i].gfsflag) + fsflags |= fsflag_gfs2flag[i].fsflag; + if (put_user(fsflags, ptr)) error = -EFAULT; @@ -199,7 +193,6 @@ void gfs2_set_inode_flags(struct inode *inode) GFS2_DIF_APPENDONLY| \ GFS2_DIF_NOATIME| \ GFS2_DIF_SYNC| \ - GFS2_DIF_SYSTEM| \ GFS2_DIF_TOPDIR| \ GFS2_DIF_INHERIT_JDATA) @@ -238,10 +231,6 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if ((new_flags ^ flags) == 0) goto out; - error = -EINVAL; - if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) - goto out; - error = -EPERM; if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) goto out; @@ -256,7 +245,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) goto out; } if ((flags ^ new_flags) & GFS2_DIF_JDATA) { - if (flags & GFS2_DIF_JDATA) + if (new_flags & GFS2_DIF_JDATA) gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); error = filemap_fdatawrite(inode->i_mapping); if (error) @@ -264,6 +253,8 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) error = filemap_fdatawait(inode->i_mapping); if (error) goto out; + if (new_flags & GFS2_DIF_JDATA) + gfs2_ordered_del_inode(ip); } error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) @@ -271,6 +262,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; + inode->i_ctime = current_time(inode); gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); @@ -289,19 +281,33 @@ out_drop_write: static int gfs2_set_flags(struct file *filp, u32 __user *ptr) { struct inode *inode = file_inode(filp); - u32 fsflags, gfsflags; + u32 fsflags, gfsflags = 0; + u32 mask; + int i; if (get_user(fsflags, ptr)) return -EFAULT; - gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); - if (!S_ISDIR(inode->i_mode)) { - gfsflags &= ~GFS2_DIF_TOPDIR; - if (gfsflags & GFS2_DIF_INHERIT_JDATA) - gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM); + for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) { + if (fsflags & fsflag_gfs2flag[i].fsflag) { + fsflags &= ~fsflag_gfs2flag[i].fsflag; + gfsflags |= fsflag_gfs2flag[i].gfsflag; + } + } + if (fsflags || gfsflags & ~GFS2_FLAGS_USER_SET) + return -EINVAL; + + mask = GFS2_FLAGS_USER_SET; + if (S_ISDIR(inode->i_mode)) { + mask &= ~GFS2_DIF_JDATA; + } else { + /* The GFS2_DIF_TOPDIR flag is only valid for directories. */ + if (gfsflags & GFS2_DIF_TOPDIR) + return -EINVAL; + mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA); } - return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA)); + + return do_gfs2_set_flags(filp, gfsflags, mask); } static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 863749e29bf9..4e971b1c7f92 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -18,7 +18,7 @@ #include <linux/posix_acl.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/security.h> #include <linux/uaccess.h> @@ -189,7 +189,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, gfs2_set_iop(inode); - inode->i_atime.tv_sec = 0; + /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ + inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); inode->i_atime.tv_nsec = 0; unlock_new_inode(inode); @@ -1986,6 +1987,7 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat, struct inode *inode = d_inode(path->dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; + u32 gfsflags; int error; gfs2_holder_mark_uninitialized(&gh); @@ -1995,13 +1997,30 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat, return error; } + gfsflags = ip->i_diskflags; + if (gfsflags & GFS2_DIF_APPENDONLY) + stat->attributes |= STATX_ATTR_APPEND; + if (gfsflags & GFS2_DIF_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + generic_fillattr(inode, stat); + if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); return 0; } +const struct iomap_ops gfs2_iomap_ops = { + .iomap_begin = gfs2_iomap_begin, +}; + static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -2009,41 +2028,59 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct gfs2_holder gh; int ret; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); - if (ret) - return ret; - - inode_lock(inode); + inode_lock_shared(inode); ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) goto out; - if (gfs2_is_stuffed(ip)) { - u64 phys = ip->i_no_addr << inode->i_blkbits; - u64 size = i_size_read(inode); - u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| - FIEMAP_EXTENT_DATA_INLINE; - phys += sizeof(struct gfs2_dinode); - phys += start; - if (start + len > size) - len = size - start; - if (start < size) - ret = fiemap_fill_next_extent(fieinfo, start, phys, - len, flags); - if (ret == 1) - ret = 0; - } else { - ret = __generic_block_fiemap(inode, fieinfo, start, len, - gfs2_block_map); - } + ret = iomap_fiemap(inode, fieinfo, start, len, &gfs2_iomap_ops); gfs2_glock_dq_uninit(&gh); + out: - inode_unlock(inode); + inode_unlock_shared(inode); return ret; } +loff_t gfs2_seek_data(struct file *file, loff_t offset) +{ + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + loff_t ret; + + inode_lock_shared(inode); + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (!ret) + ret = iomap_seek_data(inode, offset, &gfs2_iomap_ops); + gfs2_glock_dq_uninit(&gh); + inode_unlock_shared(inode); + + if (ret < 0) + return ret; + return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); +} + +loff_t gfs2_seek_hole(struct file *file, loff_t offset) +{ + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + loff_t ret; + + inode_lock_shared(inode); + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (!ret) + ret = iomap_seek_hole(inode, offset, &gfs2_iomap_ops); + gfs2_glock_dq_uninit(&gh); + inode_unlock_shared(inode); + + if (ret < 0) + return ret; + return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); +} + const struct inode_operations gfs2_file_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index aace8ce34a18..b5b6341a4f5c 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -109,6 +109,8 @@ extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); extern int gfs2_open_common(struct inode *inode, struct file *file); +extern loff_t gfs2_seek_data(struct file *file, loff_t offset); +extern loff_t gfs2_seek_hole(struct file *file, loff_t offset); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index a3711f543405..ad55eb86a250 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1065,15 +1065,15 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp->sd_args = *args; if (sdp->sd_args.ar_spectator) { - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; set_bit(SDF_RORECOVERY, &sdp->sd_flags); } if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; if (sdp->sd_args.ar_nobarrier) set_bit(SDF_NOBARRIERS, &sdp->sd_flags); - sb->s_flags |= MS_NOSEC; + sb->s_flags |= SB_NOSEC; sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_d_op = &gfs2_dops; @@ -1257,7 +1257,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, struct gfs2_args args; struct gfs2_sbd *sdp; - if (!(flags & MS_RDONLY)) + if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; bdev = blkdev_get_by_path(dev_name, mode, fs_type); @@ -1313,15 +1313,15 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (s->s_root) { error = -EBUSY; - if ((flags ^ s->s_flags) & MS_RDONLY) + if ((flags ^ s->s_flags) & SB_RDONLY) goto error_super; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); + error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0); if (error) goto error_super; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; bdev->bd_super = s; } @@ -1365,7 +1365,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, pr_warn("gfs2 mount does not exist\n"); return ERR_CAST(s); } - if ((flags ^ s->s_flags) & MS_RDONLY) { + if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 8e54f2e3a304..d81d46e19726 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -754,14 +754,15 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); struct backing_dev_info *bdi = inode_to_bdi(metamapping->host); int ret = 0; + bool flush_all = (wbc->sync_mode == WB_SYNC_ALL || gfs2_is_jdata(ip)); - if (wbc->sync_mode == WB_SYNC_ALL) + if (flush_all) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); if (bdi->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); - if (wbc->sync_mode == WB_SYNC_ALL) + if (flush_all) ret = filemap_fdatawait(metamapping); if (ret) mark_inode_dirty_sync(inode); @@ -1255,10 +1256,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) return -EINVAL; if (sdp->sd_args.ar_spectator) - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; - if ((sb->s_flags ^ *flags) & MS_RDONLY) { - if (*flags & MS_RDONLY) + if ((sb->s_flags ^ *flags) & SB_RDONLY) { + if (*flags & SB_RDONLY) error = gfs2_make_fs_ro(sdp); else error = gfs2_make_fs_rw(sdp); @@ -1268,9 +1269,9 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) sdp->sd_args = args; if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; else - sb->s_flags &= ~MS_POSIXACL; + sb->s_flags &= ~SB_POSIXACL; if (sdp->sd_args.ar_nobarrier) set_bit(SDF_NOBARRIERS, &sdp->sd_flags); else diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 2f159265693b..f67a709589d3 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -13,6 +13,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/writeback.h> #include <linux/ktime.h> +#include <linux/iomap.h> #include "incore.h" #include "glock.h" #include "rgrp.h" @@ -470,6 +471,70 @@ TRACE_EVENT(gfs2_bmap, __entry->errno) ); +TRACE_EVENT(gfs2_iomap_start, + + TP_PROTO(const struct gfs2_inode *ip, loff_t pos, ssize_t length, + u16 flags), + + TP_ARGS(ip, pos, length, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, inum ) + __field( loff_t, pos ) + __field( ssize_t, length ) + __field( u16, flags ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->inum = ip->i_no_addr; + __entry->pos = pos; + __entry->length = length; + __entry->flags = flags; + ), + + TP_printk("%u,%u bmap %llu iomap start %llu/%lu flags:%08x", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->pos, + (unsigned long)__entry->length, (u16)__entry->flags) +); + +TRACE_EVENT(gfs2_iomap_end, + + TP_PROTO(const struct gfs2_inode *ip, struct iomap *iomap, int ret), + + TP_ARGS(ip, iomap, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, inum ) + __field( loff_t, offset ) + __field( ssize_t, length ) + __field( u16, flags ) + __field( u16, type ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->inum = ip->i_no_addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->flags = iomap->flags; + __entry->type = iomap->type; + __entry->ret = ret; + ), + + TP_printk("%u,%u bmap %llu iomap end %llu/%lu ty:%d flags:%08x rc:%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->offset, + (unsigned long)__entry->length, (u16)__entry->type, + (u16)__entry->flags, __entry->ret) +); + /* Keep track of blocks as they are allocated/freed */ TRACE_EVENT(gfs2_block_alloc, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index affef3c066e0..ca8b72d0a831 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -117,7 +117,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) kfree(tr); up_read(&sdp->sd_log_flush_lock); - if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) + if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); if (alloced) sb_end_intwrite(sdp->sd_vfs); @@ -145,7 +145,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, * * This is used in two distinct cases: * i) In ordered write mode - * We put the data buffer on a list so that we can ensure that its + * We put the data buffer on a list so that we can ensure that it's * synced to disk at the right time * ii) In journaled data mode * We need to journal the data block in the same way as metadata in diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index ea09e41dbb49..05de20954659 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -231,7 +231,6 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_holder rg_gh; - struct buffer_head *dibh; __be64 *dataptrs; u64 bn = 0; u64 bstart = 0; @@ -308,13 +307,8 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, ea->ea_num_ptrs = 0; } - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + ip->i_inode.i_ctime = current_time(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); gfs2_trans_end(sdp); @@ -616,7 +610,6 @@ static int gfs2_xattr_get(const struct xattr_handler *handler, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; - bool need_unlock = false; int ret; /* During lookup, SELinux calls this function with the glock locked. */ @@ -625,10 +618,11 @@ static int gfs2_xattr_get(const struct xattr_handler *handler, ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); if (ret) return ret; - need_unlock = true; + } else { + gfs2_holder_mark_uninitialized(&gh); } ret = __gfs2_xattr_get(inode, name, buffer, size, handler->flags); - if (need_unlock) + if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); return ret; } @@ -749,7 +743,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, ea_skeleton_call_t skeleton_call, void *private) { struct gfs2_alloc_parms ap = { .target = blks }; - struct buffer_head *dibh; int error; error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); @@ -774,13 +767,8 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) goto out_end_trans; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + ip->i_inode.i_ctime = current_time(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); out_end_trans: gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -891,7 +879,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct ea_set *es) { struct gfs2_ea_request *er = es->es_er; - struct buffer_head *dibh; int error; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0); @@ -908,14 +895,9 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (es->es_el) ea_set_remove_stuffed(ip, es->es_el); - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out; ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); -out: + __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + gfs2_trans_end(GFS2_SB(&ip->i_inode)); return error; } @@ -1111,7 +1093,6 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) { struct gfs2_ea_header *ea = el->el_ea; struct gfs2_ea_header *prev = el->el_prev; - struct buffer_head *dibh; int error; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); @@ -1132,13 +1113,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) ea->ea_type = GFS2_EATYPE_UNUSED; } - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + ip->i_inode.i_ctime = current_time(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -1268,11 +1244,20 @@ static int gfs2_xattr_set(const struct xattr_handler *handler, if (ret) return ret; - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - return ret; + /* May be called from gfs_setattr with the glock locked. */ + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + return ret; + } else { + if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) + return -EIO; + gfs2_holder_mark_uninitialized(&gh); + } ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags); - gfs2_glock_dq_uninit(&gh); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); return ret; } diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 8aec5e732abf..b63a4df7327b 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -98,13 +98,11 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { - struct hfs_btree *tree; struct page *src_page, *dst_page; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; - tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page[0]; @@ -237,7 +235,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { - struct super_block *sb; struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; @@ -249,7 +246,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) return NULL; } - sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 894994d2c885..460281b1299e 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -204,11 +204,11 @@ int hfs_mdb_get(struct super_block *sb) attrib = mdb->drAtrb; if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { pr_warn("filesystem is marked locked, mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } if (!sb_rdonly(sb)) { /* Mark the volume uncleanly unmounted in case we crash */ diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 7e0d65e9586c..173876782f73 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -114,18 +114,18 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NODIRATIME; - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + *flags |= SB_NODIRATIME; + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & MS_RDONLY)) { + if (!(*flags & SB_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } } return 0; @@ -407,7 +407,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &hfs_super_operations; sb->s_xattr = hfs_xattr_handlers; - sb->s_flags |= MS_NODIRATIME; + sb->s_flags |= SB_NODIRATIME; mutex_init(&sbi->bitmap_lock); res = hfs_mdb_get(sb); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index d77015c3f22c..177fae4e6581 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -127,14 +127,12 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { - struct hfs_btree *tree; struct page **src_page, **dst_page; int l; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; - tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page + (src >> PAGE_SHIFT); @@ -401,7 +399,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { - struct super_block *sb; struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; @@ -414,7 +411,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) return NULL; } - sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index e5bb2de2262a..1d458b716957 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -329,9 +329,9 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfsplus_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & MS_RDONLY)) { + if (!(*flags & SB_RDONLY)) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; int force = 0; @@ -340,20 +340,20 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } else if (force) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { pr_warn("filesystem is marked journaled, leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } } return 0; @@ -455,16 +455,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("Filesystem is marked locked, mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !sb_rdonly(sb)) { pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } err = -EINVAL; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 8d6b7e35faf9..c83ece7facc5 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -150,7 +150,6 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx) if (unlikely(ret < 0)) goto out; ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; - file->f_version = inode->i_version; } next_pos = ctx->pos; if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) { diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 3b834563b1f1..a4ad18afbdec 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -419,7 +419,6 @@ int hpfs_add_dirent(struct inode *i, c = 1; goto ret; } - i->i_version++; c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0); ret: return c; @@ -726,7 +725,6 @@ int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de, return 2; } } - i->i_version++; for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1); hpfs_delete_de(i->i_sb, dnode, de); hpfs_mark_4buffers_dirty(qbh); diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index e0e60b148400..7c49f1ef0c85 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -288,7 +288,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, goto bail; } if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) { - if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok; + if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & SB_RDONLY) goto ok; hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); goto bail; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 1516fb4e28f4..f2c3ebcd309c 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -78,7 +78,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) else { pr_cont("; remounting read-only\n"); mark_dirty(s, 0); - s->s_flags |= MS_RDONLY; + s->s_flags |= SB_RDONLY; } } else if (sb_rdonly(s)) pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); @@ -235,7 +235,6 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb) ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; - ei->vfs_inode.i_version = 1; return &ei->vfs_inode; } @@ -457,7 +456,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) sync_filesystem(s); - *flags |= MS_NOATIME; + *flags |= SB_NOATIME; hpfs_lock(s); uid = sbi->sb_uid; gid = sbi->sb_gid; @@ -488,7 +487,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; sbi->sb_err = errs; sbi->sb_timeshift = timeshift; - if (!(*flags & MS_RDONLY)) mark_dirty(s, 1); + if (!(*flags & SB_RDONLY)) mark_dirty(s, 1); hpfs_unlock(s); return 0; @@ -614,7 +613,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) goto bail4; } - s->s_flags |= MS_NOATIME; + s->s_flags |= SB_NOATIME; /* Fill superblock stuff */ s->s_magic = HPFS_SUPER_MAGIC; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ed113ea17aff..8a85f3f53446 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -407,7 +407,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next = start; while (next < end) { /* @@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* - * page_put due to reference from alloc_huge_page() * unlock_page because locked by add_to_page_cache() + * page_put due to reference from alloc_huge_page() */ - put_page(page); unlock_page(page); + put_page(page); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) @@ -668,7 +668,6 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) { - error = -EINVAL; if (attr->ia_size & ~huge_page_mask(h)) return -EINVAL; error = hugetlb_vmtruncate(inode, attr->ia_size); diff --git a/fs/inode.c b/fs/inode.c index d1e35b53bb23..03102d6ef044 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -416,7 +416,7 @@ void inode_add_lru(struct inode *inode) { if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) && - !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) + !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE) inode_lru_list_add(inode); } @@ -595,7 +595,7 @@ static void dispose_list(struct list_head *head) * @sb: superblock to operate on * * Make sure that no inodes with zero refcount are retained. This is - * called by superblock shutdown after having MS_ACTIVE flag removed, + * called by superblock shutdown after having SB_ACTIVE flag removed, * so any inode reaching zero refcount during or after that call will * be immediately evicted. */ @@ -1492,7 +1492,7 @@ static void iput_final(struct inode *inode) else drop = generic_drop_inode(inode); - if (!drop && (sb->s_flags & MS_ACTIVE)) { + if (!drop && (sb->s_flags & SB_ACTIVE)) { inode_add_lru(inode); spin_unlock(&inode->i_lock); return; @@ -1644,7 +1644,7 @@ int generic_update_time(struct inode *inode, struct timespec *time, int flags) if (flags & S_MTIME) inode->i_mtime = *time; - if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) + if (!(inode->i_sb->s_flags & SB_LAZYTIME) || (flags & S_VERSION)) iflags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, iflags); return 0; @@ -1691,7 +1691,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode, if (IS_NOATIME(inode)) return false; - if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) + if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; if (mnt->mnt_flags & MNT_NOATIME) @@ -2090,7 +2090,7 @@ void inode_set_flags(struct inode *inode, unsigned int flags, WARN_ON_ONCE(flags & ~mask); do { - old_flags = ACCESS_ONCE(inode->i_flags); + old_flags = READ_ONCE(inode->i_flags); new_flags = (old_flags & ~mask) | flags; } while (unlikely(cmpxchg(&inode->i_flags, old_flags, new_flags) != old_flags)); diff --git a/fs/internal.h b/fs/internal.h index 48cee21b4f14..df262f41a0ef 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,6 +55,7 @@ extern void __init chrdev_init(void); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); +long do_unlinkat(int dfd, struct filename *name); /* * namespace.c diff --git a/fs/iomap.c b/fs/iomap.c index d4801f8dd4fd..47d29ccffaef 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -350,8 +350,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { - sector_t sector = iomap->blkno + - (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); + sector_t sector = (iomap->addr + + (pos & PAGE_MASK) - iomap->offset) >> 9; return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, offset, bytes); @@ -510,11 +510,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; + if (iomap->flags & IOMAP_F_DATA_INLINE) + flags |= FIEMAP_EXTENT_DATA_INLINE; return fiemap_fill_next_extent(fi, iomap->offset, - iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, + iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, iomap->length, flags); - } static loff_t @@ -830,7 +831,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio = bio_alloc(GFP_KERNEL, 1); bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = - iomap->blkno + ((pos - iomap->offset) >> 9); + (iomap->addr + pos - iomap->offset) >> 9; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -855,6 +856,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, struct bio *bio; bool need_zeroout = false; int nr_pages, ret; + size_t copied = 0; if ((pos | length | align) & ((1 << blkbits) - 1)) return -EINVAL; @@ -866,7 +868,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, /*FALLTHRU*/ case IOMAP_UNWRITTEN: if (!(dio->flags & IOMAP_DIO_WRITE)) { - iov_iter_zero(length, dio->submit.iter); + length = iov_iter_zero(length, dio->submit.iter); dio->size += length; return length; } @@ -903,13 +905,16 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, } do { - if (dio->error) + size_t n; + if (dio->error) { + iov_iter_revert(dio->submit.iter, copied); return 0; + } bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = - iomap->blkno + ((pos - iomap->offset) >> 9); + (iomap->addr + pos - iomap->offset) >> 9; bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -917,20 +922,24 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, ret = bio_iov_iter_get_pages(bio, &iter); if (unlikely(ret)) { bio_put(bio); - return ret; + return copied ? copied : ret; } + n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); - task_io_account_write(bio->bi_iter.bi_size); + task_io_account_write(n); } else { bio_set_op_attrs(bio, REQ_OP_READ, 0); if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } - dio->size += bio->bi_iter.bi_size; - pos += bio->bi_iter.bi_size; + iov_iter_advance(dio->submit.iter, n); + + dio->size += n; + pos += n; + copied += n; nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); @@ -946,9 +955,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, if (pad) iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); } - - iov_iter_advance(dio->submit.iter, length); - return length; + return copied; } ssize_t @@ -1056,7 +1063,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || - !blk_mq_poll(dio->submit.last_queue, + !blk_poll(dio->submit.last_queue, dio->submit.cookie)) io_schedule(); } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 447a24d77b89..bc258a4402f6 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -114,7 +114,7 @@ static void destroy_inodecache(void) static int isofs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if (!(*flags & MS_RDONLY)) + if (!(*flags & SB_RDONLY)) return -EROFS; return 0; } diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 57d4c3e2e94a..055ec6c586f7 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -73,41 +73,41 @@ static inline struct iso_inode_info *ISOFS_I(struct inode *inode) return container_of(inode, struct iso_inode_info, vfs_inode); } -static inline int isonum_711(char *p) +static inline int isonum_711(u8 *p) { - return *(u8 *)p; + return *p; } -static inline int isonum_712(char *p) +static inline int isonum_712(s8 *p) { - return *(s8 *)p; + return *p; } -static inline unsigned int isonum_721(char *p) +static inline unsigned int isonum_721(u8 *p) { return get_unaligned_le16(p); } -static inline unsigned int isonum_722(char *p) +static inline unsigned int isonum_722(u8 *p) { return get_unaligned_be16(p); } -static inline unsigned int isonum_723(char *p) +static inline unsigned int isonum_723(u8 *p) { /* Ignore bigendian datum due to broken mastering programs */ return get_unaligned_le16(p); } -static inline unsigned int isonum_731(char *p) +static inline unsigned int isonum_731(u8 *p) { return get_unaligned_le32(p); } -static inline unsigned int isonum_732(char *p) +static inline unsigned int isonum_732(u8 *p) { return get_unaligned_be32(p); } -static inline unsigned int isonum_733(char *p) +static inline unsigned int isonum_733(u8 *p) { /* Ignore bigendian datum due to broken mastering programs */ return get_unaligned_le32(p); } -extern int iso_date(char *, int); +extern int iso_date(u8 *, int); struct inode; /* To make gcc happy */ diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index ef03625431bb..1558cf22ef8a 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -7,78 +7,78 @@ */ struct SU_SP_s { - unsigned char magic[2]; - unsigned char skip; + __u8 magic[2]; + __u8 skip; } __attribute__ ((packed)); struct SU_CE_s { - char extent[8]; - char offset[8]; - char size[8]; + __u8 extent[8]; + __u8 offset[8]; + __u8 size[8]; }; struct SU_ER_s { - unsigned char len_id; - unsigned char len_des; - unsigned char len_src; - unsigned char ext_ver; - char data[0]; + __u8 len_id; + __u8 len_des; + __u8 len_src; + __u8 ext_ver; + __u8 data[0]; } __attribute__ ((packed)); struct RR_RR_s { - char flags[1]; + __u8 flags[1]; } __attribute__ ((packed)); struct RR_PX_s { - char mode[8]; - char n_links[8]; - char uid[8]; - char gid[8]; + __u8 mode[8]; + __u8 n_links[8]; + __u8 uid[8]; + __u8 gid[8]; }; struct RR_PN_s { - char dev_high[8]; - char dev_low[8]; + __u8 dev_high[8]; + __u8 dev_low[8]; }; struct SL_component { - unsigned char flags; - unsigned char len; - char text[0]; + __u8 flags; + __u8 len; + __u8 text[0]; } __attribute__ ((packed)); struct RR_SL_s { - unsigned char flags; + __u8 flags; struct SL_component link; } __attribute__ ((packed)); struct RR_NM_s { - unsigned char flags; + __u8 flags; char name[0]; } __attribute__ ((packed)); struct RR_CL_s { - char location[8]; + __u8 location[8]; }; struct RR_PL_s { - char location[8]; + __u8 location[8]; }; struct stamp { - char time[7]; + __u8 time[7]; /* actually 6 unsigned, 1 signed */ } __attribute__ ((packed)); struct RR_TF_s { - char flags; + __u8 flags; struct stamp times[0]; /* Variable number of these beasts */ } __attribute__ ((packed)); /* Linux-specific extension for transparent decompression */ struct RR_ZF_s { - char algorithm[2]; - char parms[2]; - char real_size[8]; + __u8 algorithm[2]; + __u8 parms[2]; + __u8 real_size[8]; }; /* @@ -94,9 +94,9 @@ struct RR_ZF_s { #define TF_LONG_FORM 128 struct rock_ridge { - char signature[2]; - unsigned char len; - unsigned char version; + __u8 signature[2]; + __u8 len; + __u8 version; union { struct SU_SP_s SP; struct SU_CE_s CE; diff --git a/fs/isofs/util.c b/fs/isofs/util.c index 42544bf0e222..e88dba721661 100644 --- a/fs/isofs/util.c +++ b/fs/isofs/util.c @@ -16,7 +16,7 @@ * to GMT. Thus we should always be correct. */ -int iso_date(char * p, int flag) +int iso_date(u8 *p, int flag) { int year, month, day, hour, minute, second, tz; int crtime; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 7d5ef3bf3f3e..67546c7ad473 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -165,11 +165,11 @@ static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) * Helper function used to manage commit timeouts */ -static void commit_timeout(unsigned long __data) +static void commit_timeout(struct timer_list *t) { - struct task_struct * p = (struct task_struct *) __data; + journal_t *journal = from_timer(journal, t, j_commit_timer); - wake_up_process(p); + wake_up_process(journal->j_task); } /* @@ -197,8 +197,7 @@ static int kjournald2(void *arg) * Set up an interval timer which can be used to trigger a commit wakeup * after the commit interval expires */ - setup_timer(&journal->j_commit_timer, commit_timeout, - (unsigned long)current); + timer_setup(&journal->j_commit_timer, commit_timeout, 0); set_freezable(); @@ -738,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) return err; } +/* Return 1 when transaction with given tid has already committed. */ +int jbd2_transaction_committed(journal_t *journal, tid_t tid) +{ + int ret = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) + ret = 0; + if (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid) + ret = 0; + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_transaction_committed); + /* * When this function returns the transaction corresponding to tid * will be completed. If the transaction has currently running, start diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index e96c6b05e43e..d8c274d39ddb 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -409,10 +409,10 @@ int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) mutex_unlock(&c->alloc_sem); } - if (!(*flags & MS_RDONLY)) + if (!(*flags & SB_RDONLY)) jffs2_start_garbage_collect_thread(c); - *flags |= MS_NOATIME; + *flags |= SB_NOATIME; return 0; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 824e61ede465..c2fbec19c616 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -59,7 +59,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) } -#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY) +#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & SB_RDONLY) #define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) ) #ifndef CONFIG_JFFS2_FS_WRITEBUFFER diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 153f1c6eb169..f60dee7faf03 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -301,10 +301,10 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &jffs2_super_operations; sb->s_export_op = &jffs2_export_ops; - sb->s_flags = sb->s_flags | MS_NOATIME; + sb->s_flags = sb->s_flags | SB_NOATIME; sb->s_xattr = jffs2_xattr_handlers; #ifdef CONFIG_JFFS2_FS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif ret = jffs2_do_fill_super(sb, data, silent); return ret; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 1c4b9ad4d7ab..1a3b0cc22ad3 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -663,6 +663,8 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, } else { INCREMENT(mpStat.pagealloc); mp = alloc_metapage(GFP_NOFS); + if (!mp) + goto unlock; mp->page = page; mp->sb = inode->i_sb; mp->flag = 0; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2f14677169c3..90373aebfdca 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -87,7 +87,7 @@ static void jfs_handle_error(struct super_block *sb) else if (sbi->flag & JFS_ERR_REMOUNT_RO) { jfs_err("ERROR: (device %s): remounting filesystem as read-only", sb->s_id); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } /* nothing is done for continue beyond marking the superblock dirty */ @@ -477,7 +477,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) return rc; } - if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) { + if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o @@ -488,12 +488,12 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) ret = jfs_mount_rw(sb, 1); /* mark the fs r/w for quota activity */ - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; dquot_resume(sb, -1); return ret; } - if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) { + if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; @@ -545,7 +545,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sbi->flag = flag; #ifdef CONFIG_JFS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif if (newLVSize) { @@ -853,7 +853,6 @@ out: } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); - inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); inode_unlock(inode); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 95a7c88baed9..26dd9a50f383 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -335,7 +335,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, deactivate_locked_super(sb); return ERR_PTR(error); } - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; mutex_lock(&kernfs_mutex); list_add(&info->node, &root->supers); diff --git a/fs/libfs.c b/fs/libfs.c index 3aabe553fc45..7ff3cb904acd 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -246,7 +246,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, struct inode *root; struct qstr d_name = QSTR_INIT(name, strlen(name)); - s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER, + s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER, &init_user_ns, NULL); if (IS_ERR(s)) return ERR_CAST(s); @@ -277,7 +277,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, d_instantiate(dentry, root); s->s_root = dentry; s->s_d_op = dops; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; return dget(s->s_root); Enomem: @@ -578,7 +578,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c spin_lock(&pin_fs_lock); if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); - mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL); + mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (IS_ERR(mnt)) return PTR_ERR(mnt); spin_lock(&pin_fs_lock); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 0d4e590e0549..826a89184f90 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -578,8 +578,10 @@ static void nlm_complain_hosts(struct net *net) if (ln->nrhosts == 0) return; - printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net); - dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net); + pr_warn("lockd: couldn't shutdown host module for net %x!\n", + net->ns.inum); + dprintk("lockd: %lu hosts left in net %x:\n", ln->nrhosts, + net->ns.inum); } else { if (nrhosts == 0) return; @@ -590,9 +592,9 @@ static void nlm_complain_hosts(struct net *net) for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; - dprintk(" %s (cnt %d use %d exp %ld net %p)\n", + dprintk(" %s (cnt %d use %d exp %ld net %x)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires, host->net); + host->h_inuse, host->h_expires, host->net->ns.inum); } } @@ -605,7 +607,8 @@ nlm_shutdown_hosts_net(struct net *net) mutex_lock(&nlm_host_mutex); /* First, make all hosts eligible for gc */ - dprintk("lockd: nuking all hosts in net %p...\n", net); + dprintk("lockd: nuking all hosts in net %x...\n", + net ? net->ns.inum : 0); for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; @@ -618,9 +621,8 @@ nlm_shutdown_hosts_net(struct net *net) /* Then, perform a garbage collection pass */ nlm_gc_hosts(net); - mutex_unlock(&nlm_host_mutex); - nlm_complain_hosts(net); + mutex_unlock(&nlm_host_mutex); } /* @@ -646,7 +648,8 @@ nlm_gc_hosts(struct net *net) struct hlist_node *next; struct nlm_host *host; - dprintk("lockd: host garbage collection for net %p\n", net); + dprintk("lockd: host garbage collection for net %x\n", + net ? net->ns.inum : 0); for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; @@ -662,9 +665,10 @@ nlm_gc_hosts(struct net *net) if (atomic_read(&host->h_count) || host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " - "(cnt %d use %d exp %ld net %p)\n", + "(cnt %d use %d exp %ld net %x)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires, host->net); + host->h_inuse, host->h_expires, + host->net->ns.inum); continue; } nlm_destroy_host_locked(host); diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 9fbbd11f9ecb..96cfb2967ac7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -110,7 +110,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, clnt = nsm_create(host->net, host->nodename); if (IS_ERR(clnt)) { dprintk("lockd: failed to create NSM upcall transport, " - "status=%ld, net=%p\n", PTR_ERR(clnt), host->net); + "status=%ld, net=%x\n", PTR_ERR(clnt), + host->net->ns.inum); return PTR_ERR(clnt); } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index b995bdc13976..9c36d614bf89 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,6 +57,9 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; +atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); +DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); + unsigned int lockd_net_id; /* @@ -259,7 +262,7 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net) if (error < 0) goto err_bind; set_grace_period(net); - dprintk("lockd_up_net: per-net data created; net=%p\n", net); + dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum); return 0; err_bind: @@ -274,12 +277,15 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); svc_shutdown_net(serv, net); - dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); + dprintk("%s: per-net data destroyed; net=%x\n", + __func__, net->ns.inum); } } else { - printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n", - nlmsvc_task, net); + pr_err("%s: no users! task=%p, net=%x\n", + __func__, nlmsvc_task, net->ns.inum); BUG(); } } @@ -290,7 +296,8 @@ static int lockd_inetaddr_event(struct notifier_block *this, struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out; if (nlmsvc_rqst) { @@ -301,6 +308,8 @@ static int lockd_inetaddr_event(struct notifier_block *this, svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin); } + atomic_dec(&nlm_ntf_refcnt); + wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -317,7 +326,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out; if (nlmsvc_rqst) { @@ -329,6 +339,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin6); } + atomic_dec(&nlm_ntf_refcnt); + wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -345,10 +357,12 @@ static void lockd_unregister_notifiers(void) #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif + wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0); } static void lockd_svc_exit_thread(void) { + atomic_dec(&nlm_ntf_refcnt); lockd_unregister_notifiers(); svc_exit_thread(nlmsvc_rqst); } @@ -369,9 +383,11 @@ static int lockd_start_svc(struct svc_serv *serv) printk(KERN_WARNING "lockd_up: svc_rqst allocation failed, error=%d\n", error); + lockd_unregister_notifiers(); goto out_rqst; } + atomic_inc(&nlm_ntf_refcnt); svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; @@ -459,13 +475,16 @@ int lockd_up(struct net *net) } error = lockd_up_net(serv, net); - if (error < 0) - goto err_net; + if (error < 0) { + lockd_unregister_notifiers(); + goto err_put; + } error = lockd_start_svc(serv); - if (error < 0) - goto err_start; - + if (error < 0) { + lockd_down_net(serv, net); + goto err_put; + } nlmsvc_users++; /* * Note: svc_serv structures have an initial use count of 1, @@ -476,12 +495,6 @@ err_put: err_create: mutex_unlock(&nlmsvc_mutex); return error; - -err_start: - lockd_down_net(serv, net); -err_net: - lockd_unregister_notifiers(); - goto err_put; } EXPORT_SYMBOL_GPL(lockd_up); @@ -602,7 +615,7 @@ static struct ctl_table nlm_sysctl_root[] = { */ #define param_set_min_max(name, type, which_strtol, min, max) \ -static int param_set_##name(const char *val, struct kernel_param *kp) \ +static int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ char *endp; \ __typeof__(type) num = which_strtol(val, &endp, 0); \ @@ -678,6 +691,17 @@ static int lockd_init_net(struct net *net) static void lockd_exit_net(struct net *net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + WARN_ONCE(!list_empty(&ln->lockd_manager.list), + "net %x %s: lockd_manager.list is not empty\n", + net->ns.inum, __func__); + WARN_ONCE(!list_empty(&ln->nsm_handles), + "net %x %s: nsm_handles list is not empty\n", + net->ns.inum, __func__); + WARN_ONCE(delayed_work_pending(&ln->grace_period_end), + "net %x %s: grace_period_end was not cancelled\n", + net->ns.inum, __func__); } static struct pernet_operations lockd_net_ops = { diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index a563ddbc19e6..4ec3d6e03e76 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -370,7 +370,7 @@ nlmsvc_mark_resources(struct net *net) { struct nlm_host hint; - dprintk("lockd: nlmsvc_mark_resources for net %p\n", net); + dprintk("lockd: %s for net %x\n", __func__, net ? net->ns.inum : 0); hint.net = net; nlm_traverse_files(&hint, nlmsvc_mark_host, NULL); } diff --git a/fs/locks.c b/fs/locks.c index 1bd71c4d663a..21b4dfa289ee 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -141,7 +141,7 @@ static inline bool is_remote_lock(struct file *filp) { - return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK)); + return likely(!(filp->f_path.dentry->d_sb->s_flags & SB_NOREMOTELOCK)); } static bool lease_breaking(struct file_lock *fl) diff --git a/fs/mbcache.c b/fs/mbcache.c index d818fd236787..b8b8b9ced9f8 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink, struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); + /* Unlikely, but not impossible */ + if (unlikely(cache->c_entry_count < 0)) + return 0; return cache->c_entry_count; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index b6829d679643..72e308c3e66b 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -125,9 +125,9 @@ static int minix_remount (struct super_block * sb, int * flags, char * data) sync_filesystem(sb); ms = sbi->s_ms; - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { if (ms->s_state & MINIX_VALID_FS || !(sbi->s_mount_state & MINIX_VALID_FS)) return 0; diff --git a/fs/namei.c b/fs/namei.c index ed8b9488a890..9cc91fb7f156 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | - LOOKUP_AUTOMOUNT))) { - /* Positive dentry that isn't meant to trigger an - * automount, EISDIR will allow it to be used, - * otherwise there's no mount here "now" so return - * ENOENT. - */ - if (path->dentry->d_inode) - return -EISDIR; - else - return -ENOENT; - } + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + path->dentry->d_inode) + return -EISDIR; if (path->dentry->d_sb->s_user_ns != &init_user_ns) return -EACCES; @@ -1210,7 +1201,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see * the components of that value change under us */ - while (managed = ACCESS_ONCE(path->dentry->d_flags), + while (managed = READ_ONCE(path->dentry->d_flags), managed &= DCACHE_MANAGED_DENTRY, unlikely(managed != 0)) { /* Allow the filesystem to manage the transit without i_mutex @@ -1395,7 +1386,7 @@ int follow_down(struct path *path) unsigned managed; int ret; - while (managed = ACCESS_ONCE(path->dentry->d_flags), + while (managed = READ_ONCE(path->dentry->d_flags), unlikely(managed & DCACHE_MANAGED_DENTRY)) { /* Allow the filesystem to manage the transit without i_mutex * being held. @@ -3459,7 +3450,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, goto out; child = vfs_tmpfile(path.dentry, op->mode, op->open_flag); error = PTR_ERR(child); - if (unlikely(IS_ERR(child))) + if (IS_ERR(child)) goto out2; dput(path.dentry); path.dentry = child; @@ -4010,10 +4001,9 @@ EXPORT_SYMBOL(vfs_unlink); * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ -static long do_unlinkat(int dfd, const char __user *pathname) +long do_unlinkat(int dfd, struct filename *name) { int error; - struct filename *name; struct dentry *dentry; struct path path; struct qstr last; @@ -4022,8 +4012,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: - name = filename_parentat(dfd, getname(pathname), lookup_flags, - &path, &last, &type); + name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (IS_ERR(name)) return PTR_ERR(name); @@ -4065,12 +4054,12 @@ exit2: mnt_drop_write(path.mnt); exit1: path_put(&path); - putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } + putname(name); return error; slashes: @@ -4091,12 +4080,12 @@ SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) if (flag & AT_REMOVEDIR) return do_rmdir(dfd, pathname); - return do_unlinkat(dfd, pathname); + return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { - return do_unlinkat(AT_FDCWD, pathname); + return do_unlinkat(AT_FDCWD, getname(pathname)); } int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) diff --git a/fs/namespace.c b/fs/namespace.c index d18deb4c410b..9d1374ab6e06 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -353,7 +353,7 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will @@ -2826,6 +2826,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, SB_DIRSYNC | SB_SILENT | SB_POSIXACL | + SB_LAZYTIME | SB_I_VERSION); if (flags & MS_REMOUNT) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index b5ec1d980dc9..0c57c5c5d40a 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -120,10 +120,6 @@ static inline int ncp_case_sensitive(const struct inode *i) /* * Note: leave the hash unchanged if the directory * is case-sensitive. - * - * Accessing the parent inode can be racy under RCU pathwalking. - * Use ACCESS_ONCE() to make sure we use _one_ particular inode, - * the callers will handle races. */ static int ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) @@ -148,11 +144,6 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) return 0; } -/* - * Accessing the parent inode can be racy under RCU pathwalking. - * Use ACCESS_ONCE() to make sure we use _one_ particular inode, - * the callers will handle races. - */ static int ncp_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 6d0f14c86099..41de88cdc053 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -103,7 +103,7 @@ static void destroy_inodecache(void) static int ncp_remount(struct super_block *sb, int *flags, char* data) { sync_filesystem(sb); - *flags |= MS_NODIRATIME; + *flags |= SB_NODIRATIME; return 0; } @@ -547,7 +547,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) else default_bufsize = 1024; - sb->s_flags |= MS_NODIRATIME; /* probably even noatime */ + sb->s_flags |= SB_NODIRATIME; /* probably even noatime */ sb->s_maxbytes = 0xFFFFFFFFU; sb->s_blocksize = 1024; /* Eh... Is this correct? */ sb->s_blocksize_bits = 10; @@ -618,7 +618,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) server->tx.creq = NULL; server->rcv.creq = NULL; - init_timer(&server->timeout_tm); + timer_setup(&server->timeout_tm, ncpdgram_timeout_call, 0); #undef NCP_PACKET_SIZE #define NCP_PACKET_SIZE 131072 error = -ENOMEM; @@ -650,8 +650,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) } else { INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc); INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc); - server->timeout_tm.data = (unsigned long)server; - server->timeout_tm.function = ncpdgram_timeout_call; } release_sock(sock->sk); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 89031d7e3ae1..f06cde4adf71 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -150,7 +150,7 @@ extern void ncp_tcp_rcv_proc(struct work_struct *work); extern void ncp_tcp_tx_proc(struct work_struct *work); extern void ncpdgram_rcv_proc(struct work_struct *work); extern void ncpdgram_timeout_proc(struct work_struct *work); -extern void ncpdgram_timeout_call(unsigned long server); +extern void ncpdgram_timeout_call(struct timer_list *t); extern void ncp_tcp_data_ready(struct sock* sk); extern void ncp_tcp_write_space(struct sock* sk); extern void ncp_tcp_error_report(struct sock* sk); diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 7dd7170d6cdf..efb176b1751a 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -117,10 +117,10 @@ void ncp_tcp_write_space(struct sock *sk) schedule_work(&server->tx.tq); } -void ncpdgram_timeout_call(unsigned long v) +void ncpdgram_timeout_call(struct timer_list *t) { - struct ncp_server *server = (void*)v; - + struct ncp_server *server = from_timer(server, t, timeout_tm); + schedule_work(&server->timeout_tq); } diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index b60627bcfc62..ef6729568432 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -67,7 +67,7 @@ out: */ void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) { - if (atomic_dec_and_test(&dreq->count)) + if (refcount_dec_and_test(&dreq->count)) kfree(dreq); } @@ -87,7 +87,7 @@ static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) dreq = container_of(req, struct nfs_cache_defer_req, req); dreq->deferred_req.revisit = nfs_dns_cache_revisit; - atomic_inc(&dreq->count); + refcount_inc(&dreq->count); return &dreq->deferred_req; } @@ -99,7 +99,7 @@ struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); if (dreq) { init_completion(&dreq->completion); - atomic_set(&dreq->count, 1); + refcount_set(&dreq->count, 1); dreq->req.defer = nfs_dns_cache_defer; } return dreq; diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 4e6236a86cf7..220ee409abc4 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -16,7 +16,7 @@ struct nfs_cache_defer_req { struct cache_req req; struct cache_deferred_req deferred_req; struct completion completion; - atomic_t count; + refcount_t count; }; extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index cd9d992feb2e..509dc5adeb8f 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -49,15 +49,15 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) if (ret <= 0) goto out_err; nn->nfs_callback_tcpport = ret; - dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", - nn->nfs_callback_tcpport, PF_INET, net); + dprintk("NFS: Callback listener port = %u (af %u, net %x)\n", + nn->nfs_callback_tcpport, PF_INET, net->ns.inum); ret = svc_create_xprt(serv, "tcp", net, PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { nn->nfs_callback_tcpport6 = ret; - dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", - nn->nfs_callback_tcpport6, PF_INET6, net); + dprintk("NFS: Callback listener port = %u (af %u, net %x\n", + nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum); } else if (ret != -EAFNOSUPPORT) goto out_err; return 0; @@ -185,7 +185,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc if (--nn->cb_users[minorversion]) return; - dprintk("NFS: destroy per-net callback data; net=%p\n", net); + dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); svc_shutdown_net(serv, net); } @@ -198,7 +198,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, if (nn->cb_users[minorversion]++) return 0; - dprintk("NFS: create per-net callback data; net=%p\n", net); + dprintk("NFS: create per-net callback data; net=%x\n", net->ns.inum); ret = svc_bind(serv, net); if (ret < 0) { @@ -223,7 +223,7 @@ err_socks: err_bind: nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " - "net = %p\n", ret, net); + "net = %x\n", ret, net->ns.inum); return ret; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 19151f6c0e97..2435af56b87e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -440,7 +440,7 @@ static bool referring_call_exists(struct nfs_client *clp, uint32_t nrclists, struct referring_call_list *rclists) { - bool status = 0; + bool status = false; int i, j; struct nfs4_session *session; struct nfs4_slot_table *tbl; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 22880ef6d8dd..b9129e2befea 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -163,7 +163,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; - atomic_set(&clp->cl_count, 1); + refcount_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); @@ -269,7 +269,7 @@ void nfs_put_client(struct nfs_client *clp) nn = net_generic(clp->cl_net, nfs_net_id); - if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { + if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); @@ -291,12 +291,23 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat const struct sockaddr *sap = data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); +again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; + /* If a client is still initializing then we need to wait */ + if (clp->cl_cons_state > NFS_CS_READY) { + refcount_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + nfs_wait_client_init_complete(clp); + nfs_put_client(clp); + spin_lock(&nn->nfs_client_lock); + goto again; + } + /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; @@ -314,7 +325,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat sap)) continue; - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); return clp; } return NULL; @@ -1006,7 +1017,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, /* Copy data from the source */ server->nfs_client = source->nfs_client; server->destroy = source->destroy; - atomic_inc(&server->nfs_client->cl_count); + refcount_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); server->fsid = fattr->fsid; @@ -1166,7 +1177,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v) clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), - atomic_read(&clp->cl_count), + refcount_read(&clp->cl_count), clp->cl_hostname); rcu_read_unlock(); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 606dd3871f66..ade44ca0c66c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1041,6 +1041,33 @@ int nfs_delegations_present(struct nfs_client *clp) } /** + * nfs4_refresh_delegation_stateid - Update delegation stateid seqid + * @dst: stateid to refresh + * @inode: inode to check + * + * Returns "true" and updates "dst->seqid" * if inode had a delegation + * that matches our delegation stateid. Otherwise "false" is returned. + */ +bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +{ + struct nfs_delegation *delegation; + bool ret = false; + if (!inode) + goto out; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && + nfs4_stateid_match_other(dst, &delegation->stateid)) { + dst->seqid = delegation->stateid.seqid; + return ret; + } + rcu_read_unlock(); +out: + return ret; +} + +/** * nfs4_copy_delegation_stateid - Copy inode's state ID information * @inode: inode to check * @flags: delegation type requirement diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index ddaf2644cf13..185a09f37a89 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -62,6 +62,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); +bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5ceaeb1f6fb6..2f3f86726f5b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -118,13 +118,6 @@ nfs_opendir(struct inode *inode, struct file *filp) goto out; } filp->private_data = ctx; - if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { - /* This is a mountpoint, so d_revalidate will never - * have been called, so we need to refresh the - * inode (for close-open consistency) ourselves. - */ - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - } out: put_rpccred(cred); return res; @@ -253,7 +246,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri desc->cache_entry_index = index; return 0; out_eof: - desc->eof = 1; + desc->eof = true; return -EBADCOOKIE; } @@ -307,7 +300,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des if (array->eof_index >= 0) { status = -EBADCOOKIE; if (*desc->dir_cookie == array->last_cookie) - desc->eof = 1; + desc->eof = true; } out: return status; @@ -761,7 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { - desc->eof = 1; + desc->eof = true; break; } desc->ctx->pos++; @@ -773,7 +766,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) ctx->duped = 1; } if (array->eof_index >= 0) - desc->eof = 1; + desc->eof = true; kunmap(desc->page); cache_page_release(desc); @@ -873,7 +866,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res == -EBADCOOKIE) { res = 0; /* This means either end of directory */ - if (*desc->dir_cookie && desc->eof == 0) { + if (*desc->dir_cookie && !desc->eof) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc); if (res == 0) @@ -1081,7 +1074,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) int error; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1168,7 +1161,7 @@ out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: if (flags & LOOKUP_RCU) { - if (parent != ACCESS_ONCE(dentry->d_parent)) + if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; } else dput(parent); @@ -1241,8 +1234,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - if (nfs_mapping_need_revalidate_inode(inode)) - error = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + error = nfs_lookup_verify_inode(inode, flags); dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; @@ -1264,7 +1256,7 @@ static int nfs_dentry_delete(const struct dentry *dentry) /* Unhash it, so that ->d_iput() would be called */ return 1; } - if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { + if (!(dentry->d_sb->s_flags & SB_ACTIVE)) { /* Unhash it, so that ancestors of killed async unlink * files will be cleaned up during umount */ return 1; @@ -1393,6 +1385,7 @@ static int nfs4_lookup_revalidate(struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, + .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, @@ -1582,7 +1575,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) struct inode *dir; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1596,7 +1589,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) ret = -ECHILD; if (!(flags & LOOKUP_RCU)) dput(parent); - else if (parent != ACCESS_ONCE(dentry->d_parent)) + else if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; goto out; } @@ -2064,7 +2057,7 @@ out: * should mark the directories for revalidation. */ d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, + nfs_set_verifier(old_dentry, nfs_save_change_attribute(new_dir)); } else if (error == -ENOENT) nfs_dentry_handle_enoent(old_dentry); @@ -2369,15 +2362,15 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) } EXPORT_SYMBOL_GPL(nfs_access_add_cache); -#define NFS_MAY_READ (NFS4_ACCESS_READ) -#define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \ - NFS4_ACCESS_EXTEND | \ - NFS4_ACCESS_DELETE) -#define NFS_FILE_MAY_WRITE (NFS4_ACCESS_MODIFY | \ - NFS4_ACCESS_EXTEND) +#define NFS_MAY_READ (NFS_ACCESS_READ) +#define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \ + NFS_ACCESS_EXTEND | \ + NFS_ACCESS_DELETE) +#define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \ + NFS_ACCESS_EXTEND) #define NFS_DIR_MAY_WRITE NFS_MAY_WRITE -#define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP) -#define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE) +#define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP) +#define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE) static int nfs_access_calc_mask(u32 access_result, umode_t umode) { @@ -2425,9 +2418,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) if (!may_block) goto out; - /* Be clever: ask server to check for all possible rights */ - cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE - | NFS_MAY_WRITE | NFS_MAY_READ; + /* + * Determine which access bits we want to ask for... + */ + cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; + if (S_ISDIR(inode->i_mode)) + cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; + else + cache.mask |= NFS_ACCESS_EXECUTE; cache.cred = cred; status = NFS_PROTO(inode)->access(inode, &cache); if (status != 0) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0214dd1e1060..81cca49a8375 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -829,23 +829,9 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; - /* - * VFS doesn't require the open mode to match a flock() lock's type. - * NFS, however, may simulate flock() locking with posix locking which - * requires the open mode to match the lock type. - */ - switch (fl->fl_type) { - case F_UNLCK: + /* We're simulating flock() locks using posix locks on the server */ + if (fl->fl_type == F_UNLCK) return do_unlk(filp, cmd, fl, is_local); - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - } - return do_setlk(filp, cmd, fl, is_local); } EXPORT_SYMBOL_GPL(nfs_flock); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 508126eb49f9..4e54d8b5413a 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -471,10 +471,10 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) return PNFS_NOT_ATTEMPTED; dprintk("%s USE DS: %s cl_count %d\n", __func__, - ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count)); /* No multipath support. Use first DS */ - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); @@ -515,10 +515,10 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, - offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count)); hdr->pgio_done_cb = filelayout_write_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); @@ -1064,9 +1064,9 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; dprintk("%s ino %lu, how %d cl_count %d\n", __func__, - data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); + data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count)); data->commit_done_cb = filelayout_commit_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b0fa83a60754..c75ad982bcfc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -187,7 +187,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, continue; if (!ff_mirror_match_fh(mirror, pos)) continue; - if (atomic_inc_not_zero(&pos->ref)) { + if (refcount_inc_not_zero(&pos->ref)) { spin_unlock(&inode->i_lock); return pos; } @@ -218,7 +218,7 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) mirror = kzalloc(sizeof(*mirror), gfp_flags); if (mirror != NULL) { spin_lock_init(&mirror->lock); - atomic_set(&mirror->ref, 1); + refcount_set(&mirror->ref, 1); INIT_LIST_HEAD(&mirror->mirrors); } return mirror; @@ -242,7 +242,7 @@ static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) { - if (mirror != NULL && atomic_dec_and_test(&mirror->ref)) + if (mirror != NULL && refcount_dec_and_test(&mirror->ref)) ff_layout_free_mirror(mirror); } @@ -1726,10 +1726,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) vers = nfs4_ff_layout_ds_version(lseg, idx); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, - ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); hdr->pgio_done_cb = ff_layout_read_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) @@ -1785,11 +1785,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, - offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), + offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); hdr->pgio_done_cb = ff_layout_write_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; fh = nfs4_ff_layout_select_ds_fh(lseg, idx); @@ -1863,11 +1863,11 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) vers = nfs4_ff_layout_ds_version(lseg, idx); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, - data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count), + data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), vers); data->commit_done_cb = ff_layout_commit_done_cb; data->cred = ds_cred; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) @@ -2286,7 +2286,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) continue; /* mirror refcount put in cleanup_layoutstats */ - if (!atomic_inc_not_zero(&mirror->ref)) + if (!refcount_inc_not_zero(&mirror->ref)) continue; dev = &mirror->mirror_ds->id_node; memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 679cb087ef3f..411798346e48 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -14,6 +14,7 @@ #define FF_FLAGS_NO_IO_THRU_MDS 2 #define FF_FLAGS_NO_READ_IO 4 +#include <linux/refcount.h> #include "../pnfs.h" /* XXX: Let's filter out insanely large mirror count for now to avoid oom @@ -82,7 +83,7 @@ struct nfs4_ff_layout_mirror { nfs4_stateid stateid; struct rpc_cred __rcu *ro_cred; struct rpc_cred __rcu *rw_cred; - atomic_t ref; + refcount_t ref; spinlock_t lock; unsigned long flags; struct nfs4_ff_layoutstat read_stat; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 134d9f560240..b992d2382ffa 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -85,9 +85,9 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); -int nfs_wait_atomic_killable(atomic_t *p) +int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode) { - return nfs_wait_killable(TASK_KILLABLE); + return nfs_wait_killable(mode); } /** @@ -752,7 +752,7 @@ int nfs_getattr(const struct path *path, struct kstat *stat, * Note that we only have to check the vfsmount flags here: * - NFS always sets S_NOATIME by so checking it would give a * bogus result - * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is + * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is * no point in checking those. */ if ((path->mnt->mnt_flags & MNT_NOATIME) || @@ -783,7 +783,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { - atomic_set(&l_ctx->count, 1); + refcount_set(&l_ctx->count, 1); l_ctx->lockowner = current->files; INIT_LIST_HEAD(&l_ctx->list); atomic_set(&l_ctx->io_count, 0); @@ -797,7 +797,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context do { if (pos->lockowner != current->files) continue; - atomic_inc(&pos->count); + refcount_inc(&pos->count); return pos; } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); return NULL; @@ -836,7 +836,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) struct nfs_open_context *ctx = l_ctx->open_context; struct inode *inode = d_inode(ctx->dentry); - if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; list_del(&l_ctx->list); spin_unlock(&inode->i_lock); @@ -913,7 +913,7 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL) - atomic_inc(&ctx->lock_context.count); + refcount_inc(&ctx->lock_context.count); return ctx; } EXPORT_SYMBOL_GPL(get_nfs_open_context); @@ -924,11 +924,11 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) struct super_block *sb = ctx->dentry->d_sb; if (!list_empty(&ctx->list)) { - if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); - } else if (!atomic_dec_and_test(&ctx->lock_context.count)) + } else if (!refcount_dec_and_test(&ctx->lock_context.count)) return; if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); @@ -2084,8 +2084,12 @@ static int nfs_net_init(struct net *net) static void nfs_net_exit(struct net *net) { + struct nfs_net *nn = net_generic(net, nfs_net_id); + nfs_fs_proc_net_exit(net); nfs_cleanup_cb_ident_idr(net); + WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); + WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); } static struct pernet_operations nfs_net_ops = { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f9a4a5524bd5..8357ff69962f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -10,7 +10,7 @@ #include <linux/nfs_page.h> #include <linux/wait_bit.h> -#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) +#define NFS_MS_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; @@ -388,7 +388,7 @@ extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); -extern int nfs_wait_atomic_killable(atomic_t *p); +extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode); /* super.c */ extern const struct super_operations nfs_sops; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index bc673fb47fb3..49f848fd1f04 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -188,6 +188,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) { struct nfs3_accessargs arg = { .fh = NFS_FH(inode), + .access = entry->mask, }; struct nfs3_accessres res; struct rpc_message msg = { @@ -196,25 +197,9 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) .rpc_resp = &res, .rpc_cred = entry->cred, }; - int mode = entry->mask; int status = -ENOMEM; dprintk("NFS call access\n"); - - if (mode & MAY_READ) - arg.access |= NFS3_ACCESS_READ; - if (S_ISDIR(inode->i_mode)) { - if (mode & MAY_WRITE) - arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE; - if (mode & MAY_EXEC) - arg.access |= NFS3_ACCESS_LOOKUP; - } else { - if (mode & MAY_WRITE) - arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND; - if (mode & MAY_EXEC) - arg.access |= NFS3_ACCESS_EXECUTE; - } - res.fattr = nfs_alloc_fattr(); if (res.fattr == NULL) goto out; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index dcfcf7fd7438..b374f680830c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -145,7 +145,7 @@ struct nfs4_lock_state { unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; - atomic_t ls_count; + refcount_t ls_count; fl_owner_t ls_owner; }; @@ -162,6 +162,7 @@ enum { NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ + NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */ }; struct nfs4_state { @@ -185,6 +186,8 @@ struct nfs4_state { unsigned int n_rdwr; /* Number of read/write references */ fmode_t state; /* State on the server (R,W, or RW) */ atomic_t count; + + wait_queue_head_t waitq; }; @@ -458,6 +461,10 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, const struct nfs_lock_context *, nfs4_stateid *, struct rpc_cred **); +extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state); +extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -465,7 +472,7 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); -extern int nfs4_setup_sequence(const struct nfs_client *client, +extern int nfs4_setup_sequence(struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); @@ -475,6 +482,7 @@ extern int nfs4_sequence_done(struct rpc_task *task, extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); extern const nfs4_stateid zero_stateid; +extern const nfs4_stateid invalid_stateid; /* nfs4super.c */ struct nfs_mount_info; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index e9bea90dc017..65a7e5da508c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -404,15 +404,19 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (error < 0) goto error; - if (!nfs4_has_session(clp)) - nfs_mark_client_ready(clp, NFS_CS_READY); - error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - if (clp != old) + if (clp != old) { clp->cl_preserve_clid = true; + /* + * Mark the client as having failed initialization so other + * processes walking the nfs_client_list in nfs_match_client() + * won't try to use it. + */ + nfs_mark_client_ready(clp, -EPERM); + } nfs_put_client(clp); clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); return old; @@ -483,7 +487,7 @@ static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new, * ID and serverowner fields. Wait for CREATE_SESSION * to finish. */ if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); + refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); nfs_put_client(*prev); @@ -539,6 +543,9 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + if (pos == new) + goto found; + status = nfs4_match_client(pos, new, &prev, nn); if (status < 0) goto out_unlock; @@ -559,7 +566,8 @@ int nfs40_walk_client_list(struct nfs_client *new, * way that a SETCLIENTID_CONFIRM to pos can succeed is * if new and pos point to the same server: */ - atomic_inc(&pos->cl_count); +found: + refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); nfs_put_client(prev); @@ -572,6 +580,7 @@ int nfs40_walk_client_list(struct nfs_client *new, case 0: nfs4_swap_callback_idents(pos, new); pos->cl_confirm = new->cl_confirm; + nfs_mark_client_ready(pos, NFS_CS_READY); prev = NULL; *result = pos; @@ -715,7 +724,7 @@ int nfs41_walk_client_list(struct nfs_client *new, continue; found: - atomic_inc(&pos->cl_count); + refcount_inc(&pos->cl_count); *result = pos; status = 0; break; @@ -749,7 +758,7 @@ nfs4_find_client_ident(struct net *net, int cb_ident) spin_lock(&nn->nfs_client_lock); clp = idr_find(&nn->cb_ident_idr, cb_ident); if (clp) - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); return clp; } @@ -793,7 +802,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, spin_lock(&nn->nfs_client_lock); list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { - if (nfs4_cb_match_client(addr, clp, minorversion) == false) + if (!nfs4_cb_match_client(addr, clp, minorversion)) continue; if (!nfs4_has_session(clp)) @@ -804,7 +813,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, sid->data, NFS4_MAX_SESSIONID_LEN) != 0) continue; - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); return clp; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f90090e8c959..56fa5a16e097 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -96,6 +96,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_open_context *ctx, struct nfs4_label *ilabel, struct nfs4_label *olabel); #ifdef CONFIG_NFS_V4_1 +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, + struct rpc_cred *cred, + struct nfs4_slot *slot, + bool is_privileged); static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, struct rpc_cred *); static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, @@ -254,15 +258,12 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE }; const u32 nfs4_fs_locations_bitmap[3] = { - FATTR4_WORD0_TYPE - | FATTR4_WORD0_CHANGE + FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID | FATTR4_WORD0_FILEID | FATTR4_WORD0_FS_LOCATIONS, - FATTR4_WORD1_MODE - | FATTR4_WORD1_NUMLINKS - | FATTR4_WORD1_OWNER + FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED @@ -644,13 +645,14 @@ static int nfs40_sequence_done(struct rpc_task *task, #if defined(CONFIG_NFS_V4_1) -static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +static void nfs41_release_slot(struct nfs4_slot *slot) { struct nfs4_session *session; struct nfs4_slot_table *tbl; - struct nfs4_slot *slot = res->sr_slot; bool send_new_highest_used_slotid = false; + if (!slot) + return; tbl = slot->table; session = tbl->session; @@ -676,13 +678,18 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) send_new_highest_used_slotid = false; out_unlock: spin_unlock(&tbl->slot_tbl_lock); - res->sr_slot = NULL; if (send_new_highest_used_slotid) nfs41_notify_server(session->clp); if (waitqueue_active(&tbl->slot_waitq)) wake_up_all(&tbl->slot_waitq); } +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +{ + nfs41_release_slot(res->sr_slot); + res->sr_slot = NULL; +} + static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -710,13 +717,6 @@ static int nfs41_sequence_process(struct rpc_task *task, /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: - /* If previous op on slot was interrupted and we reused - * the seq# and got a reply from the cache, then retry - */ - if (task->tk_status == -EREMOTEIO && interrupted) { - ++slot->seq_nr; - goto retry_nowait; - } /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; @@ -750,16 +750,16 @@ static int nfs41_sequence_process(struct rpc_task *task, * The slot id we used was probably retired. Try again * using a different slot id. */ + if (slot->seq_nr < slot->table->target_highest_slotid) + goto session_recover; goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: /* * Was the last operation on this sequence interrupted? * If so, retry after bumping the sequence number. */ - if (interrupted) { - ++slot->seq_nr; - goto retry_nowait; - } + if (interrupted) + goto retry_new_seq; /* * Could this slot have been previously retired? * If so, then the server may be expecting seq_nr = 1! @@ -768,10 +768,11 @@ static int nfs41_sequence_process(struct rpc_task *task, slot->seq_nr = 1; goto retry_nowait; } - break; + goto session_recover; case -NFS4ERR_SEQ_FALSE_RETRY: - ++slot->seq_nr; - goto retry_nowait; + if (interrupted) + goto retry_new_seq; + goto session_recover; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -781,6 +782,11 @@ out: dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); out_noaction: return ret; +session_recover: + nfs4_schedule_session_recovery(session, res->sr_status); + goto retry_nowait; +retry_new_seq: + ++slot->seq_nr; retry_nowait: if (rpc_restart_call_prepare(task)) { nfs41_sequence_free_slot(res); @@ -857,6 +863,17 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; +static void +nfs4_sequence_process_interrupted(struct nfs_client *client, + struct nfs4_slot *slot, struct rpc_cred *cred) +{ + struct rpc_task *task; + + task = _nfs41_proc_sequence(client, cred, slot, true); + if (!IS_ERR(task)) + rpc_put_task_async(task); +} + #else /* !CONFIG_NFS_V4_1 */ static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -877,9 +894,34 @@ int nfs4_sequence_done(struct rpc_task *task, } EXPORT_SYMBOL_GPL(nfs4_sequence_done); +static void +nfs4_sequence_process_interrupted(struct nfs_client *client, + struct nfs4_slot *slot, struct rpc_cred *cred) +{ + WARN_ON_ONCE(1); + slot->interrupted = 0; +} + #endif /* !CONFIG_NFS_V4_1 */ -int nfs4_setup_sequence(const struct nfs_client *client, +static +void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct nfs4_slot *slot) +{ + if (!slot) + return; + slot->privileged = args->sa_privileged ? 1 : 0; + args->sa_slot = slot; + + res->sr_slot = slot; + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; + +} + +int nfs4_setup_sequence(struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task) @@ -897,29 +939,28 @@ int nfs4_setup_sequence(const struct nfs_client *client, task->tk_timeout = 0; } - spin_lock(&tbl->slot_tbl_lock); - /* The state manager will wait until the slot table is empty */ - if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) - goto out_sleep; + for (;;) { + spin_lock(&tbl->slot_tbl_lock); + /* The state manager will wait until the slot table is empty */ + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; + + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* Try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; + } + spin_unlock(&tbl->slot_tbl_lock); - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - /* Try again in 1/4 second */ - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - goto out_sleep; + if (likely(!slot->interrupted)) + break; + nfs4_sequence_process_interrupted(client, + slot, task->tk_msg.rpc_cred); } - spin_unlock(&tbl->slot_tbl_lock); - - slot->privileged = args->sa_privileged ? 1 : 0; - args->sa_slot = slot; - res->sr_slot = slot; - if (session) { - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - res->sr_status = 1; - } + nfs4_sequence_attach_slot(args, res, slot); trace_nfs4_setup_sequence(session, args); out_start: @@ -1044,6 +1085,12 @@ struct nfs4_opendata { int rpc_status; }; +struct nfs4_open_createattrs { + struct nfs4_label *label; + struct iattr *sattr; + const __u32 verf[2]; +}; + static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, int err, struct nfs4_exception *exception) { @@ -1113,8 +1160,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct nfs4_state_owner *sp, fmode_t fmode, int flags, - const struct iattr *attrs, - struct nfs4_label *label, + const struct nfs4_open_createattrs *c, enum open_claim_type4 claim, gfp_t gfp_mask) { @@ -1122,6 +1168,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct inode *dir = d_inode(parent); struct nfs_server *server = NFS_SERVER(dir); struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + struct nfs4_label *label = (c != NULL) ? c->label : NULL; struct nfs4_opendata *p; p = kzalloc(sizeof(*p), gfp_mask); @@ -1187,15 +1234,11 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, case NFS4_OPEN_CLAIM_DELEG_PREV_FH: p->o_arg.fh = NFS_FH(d_inode(dentry)); } - if (attrs != NULL && attrs->ia_valid != 0) { - __u32 verf[2]; - + if (c != NULL && c->sattr != NULL && c->sattr->ia_valid != 0) { p->o_arg.u.attrs = &p->attrs; - memcpy(&p->attrs, attrs, sizeof(p->attrs)); + memcpy(&p->attrs, c->sattr, sizeof(p->attrs)); - verf[0] = jiffies; - verf[1] = current->pid; - memcpy(p->o_arg.u.verifier.data, verf, + memcpy(p->o_arg.u.verifier.data, c->verf, sizeof(p->o_arg.u.verifier.data)); } p->c_arg.fh = &p->o_res.fh; @@ -1334,6 +1377,25 @@ static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state) } #endif /* CONFIG_NFS_V4_1 */ +static void nfs_state_log_update_open_stateid(struct nfs4_state *state) +{ + if (test_and_clear_bit(NFS_STATE_CHANGE_WAIT, &state->flags)) + wake_up_all(&state->waitq); +} + +static void nfs_state_log_out_of_order_open_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + u32 state_seqid = be32_to_cpu(state->open_stateid.seqid); + u32 stateid_seqid = be32_to_cpu(stateid->seqid); + + if (stateid_seqid == state_seqid + 1U || + (stateid_seqid == 1U && state_seqid == 0xffffffffU)) + nfs_state_log_update_open_stateid(state); + else + set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); +} + static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) { struct nfs_client *clp = state->owner->so_server->nfs_client; @@ -1349,18 +1411,32 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) nfs4_state_mark_reclaim_nograce(clp, state); } +/* + * Check for whether or not the caller may update the open stateid + * to the value passed in by stateid. + * + * Note: This function relies heavily on the server implementing + * RFC7530 Section 9.1.4.2, and RFC5661 Section 8.2.2 + * correctly. + * i.e. The stateid seqids have to be initialised to 1, and + * are then incremented on every state transition. + */ static bool nfs_need_update_open_stateid(struct nfs4_state *state, - const nfs4_stateid *stateid, nfs4_stateid *freeme) + const nfs4_stateid *stateid) { - if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) - return true; - if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { - nfs4_stateid_copy(freeme, &state->open_stateid); - nfs_test_and_clear_all_open_stateid(state); + if (test_bit(NFS_OPEN_STATE, &state->flags) == 0 || + !nfs4_stateid_match_other(stateid, &state->open_stateid)) { + if (stateid->seqid == cpu_to_be32(1)) + nfs_state_log_update_open_stateid(state); + else + set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); return true; } - if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) + + if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) { + nfs_state_log_out_of_order_open_stateid(state, stateid); return true; + } return false; } @@ -1399,11 +1475,14 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, if (nfs4_stateid_match_other(stateid, &state->open_stateid) && !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { nfs_resync_open_stateid_locked(state); - return; + goto out; } if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + trace_nfs4_open_stateid_update(state->inode, stateid, 0); +out: + nfs_state_log_update_open_stateid(state); } static void nfs_clear_open_stateid(struct nfs4_state *state, @@ -1420,29 +1499,60 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, } static void nfs_set_open_stateid_locked(struct nfs4_state *state, - const nfs4_stateid *stateid, fmode_t fmode, - nfs4_stateid *freeme) + const nfs4_stateid *stateid, nfs4_stateid *freeme) { - switch (fmode) { - case FMODE_READ: - set_bit(NFS_O_RDONLY_STATE, &state->flags); + DEFINE_WAIT(wait); + int status = 0; + for (;;) { + + if (!nfs_need_update_open_stateid(state, stateid)) + return; + if (!test_bit(NFS_STATE_CHANGE_WAIT, &state->flags)) break; - case FMODE_WRITE: - set_bit(NFS_O_WRONLY_STATE, &state->flags); + if (status) break; - case FMODE_READ|FMODE_WRITE: - set_bit(NFS_O_RDWR_STATE, &state->flags); + /* Rely on seqids for serialisation with NFSv4.0 */ + if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client)) + break; + + prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE); + /* + * Ensure we process the state changes in the same order + * in which the server processed them by delaying the + * update of the stateid until we are in sequence. + */ + write_sequnlock(&state->seqlock); + spin_unlock(&state->owner->so_lock); + rcu_read_unlock(); + trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); + if (!signal_pending(current)) { + if (schedule_timeout(5*HZ) == 0) + status = -EAGAIN; + else + status = 0; + } else + status = -EINTR; + finish_wait(&state->waitq, &wait); + rcu_read_lock(); + spin_lock(&state->owner->so_lock); + write_seqlock(&state->seqlock); } - if (!nfs_need_update_open_stateid(state, stateid, freeme)) - return; + + if (test_bit(NFS_OPEN_STATE, &state->flags) && + !nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs4_stateid_copy(freeme, &state->open_stateid); + nfs_test_and_clear_all_open_stateid(state); + } + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + trace_nfs4_open_stateid_update(state->inode, stateid, status); + nfs_state_log_update_open_stateid(state); } -static void __update_open_stateid(struct nfs4_state *state, +static void nfs_state_set_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, - const nfs4_stateid *deleg_stateid, fmode_t fmode, nfs4_stateid *freeme) { @@ -1450,17 +1560,34 @@ static void __update_open_stateid(struct nfs4_state *state, * Protect the call to nfs4_state_set_mode_locked and * serialise the stateid update */ - spin_lock(&state->owner->so_lock); write_seqlock(&state->seqlock); - if (deleg_stateid != NULL) { - nfs4_stateid_copy(&state->stateid, deleg_stateid); - set_bit(NFS_DELEGATED_STATE, &state->flags); + nfs_set_open_stateid_locked(state, open_stateid, freeme); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_WRITE: + set_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case FMODE_READ|FMODE_WRITE: + set_bit(NFS_O_RDWR_STATE, &state->flags); } - if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme); + set_bit(NFS_OPEN_STATE, &state->flags); + write_sequnlock(&state->seqlock); +} + +static void nfs_state_set_delegation(struct nfs4_state *state, + const nfs4_stateid *deleg_stateid, + fmode_t fmode) +{ + /* + * Protect the call to nfs4_state_set_mode_locked and + * serialise the stateid update + */ + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, deleg_stateid); + set_bit(NFS_DELEGATED_STATE, &state->flags); write_sequnlock(&state->seqlock); - update_open_stateflags(state, fmode); - spin_unlock(&state->owner->so_lock); } static int update_open_stateid(struct nfs4_state *state, @@ -1478,6 +1605,12 @@ static int update_open_stateid(struct nfs4_state *state, fmode &= (FMODE_READ|FMODE_WRITE); rcu_read_lock(); + spin_lock(&state->owner->so_lock); + if (open_stateid != NULL) { + nfs_state_set_open_stateid(state, open_stateid, fmode, &freeme); + ret = 1; + } + deleg_cur = rcu_dereference(nfsi->delegation); if (deleg_cur == NULL) goto no_delegation; @@ -1494,18 +1627,16 @@ static int update_open_stateid(struct nfs4_state *state, goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); - __update_open_stateid(state, open_stateid, &deleg_cur->stateid, - fmode, &freeme); + nfs_state_set_delegation(state, &deleg_cur->stateid, fmode); ret = 1; no_delegation_unlock: spin_unlock(&deleg_cur->lock); no_delegation: + if (ret) + update_open_stateflags(state, fmode); + spin_unlock(&state->owner->so_lock); rcu_read_unlock(); - if (!ret && open_stateid != NULL) { - __update_open_stateid(state, open_stateid, NULL, fmode, &freeme); - ret = 1; - } if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) nfs4_schedule_state_manager(clp); if (freeme.type != 0) @@ -1761,7 +1892,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context struct nfs4_opendata *opendata; opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, - NULL, NULL, claim, GFP_NOFS); + NULL, claim, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -2518,7 +2649,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); spin_unlock(&state->state_lock); nfs4_put_lock_state(prev); @@ -2692,8 +2823,7 @@ out: static int _nfs4_do_open(struct inode *dir, struct nfs_open_context *ctx, int flags, - struct iattr *sattr, - struct nfs4_label *label, + const struct nfs4_open_createattrs *c, int *opened) { struct nfs4_state_owner *sp; @@ -2705,6 +2835,8 @@ static int _nfs4_do_open(struct inode *dir, struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; + struct iattr *sattr = c->sattr; + struct nfs4_label *label = c->label; struct nfs4_label *olabel = NULL; int status; @@ -2723,8 +2855,8 @@ static int _nfs4_do_open(struct inode *dir, status = -ENOMEM; if (d_really_is_positive(dentry)) claim = NFS4_OPEN_CLAIM_FH; - opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, - label, claim, GFP_KERNEL); + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, + c, claim, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; @@ -2805,10 +2937,18 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_state *res; + struct nfs4_open_createattrs c = { + .label = label, + .sattr = sattr, + .verf = { + [0] = (__u32)jiffies, + [1] = (__u32)current->pid, + }, + }; int status; do { - status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened); + status = _nfs4_do_open(dir, ctx, flags, &c, opened); res = ctx->state; trace_nfs4_open_file(ctx, flags, status); if (status == 0) @@ -3024,18 +3164,20 @@ static void nfs4_close_done(struct rpc_task *task, void *data) calldata->arg.lr_args = NULL; calldata->res.lr_res = NULL; break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_layout_stateid(&calldata->arg.lr_args->stateid, + calldata->inode)) + goto lr_restart; + /* Fallthrough */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_UNKNOWN_LAYOUTTYPE: case -NFS4ERR_WRONG_CRED: calldata->arg.lr_args = NULL; calldata->res.lr_res = NULL; - calldata->res.lr_ret = 0; - rpc_restart_call_prepare(task); - return; + goto lr_restart; } } @@ -3051,39 +3193,43 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.bitmask != NULL) { calldata->arg.bitmask = NULL; calldata->res.fattr = NULL; - task->tk_status = 0; - rpc_restart_call_prepare(task); - goto out_release; + goto out_restart; } break; + case -NFS4ERR_OLD_STATEID: + /* Did we race with OPEN? */ + if (nfs4_refresh_open_stateid(&calldata->arg.stateid, + state)) + goto out_restart; + goto out_release; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: nfs4_free_revoked_stateid(server, &calldata->arg.stateid, task->tk_msg.rpc_cred); - case -NFS4ERR_OLD_STATEID: + /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &state->open_stateid)) { - rpc_restart_call_prepare(task); - goto out_release; - } - if (calldata->arg.fmode == 0) - break; + break; default: - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - goto out_release; - } + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + goto out_restart; } nfs_clear_open_stateid(state, &calldata->arg.stateid, res_stateid, calldata->arg.fmode); out_release: + task->tk_status = 0; nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, &calldata->fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); + return; +lr_restart: + calldata->res.lr_ret = 0; +out_restart: + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out_release; } static void nfs4_close_prepare(struct rpc_task *task, void *data) @@ -3103,7 +3249,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); - nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid); /* Calculate the change in open mode */ calldata->arg.fmode = 0; if (state->n_rdwr == 0) { @@ -3121,7 +3266,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; if (!nfs4_valid_open_stateid(state) || - test_bit(NFS_OPEN_STATE, &state->flags) == 0) + !nfs4_refresh_open_stateid(&calldata->arg.stateid, state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -3215,6 +3360,8 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); + if (!nfs4_copy_open_stateid(&calldata->arg.stateid, state)) + goto out_free_calldata; /* Serialization for the sequence id */ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask); @@ -3889,6 +4036,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry struct nfs4_accessargs args = { .fh = NFS_FH(inode), .bitmask = server->cache_consistency_bitmask, + .access = entry->mask, }; struct nfs4_accessres res = { .server = server, @@ -3899,26 +4047,8 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_resp = &res, .rpc_cred = entry->cred, }; - int mode = entry->mask; int status = 0; - /* - * Determine which access bits we want to ask for... - */ - if (mode & MAY_READ) - args.access |= NFS4_ACCESS_READ; - if (S_ISDIR(inode->i_mode)) { - if (mode & MAY_WRITE) - args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE; - if (mode & MAY_EXEC) - args.access |= NFS4_ACCESS_LOOKUP; - } else { - if (mode & MAY_WRITE) - args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND; - if (mode & MAY_EXEC) - args.access |= NFS4_ACCESS_EXECUTE; - } - res.fattr = nfs_alloc_fattr(); if (res.fattr == NULL) return -ENOMEM; @@ -4843,7 +4973,7 @@ static void nfs4_renew_release(void *calldata) struct nfs4_renewdata *data = calldata; struct nfs_client *clp = data->client; - if (atomic_read(&clp->cl_count) > 1) + if (refcount_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); kfree(data); @@ -4891,7 +5021,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, if (renew_flags == 0) return 0; - if (!atomic_inc_not_zero(&clp->cl_count)) + if (!refcount_inc_not_zero(&clp->cl_count)) return -EIO; data = kmalloc(sizeof(*data), GFP_NOFS); if (data == NULL) { @@ -5643,18 +5773,20 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) data->args.lr_args = NULL; data->res.lr_res = NULL; break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_layout_stateid(&data->args.lr_args->stateid, + data->inode)) + goto lr_restart; + /* Fallthrough */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_UNKNOWN_LAYOUTTYPE: case -NFS4ERR_WRONG_CRED: data->args.lr_args = NULL; data->res.lr_res = NULL; - data->res.lr_ret = 0; - rpc_restart_call_prepare(task); - return; + goto lr_restart; } } @@ -5668,27 +5800,36 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) nfs4_free_revoked_stateid(data->res.server, data->args.stateid, task->tk_msg.rpc_cred); + /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: task->tk_status = 0; break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_delegation_stateid(&data->stateid, data->inode)) + goto out_restart; + task->tk_status = 0; + break; case -NFS4ERR_ACCESS: if (data->args.bitmask) { data->args.bitmask = NULL; data->res.fattr = NULL; - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; + goto out_restart; } + /* Fallthrough */ default: if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; + goto out_restart; } } data->rpc_status = task->tk_status; + return; +lr_restart: + data->res.lr_ret = 0; +out_restart: + task->tk_status = 0; + rpc_restart_call_prepare(task); } static void nfs4_delegreturn_release(void *calldata) @@ -5896,7 +6037,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.seqid = seqid; p->res.seqid = seqid; p->lsp = lsp; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); p->l_ctx = nfs_get_lock_context(ctx); @@ -6112,7 +6253,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; p->server = server; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); memcpy(&p->fl, fl, sizeof(p->fl)); return p; @@ -6568,6 +6709,20 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; + /* + * Don't rely on the VFS having checked the file open mode, + * since it won't do this for flock() locks. + */ + switch (request->fl_type) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + status = nfs4_set_lock_state(state, request); if (status != 0) return status; @@ -6763,9 +6918,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct page *page) { struct nfs_server *server = NFS_SERVER(dir); - u32 bitmask[3] = { - [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - }; + u32 bitmask[3]; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -6784,12 +6937,15 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, dprintk("%s: start\n", __func__); + bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* Ask for the fileid of the absent filesystem if mounted_on_fileid * is not supported */ if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + bitmask[0] &= ~FATTR4_WORD0_FILEID; else - bitmask[0] |= FATTR4_WORD0_FILEID; + bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; @@ -7472,7 +7628,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, struct nfs41_exchange_id_data *calldata; int status; - if (!atomic_inc_not_zero(&clp->cl_count)) + if (!refcount_inc_not_zero(&clp->cl_count)) return ERR_PTR(-EIO); status = -ENOMEM; @@ -8072,7 +8228,7 @@ static void nfs41_sequence_release(void *data) struct nfs4_sequence_data *calldata = data; struct nfs_client *clp = calldata->clp; - if (atomic_read(&clp->cl_count) > 1) + if (refcount_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); kfree(calldata); @@ -8101,7 +8257,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) trace_nfs4_sequence(clp, task->tk_status); if (task->tk_status < 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); - if (atomic_read(&clp->cl_count) == 1) + if (refcount_read(&clp->cl_count) == 1) goto out; if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { @@ -8135,6 +8291,7 @@ static const struct rpc_call_ops nfs41_sequence_ops = { static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred, + struct nfs4_slot *slot, bool is_privileged) { struct nfs4_sequence_data *calldata; @@ -8148,15 +8305,18 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .callback_ops = &nfs41_sequence_ops, .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; + struct rpc_task *ret; - if (!atomic_inc_not_zero(&clp->cl_count)) - return ERR_PTR(-EIO); + ret = ERR_PTR(-EIO); + if (!refcount_inc_not_zero(&clp->cl_count)) + goto out_err; + + ret = ERR_PTR(-ENOMEM); calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (calldata == NULL) { - nfs_put_client(clp); - return ERR_PTR(-ENOMEM); - } + if (calldata == NULL) + goto out_put_clp; nfs4_init_sequence(&calldata->args, &calldata->res, 0); + nfs4_sequence_attach_slot(&calldata->args, &calldata->res, slot); if (is_privileged) nfs4_set_sequence_privileged(&calldata->args); msg.rpc_argp = &calldata->args; @@ -8164,7 +8324,15 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, calldata->clp = clp; task_setup_data.callback_data = calldata; - return rpc_run_task(&task_setup_data); + ret = rpc_run_task(&task_setup_data); + if (IS_ERR(ret)) + goto out_err; + return ret; +out_put_clp: + nfs_put_client(clp); +out_err: + nfs41_release_slot(slot); + return ret; } static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) @@ -8174,7 +8342,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) return -EAGAIN; - task = _nfs41_proc_sequence(clp, cred, false); + task = _nfs41_proc_sequence(clp, cred, NULL, false); if (IS_ERR(task)) ret = PTR_ERR(task); else @@ -8188,7 +8356,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) struct rpc_task *task; int ret; - task = _nfs41_proc_sequence(clp, cred, true); + task = _nfs41_proc_sequence(clp, cred, NULL, true); if (IS_ERR(task)) { ret = PTR_ERR(task); goto out; @@ -8588,18 +8756,27 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_layout_stateid(&lrp->args.stateid, + lrp->args.inode)) + goto out_restart; + /* Fallthrough */ default: task->tk_status = 0; + /* Fallthrough */ case 0: break; case -NFS4ERR_DELAY: if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; - nfs4_sequence_free_slot(&lrp->res.seq_res); - rpc_restart_call_prepare(task); - return; + goto out_restart; } dprintk("<-- %s\n", __func__); + return; +out_restart: + task->tk_status = 0; + nfs4_sequence_free_slot(&lrp->res.seq_res); + rpc_restart_call_prepare(task); } static void nfs4_layoutreturn_release(void *calldata) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0378e2257ca7..e4f4a09ed9f4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -69,6 +69,14 @@ const nfs4_stateid zero_stateid = { { .data = { 0 } }, .type = NFS4_SPECIAL_STATEID_TYPE, }; +const nfs4_stateid invalid_stateid = { + { + /* Funky initialiser keeps older gcc versions happy */ + .data = { 0xff, 0xff, 0xff, 0xff, 0 }, + }, + .type = NFS4_INVALID_STATEID_TYPE, +}; + static DEFINE_MUTEX(nfs_clid_init_mutex); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -645,6 +653,7 @@ nfs4_alloc_open_state(void) INIT_LIST_HEAD(&state->lock_states); spin_lock_init(&state->state_lock); seqlock_init(&state->seqlock); + init_waitqueue_head(&state->waitq); return state; } @@ -825,7 +834,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, ret = pos; } if (ret) - atomic_inc(&ret->ls_count); + refcount_inc(&ret->ls_count); return ret; } @@ -843,7 +852,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f if (lsp == NULL) return NULL; nfs4_init_seqid_counter(&lsp->ls_seqid); - atomic_set(&lsp->ls_count, 1); + refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner = fl_owner; lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); @@ -907,7 +916,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (lsp == NULL) return; state = lsp->ls_state; - if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock)) + if (!refcount_dec_and_lock(&lsp->ls_count, &state->state_lock)) return; list_del(&lsp->ls_locks); if (list_empty(&state->lock_states)) @@ -927,7 +936,7 @@ static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner; dst->fl_u.nfs4_fl.owner = lsp; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); } static void nfs4_fl_release_lock(struct file_lock *fl) @@ -985,18 +994,39 @@ out: return ret; } -static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +{ + bool ret; + int seq; + + do { + ret = false; + seq = read_seqbegin(&state->seqlock); + if (nfs4_state_match_open_stateid_other(state, dst)) { + dst->seqid = state->open_stateid.seqid; + ret = true; + } + } while (read_seqretry(&state->seqlock, seq)); + return ret; +} + +bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { + bool ret; const nfs4_stateid *src; int seq; do { + ret = false; src = &zero_stateid; seq = read_seqbegin(&state->seqlock); - if (test_bit(NFS_OPEN_STATE, &state->flags)) + if (test_bit(NFS_OPEN_STATE, &state->flags)) { src = &state->open_stateid; + ret = true; + } nfs4_stateid_copy(dst, src); } while (read_seqretry(&state->seqlock, seq)); + return ret; } /* @@ -1177,7 +1207,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; __module_get(THIS_MODULE); - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); /* The rcu_read_lock() is not strictly necessary, as the state * manager is the only thread that ever changes the rpc_xprt @@ -1269,7 +1299,7 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, nfs_wait_bit_killable, TASK_KILLABLE); if (res) @@ -1409,6 +1439,11 @@ void nfs_inode_find_state_and_recover(struct inode *inode, found = true; continue; } + if (nfs4_stateid_match_other(&state->open_stateid, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) { + found = true; + continue; + } if (nfs_state_lock_state_matches_stateid(state, stateid) && nfs4_state_mark_reclaim_nograce(clp, state)) found = true; @@ -2510,7 +2545,7 @@ static void nfs4_state_manager(struct nfs_client *clp) break; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) break; - } while (atomic_read(&clp->cl_count) > 1); + } while (refcount_read(&clp->cl_count) > 1); return; out_error: if (strlen(section)) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index e7c6275519b0..a275fba93170 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -202,17 +202,13 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_ARGS(clp, error), TP_STRUCT__entry( - __string(dstaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)) + __string(dstaddr, clp->cl_hostname) __field(int, error) ), TP_fast_assign( __entry->error = error; - __assign_str(dstaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)); + __assign_str(dstaddr, clp->cl_hostname); ), TP_printk( @@ -1066,6 +1062,8 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait); DECLARE_EVENT_CLASS(nfs4_getattr_event, TP_PROTO( @@ -1133,9 +1131,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __string(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __string(dstaddr, clp ? clp->cl_hostname : "unknown") ), TP_fast_assign( @@ -1148,9 +1144,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") ), TP_printk( @@ -1192,9 +1186,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __string(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __string(dstaddr, clp ? clp->cl_hostname : "unknown") __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -1209,9 +1201,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 14ed9791ec9c..77c6729e57f0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4385,6 +4385,14 @@ static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *state return decode_stateid(xdr, stateid); } +static int decode_invalid_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + nfs4_stateid dummy; + + nfs4_stateid_copy(stateid, &invalid_stateid); + return decode_stateid(xdr, &dummy); +} + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) { int status; @@ -4393,7 +4401,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_open_stateid(xdr, &res->stateid); + status = decode_invalid_stateid(xdr, &res->stateid); return status; } @@ -6108,6 +6116,8 @@ static int decode_layoutreturn(struct xdr_stream *xdr, res->lrs_present = be32_to_cpup(p); if (res->lrs_present) status = decode_layout_stateid(xdr, &res->stateid); + else + nfs4_stateid_copy(&res->stateid, &invalid_stateid); return status; out_overflow: print_overflow_msg(__func__, xdr); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3bcd669a3152..d602fe9e1ac8 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -251,7 +251,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) { - atomic_inc(&lo->plh_refcount); + refcount_inc(&lo->plh_refcount); } static struct pnfs_layout_hdr * @@ -296,7 +296,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) pnfs_layoutreturn_before_put_layout_hdr(lo); - if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); @@ -355,6 +355,24 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* + * Update the seqid of a layout stateid + */ +bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + bool ret = false; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + dst->seqid = lo->plh_stateid.seqid; + ret = true; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* * Mark a pnfs_layout_hdr and all associated layout segments as invalid * * In order to continue using the pnfs_layout_hdr, a full recovery @@ -395,14 +413,14 @@ pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { lo->plh_retry_timestamp = jiffies; if (!test_and_set_bit(fail_bit, &lo->plh_flags)) - atomic_inc(&lo->plh_refcount); + refcount_inc(&lo->plh_refcount); } static void pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { if (test_and_clear_bit(fail_bit, &lo->plh_flags)) - atomic_dec(&lo->plh_refcount); + refcount_dec(&lo->plh_refcount); } static void @@ -450,7 +468,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); - atomic_set(&lseg->pls_refcount, 1); + refcount_set(&lseg->pls_refcount, 1); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; lseg->pls_range = *range; @@ -472,7 +490,7 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ - atomic_dec(&lo->plh_refcount); + refcount_dec(&lo->plh_refcount); if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) return; if (list_empty(&lo->plh_segs) && @@ -507,13 +525,13 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) return; dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount), + refcount_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); lo = lseg->pls_layout; inode = lo->plh_inode; - if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { spin_unlock(&inode->i_lock); return; @@ -551,7 +569,7 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { - if (!atomic_dec_and_test(&lseg->pls_refcount)) + if (!refcount_dec_and_test(&lseg->pls_refcount)) return false; pnfs_layout_remove_lseg(lseg->pls_layout, lseg); list_add(&lseg->pls_list, tmp_list); @@ -570,7 +588,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, * outstanding io is finished. */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount)); + refcount_read(&lseg->pls_refcount)); if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; } @@ -1451,7 +1469,7 @@ alloc_init_layout_hdr(struct inode *ino, lo = pnfs_alloc_layout_hdr(ino, gfp_flags); if (!lo) return NULL; - atomic_set(&lo->plh_refcount, 1); + refcount_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); INIT_LIST_HEAD(&lo->plh_return_segs); @@ -1513,7 +1531,7 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || (range->iomode != ls_range->iomode && - strict_iomode == true) || + strict_iomode) || !pnfs_lseg_range_intersecting(ls_range, range)) return 0; @@ -1546,7 +1564,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, } dprintk("%s:Return lseg %p ref %d\n", - __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); + __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0); return ret; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 87f144f14d1e..8d507c361d98 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,6 +30,7 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +#include <linux/refcount.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/workqueue.h> @@ -54,7 +55,7 @@ struct nfs4_pnfs_ds { char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; struct nfs_client *ds_clp; - atomic_t ds_count; + refcount_t ds_count; unsigned long ds_state; #define NFS4DS_CONNECTING 0 /* ds is establishing connection */ }; @@ -63,7 +64,7 @@ struct pnfs_layout_segment { struct list_head pls_list; struct list_head pls_lc_list; struct pnfs_layout_range pls_range; - atomic_t pls_refcount; + refcount_t pls_refcount; u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; @@ -179,7 +180,7 @@ struct pnfs_layoutdriver_type { }; struct pnfs_layout_hdr { - atomic_t plh_refcount; + refcount_t plh_refcount; atomic_t plh_outstanding; /* number of RPCs out */ struct list_head plh_layouts; /* other client layouts */ struct list_head plh_bulk_destroy; @@ -251,6 +252,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, bool is_recall); int pnfs_destroy_layouts_byclid(struct nfs_client *clp, bool is_recall); +bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, struct inode *inode); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -393,7 +395,7 @@ static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { - atomic_inc(&lseg->pls_refcount); + refcount_inc(&lseg->pls_refcount); smp_mb__after_atomic(); } return lseg; @@ -764,6 +766,11 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) { } +static inline bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, + struct inode *inode) +{ + return false; +} #endif /* CONFIG_NFS_V4_1 */ #if IS_ENABLED(CONFIG_NFS_V4_2) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 60da59be83b6..03aaa60c7768 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -338,7 +338,7 @@ print_ds(struct nfs4_pnfs_ds *ds) " client %p\n" " cl_exchange_flags %x\n", ds->ds_remotestr, - atomic_read(&ds->ds_count), ds->ds_clp, + refcount_read(&ds->ds_count), ds->ds_clp, ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); } @@ -451,7 +451,7 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) { - if (atomic_dec_and_lock(&ds->ds_count, + if (refcount_dec_and_lock(&ds->ds_count, &nfs4_ds_cache_lock)) { list_del_init(&ds->ds_node); spin_unlock(&nfs4_ds_cache_lock); @@ -537,7 +537,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; - atomic_set(&ds->ds_count, 1); + refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); @@ -546,10 +546,10 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) } else { kfree(remotestr); kfree(ds); - atomic_inc(&tmp_ds->ds_count); + refcount_inc(&tmp_ds->ds_count); dprintk("%s data server %s found, inc'ed ds_count to %d\n", __func__, tmp_ds->ds_remotestr, - atomic_read(&tmp_ds->ds_count)); + refcount_read(&tmp_ds->ds_count)); ds = tmp_ds; } spin_unlock(&nfs4_ds_cache_lock); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c9d24bae3025..29bacdc56f6a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -813,9 +813,9 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) */ seq_printf(m, "\n\topts:\t"); seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw"); - seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); - seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); - seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : ""); + seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : ""); nfs_show_mount_options(m, nfss, 1); seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); @@ -1332,7 +1332,7 @@ static int nfs_parse_mount_options(char *raw, mnt->options |= NFS_OPTION_MIGRATION; break; case Opt_nomigration: - mnt->options &= NFS_OPTION_MIGRATION; + mnt->options &= ~NFS_OPTION_MIGRATION; break; /* @@ -1456,18 +1456,21 @@ static int nfs_parse_mount_options(char *raw, switch (token) { case Opt_xprt_udp6: protofamily = AF_INET6; + /* fall through */ case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: protofamily = AF_INET6; + /* fall through */ case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma6: protofamily = AF_INET6; + /* fall through */ case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; @@ -1494,11 +1497,13 @@ static int nfs_parse_mount_options(char *raw, switch (token) { case Opt_xprt_udp6: mountfamily = AF_INET6; + /* fall through */ case Opt_xprt_udp: mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: mountfamily = AF_INET6; + /* fall through */ case Opt_xprt_tcp: mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; break; @@ -1988,9 +1993,9 @@ static int nfs23_validate_mount_data(void *options, args->version = NFS_DEFAULT_VERSION; switch (data->version) { case 1: - data->namlen = 0; + data->namlen = 0; /* fall through */ case 2: - data->bsize = 0; + data->bsize = 0; /* fall through */ case 3: if (data->flags & NFS_MOUNT_VER3) goto out_no_v3; @@ -1998,11 +2003,14 @@ static int nfs23_validate_mount_data(void *options, memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); /* Turn off security negotiation */ extra_flags |= NFS_MOUNT_SECFLAVOUR; + /* fall through */ case 4: if (data->flags & NFS_MOUNT_SECFLAVOUR) goto out_no_sec; + /* fall through */ case 5: memset(data->context, 0, sizeof(data->context)); + /* fall through */ case 6: if (data->flags & NFS_MOUNT_VER3) { if (data->root.size > NFS3_FHSIZE || data->root.size == 0) @@ -2288,11 +2296,11 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) /* * noac is a special case. It implies -o sync, but that's not * necessarily reflected in the mtab options. do_remount_sb - * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the + * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the * remount options, so we have to explicitly reset it. */ if (data->flags & NFS_MOUNT_NOAC) - *flags |= MS_SYNCHRONOUS; + *flags |= SB_SYNCHRONOUS; /* compare new mount options with old ones */ error = nfs_compare_remount_data(nfss, data); @@ -2341,7 +2349,7 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; sb->s_time_gran = 1; sb->s_export_op = &nfs_export_ops; } @@ -2371,7 +2379,7 @@ static void nfs_clone_super(struct super_block *sb, /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; } nfs_initialise_sb(sb); @@ -2592,11 +2600,11 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, /* -o noac implies -o sync */ if (server->flags & NFS_MOUNT_NOAC) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; + sb_mntdata.mntflags |= SB_SYNCHRONOUS; if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) - if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; + if (mount_info->cloned->sb->s_flags & SB_SYNCHRONOUS) + sb_mntdata.mntflags |= SB_SYNCHRONOUS; /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); @@ -2633,7 +2641,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, if (error) goto error_splat_root; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; out: return mntroot; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index babebbccae2a..4a379d7918f2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -487,10 +487,8 @@ try_again: } ret = nfs_page_group_lock(head); - if (ret < 0) { - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); - } + if (ret < 0) + goto release_request; /* lock each request in the page group */ total_bytes = head->wb_bytes; @@ -515,8 +513,7 @@ try_again: if (ret < 0) { nfs_unroll_locks(inode, head, subreq); nfs_release_request(subreq); - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); + goto release_request; } } /* @@ -532,8 +529,8 @@ try_again: nfs_page_group_unlock(head); nfs_unroll_locks(inode, head, subreq); nfs_unlock_and_release_request(subreq); - nfs_unlock_and_release_request(head); - return ERR_PTR(-EIO); + ret = -EIO; + goto release_request; } } @@ -576,6 +573,10 @@ try_again: /* still holds ref on head from nfs_page_find_head_request * and still has lock on head from lock loop */ return head; + +release_request: + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); } static void nfs_write_error_remove_page(struct nfs_page *req) @@ -1889,6 +1890,8 @@ int nfs_commit_inode(struct inode *inode, int how) if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); nfs_commit_end(cinfo.mds); + if (res == 0) + return res; if (error < 0) goto out_error; if (!may_wait) diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 420d3a0ab258..5be08f02a76b 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -30,7 +30,11 @@ locks_start_grace(struct net *net, struct lock_manager *lm) struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); - list_add(&lm->list, grace_list); + if (list_empty(&lm->list)) + list_add(&lm->list, grace_list); + else + WARN(1, "double list_add attempt detected in net %x %s\n", + net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); @@ -55,14 +59,7 @@ locks_end_grace(struct lock_manager *lm) } EXPORT_SYMBOL_GPL(locks_end_grace); -/** - * locks_in_grace - * - * Lock managers call this function to determine when it is OK for them - * to answer ordinary lock requests, and when they should accept only - * lock reclaims. - */ -int +static bool __state_in_grace(struct net *net, bool open) { struct list_head *grace_list = net_generic(net, grace_net_id); @@ -78,15 +75,22 @@ __state_in_grace(struct net *net, bool open) return false; } -int locks_in_grace(struct net *net) +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +bool locks_in_grace(struct net *net) { - return __state_in_grace(net, 0); + return __state_in_grace(net, false); } EXPORT_SYMBOL_GPL(locks_in_grace); -int opens_in_grace(struct net *net) +bool opens_in_grace(struct net *net) { - return __state_in_grace(net, 1); + return __state_in_grace(net, true); } EXPORT_SYMBOL_GPL(opens_in_grace); @@ -104,7 +108,9 @@ grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); - BUG_ON(!list_empty(grace_list)); + WARN_ONCE(!list_empty(grace_list), + "net %x %s: grace_list is not empty\n", + net->ns.inum, __func__); } static struct pernet_operations grace_net_ops = { diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 697f8ae7792d..fdf2aad73470 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -61,6 +61,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else gi->gid[i] = rqgi->gid[i]; } + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } else { gi = get_group_info(rqgi); } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 3f880ae0966b..70b8bf781fce 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -66,7 +66,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, bex->es = PNFS_BLOCK_READ_DATA; else bex->es = PNFS_BLOCK_READWRITE_DATA; - bex->soff = (iomap.blkno << 9); + bex->soff = iomap.addr; break; case IOMAP_UNWRITTEN: if (seg->iomode & IOMODE_RW) { @@ -79,7 +79,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, } bex->es = PNFS_BLOCK_INVALID_DATA; - bex->soff = (iomap.blkno << 9); + bex->soff = iomap.addr; break; } /*FALLTHRU*/ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 46b48dbbdd32..8ceb25a10ea0 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -232,7 +232,7 @@ static struct cache_head *expkey_alloc(void) return NULL; } -static struct cache_detail svc_expkey_cache_template = { +static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", @@ -748,7 +748,7 @@ static struct cache_head *svc_export_alloc(void) return NULL; } -static struct cache_detail svc_export_cache_template = { +static const struct cache_detail svc_export_cache_template = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", @@ -1230,7 +1230,7 @@ nfsd_export_init(struct net *net) int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - dprintk("nfsd: initializing export module (net: %p).\n", net); + dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum); nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); if (IS_ERR(nn->svc_export_cache)) @@ -1278,7 +1278,7 @@ nfsd_export_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - dprintk("nfsd: shutting down export module (net: %p).\n", net); + dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum); cache_unregister_net(nn->svc_expkey_cache, net); cache_unregister_net(nn->svc_export_cache, net); @@ -1286,5 +1286,5 @@ nfsd_export_shutdown(struct net *net) cache_destroy_net(nn->svc_export_cache, net); svcauth_unix_purge(net); - dprintk("nfsd: export shutdown complete (net: %p).\n", net); + dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum); } diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 6dfede6d172a..84831253203d 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -12,6 +12,7 @@ #include <linux/nsproxy.h> #include <linux/sunrpc/addr.h> #include <linux/uaccess.h> +#include <linux/kernel.h> #include "state.h" #include "netns.h" @@ -126,8 +127,6 @@ static struct nfsd_fault_inject_op inject_ops[] = { }, }; -#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op)) - int nfsd_fault_inject_init(void) { unsigned int i; @@ -138,7 +137,7 @@ int nfsd_fault_inject_init(void) if (!debug_dir) goto fail; - for (i = 0; i < NUM_INJECT_OPS; i++) { + for (i = 0; i < ARRAY_SIZE(inject_ops); i++) { op = &inject_ops[i]; if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd)) goto fail; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 3714231a9d0f..36358d435cb0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -107,7 +107,7 @@ struct nfsd_net { bool lockd_up; /* Time of server startup */ - struct timeval nfssvc_boot; + struct timespec64 nfssvc_boot; /* * Max number of connections this nfsd container will allow. Defaults @@ -119,6 +119,9 @@ struct nfsd_net { u32 clverifier_counter; struct svc_serv *nfsd_serv; + + wait_queue_head_t ntf_wq; + atomic_t ntf_refcnt; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index f38acd905441..2758480555fa 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -748,8 +748,9 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) if (resp->status == 0) { *p++ = htonl(resp->count); *p++ = htonl(resp->committed); - *p++ = htonl(nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_usec); + /* unique identifier, y2038 overflow can be ignored */ + *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); + *p++ = htonl(nn->nfssvc_boot.tv_nsec); } return xdr_ressize_check(rqstp, p); } @@ -1119,8 +1120,9 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) p = encode_wcc_data(rqstp, p, &resp->fh); /* Write verifier */ if (resp->status == 0) { - *p++ = htonl(nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_usec); + /* unique identifier, y2038 overflow can be ignored */ + *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); + *p++ = htonl(nn->nfssvc_boot.tv_nsec); } return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 6b9b6cca469f..a5bb76593ce7 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -178,7 +178,7 @@ static struct ent *idtoname_lookup(struct cache_detail *, struct ent *); static struct ent *idtoname_update(struct cache_detail *, struct ent *, struct ent *); -static struct cache_detail idtoname_cache_template = { +static const struct cache_detail idtoname_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", @@ -341,7 +341,7 @@ static struct ent *nametoid_update(struct cache_detail *, struct ent *, struct ent *); static int nametoid_parse(struct cache_detail *, char *, int); -static struct cache_detail nametoid_cache_template = { +static const struct cache_detail nametoid_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ea45d954e8d7..7d888369f85a 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -336,7 +336,7 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) trace_layout_recall(&ls->ls_stid.sc_stateid); - atomic_inc(&ls->ls_stid.sc_count); + refcount_inc(&ls->ls_stid.sc_count); nfsd4_run_cb(&ls->ls_recall); out_unlock: @@ -441,7 +441,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) goto done; } - atomic_inc(&ls->ls_stid.sc_count); + refcount_inc(&ls->ls_stid.sc_count); list_add_tail(&new->lo_perstate, &ls->ls_layouts); new = NULL; done: diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8487486ec496..008ea0b627d0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -485,9 +485,6 @@ static __be32 nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - if (!cstate->current_fh.fh_dentry) - return nfserr_nofilehandle; - u->getfh = &cstate->current_fh; return nfs_ok; } @@ -535,9 +532,6 @@ static __be32 nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - if (!cstate->current_fh.fh_dentry) - return nfserr_nofilehandle; - fh_dup2(&cstate->save_fh, &cstate->current_fh); if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) { memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t)); @@ -570,10 +564,11 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) /* * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy + * __force to keep sparse happy. y2038 time_t overflow is + * irrelevant in this usage. */ verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; memcpy(verifier->data, verf, sizeof(verifier->data)); } @@ -703,10 +698,8 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_link *link = &u->link; - __be32 status = nfserr_nofilehandle; + __be32 status; - if (!cstate->save_fh.fh_dentry) - return status; status = nfsd_link(rqstp, &cstate->current_fh, link->li_name, link->li_namelen, &cstate->save_fh); if (!status) @@ -850,10 +843,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_rename *rename = &u->rename; - __be32 status = nfserr_nofilehandle; + __be32 status; - if (!cstate->save_fh.fh_dentry) - return status; if (opens_in_grace(SVC_NET(rqstp)) && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0c04f81aa63b..b29b5a185a2c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -63,12 +63,16 @@ static const stateid_t zero_stateid = { static const stateid_t currentstateid = { .si_generation = 1, }; +static const stateid_t close_stateid = { + .si_generation = 0xffffffffU, +}; static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) +#define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t))) /* forward declarations */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); @@ -83,6 +87,11 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid); */ static DEFINE_SPINLOCK(state_lock); +enum nfsd4_st_mutex_lock_subclass { + OPEN_STATEID_MUTEX = 0, + LOCK_STATEID_MUTEX = 1, +}; + /* * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for * the refcount on the open stateid to drop. @@ -359,7 +368,7 @@ put_nfs4_file(struct nfs4_file *fi) { might_lock(&state_lock); - if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { + if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); @@ -568,7 +577,7 @@ alloc_clnt_odstate(struct nfs4_client *clp) co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); if (co) { co->co_client = clp; - atomic_set(&co->co_odcount, 1); + refcount_set(&co->co_odcount, 1); } return co; } @@ -586,7 +595,7 @@ static inline void get_clnt_odstate(struct nfs4_clnt_odstate *co) { if (co) - atomic_inc(&co->co_odcount); + refcount_inc(&co->co_odcount); } static void @@ -598,7 +607,7 @@ put_clnt_odstate(struct nfs4_clnt_odstate *co) return; fp = co->co_file; - if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { + if (refcount_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { list_del(&co->co_perfile); spin_unlock(&fp->fi_lock); @@ -656,7 +665,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ - atomic_set(&stid->sc_count, 1); + refcount_set(&stid->sc_count, 1); spin_lock_init(&stid->sc_lock); /* @@ -813,7 +822,7 @@ nfs4_put_stid(struct nfs4_stid *s) might_lock(&clp->cl_lock); - if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) { + if (!refcount_dec_and_lock(&s->sc_count, &clp->cl_lock)) { wake_up_all(&close_wq); return; } @@ -913,7 +922,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) if (status) return status; ++fp->fi_delegees; - atomic_inc(&dp->dl_stid.sc_count); + refcount_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = NFS4_DELEG_STID; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); @@ -1214,7 +1223,7 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, WARN_ON_ONCE(!list_empty(&stp->st_locks)); - if (!atomic_dec_and_test(&s->sc_count)) { + if (!refcount_dec_and_test(&s->sc_count)) { wake_up_all(&close_wq); return; } @@ -1439,8 +1448,10 @@ free_session_slots(struct nfsd4_session *ses) { int i; - for (i = 0; i < ses->se_fchannel.maxreqs; i++) + for (i = 0; i < ses->se_fchannel.maxreqs; i++) { + free_svc_cred(&ses->se_slots[i]->sl_cred); kfree(ses->se_slots[i]); + } } /* @@ -1472,6 +1483,11 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) spin_lock(&nfsd_drc_lock); avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, nfsd_drc_max_mem - nfsd_drc_mem_used); + /* + * Never use more than a third of the remaining memory, + * unless it's the only way to give this client a slot: + */ + avail = clamp_t(int, avail, slotsize, avail/3); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -2072,7 +2088,7 @@ find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) s = find_stateid_locked(cl, t); if (s != NULL) { if (typemask & s->sc_type) - atomic_inc(&s->sc_count); + refcount_inc(&s->sc_count); else s = NULL; } @@ -2287,14 +2303,18 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) dprintk("--> %s slot %p\n", __func__, slot); + slot->sl_flags |= NFSD4_SLOT_INITIALIZED; slot->sl_opcnt = resp->opcnt; slot->sl_status = resp->cstate.status; + free_svc_cred(&slot->sl_cred); + copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred); - slot->sl_flags |= NFSD4_SLOT_INITIALIZED; - if (nfsd4_not_cached(resp)) { - slot->sl_datalen = 0; + if (!nfsd4_cache_this(resp)) { + slot->sl_flags &= ~NFSD4_SLOT_CACHED; return; } + slot->sl_flags |= NFSD4_SLOT_CACHED; + base = resp->cstate.data_offset; slot->sl_datalen = buf->len - base; if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) @@ -2321,8 +2341,16 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, op = &args->ops[resp->opcnt - 1]; nfsd4_encode_operation(resp, op); - /* Return nfserr_retry_uncached_rep in next operation. */ - if (args->opcnt > 1 && !(slot->sl_flags & NFSD4_SLOT_CACHETHIS)) { + if (slot->sl_flags & NFSD4_SLOT_CACHED) + return op->status; + if (args->opcnt == 1) { + /* + * The original operation wasn't a solo sequence--we + * always cache those--so this retry must not match the + * original: + */ + op->status = nfserr_seq_false_retry; + } else { op = &args->ops[resp->opcnt++]; op->status = nfserr_retry_uncached_rep; nfsd4_encode_operation(resp, op); @@ -2986,6 +3014,34 @@ static bool nfsd4_request_too_big(struct svc_rqst *rqstp, return xb->len > session->se_fchannel.maxreq_sz; } +static bool replay_matches_cache(struct svc_rqst *rqstp, + struct nfsd4_sequence *seq, struct nfsd4_slot *slot) +{ + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + if ((bool)(slot->sl_flags & NFSD4_SLOT_CACHETHIS) != + (bool)seq->cachethis) + return false; + /* + * If there's an error than the reply can have fewer ops than + * the call. But if we cached a reply with *more* ops than the + * call you're sending us now, then this new call is clearly not + * really a replay of the old one: + */ + if (slot->sl_opcnt < argp->opcnt) + return false; + /* This is the only check explicitly called by spec: */ + if (!same_creds(&rqstp->rq_cred, &slot->sl_cred)) + return false; + /* + * There may be more comparisons we could actually do, but the + * spec doesn't require us to catch every case where the calls + * don't match (that would require caching the call as well as + * the reply), so we don't bother. + */ + return true; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -3045,6 +3101,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_seq_misordered; if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) goto out_put_session; + status = nfserr_seq_false_retry; + if (!replay_matches_cache(rqstp, seq, slot)) + goto out_put_session; cstate->slot = slot; cstate->session = session; cstate->clp = clp; @@ -3351,7 +3410,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, { lockdep_assert_held(&state_lock); - atomic_set(&fp->fi_ref, 1); + refcount_set(&fp->fi_ref, 1); spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); @@ -3512,15 +3571,63 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; - if (local->st_stateowner == &oo->oo_owner) { + if (local->st_stateowner != &oo->oo_owner) + continue; + if (local->st_stid.sc_type == NFS4_OPEN_STID) { ret = local; - atomic_inc(&ret->st_stid.sc_count); + refcount_inc(&ret->st_stid.sc_count); break; } } return ret; } +static __be32 +nfsd4_verify_open_stid(struct nfs4_stid *s) +{ + __be32 ret = nfs_ok; + + switch (s->sc_type) { + default: + break; + case NFS4_CLOSED_STID: + case NFS4_CLOSED_DELEG_STID: + ret = nfserr_bad_stateid; + break; + case NFS4_REVOKED_DELEG_STID: + ret = nfserr_deleg_revoked; + } + return ret; +} + +/* Lock the stateid st_mutex, and deal with races with CLOSE */ +static __be32 +nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) +{ + __be32 ret; + + mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); + ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret != nfs_ok) + mutex_unlock(&stp->st_mutex); + return ret; +} + +static struct nfs4_ol_stateid * +nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *stp; + for (;;) { + spin_lock(&fp->fi_lock); + stp = nfsd4_find_existing_open(fp, open); + spin_unlock(&fp->fi_lock); + if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok) + break; + nfs4_put_stid(&stp->st_stid); + } + return stp; +} + static struct nfs4_openowner * alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, struct nfsd4_compound_state *cstate) @@ -3563,8 +3670,9 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) stp = open->op_stp; /* We are moving these outside of the spinlocks to avoid the warnings */ mutex_init(&stp->st_mutex); - mutex_lock(&stp->st_mutex); + mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); +retry: spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); @@ -3573,7 +3681,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) goto out_unlock; open->op_stp = NULL; - atomic_inc(&stp->st_stid.sc_count); + refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); @@ -3589,7 +3697,11 @@ out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); if (retstp) { - mutex_lock(&retstp->st_mutex); + /* Handle races with CLOSE */ + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } /* To keep mutex tracking happy */ mutex_unlock(&stp->st_mutex); stp = retstp; @@ -3621,7 +3733,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) * there should be no danger of the refcount going back up again at * this point. */ - wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2); + wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); release_all_access(s); if (s->st_stid.sc_file) { @@ -3647,7 +3759,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval) hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { if (fh_match(&fp->fi_fhandle, fh)) { - if (atomic_inc_not_zero(&fp->fi_ref)) + if (refcount_inc_not_zero(&fp->fi_ref)) return fp; } } @@ -3783,7 +3895,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * lock) we know the server hasn't removed the lease yet, we know * it's safe to take a reference. */ - atomic_inc(&dp->dl_stid.sc_count); + refcount_inc(&dp->dl_stid.sc_count); nfsd4_run_cb(&dp->dl_recall); } @@ -3966,7 +4078,8 @@ static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, statei { struct nfs4_stid *ret; - ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); + ret = find_stateid_by_type(cl, s, + NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); if (!ret) return NULL; return delegstateid(ret); @@ -3989,6 +4102,12 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; + if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + nfs4_put_stid(&deleg->dl_stid); + if (cl->cl_minorversion) + status = nfserr_deleg_revoked; + goto out; + } flags = share_access_to_flags(open->op_share_access); status = nfs4_check_delegmode(deleg, flags); if (status) { @@ -4392,6 +4511,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; + bool new_stp = false; /* * Lookup file; if found, lookup stateid and check open request, @@ -4403,9 +4523,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; - spin_lock(&fp->fi_lock); - stp = nfsd4_find_existing_open(fp, open); - spin_unlock(&fp->fi_lock); + stp = nfsd4_find_and_lock_existing_open(fp, open); } else { open->op_file = NULL; status = nfserr_bad_stateid; @@ -4413,35 +4531,31 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } + if (!stp) { + stp = init_open_stateid(fp, open); + if (!open->op_stp) + new_stp = true; + } + /* * OPEN the file, or upgrade an existing OPEN. * If truncate fails, the OPEN fails. + * + * stp is already locked. */ - if (stp) { + if (!new_stp) { /* Stateid was found, this is an OPEN upgrade */ - mutex_lock(&stp->st_mutex); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { mutex_unlock(&stp->st_mutex); goto out; } } else { - /* stp is returned locked. */ - stp = init_open_stateid(fp, open); - /* See if we lost the race to some other thread */ - if (stp->st_access_bmap != 0) { - status = nfs4_upgrade_open(rqstp, fp, current_fh, - stp, open); - if (status) { - mutex_unlock(&stp->st_mutex); - goto out; - } - goto upgrade_out; - } status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { - mutex_unlock(&stp->st_mutex); + stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); + mutex_unlock(&stp->st_mutex); goto out; } @@ -4450,7 +4564,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } -upgrade_out: + nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); @@ -4677,7 +4791,7 @@ nfs4_laundromat(struct nfsd_net *nn) spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { - nbl = list_first_entry(&nn->blocked_locks_lru, + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); posix_unblock_lock(&nbl->nbl_lock); @@ -4798,6 +4912,18 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } +static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session) +{ + __be32 ret; + + spin_lock(&s->sc_lock); + ret = nfsd4_verify_open_stid(s); + if (ret == nfs_ok) + ret = check_stateid_generation(in, &s->sc_stateid, has_session); + spin_unlock(&s->sc_lock); + return ret; +} + static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) { if (ols->st_stateowner->so_is_open_owner && @@ -4811,7 +4937,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) struct nfs4_stid *s; __be32 status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return status; /* Client debugging aid. */ if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { @@ -4826,7 +4953,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) s = find_stateid_locked(cl, stateid); if (!s) goto out_unlock; - status = check_stateid_generation(stateid, &s->sc_stateid, 1); + status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; switch (s->sc_type) { @@ -4858,8 +4985,19 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; + bool return_revoked = false; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + /* + * only return revoked delegations if explicitly asked. + * otherwise we report revoked or bad_stateid status. + */ + if (typemask & NFS4_REVOKED_DELEG_STID) + return_revoked = true; + else if (typemask & NFS4_DELEG_STID) + typemask |= NFS4_REVOKED_DELEG_STID; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return nfserr_bad_stateid; status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { @@ -4872,6 +5010,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, *s = find_stateid_by_type(cstate->clp, stateid, typemask); if (!*s) return nfserr_bad_stateid; + if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + nfs4_put_stid(*s); + if (cstate->minorversion) + return nfserr_deleg_revoked; + return nfserr_bad_stateid; + } return nfs_ok; } @@ -4971,7 +5115,7 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, &s, nn); if (status) return status; - status = check_stateid_generation(stateid, &s->sc_stateid, + status = nfsd4_stid_check_stateid_generation(stateid, s, nfsd4_has_session(cstate)); if (status) goto out; @@ -5025,7 +5169,9 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) struct nfs4_ol_stateid *stp = openlockstateid(s); __be32 ret; - mutex_lock(&stp->st_mutex); + ret = nfsd4_lock_ol_stateid(stp); + if (ret) + goto out_put_stid; ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) @@ -5036,11 +5182,13 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) lockowner(stp->st_stateowner))) goto out; + stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(stp); ret = nfs_ok; out: mutex_unlock(&stp->st_mutex); +out_put_stid: nfs4_put_stid(s); return ret; } @@ -5060,6 +5208,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, s = find_stateid_locked(cl, stateid); if (!s) goto out_unlock; + spin_lock(&s->sc_lock); switch (s->sc_type) { case NFS4_DELEG_STID: ret = nfserr_locks_held; @@ -5071,11 +5220,13 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_locks_held; break; case NFS4_LOCK_STID: - atomic_inc(&s->sc_count); + spin_unlock(&s->sc_lock); + refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; case NFS4_REVOKED_DELEG_STID: + spin_unlock(&s->sc_lock); dp = delegstateid(s); list_del_init(&dp->dl_recall_lru); spin_unlock(&cl->cl_lock); @@ -5084,6 +5235,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; /* Default falls through and returns nfserr_bad_stateid */ } + spin_unlock(&s->sc_lock); out_unlock: spin_unlock(&cl->cl_lock); out: @@ -5106,15 +5258,9 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - if (stp->st_stid.sc_type == NFS4_CLOSED_STID - || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID) - /* - * "Closed" stateid's exist *only* to return - * nfserr_replay_me from the previous step, and - * revoked delegations are kept only for free_stateid. - */ - return nfserr_bad_stateid; - mutex_lock(&stp->st_mutex); + status = nfsd4_lock_ol_stateid(stp); + if (status != nfs_ok) + return status; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); @@ -5294,7 +5440,6 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) bool unhashed; LIST_HEAD(reaplist); - s->st_stid.sc_type = NFS4_CLOSED_STID; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); @@ -5334,10 +5479,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_bump_seqid(cstate, status); if (status) goto out; + + stp->st_stid.sc_type = NFS4_CLOSED_STID; nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - mutex_unlock(&stp->st_mutex); nfsd4_close_open_stateid(stp); + mutex_unlock(&stp->st_mutex); + + /* See RFC5661 sectionm 18.2.4 */ + if (stp->st_stid.sc_client->cl_minorversion) + memcpy(&close->cl_stateid, &close_stateid, + sizeof(close->cl_stateid)); /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); @@ -5363,7 +5515,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; dp = delegstateid(s); - status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); + status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate)); if (status) goto put_stateid; @@ -5569,16 +5721,43 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, return ret; } -static void +static struct nfs4_ol_stateid * +find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *lst; + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { + if (lst->st_stid.sc_type != NFS4_LOCK_STID) + continue; + if (lst->st_stid.sc_file == fp) { + refcount_inc(&lst->st_stid.sc_count); + return lst; + } + } + return NULL; +} + +static struct nfs4_ol_stateid * init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, struct nfs4_file *fp, struct inode *inode, struct nfs4_ol_stateid *open_stp) { struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfs4_ol_stateid *retstp; - lockdep_assert_held(&clp->cl_lock); + mutex_init(&stp->st_mutex); + mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); +retry: + spin_lock(&clp->cl_lock); + spin_lock(&fp->fi_lock); + retstp = find_lock_stateid(lo, fp); + if (retstp) + goto out_unlock; - atomic_inc(&stp->st_stid.sc_count); + refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_LOCK_STID; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); @@ -5586,29 +5765,22 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - mutex_init(&stp->st_mutex); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); - spin_lock(&fp->fi_lock); list_add(&stp->st_perfile, &fp->fi_stateids); +out_unlock: spin_unlock(&fp->fi_lock); -} - -static struct nfs4_ol_stateid * -find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) -{ - struct nfs4_ol_stateid *lst; - struct nfs4_client *clp = lo->lo_owner.so_client; - - lockdep_assert_held(&clp->cl_lock); - - list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { - if (lst->st_stid.sc_file == fp) { - atomic_inc(&lst->st_stid.sc_count); - return lst; + spin_unlock(&clp->cl_lock); + if (retstp) { + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; } + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + stp = retstp; } - return NULL; + return stp; } static struct nfs4_ol_stateid * @@ -5621,26 +5793,25 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *clp = oo->oo_owner.so_client; + *new = false; spin_lock(&clp->cl_lock); lst = find_lock_stateid(lo, fi); - if (lst == NULL) { - spin_unlock(&clp->cl_lock); - ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); - if (ns == NULL) - return NULL; - - spin_lock(&clp->cl_lock); - lst = find_lock_stateid(lo, fi); - if (likely(!lst)) { - lst = openlockstateid(ns); - init_lock_stateid(lst, lo, fi, inode, ost); - ns = NULL; - *new = true; - } - } spin_unlock(&clp->cl_lock); - if (ns) + if (lst != NULL) { + if (nfsd4_lock_ol_stateid(lst) == nfs_ok) + goto out; + nfs4_put_stid(&lst->st_stid); + } + ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); + if (ns == NULL) + return NULL; + + lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost); + if (lst == openlockstateid(ns)) + *new = true; + else nfs4_put_stid(ns); +out: return lst; } @@ -5677,7 +5848,6 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_lockowner *lo; struct nfs4_ol_stateid *lst; unsigned int strhashval; - bool hashed; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5693,25 +5863,12 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, goto out; } -retry: lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); if (lst == NULL) { status = nfserr_jukebox; goto out; } - mutex_lock(&lst->st_mutex); - - /* See if it's still hashed to avoid race with FREE_STATEID */ - spin_lock(&cl->cl_lock); - hashed = !list_empty(&lst->st_perfile); - spin_unlock(&cl->cl_lock); - - if (!hashed) { - mutex_unlock(&lst->st_mutex); - nfs4_put_stid(&lst->st_stid); - goto retry; - } status = nfs_ok; *plst = lst; out: @@ -5917,14 +6074,16 @@ out: seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; - mutex_unlock(&lock_stp->st_mutex); - /* * If this is a new, never-before-used stateid, and we are * returning an error, then just go ahead and release it. */ - if (status && new) + if (status && new) { + lock_stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(lock_stp); + } + + mutex_unlock(&lock_stp->st_mutex); nfs4_put_stid(&lock_stp->st_stid); } @@ -6944,6 +7103,10 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; + nn->boot_time = get_seconds(); + nn->grace_ended = false; + nn->nfsd4_manager.block_opens = true; + INIT_LIST_HEAD(&nn->nfsd4_manager.list); INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); @@ -7001,13 +7164,10 @@ nfs4_state_start_net(struct net *net) ret = nfs4_state_create_net(net); if (ret) return ret; - nn->boot_time = get_seconds(); - nn->grace_ended = false; - nn->nfsd4_manager.block_opens = true; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); - printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", - nn->nfsd4_grace, net); + printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n", + nn->nfsd4_grace, net->ns.inum); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); return 0; } @@ -7080,7 +7240,7 @@ nfs4_state_shutdown_net(struct net *net) spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { - nbl = list_first_entry(&nn->blocked_locks_lru, + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); posix_unblock_lock(&nbl->nbl_lock); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6493df6b1bd5..d107b4426f7e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1241,6 +1241,9 @@ static __net_init int nfsd_init_net(struct net *net) nn->nfsd4_grace = 90; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); + + atomic_set(&nn->ntf_refcnt, 0); + init_waitqueue_head(&nn->ntf_wq); return 0; out_idmap_error: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e02bd2783124..89cb484f1cfb 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -335,7 +335,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nn->ntf_refcnt)) goto out; if (nn->nfsd_serv) { @@ -344,6 +345,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } + atomic_dec(&nn->ntf_refcnt); + wake_up(&nn->ntf_wq); out: return NOTIFY_DONE; @@ -363,7 +366,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nn->ntf_refcnt)) goto out; if (nn->nfsd_serv) { @@ -374,7 +378,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } - + atomic_dec(&nn->ntf_refcnt); + wake_up(&nn->ntf_wq); out: return NOTIFY_DONE; } @@ -391,6 +396,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + atomic_dec(&nn->ntf_refcnt); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); @@ -398,6 +404,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } + wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0); /* * write_ports can create the server without actually starting @@ -447,7 +454,7 @@ void nfsd_reset_versions(void) */ static void set_max_drc(void) { - #define NFSD_DRC_SIZE_SHIFT 10 + #define NFSD_DRC_SIZE_SHIFT 7 nfsd_drc_max_mem = (nr_free_buffer_pages() >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; nfsd_drc_mem_used = 0; @@ -517,7 +524,8 @@ int nfsd_create_serv(struct net *net) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ + atomic_inc(&nn->ntf_refcnt); + ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 005c911b34ac..f3772ea8ba0d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -36,6 +36,7 @@ #define _NFSD4_STATE_H #include <linux/idr.h> +#include <linux/refcount.h> #include <linux/sunrpc/svc_xprt.h> #include "nfsfh.h" @@ -83,7 +84,7 @@ struct nfsd4_callback_ops { * fields that are of general use to any stateid. */ struct nfs4_stid { - atomic_t sc_count; + refcount_t sc_count; #define NFS4_OPEN_STID 1 #define NFS4_LOCK_STID 2 #define NFS4_DELEG_STID 4 @@ -169,11 +170,13 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) struct nfsd4_slot { u32 sl_seqid; __be32 sl_status; + struct svc_cred sl_cred; u32 sl_datalen; u16 sl_opcnt; #define NFSD4_SLOT_INUSE (1 << 0) #define NFSD4_SLOT_CACHETHIS (1 << 1) #define NFSD4_SLOT_INITIALIZED (1 << 2) +#define NFSD4_SLOT_CACHED (1 << 3) u8 sl_flags; char sl_data[]; }; @@ -465,7 +468,7 @@ struct nfs4_clnt_odstate { struct nfs4_client *co_client; struct nfs4_file *co_file; struct list_head co_perfile; - atomic_t co_odcount; + refcount_t co_odcount; }; /* @@ -481,7 +484,7 @@ struct nfs4_clnt_odstate { * the global state_lock spinlock. */ struct nfs4_file { - atomic_t fi_ref; + refcount_t fi_ref; spinlock_t fi_lock; struct hlist_node fi_hash; /* hash on fi_fhandle */ struct list_head fi_stateids; @@ -634,7 +637,7 @@ struct nfs4_file *find_file(struct knfsd_fh *fh); void put_nfs4_file(struct nfs4_file *fi); static inline void get_nfs4_file(struct nfs4_file *fi) { - atomic_inc(&fi->fi_ref); + refcount_inc(&fi->fi_ref); } struct file *find_any_file(struct nfs4_file *f); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 1e4edbf70052..bc29511b6405 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -649,9 +649,18 @@ static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; } -static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) +/* + * The session reply cache only needs to cache replies that the client + * actually asked us to. But it's almost free for us to cache compounds + * consisting of only a SEQUENCE op, so we may as well cache those too. + * Also, the protocol doesn't give us a convenient response in the case + * of a replay of a solo SEQUENCE op that wasn't cached + * (RETRY_UNCACHED_REP can only be returned in the second op of a + * compound). + */ +static inline bool nfsd4_cache_this(struct nfsd4_compoundres *resp) { - return !(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) + return (resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) || nfsd4_is_solo_sequence(resp); } diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 06ffa135dfa6..16a7a67a11c9 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2156,10 +2156,10 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, level++) INIT_LIST_HEAD(&lists[level]); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); - while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, btcache, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 515d13c196da..1a2894aa0194 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -150,7 +150,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = nilfs_new_inode(dir, S_IFLNK | 0777); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 8616c46d33da..68241512d7c1 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -255,10 +255,9 @@ int nilfs_copy_dirty_pages(struct address_space *dmap, pgoff_t index = 0; int err = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) return 0; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -310,7 +309,7 @@ void nilfs_copy_back_pages(struct address_space *dmap, pgoff_t index = 0; int err; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: n = pagevec_lookup(&pvec, smap, &index); if (!n) @@ -374,10 +373,10 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -519,7 +518,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 70ded52dc1dd..9f3ffba41533 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -708,21 +708,17 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, index = start >> PAGE_SHIFT; last = end >> PAGE_SHIFT; } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: if (unlikely(index > last) || - !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - min_t(pgoff_t, last - index, - PAGEVEC_SIZE - 1) + 1)) + !pagevec_lookup_range_tag(&pvec, mapping, &index, last, + PAGECACHE_TAG_DIRTY)) return ndirties; for (i = 0; i < pagevec_count(&pvec); i++) { struct buffer_head *bh, *head; struct page *page = pvec.pages[i]; - if (unlikely(page->index > last)) - break; - lock_page(page); if (!page_has_buffers(page)) create_empty_buffers(page, i_blocksize(inode), 0); @@ -757,10 +753,10 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { @@ -1958,8 +1954,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, err, ii->vfs_inode.i_ino); return err; } - mark_buffer_dirty(ibh); - nilfs_mdt_mark_dirty(ifile); spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) ii->i_bh = ibh; @@ -1968,6 +1962,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, goto retry; } + // Always redirty the buffer to avoid race condition + mark_buffer_dirty(ii->i_bh); + nilfs_mdt_mark_dirty(ifile); + clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); @@ -1981,7 +1979,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; - int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE); + int during_mount = !(sci->sc_super->s_flags & SB_ACTIVE); int defer_iput = false; spin_lock(&nilfs->ns_inode_lock); @@ -2404,11 +2402,11 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) return err; } -static void nilfs_construction_timeout(unsigned long data) +static void nilfs_construction_timeout(struct timer_list *t) { - struct task_struct *p = (struct task_struct *)data; + struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer); - wake_up_process(p); + wake_up_process(sci->sc_timer_task); } static void @@ -2546,8 +2544,7 @@ static int nilfs_segctor_thread(void *arg) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int timeout = 0; - sci->sc_timer.data = (unsigned long)current; - sci->sc_timer.function = nilfs_construction_timeout; + sci->sc_timer_task = current; /* start sync. */ sci->sc_task = current; @@ -2678,7 +2675,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_iput_queue); INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); - init_timer(&sci->sc_timer); + timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 1060949d7dd2..84084a4d9b3e 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -180,6 +180,7 @@ struct nilfs_sc_info { unsigned long sc_watermark; struct timer_list sc_timer; + struct task_struct *sc_timer_task; struct task_struct *sc_task; }; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1541a1e9221a..1341a41e7b43 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -630,22 +630,22 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, } /** - * nilfs_sufile_truncate_range - truncate range of segment array - * @sufile: inode of segment usage file - * @start: start segment number (inclusive) - * @end: end segment number (inclusive) - * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid number of segments specified - * - * %-EBUSY - Dirty or active segments are present in the range - */ + * nilfs_sufile_truncate_range - truncate range of segment array + * @sufile: inode of segment usage file + * @start: start segment number (inclusive) + * @end: end segment number (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid number of segments specified + * + * %-EBUSY - Dirty or active segments are present in the range + */ static int nilfs_sufile_truncate_range(struct inode *sufile, __u64 start, __u64 end) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4fc018dfcfae..3073b646e1ba 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -141,7 +141,7 @@ void __nilfs_error(struct super_block *sb, const char *function, if (nilfs_test_opt(nilfs, ERRORS_RO)) { printk(KERN_CRIT "Remounting filesystem read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } } @@ -160,7 +160,6 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_bh = NULL; ii->i_state = 0; ii->i_cno = 0; - ii->vfs_inode.i_version = 1; nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); return &ii->vfs_inode; } @@ -870,7 +869,7 @@ int nilfs_store_magic_and_option(struct super_block *sb, /* FS independent flags */ #ifdef NILFS_ATIME_DISABLE - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; #endif nilfs_set_default_options(sb, sbp); @@ -1134,7 +1133,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - sb->s_flags = (sb->s_flags & ~MS_POSIXACL); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL); err = -EINVAL; @@ -1144,12 +1143,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { /* Shutting down log writer */ nilfs_detach_log_writer(sb); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* * Remounting a valid RW partition RDONLY, so set @@ -1179,7 +1178,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; root = NILFS_I(d_inode(sb->s_root))->i_root; err = nilfs_attach_log_writer(sb, root); @@ -1213,7 +1212,7 @@ static int nilfs_parse_snapshot_option(const char *option, const char *msg = NULL; int err; - if (!(sd->flags & MS_RDONLY)) { + if (!(sd->flags & SB_RDONLY)) { msg = "read-only option is not specified"; goto parse_error; } @@ -1287,7 +1286,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, struct dentry *root_dentry; int err, s_new = false; - if (!(flags & MS_RDONLY)) + if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); @@ -1328,14 +1327,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags, snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); sb_set_blocksize(s, block_size(sd.bdev)); - err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (err) goto failed_super; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; } else if (!sd.cno) { if (nilfs_tree_is_busy(s->s_root)) { - if ((flags ^ s->s_flags) & MS_RDONLY) { + if ((flags ^ s->s_flags) & SB_RDONLY) { nilfs_msg(s, KERN_ERR, "the device already has a %s mount.", sb_rdonly(s) ? "read-only" : "read/write"); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2dd75bf619ad..1a85317e83f0 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -220,7 +220,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) if (!valid_fs) { nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs"); - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { nilfs_msg(sb, KERN_INFO, "recovery required for readonly filesystem"); nilfs_msg(sb, KERN_INFO, @@ -286,7 +286,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) if (valid_fs) goto skip_recovery; - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { __u64 features; if (nilfs_test_opt(nilfs, NORECOVERY)) { @@ -309,7 +309,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) err = -EROFS; goto failed_unload; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; } else if (nilfs_test_opt(nilfs, NORECOVERY)) { nilfs_msg(sb, KERN_ERR, "recovery cancelled because norecovery option was specified for a read/write mount"); @@ -737,7 +737,7 @@ struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno) } else if (cno > root->cno) { n = n->rb_right; } else { - atomic_inc(&root->count); + refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); return root; } @@ -776,7 +776,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) } else if (cno > root->cno) { p = &(*p)->rb_right; } else { - atomic_inc(&root->count); + refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); kfree(new); return root; @@ -786,7 +786,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) new->cno = cno; new->ifile = NULL; new->nilfs = nilfs; - atomic_set(&new->count, 1); + refcount_set(&new->count, 1); atomic64_set(&new->inodes_count, 0); atomic64_set(&new->blocks_count, 0); @@ -806,7 +806,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) void nilfs_put_root(struct nilfs_root *root) { - if (atomic_dec_and_test(&root->count)) { + if (refcount_dec_and_test(&root->count)) { struct the_nilfs *nilfs = root->nilfs; nilfs_sysfs_delete_snapshot_group(root); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index b305c6f033e7..883d732b0259 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -27,6 +27,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> +#include <linux/refcount.h> struct nilfs_sc_info; struct nilfs_sysfs_dev_subgroups; @@ -246,7 +247,7 @@ struct nilfs_root { __u64 cno; struct rb_node rb_node; - atomic_t count; + refcount_t count; struct the_nilfs *nilfs; struct inode *ifile; @@ -299,7 +300,7 @@ void nilfs_swap_super_block(struct the_nilfs *); static inline void nilfs_get_root(struct nilfs_root *root) { - atomic_inc(&root->count); + refcount_inc(&root->count); } static inline int nilfs_valid_fs(struct the_nilfs *nilfs) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index cba328315929..63a1ca4b9dee 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -319,7 +319,11 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0); + error = fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0); + if (error) { + mutex_unlock(&dnotify_group->mark_mutex); + goto out_err; + } spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; @@ -345,6 +349,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) */ if (dn_mark == new_dn_mark) destroy = 1; + error = 0; goto out; } diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index e5f911bd80d2..41355ce74ac0 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -21,6 +21,6 @@ config FANOTIFY_ACCESS_PERMISSIONS decisions concerning filesystem events. This is used by some fanotify listeners which need to scan files before allowing the system access to use those files. This is used by some anti-malware vendors and by some - hierarchical storage managent systems. + hierarchical storage management systems. If unsure, say N. diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 09640b546363..6702a6a0bbb5 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -10,6 +10,7 @@ #include <linux/sched/user.h> #include <linux/types.h> #include <linux/wait.h> +#include <linux/audit.h> #include "fanotify.h" @@ -36,15 +37,13 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) pr_debug("%s: list=%p event=%p\n", __func__, list, event); -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * Don't merge a permission event with any other event so that we know * the event structure we have created in fanotify_handle_event() is the * one we should check for permission response. */ - if (event->mask & FAN_ALL_PERM_EVENTS) + if (fanotify_is_perm_event(event->mask)) return 0; -#endif list_for_each_entry_reverse(test_event, list, list) { if (should_merge(test_event, event)) { @@ -56,7 +55,6 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) return 0; } -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static int fanotify_get_response(struct fsnotify_group *group, struct fanotify_perm_event_info *event, struct fsnotify_iter_info *iter_info) @@ -65,21 +63,10 @@ static int fanotify_get_response(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - /* - * fsnotify_prepare_user_wait() fails if we race with mark deletion. - * Just let the operation pass in that case. - */ - if (!fsnotify_prepare_user_wait(iter_info)) { - event->response = FAN_ALLOW; - goto out; - } - wait_event(group->fanotify_data.access_waitq, event->response); - fsnotify_finish_user_wait(iter_info); -out: /* userspace responded, convert to something usable */ - switch (event->response) { + switch (event->response & ~FAN_AUDIT) { case FAN_ALLOW: ret = 0; break; @@ -87,6 +74,11 @@ out: default: ret = -EPERM; } + + /* Check if the response should be audited */ + if (event->response & FAN_AUDIT) + audit_fanotify(event->response & ~FAN_AUDIT); + event->response = 0; pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, @@ -94,7 +86,6 @@ out: return ret; } -#endif static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, @@ -153,8 +144,7 @@ struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, { struct fanotify_event_info *event; -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (mask & FAN_ALL_PERM_EVENTS) { + if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event_info *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, @@ -165,7 +155,6 @@ struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, pevent->response = 0; goto init; } -#endif event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); if (!event) return NULL; @@ -212,9 +201,19 @@ static int fanotify_handle_event(struct fsnotify_group *group, pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, mask); + if (fanotify_is_perm_event(mask)) { + /* + * fsnotify_prepare_user_wait() fails if we race with mark + * deletion. Just let the operation pass in that case. + */ + if (!fsnotify_prepare_user_wait(iter_info)) + return 0; + } + event = fanotify_alloc_event(inode, mask, data); + ret = -ENOMEM; if (unlikely(!event)) - return -ENOMEM; + goto finish; fsn_event = &event->fse; ret = fsnotify_add_event(group, fsn_event, fanotify_merge); @@ -224,16 +223,16 @@ static int fanotify_handle_event(struct fsnotify_group *group, /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); - return 0; - } - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (mask & FAN_ALL_PERM_EVENTS) { + ret = 0; + } else if (fanotify_is_perm_event(mask)) { ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), iter_info); fsnotify_destroy_event(group, fsn_event); } -#endif +finish: + if (fanotify_is_perm_event(mask)) + fsnotify_finish_user_wait(iter_info); + return ret; } @@ -253,13 +252,11 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) event = FANOTIFY_E(fsn_event); path_put(&event->path); put_pid(event->tgid); -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (fsn_event->mask & FAN_ALL_PERM_EVENTS) { + if (fanotify_is_perm_event(fsn_event->mask)) { kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PE(fsn_event)); return; } -#endif kmem_cache_free(fanotify_event_cachep, event); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 7dacb7d80727..256d9d1ddea9 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -22,7 +22,6 @@ struct fanotify_event_info { struct pid *tgid; }; -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the @@ -41,7 +40,12 @@ FANOTIFY_PE(struct fsnotify_event *fse) { return container_of(fse, struct fanotify_perm_event_info, fae.fse); } -#endif + +static inline bool fanotify_is_perm_event(u32 mask) +{ + return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) && + mask & FAN_ALL_PERM_EVENTS; +} static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9752e7270e61..d0d4bc4c4b70 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -143,7 +143,6 @@ static int fill_event_metadata(struct fsnotify_group *group, return ret; } -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static struct fanotify_perm_event_info *dequeue_event( struct fsnotify_group *group, int fd) { @@ -180,7 +179,7 @@ static int process_access_response(struct fsnotify_group *group, * userspace can send a valid response or we will clean it up after the * timeout */ - switch (response) { + switch (response & ~FAN_AUDIT) { case FAN_ALLOW: case FAN_DENY: break; @@ -191,6 +190,9 @@ static int process_access_response(struct fsnotify_group *group, if (fd < 0) return -EINVAL; + if ((response & FAN_AUDIT) && !group->fanotify_data.audit) + return -EINVAL; + event = dequeue_event(group, fd); if (!event) return -ENOENT; @@ -200,7 +202,6 @@ static int process_access_response(struct fsnotify_group *group, return 0; } -#endif static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event *event, @@ -222,10 +223,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fanotify_event_metadata.event_len)) goto out_close_fd; -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) + if (fanotify_is_perm_event(event->mask)) FANOTIFY_PE(event)->fd = fd; -#endif if (fd != FAN_NOFD) fd_install(fd, f); @@ -310,10 +309,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, * Permission events get queued to wait for response. Other * events can be destroyed now. */ - if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { + if (!fanotify_is_perm_event(kevent->mask)) { fsnotify_destroy_event(group, kevent); } else { -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (ret <= 0) { FANOTIFY_PE(kevent)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); @@ -323,7 +321,6 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, &group->fanotify_data.access_list); spin_unlock(&group->notification_lock); } -#endif } if (ret < 0) break; @@ -339,11 +336,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_response response = { .fd = -1, .response = -1 }; struct fsnotify_group *group; int ret; + if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) + return -EINVAL; + group = file->private_data; if (count > sizeof(response)) @@ -359,16 +358,11 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count = ret; return count; -#else - return -EINVAL; -#endif } static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; struct fsnotify_event *fsn_event; @@ -404,14 +398,14 @@ static int fanotify_release(struct inode *ignored, struct file *file) spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); spin_lock(&group->notification_lock); - } else + } else { FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + } } spin_unlock(&group->notification_lock); /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); -#endif /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_destroy_group(group); @@ -722,7 +716,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; +#ifdef CONFIG_AUDITSYSCALL + if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT)) +#else if (flags & ~FAN_ALL_INIT_FLAGS) +#endif return -EINVAL; if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) @@ -769,10 +767,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (force_o_largefile()) event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); -#endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: group->priority = FS_PRIO_0; @@ -806,6 +802,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; } + if (flags & FAN_ENABLE_AUDIT) { + fd = -EPERM; + if (!capable(CAP_AUDIT_WRITE)) + goto out_destroy_group; + group->fanotify_data.audit = true; + } + fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); if (fd < 0) goto out_destroy_group; @@ -826,6 +829,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, struct fsnotify_group *group; struct fd f; struct path path; + u32 valid_mask = FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", @@ -856,11 +860,10 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, mask &= ~FAN_ONDIR; } -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD)) -#else - if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD)) -#endif + if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) + valid_mask |= FAN_ALL_PERM_EVENTS; + + if (mask & ~valid_mask) return -EINVAL; f = fdget(fanotify_fd); @@ -950,10 +953,10 @@ static int __init fanotify_user_setup(void) { fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info, - SLAB_PANIC); -#endif + if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { + fanotify_perm_event_cachep = + KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC); + } return 0; } diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 517f88c1dbe5..d478629c728b 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -157,6 +157,9 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f) if (group->fanotify_data.max_marks == UINT_MAX) flags |= FAN_UNLIMITED_MARKS; + if (group->fanotify_data.audit) + flags |= FAN_ENABLE_AUDIT; + seq_printf(m, "fanotify flags:%x event-flags:%x\n", flags, group->fanotify_data.f_flags); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 0c4583b61717..219b269c737e 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -67,7 +67,7 @@ void fsnotify_unmount_inodes(struct super_block *sb) /* * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually + * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. */ @@ -243,6 +243,29 @@ static int send_to_group(struct inode *to_tell, file_name, cookie, iter_info); } +static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp) +{ + struct fsnotify_mark_connector *conn; + struct hlist_node *node = NULL; + + conn = srcu_dereference(*connp, &fsnotify_mark_srcu); + if (conn) + node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); + + return hlist_entry_safe(node, struct fsnotify_mark, obj_list); +} + +static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) +{ + struct hlist_node *node = NULL; + + if (mark) + node = srcu_dereference(mark->obj_list.next, + &fsnotify_mark_srcu); + + return hlist_entry_safe(node, struct fsnotify_mark, obj_list); +} + /* * This is the main call to fsnotify. The VFS calls into hook specific functions * in linux/fsnotify.h. Those functions then in turn call here. Here will call @@ -252,11 +275,7 @@ static int send_to_group(struct inode *to_tell, int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { - struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; - struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; - struct fsnotify_group *inode_group, *vfsmount_group; - struct fsnotify_mark_connector *inode_conn, *vfsmount_conn; - struct fsnotify_iter_info iter_info; + struct fsnotify_iter_info iter_info = {}; struct mount *mnt; int ret = 0; /* global tests shouldn't care about events on child only the specific event */ @@ -291,26 +310,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, if ((mask & FS_MODIFY) || (test_mask & to_tell->i_fsnotify_mask)) { - inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, - &fsnotify_mark_srcu); - if (inode_conn) - inode_node = srcu_dereference(inode_conn->list.first, - &fsnotify_mark_srcu); + iter_info.inode_mark = + fsnotify_first_mark(&to_tell->i_fsnotify_marks); } if (mnt && ((mask & FS_MODIFY) || (test_mask & mnt->mnt_fsnotify_mask))) { - inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, - &fsnotify_mark_srcu); - if (inode_conn) - inode_node = srcu_dereference(inode_conn->list.first, - &fsnotify_mark_srcu); - vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks, - &fsnotify_mark_srcu); - if (vfsmount_conn) - vfsmount_node = srcu_dereference( - vfsmount_conn->list.first, - &fsnotify_mark_srcu); + iter_info.inode_mark = + fsnotify_first_mark(&to_tell->i_fsnotify_marks); + iter_info.vfsmount_mark = + fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } /* @@ -318,39 +327,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * ignore masks are properly reflected for mount mark notifications. * That's why this traversal is so complicated... */ - while (inode_node || vfsmount_node) { - inode_group = NULL; - inode_mark = NULL; - vfsmount_group = NULL; - vfsmount_mark = NULL; - - if (inode_node) { - inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), - struct fsnotify_mark, obj_list); - inode_group = inode_mark->group; - } - - if (vfsmount_node) { - vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), - struct fsnotify_mark, obj_list); - vfsmount_group = vfsmount_mark->group; - } - - if (inode_group && vfsmount_group) { - int cmp = fsnotify_compare_groups(inode_group, - vfsmount_group); - if (cmp > 0) { - inode_group = NULL; + while (iter_info.inode_mark || iter_info.vfsmount_mark) { + struct fsnotify_mark *inode_mark = iter_info.inode_mark; + struct fsnotify_mark *vfsmount_mark = iter_info.vfsmount_mark; + + if (inode_mark && vfsmount_mark) { + int cmp = fsnotify_compare_groups(inode_mark->group, + vfsmount_mark->group); + if (cmp > 0) inode_mark = NULL; - } else if (cmp < 0) { - vfsmount_group = NULL; + else if (cmp < 0) vfsmount_mark = NULL; - } } - iter_info.inode_mark = inode_mark; - iter_info.vfsmount_mark = vfsmount_mark; - ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask, data, data_is, cookie, file_name, &iter_info); @@ -358,12 +347,12 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; - if (inode_group) - inode_node = srcu_dereference(inode_node->next, - &fsnotify_mark_srcu); - if (vfsmount_group) - vfsmount_node = srcu_dereference(vfsmount_node->next, - &fsnotify_mark_srcu); + if (inode_mark) + iter_info.inode_mark = + fsnotify_next_mark(iter_info.inode_mark); + if (vfsmount_mark) + iter_info.vfsmount_mark = + fsnotify_next_mark(iter_info.vfsmount_mark); } ret = 0; out: diff --git a/fs/notify/group.c b/fs/notify/group.c index 32357534de18..b7a4b6a69efa 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -107,7 +107,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group) */ void fsnotify_get_group(struct fsnotify_group *group) { - atomic_inc(&group->refcnt); + refcount_inc(&group->refcnt); } /* @@ -115,7 +115,7 @@ void fsnotify_get_group(struct fsnotify_group *group) */ void fsnotify_put_group(struct fsnotify_group *group) { - if (atomic_dec_and_test(&group->refcnt)) + if (refcount_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } @@ -131,7 +131,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return ERR_PTR(-ENOMEM); /* set to 0 when there a no external references to this group */ - atomic_set(&group->refcnt, 1); + refcount_set(&group->refcnt, 1); atomic_set(&group->num_marks, 0); atomic_set(&group->user_waits, 0); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 7cc7d3fb1862..d3c20e0bb046 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -376,7 +376,7 @@ static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group fsnotify_get_mark(fsn_mark); /* One ref for being in the idr, one ref we just took */ - BUG_ON(atomic_read(&fsn_mark->refcnt) < 2); + BUG_ON(refcount_read(&fsn_mark->refcnt) < 2); } return i_mark; @@ -446,7 +446,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, * One ref for being in the idr * one ref grabbed by inotify_idr_find */ - if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) { + if (unlikely(refcount_read(&i_mark->fsn_mark.refcnt) < 2)) { printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); /* we can't really recover with bad ref cnting.. */ diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 9991f8826734..e9191b416434 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -105,18 +105,8 @@ static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { - WARN_ON_ONCE(!atomic_read(&mark->refcnt)); - atomic_inc(&mark->refcnt); -} - -/* - * Get mark reference when we found the mark via lockless traversal of object - * list. Mark can be already removed from the list by now and on its way to be - * destroyed once SRCU period ends. - */ -static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) -{ - return atomic_inc_not_zero(&mark->refcnt); + WARN_ON_ONCE(!refcount_read(&mark->refcnt)); + refcount_inc(&mark->refcnt); } static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) @@ -211,7 +201,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) /* Catch marks that were actually never attached to object */ if (!mark->connector) { - if (atomic_dec_and_test(&mark->refcnt)) + if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; } @@ -220,7 +210,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ - if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock)) return; conn = mark->connector; @@ -256,32 +246,60 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) FSNOTIFY_REAPER_DELAY); } -bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) +/* + * Get mark reference when we found the mark via lockless traversal of object + * list. Mark can be already removed from the list by now and on its way to be + * destroyed once SRCU period ends. + * + * Also pin the group so it doesn't disappear under us. + */ +static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { - struct fsnotify_group *group; - - if (WARN_ON_ONCE(!iter_info->inode_mark && !iter_info->vfsmount_mark)) - return false; - - if (iter_info->inode_mark) - group = iter_info->inode_mark->group; - else - group = iter_info->vfsmount_mark->group; + if (!mark) + return true; + + if (refcount_inc_not_zero(&mark->refcnt)) { + spin_lock(&mark->lock); + if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) { + /* mark is attached, group is still alive then */ + atomic_inc(&mark->group->user_waits); + spin_unlock(&mark->lock); + return true; + } + spin_unlock(&mark->lock); + fsnotify_put_mark(mark); + } + return false; +} - /* - * Since acquisition of mark reference is an atomic op as well, we can - * be sure this inc is seen before any effect of refcount increment. - */ - atomic_inc(&group->user_waits); +/* + * Puts marks and wakes up group destruction if necessary. + * + * Pairs with fsnotify_get_mark_safe() + */ +static void fsnotify_put_mark_wake(struct fsnotify_mark *mark) +{ + if (mark) { + struct fsnotify_group *group = mark->group; - if (iter_info->inode_mark) { - /* This can fail if mark is being removed */ - if (!fsnotify_get_mark_safe(iter_info->inode_mark)) - goto out_wait; + fsnotify_put_mark(mark); + /* + * We abuse notification_waitq on group shutdown for waiting for + * all marks pinned when waiting for userspace. + */ + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) + wake_up(&group->notification_waitq); } - if (iter_info->vfsmount_mark) { - if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark)) - goto out_inode; +} + +bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) +{ + /* This can fail if mark is being removed */ + if (!fsnotify_get_mark_safe(iter_info->inode_mark)) + return false; + if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark)) { + fsnotify_put_mark_wake(iter_info->inode_mark); + return false; } /* @@ -292,34 +310,13 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); return true; -out_inode: - if (iter_info->inode_mark) - fsnotify_put_mark(iter_info->inode_mark); -out_wait: - if (atomic_dec_and_test(&group->user_waits) && group->shutdown) - wake_up(&group->notification_waitq); - return false; } void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) { - struct fsnotify_group *group = NULL; - iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - if (iter_info->inode_mark) { - group = iter_info->inode_mark->group; - fsnotify_put_mark(iter_info->inode_mark); - } - if (iter_info->vfsmount_mark) { - group = iter_info->vfsmount_mark->group; - fsnotify_put_mark(iter_info->vfsmount_mark); - } - /* - * We abuse notification_waitq on group shutdown for waiting for all - * marks pinned when waiting for userspace. - */ - if (atomic_dec_and_test(&group->user_waits) && group->shutdown) - wake_up(&group->notification_waitq); + fsnotify_put_mark_wake(iter_info->inode_mark); + fsnotify_put_mark_wake(iter_info->vfsmount_mark); } /* @@ -338,7 +335,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && - atomic_read(&mark->refcnt) < 1 + + refcount_read(&mark->refcnt) < 1 + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); spin_lock(&mark->lock); @@ -599,9 +596,11 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode, return ret; err: + spin_lock(&mark->lock); mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); + spin_unlock(&mark->lock); atomic_dec(&group->num_marks); fsnotify_put_mark(mark); @@ -738,7 +737,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); - atomic_set(&mark->refcnt, 1); + refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; } diff --git a/fs/nsfs.c b/fs/nsfs.c index ef243e14b6eb..7c6f76d29f56 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -255,5 +255,5 @@ void __init nsfs_init(void) nsfs_mnt = kern_mount(&nsfs); if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); - nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER; + nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; } diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index cc91856b5e2d..3a2e509c77c5 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { spin_lock(&mapping->private_lock); if (unlikely(!page_has_buffers(page))) { spin_unlock(&mapping->private_lock); - bh = head = alloc_page_buffers(page, bh_size, 1); + bh = head = alloc_page_buffers(page, bh_size, true); spin_lock(&mapping->private_lock); if (likely(!page_has_buffers(page))) { struct buffer_head *tail; diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index b6f402194f02..ee8392aee9f6 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, if (unlikely(!page_has_buffers(page))) { struct buffer_head *tail; - bh = head = alloc_page_buffers(page, blocksize, 1); + bh = head = alloc_page_buffers(page, blocksize, true); do { set_buffer_uptodate(bh); tail = bh; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 3f70f041dbe9..bb7159f697f2 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -473,7 +473,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) #ifndef NTFS_RW /* For read-only compiled driver, enforce read-only flag. */ - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; #else /* NTFS_RW */ /* * For the read-write compiled driver, if we are remounting read-write, @@ -487,7 +487,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) * When remounting read-only, mark the volume clean if no volume errors * have occurred. */ - if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) { + if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { static const char *es = ". Cannot remount read-write."; /* Remounting read-write. */ @@ -548,7 +548,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) NVolSetErrors(vol); return -EROFS; } - } else if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) { + } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { /* Remounting read-only. */ if (!NVolErrors(vol)) { if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) @@ -1799,7 +1799,7 @@ static bool load_system_files(ntfs_volume *vol) es3); goto iput_mirr_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", !vol->mftmirr_ino ? es1 : es2, es3); } else @@ -1937,7 +1937,7 @@ get_ctx_vol_failed: es1, es2); goto iput_vol_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -1974,7 +1974,7 @@ get_ctx_vol_failed: } goto iput_logfile_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2019,7 +2019,7 @@ get_ctx_vol_failed: es1, es2); goto iput_root_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2042,7 +2042,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* * Do not set NVolErrors() because ntfs_remount() might manage * to set the dirty flag in which case all would be well. @@ -2055,7 +2055,7 @@ get_ctx_vol_failed: * If (still) a read-write mount, set the NT4 compatibility flag on * newer NTFS version volumes. */ - if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) && + if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) && ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { static const char *es1 = "Failed to set NT4 compatibility flag"; static const char *es2 = ". Run chkdsk."; @@ -2069,7 +2069,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } #endif @@ -2087,7 +2087,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } #endif /* NTFS_RW */ @@ -2128,7 +2128,7 @@ get_ctx_vol_failed: es1, es2); goto iput_quota_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2150,7 +2150,7 @@ get_ctx_vol_failed: goto iput_quota_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } /* @@ -2171,7 +2171,7 @@ get_ctx_vol_failed: es1, es2); goto iput_usnjrnl_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2194,7 +2194,7 @@ get_ctx_vol_failed: goto iput_usnjrnl_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } #endif /* NTFS_RW */ @@ -2728,7 +2728,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) lockdep_off(); ntfs_debug("Entering."); #ifndef NTFS_RW - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; #endif /* ! NTFS_RW */ /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index addd7c5f2d3e..ab5105f9767e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3585,8 +3585,6 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, * The easy case - we can just plop the record right in. */ *left_rec = *split_rec; - - has_empty_extent = 0; } else le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 88a31e9340a0..d1516327b787 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -134,6 +134,19 @@ bail: return err; } +static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + down_read(&oi->ip_alloc_sem); + ret = ocfs2_get_block(inode, iblock, bh_result, create); + up_read(&oi->ip_alloc_sem); + + return ret; +} + int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -2128,7 +2141,7 @@ static void ocfs2_dio_free_write_ctx(struct inode *inode, * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); */ -static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, +static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2154,12 +2167,9 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, * while file size will be changed. */ if (pos + total_len <= i_size_read(inode)) { - down_read(&oi->ip_alloc_sem); - /* This is the fast path for re-write. */ - ret = ocfs2_get_block(inode, iblock, bh_result, create); - - up_read(&oi->ip_alloc_sem); + /* This is the fast path for re-write. */ + ret = ocfs2_lock_get_block(inode, iblock, bh_result, create); if (buffer_mapped(bh_result) && !buffer_new(bh_result) && ret == 0) @@ -2424,9 +2434,9 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; if (iov_iter_rw(iter) == READ) - get_block = ocfs2_get_block; + get_block = ocfs2_lock_get_block; else - get_block = ocfs2_dio_get_block; + get_block = ocfs2_dio_wr_get_block; return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index b97bcc6dde7c..b1bb70c8ca4d 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -28,9 +28,6 @@ #include <linux/buffer_head.h> -void ocfs2_end_buffer_io_sync(struct buffer_head *bh, - int uptodate); - int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct ocfs2_caching_info *ci); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index d0206042d068..ea8c551bcd7e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2025,7 +2025,7 @@ static struct configfs_item_operations o2hb_region_item_ops = { .release = o2hb_region_release, }; -static struct config_item_type o2hb_region_type = { +static const struct config_item_type o2hb_region_type = { .ct_item_ops = &o2hb_region_item_ops, .ct_attrs = o2hb_region_attrs, .ct_owner = THIS_MODULE, @@ -2310,7 +2310,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { .drop_item = o2hb_heartbeat_group_drop_item, }; -static struct config_item_type o2hb_heartbeat_group_type = { +static const struct config_item_type o2hb_heartbeat_group_type = { .ct_group_ops = &o2hb_heartbeat_group_group_ops, .ct_attrs = o2hb_heartbeat_group_attrs, .ct_owner = THIS_MODULE, diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 3ef5137dc362..a9e67efc0004 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -79,10 +79,8 @@ void o2hb_fill_node_map(unsigned long *map, unsigned bytes); void o2hb_exit(void); int o2hb_init(void); -int o2hb_check_node_heartbeating(u8 node_num); int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); -int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); int o2hb_get_all_regions(char *region_uuids, u8 numregions); int o2hb_global_heartbeat_active(void); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index b17d180bdc16..da64c3a20eeb 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -40,6 +40,9 @@ char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { "panic", /* O2NM_FENCE_PANIC */ }; +static inline void o2nm_lock_subsystem(void); +static inline void o2nm_unlock_subsystem(void); + struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -181,7 +184,10 @@ static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) { /* through the first node_set .parent * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ - return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + if (node->nd_item.ci_parent) + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + else + return NULL; } enum { @@ -194,7 +200,7 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; int ret = 0; @@ -214,6 +220,13 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + write_lock(&cluster->cl_nodes_lock); if (cluster->cl_nodes[tmp]) ret = -EEXIST; @@ -226,6 +239,8 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, set_bit(tmp, cluster->cl_nodes_bitmap); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -269,7 +284,7 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; int ret, i; struct rb_node **p, *parent; unsigned int octets[4]; @@ -286,6 +301,13 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); } + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + ret = 0; write_lock(&cluster->cl_nodes_lock); if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) @@ -298,6 +320,8 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -315,7 +339,7 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; ssize_t ret; @@ -333,17 +357,26 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + ret = -EINVAL; + goto out; + } + /* the only failure case is trying to set a new local node * when a different one is already set */ if (tmp && tmp == cluster->cl_has_local && - cluster->cl_local_node != node->nd_num) - return -EBUSY; + cluster->cl_local_node != node->nd_num) { + ret = -EBUSY; + goto out; + } /* bring up the rx thread if we're setting the new local node. */ if (tmp && !cluster->cl_has_local) { ret = o2net_start_listening(node); if (ret) - return ret; + goto out; } if (!tmp && cluster->cl_has_local && @@ -358,7 +391,11 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, cluster->cl_local_node = node->nd_num; } - return count; + ret = count; + +out: + o2nm_unlock_subsystem(); + return ret; } CONFIGFS_ATTR(o2nm_node_, num); @@ -378,7 +415,7 @@ static struct configfs_item_operations o2nm_node_item_ops = { .release = o2nm_node_release, }; -static struct config_item_type o2nm_node_type = { +static const struct config_item_type o2nm_node_type = { .ct_item_ops = &o2nm_node_item_ops, .ct_attrs = o2nm_node_attrs, .ct_owner = THIS_MODULE, @@ -619,7 +656,7 @@ static struct configfs_group_operations o2nm_node_group_group_ops = { .drop_item = o2nm_node_group_drop_item, }; -static struct config_item_type o2nm_node_group_type = { +static const struct config_item_type o2nm_node_group_type = { .ct_group_ops = &o2nm_node_group_group_ops, .ct_owner = THIS_MODULE, }; @@ -637,7 +674,7 @@ static struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, }; -static struct config_item_type o2nm_cluster_type = { +static const struct config_item_type o2nm_cluster_type = { .ct_item_ops = &o2nm_cluster_item_ops, .ct_attrs = o2nm_cluster_attrs, .ct_owner = THIS_MODULE, @@ -722,7 +759,7 @@ static struct configfs_group_operations o2nm_cluster_group_group_ops = { .drop_item = o2nm_cluster_group_drop_item, }; -static struct config_item_type o2nm_cluster_group_type = { +static const struct config_item_type o2nm_cluster_group_type = { .ct_group_ops = &o2nm_cluster_group_group_ops, .ct_owner = THIS_MODULE, }; @@ -738,6 +775,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = { }, }; +static inline void o2nm_lock_subsystem(void) +{ + mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + +static inline void o2nm_unlock_subsystem(void) +{ + mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + int o2nm_depend_item(struct config_item *item) { return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 8d779227370a..bebe59feca58 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -140,7 +140,7 @@ static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); -static void o2net_idle_timer(unsigned long data); +static void o2net_idle_timer(struct timer_list *t); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); @@ -450,8 +450,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - setup_timer(&sc->sc_idle_timeout, o2net_idle_timer, - (unsigned long)sc); + timer_setup(&sc->sc_idle_timeout, o2net_idle_timer, 0); sclog(sc, "alloced\n"); @@ -1517,9 +1516,9 @@ static void o2net_sc_send_keep_req(struct work_struct *work) /* socket shutdown does a del_timer_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ -static void o2net_idle_timer(unsigned long data) +static void o2net_idle_timer(struct timer_list *t) { - struct o2net_sock_container *sc = (struct o2net_sock_container *)data; + struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout); struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); #ifdef CONFIG_DEBUG_FS unsigned long msecs = ktime_to_ms(ktime_get()) - diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index a2b19fbdcf46..e1fea149f50b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -394,7 +394,6 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm) static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) { if (dlm->dlm_worker) { - flush_workqueue(dlm->dlm_worker); destroy_workqueue(dlm->dlm_worker); dlm->dlm_worker = NULL; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3e04279446e8..9c3e0f13ca87 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2616,7 +2616,9 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, * otherwise the assert_master from the new * master will destroy this. */ - dlm_get_mle_inuse(mle); + if (ret != -EEXIST) + dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74407c6dd592..ec8f75813beb 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2419,6 +2419,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_lockres_put(res); continue; } + dlm_move_lockres_to_recovery_list(dlm, res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 9ab9e1892b5f..9c7c18c0e129 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -88,13 +88,13 @@ struct workqueue_struct *user_dlm_worker; */ #define DLMFS_CAPABILITIES "bast stackglue" static int param_set_dlmfs_capabilities(const char *val, - struct kernel_param *kp) + const struct kernel_param *kp) { printk(KERN_ERR "%s: readonly parameter\n", kp->name); return -EINVAL; } static int param_get_dlmfs_capabilities(char *buffer, - struct kernel_param *kp) + const struct kernel_param *kp) { return strlcpy(buffer, DLMFS_CAPABILITIES, strlen(DLMFS_CAPABILITIES) + 1); @@ -670,7 +670,6 @@ static void __exit exit_dlmfs_fs(void) { unregister_filesystem(&dlmfs_fs_type); - flush_workqueue(user_dlm_worker); destroy_workqueue(user_dlm_worker); /* diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6e41fc8fabbe..a1d051055472 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -227,7 +227,7 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if ((inode->i_flags & S_NOATIME) || - ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) + ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))) return 0; /* @@ -1161,6 +1161,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + /* + * Here we should wait dio to finish before inode lock + * to avoid a deadlock between ocfs2_setattr() and + * ocfs2_dio_end_io_write() + */ + inode_dio_wait(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1200,8 +1207,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (status) goto bail_unlock; - inode_dio_wait(inode); - if (i_size_read(inode) >= attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 2cabbcf2f28e..e87279e49ba3 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -129,19 +129,13 @@ static struct kobj_attribute ocfs2_attr_filecheck_set = ocfs2_filecheck_show, ocfs2_filecheck_store); -static int ocfs2_filecheck_sysfs_wait(atomic_t *p) -{ - schedule(); - return 0; -} - static void ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) { struct ocfs2_filecheck_entry *p; if (!atomic_dec_and_test(&entry->fs_count)) - wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait, + wait_on_atomic_t(&entry->fs_count, atomic_t_wait, TASK_UNINTERRUPTIBLE); spin_lock(&entry->fs_fcheck->fc_lock); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 71f22c8fbffd..9f0b95abc09f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1147,12 +1147,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT, NULL, ALLOC_NEW_GROUP); - if (status < 0 && status != -ENOSPC) { + if (status < 0 && status != -ENOSPC) mlog_errno(status); - goto bail; - } -bail: return status; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 80733496b22a..80efa5699fb0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -675,9 +675,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) } /* We're going to/from readonly mode. */ - if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) { + if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { /* Disable quota accounting before remounting RO */ - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { ret = ocfs2_susp_quotas(osb, 0); if (ret < 0) goto out; @@ -691,8 +691,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto unlock_osb; } - if (*flags & MS_RDONLY) { - sb->s_flags |= MS_RDONLY; + if (*flags & SB_RDONLY) { + sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; } else { if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { @@ -709,14 +709,14 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto unlock_osb; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; } trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); unlock_osb: spin_unlock(&osb->osb_lock); /* Enable quota accounting after remounting RW */ - if (!ret && !(*flags & MS_RDONLY)) { + if (!ret && !(*flags & SB_RDONLY)) { if (sb_any_quota_suspended(sb)) ret = ocfs2_susp_quotas(osb, 1); else @@ -724,7 +724,7 @@ unlock_osb: if (ret < 0) { /* Return back changes... */ spin_lock(&osb->osb_lock); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); goto out; @@ -744,9 +744,9 @@ unlock_osb: if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? - MS_POSIXACL : 0); + SB_POSIXACL : 0); } out: return ret; @@ -1057,10 +1057,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OCFS2_SUPER_MAGIC; - sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | - ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0); - /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, + /* Hard readonly mode only if: bdev_read_only, SB_RDONLY, * heartbeat=none */ if (bdev_read_only(sb->s_bdev)) { if (!sb_rdonly(sb)) { @@ -2057,7 +2057,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; /* this is needed to support O_LARGEFILE */ cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); @@ -2521,10 +2521,8 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) /* This function assumes that the caller has the main osb resource */ /* ocfs2_initializer_super have already created this workqueue */ - if (osb->ocfs2_wq) { - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) destroy_workqueue(osb->ocfs2_wq); - } ocfs2_free_slot_info(osb); @@ -2570,7 +2568,7 @@ static int ocfs2_handle_error(struct super_block *sb) return rv; pr_crit("OCFS2: File system is now read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ocfs2_set_ro_flag(osb, 0); } diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index b023e4f3d740..d4550c8bbc41 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -26,9 +26,6 @@ #ifndef OCFS2_SUPER_H #define OCFS2_SUPER_H -int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, - int node_num); - __printf(3, 4) int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 5fdf269ba82e..c5898c59d411 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -901,7 +901,7 @@ static int ocfs2_xattr_list_entry(struct super_block *sb, case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS: case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT: - if (!(sb->s_flags & MS_POSIXACL)) + if (!(sb->s_flags & SB_POSIXACL)) return 0; break; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 13215f26e321..2200662a9bf1 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -369,7 +369,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino) static int openprom_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NOATIME; + *flags |= SB_NOATIME; return 0; } @@ -386,7 +386,7 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent) struct op_inode_info *oi; int ret; - s->s_flags |= MS_NOATIME; + s->s_flags |= SB_NOATIME; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = OPENPROM_SUPER_MAGIC; diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index c2d8233b1e82..480ea059a680 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -155,13 +155,11 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) int orangefs_init_acl(struct inode *inode, struct inode *dir) { - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct posix_acl *default_acl, *acl; umode_t mode = inode->i_mode; + struct iattr iattr; int error = 0; - ClearModeFlag(orangefs_inode); - error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) return error; @@ -180,9 +178,11 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir) /* If mode of the inode was changed, then do a forcible ->setattr */ if (mode != inode->i_mode) { - SetModeFlag(orangefs_inode); + memset(&iattr, 0, sizeof iattr); inode->i_mode = mode; - orangefs_flush_inode(inode); + iattr.ia_mode = mode; + iattr.ia_valid |= ATTR_MODE; + orangefs_inode_setattr(inode, &iattr); } return error; diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index ded456f17de6..c584ad8d023c 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -162,7 +162,7 @@ static ssize_t orangefs_devreq_read(struct file *file, struct orangefs_kernel_op_s *op, *temp; __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION; static __s32 magic = ORANGEFS_DEVREQ_MAGIC; - struct orangefs_kernel_op_s *cur_op = NULL; + struct orangefs_kernel_op_s *cur_op; unsigned long ret; /* We do not support blocking IO. */ @@ -186,6 +186,7 @@ static ssize_t orangefs_devreq_read(struct file *file, return -EAGAIN; restart: + cur_op = NULL; /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); list_for_each_entry_safe(op, temp, &orangefs_request_list, list) { diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index a8cc588d6224..e2c2699d8016 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -386,7 +386,6 @@ static int orangefs_dir_release(struct inode *inode, struct file *file) { struct orangefs_dir *od = file->private_data; struct orangefs_dir_part *part = od->part; - orangefs_flush_inode(inode); while (part) { struct orangefs_dir_part *next = part->next; vfree(part); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index e4a8e6a7eb17..0d228cd087e6 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -383,9 +383,15 @@ out: if (type == ORANGEFS_IO_READ) { file_accessed(file); } else { - SetMtimeFlag(orangefs_inode); - inode->i_mtime = current_time(inode); - mark_inode_dirty_sync(inode); + file_update_time(file); + /* + * Must invalidate to ensure write loop doesn't + * prevent kernel from reading updated + * attribute. Size probably changed because of + * the write, and other clients could update + * any other attribute. + */ + orangefs_inode->getattr_time = jiffies - 1; } } @@ -446,7 +452,7 @@ ssize_t orangefs_inode_read(struct inode *inode, static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; - loff_t pos = *(&iocb->ki_pos); + loff_t pos = iocb->ki_pos; ssize_t rc = 0; BUG_ON(iocb->private); @@ -486,9 +492,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite } } - if (file->f_pos > i_size_read(file->f_mapping->host)) - orangefs_i_size_write(file->f_mapping->host, file->f_pos); - rc = generic_write_checks(iocb, iter); if (rc <= 0) { @@ -502,7 +505,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite * pos to the end of the file, so we will wait till now to set * pos... */ - pos = *(&iocb->ki_pos); + pos = iocb->ki_pos; rc = do_readv_writev(ORANGEFS_IO_WRITE, file, @@ -615,8 +618,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file) "orangefs_file_release: called on %pD\n", file); - orangefs_flush_inode(inode); - /* * remove all associated inode pages from the page cache and * readahead cache (if any); this forces an expensive refresh of @@ -666,8 +667,6 @@ static int orangefs_fsync(struct file *file, ret); op_release(new_op); - - orangefs_flush_inode(file_inode(file)); return ret; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 28825a5b6d09..fe1d705ad91f 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -290,6 +290,22 @@ int orangefs_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } +int orangefs_update_time(struct inode *inode, struct timespec *time, int flags) +{ + struct iattr iattr; + gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", + get_khandle_from_ino(inode)); + generic_update_time(inode, time, flags); + memset(&iattr, 0, sizeof iattr); + if (flags & S_ATIME) + iattr.ia_valid |= ATTR_ATIME; + if (flags & S_CTIME) + iattr.ia_valid |= ATTR_CTIME; + if (flags & S_MTIME) + iattr.ia_valid |= ATTR_MTIME; + return orangefs_inode_setattr(inode, &iattr); +} + /* ORANGEDS2 implementation of VFS inode operations for files */ const struct inode_operations orangefs_file_inode_operations = { .get_acl = orangefs_get_acl, @@ -298,6 +314,7 @@ const struct inode_operations orangefs_file_inode_operations = { .getattr = orangefs_getattr, .listxattr = orangefs_listxattr, .permission = orangefs_permission, + .update_time = orangefs_update_time, }; static int orangefs_init_iops(struct inode *inode) diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 7e9e5d0ea3bc..c98bba2dbc94 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -22,7 +22,9 @@ static int orangefs_create(struct inode *dir, { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; struct inode *inode; + struct iattr iattr; int ret; gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n", @@ -55,8 +57,10 @@ static int orangefs_create(struct inode *dir, if (ret < 0) goto out; - inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, - &new_op->downcall.resp.create.refn); + ref = new_op->downcall.resp.create.refn; + op_release(new_op); + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &ref); if (IS_ERR(inode)) { gossip_err("%s: Failed to allocate inode for file :%pd:\n", __func__, @@ -82,12 +86,13 @@ static int orangefs_create(struct inode *dir, __func__, dentry); - SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_time(dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); ret = 0; out: - op_release(new_op); gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd: returning %d\n", __func__, @@ -221,6 +226,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; + struct iattr iattr; int ret; gossip_debug(GOSSIP_NAME_DEBUG, @@ -253,8 +259,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) if (!ret) { drop_nlink(inode); - SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_time(dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); } return ret; @@ -266,7 +274,9 @@ static int orangefs_symlink(struct inode *dir, { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; struct inode *inode; + struct iattr iattr; int mode = 755; int ret; @@ -307,8 +317,10 @@ static int orangefs_symlink(struct inode *dir, goto out; } - inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, - &new_op->downcall.resp.sym.refn); + ref = new_op->downcall.resp.sym.refn; + op_release(new_op); + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, &ref); if (IS_ERR(inode)) { gossip_err ("*** Failed to allocate orangefs symlink inode\n"); @@ -331,12 +343,13 @@ static int orangefs_symlink(struct inode *dir, get_khandle_from_ino(inode), dentry); - SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_time(dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); ret = 0; out: - op_release(new_op); return ret; } @@ -344,7 +357,9 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; struct inode *inode; + struct iattr iattr; int ret; new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR); @@ -373,8 +388,10 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode goto out; } - inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, - &new_op->downcall.resp.mkdir.refn); + ref = new_op->downcall.resp.mkdir.refn; + op_release(new_op); + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, &ref); if (IS_ERR(inode)) { gossip_err("*** Failed to allocate orangefs dir inode\n"); ret = PTR_ERR(inode); @@ -400,11 +417,12 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode * NOTE: we have no good way to keep nlink consistent for directories * across clients; keep constant at 1. */ - SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_time(dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); out: - op_release(new_op); return ret; } @@ -470,4 +488,5 @@ const struct inode_operations orangefs_dir_inode_operations = { .getattr = orangefs_getattr, .listxattr = orangefs_listxattr, .permission = orangefs_permission, + .update_time = orangefs_update_time, }; diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h index b6001bb28f5a..c7db56a31b92 100644 --- a/fs/orangefs/orangefs-debug.h +++ b/fs/orangefs/orangefs-debug.h @@ -15,8 +15,10 @@ #ifdef __KERNEL__ #include <linux/types.h> +#include <linux/kernel.h> #else #include <stdint.h> +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif #define GOSSIP_NO_DEBUG (__u64)0 @@ -88,6 +90,6 @@ static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { }; static const int num_kmod_keyword_mask_map = (int) - (sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s)); + (ARRAY_SIZE(s_kmod_keyword_mask_map)); #endif /* __ORANGEFS_DEBUG_H */ diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 004af348fb80..2595453fe737 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -209,37 +209,10 @@ struct orangefs_inode_s { struct inode vfs_inode; sector_t last_failed_block_index_read; - /* - * State of in-memory attributes not yet flushed to disk associated - * with this object - */ - unsigned long pinode_flags; - unsigned long getattr_time; u32 getattr_mask; }; -#define P_ATIME_FLAG 0 -#define P_MTIME_FLAG 1 -#define P_CTIME_FLAG 2 -#define P_MODE_FLAG 3 - -#define ClearAtimeFlag(pinode) clear_bit(P_ATIME_FLAG, &(pinode)->pinode_flags) -#define SetAtimeFlag(pinode) set_bit(P_ATIME_FLAG, &(pinode)->pinode_flags) -#define AtimeFlag(pinode) test_bit(P_ATIME_FLAG, &(pinode)->pinode_flags) - -#define ClearMtimeFlag(pinode) clear_bit(P_MTIME_FLAG, &(pinode)->pinode_flags) -#define SetMtimeFlag(pinode) set_bit(P_MTIME_FLAG, &(pinode)->pinode_flags) -#define MtimeFlag(pinode) test_bit(P_MTIME_FLAG, &(pinode)->pinode_flags) - -#define ClearCtimeFlag(pinode) clear_bit(P_CTIME_FLAG, &(pinode)->pinode_flags) -#define SetCtimeFlag(pinode) set_bit(P_CTIME_FLAG, &(pinode)->pinode_flags) -#define CtimeFlag(pinode) test_bit(P_CTIME_FLAG, &(pinode)->pinode_flags) - -#define ClearModeFlag(pinode) clear_bit(P_MODE_FLAG, &(pinode)->pinode_flags) -#define SetModeFlag(pinode) set_bit(P_MODE_FLAG, &(pinode)->pinode_flags) -#define ModeFlag(pinode) test_bit(P_MODE_FLAG, &(pinode)->pinode_flags) - /* per superblock private orangefs info */ struct orangefs_sb_info_s { struct orangefs_khandle root_khandle; @@ -275,12 +248,6 @@ struct orangefs_kiocb_s { /* orangefs kernel operation type */ struct orangefs_kernel_op_s *op; - /* The user space buffers from/to which I/O is being staged */ - struct iovec *iov; - - /* number of elements in the iovector */ - unsigned long nr_segs; - /* set to indicate the type of the operation */ int rw; @@ -442,6 +409,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, int orangefs_permission(struct inode *inode, int mask); +int orangefs_update_time(struct inode *, struct timespec *, int); + /* * defined in xattr.c */ @@ -484,8 +453,6 @@ bool __is_daemon_in_service(void); */ __s32 fsid_of_op(struct orangefs_kernel_op_s *op); -int orangefs_flush_inode(struct inode *inode); - ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, void *buffer, @@ -566,17 +533,6 @@ do { \ sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \ } while (0) -static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size) -{ -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - inode_lock(inode); -#endif - i_size_write(inode, i_size); -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - inode_unlock(inode); -#endif -} - static inline void orangefs_set_timeout(struct dentry *dentry) { unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index f82336496311..97fe93129f38 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -4,6 +4,7 @@ * * See COPYING in top-level directory. */ +#include <linux/kernel.h> #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-dev-proto.h" @@ -437,89 +438,8 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr) op_release(new_op); - /* - * successful setattr should clear the atime, mtime and - * ctime flags. - */ - if (ret == 0) { - ClearAtimeFlag(orangefs_inode); - ClearMtimeFlag(orangefs_inode); - ClearCtimeFlag(orangefs_inode); - ClearModeFlag(orangefs_inode); + if (ret == 0) orangefs_inode->getattr_time = jiffies - 1; - } - - return ret; -} - -int orangefs_flush_inode(struct inode *inode) -{ - /* - * If it is a dirty inode, this function gets called. - * Gather all the information that needs to be setattr'ed - * Right now, this will only be used for mode, atime, mtime - * and/or ctime. - */ - struct iattr wbattr; - int ret; - int mtime_flag; - int ctime_flag; - int atime_flag; - int mode_flag; - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - - memset(&wbattr, 0, sizeof(wbattr)); - - /* - * check inode flags up front, and clear them if they are set. This - * will prevent multiple processes from all trying to flush the same - * inode if they call close() simultaneously - */ - mtime_flag = MtimeFlag(orangefs_inode); - ClearMtimeFlag(orangefs_inode); - ctime_flag = CtimeFlag(orangefs_inode); - ClearCtimeFlag(orangefs_inode); - atime_flag = AtimeFlag(orangefs_inode); - ClearAtimeFlag(orangefs_inode); - mode_flag = ModeFlag(orangefs_inode); - ClearModeFlag(orangefs_inode); - - /* -- Lazy atime,mtime and ctime update -- - * Note: all times are dictated by server in the new scheme - * and not by the clients - * - * Also mode updates are being handled now.. - */ - - if (mtime_flag) - wbattr.ia_valid |= ATTR_MTIME; - if (ctime_flag) - wbattr.ia_valid |= ATTR_CTIME; - if (atime_flag) - wbattr.ia_valid |= ATTR_ATIME; - - if (mode_flag) { - wbattr.ia_mode = inode->i_mode; - wbattr.ia_valid |= ATTR_MODE; - } - - gossip_debug(GOSSIP_UTILS_DEBUG, - "*********** orangefs_flush_inode: %pU " - "(ia_valid %d)\n", - get_khandle_from_ino(inode), - wbattr.ia_valid); - if (wbattr.ia_valid == 0) { - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_flush_inode skipping setattr()\n"); - return 0; - } - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_flush_inode (%pU) writing mode %o\n", - get_khandle_from_ino(inode), - inode->i_mode); - - ret = orangefs_inode_setattr(inode, &wbattr); return ret; } @@ -606,7 +526,7 @@ int orangefs_normalize_to_errno(__s32 error_code) /* Convert ORANGEFS encoded errno values into regular errno values. */ } else if ((-error_code) & ORANGEFS_ERROR_BIT) { i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS); - if (i < sizeof(PINT_errno_mapping)/sizeof(*PINT_errno_mapping)) + if (i < ARRAY_SIZE(PINT_errno_mapping)) error_code = -PINT_errno_mapping[i]; else error_code = -EINVAL; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 47ebd9bfd1a1..36f1390b5ed7 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -40,7 +40,7 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root) { struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb); - if (root->d_sb->s_flags & MS_POSIXACL) + if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); if (orangefs_sb->flags & ORANGEFS_OPT_INTR) seq_puts(m, ",intr"); @@ -60,7 +60,7 @@ static int parse_mount_options(struct super_block *sb, char *options, * Force any potential flags that might be set from the mount * to zero, ie, initialize to unset. */ - sb->s_flags &= ~MS_POSIXACL; + sb->s_flags &= ~SB_POSIXACL; orangefs_sb->flags &= ~ORANGEFS_OPT_INTR; orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; @@ -73,7 +73,7 @@ static int parse_mount_options(struct super_block *sb, char *options, token = match_token(p, tokens, args); switch (token) { case Opt_acl: - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; break; case Opt_intr: orangefs_sb->flags |= ORANGEFS_OPT_INTR; @@ -99,8 +99,6 @@ static void orangefs_inode_cache_ctor(void *req) inode_init_once(&orangefs_inode->vfs_inode); init_rwsem(&orangefs_inode->xattr_sem); - - orangefs_inode->vfs_inode.i_version = 1; } static struct inode *orangefs_alloc_inode(struct super_block *sb) @@ -119,7 +117,6 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL; orangefs_inode->last_failed_block_index_read = 0; memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target)); - orangefs_inode->pinode_flags = 0; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_alloc_inode: allocated %p\n", @@ -299,21 +296,9 @@ void fsid_key_table_finalize(void) { } -/* Called whenever the VFS dirties the inode in response to atime updates */ -static void orangefs_dirty_inode(struct inode *inode, int flags) -{ - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - - gossip_debug(GOSSIP_SUPER_DEBUG, - "orangefs_dirty_inode: %pU\n", - get_khandle_from_ino(inode)); - SetAtimeFlag(orangefs_inode); -} - static const struct super_operations orangefs_s_ops = { .alloc_inode = orangefs_alloc_inode, .destroy_inode = orangefs_destroy_inode, - .dirty_inode = orangefs_dirty_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, .remount_fs = orangefs_remount_fs, @@ -522,7 +507,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, ret = orangefs_fill_sb(sb, &new_op->downcall.resp.fs_mount, data, - flags & MS_SILENT ? 1 : 0); + flags & SB_SILENT ? 1 : 0); if (ret) { d = ERR_PTR(ret); diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c index d856cdf91763..db107fe91ab3 100644 --- a/fs/orangefs/symlink.c +++ b/fs/orangefs/symlink.c @@ -15,4 +15,5 @@ const struct inode_operations orangefs_symlink_inode_operations = { .getattr = orangefs_getattr, .listxattr = orangefs_listxattr, .permission = orangefs_permission, + .update_time = orangefs_update_time, }; diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index 835c6e148afc..0577d6dba8c8 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -29,10 +29,10 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s */ void purge_waiting_ops(void) { - struct orangefs_kernel_op_s *op; + struct orangefs_kernel_op_s *op, *tmp; spin_lock(&orangefs_request_list_lock); - list_for_each_entry(op, &orangefs_request_list, list) { + list_for_each_entry_safe(op, tmp, &orangefs_request_list, list) { gossip_debug(GOSSIP_WAIT_DEBUG, "pvfs2-client-core: purging op tag %llu %s\n", llu(op->tag), diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index cbfc196e5dc5..5ac415466861 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -24,6 +24,16 @@ config OVERLAY_FS_REDIRECT_DIR an overlay which has redirects on a kernel that doesn't support this feature will have unexpected results. +config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW + bool "Overlayfs: follow redirects even if redirects are turned off" + default y + depends on OVERLAY_FS + help + Disable this to get a possibly more secure configuration, but that + might not be backward compatible with previous kernels. + + For more information, see Documentation/filesystems/overlayfs.txt + config OVERLAY_FS_INDEX bool "Overlayfs: turn on inodes index feature by default" depends on OVERLAY_FS diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index c441f9387a1b..eb3b8d39fb61 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -22,7 +22,6 @@ #include <linux/ratelimit.h> #include <linux/exportfs.h> #include "overlayfs.h" -#include "ovl_entry.h" #define OVL_COPY_UP_CHUNK_SIZE (1 << 20) @@ -486,6 +485,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) { struct inode *udir = c->destdir->d_inode; + struct inode *inode; struct dentry *newdentry = NULL; struct dentry *temp = NULL; int err; @@ -508,7 +508,11 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) if (err) goto out_cleanup; - ovl_inode_update(d_inode(c->dentry), newdentry); + inode = d_inode(c->dentry); + ovl_inode_update(inode, newdentry); + if (S_ISDIR(inode->i_mode)) + ovl_set_flag(OVL_WHITEOUTS, inode); + out: dput(temp); return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index cc961a3bd3bd..f9788bc116a8 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -181,6 +181,11 @@ static bool ovl_type_origin(struct dentry *dentry) return OVL_TYPE_ORIGIN(ovl_path_type(dentry)); } +static bool ovl_may_have_whiteouts(struct dentry *dentry) +{ + return ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry)); +} + static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct cattr *attr, struct dentry *hardlink) { @@ -300,7 +305,6 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) { int err; struct dentry *ret = NULL; - enum ovl_path_type type = ovl_path_type(dentry); LIST_HEAD(list); err = ovl_check_empty_dir(dentry, &list); @@ -313,13 +317,13 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) * When removing an empty opaque directory, then it makes no sense to * replace it with an exact replica of itself. * - * If no upperdentry then skip clearing whiteouts. + * If upperdentry has whiteouts, clear them. * * Can race with copy-up, since we don't hold the upperdir mutex. * Doesn't matter, since copy-up can't create a non-empty directory * from an empty one. */ - if (OVL_TYPE_UPPER(type) && OVL_TYPE_MERGE(type)) + if (!list_empty(&list)) ret = ovl_clear_empty(dentry, &list); out_free: @@ -698,8 +702,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; - /* Redirect dir can be !ovl_lower_positive && OVL_TYPE_MERGE */ - if (is_dir && ovl_dentry_get_redirect(dentry)) { + /* Redirect/origin dir can be !ovl_lower_positive && not clean */ + if (is_dir && (ovl_dentry_get_redirect(dentry) || + ovl_may_have_whiteouts(dentry))) { opaquedir = ovl_check_empty_and_clear(dentry); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) @@ -882,7 +887,8 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) spin_unlock(&dentry->d_lock); } else { kfree(redirect); - pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); + pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n", + err); /* Fall back to userspace copy-up */ err = -EXDEV; } @@ -946,7 +952,8 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, old_cred = ovl_override_creds(old->d_sb); - if (overwrite && new_is_dir && ovl_type_merge_or_lower(new)) { + if (overwrite && new_is_dir && (ovl_type_merge_or_lower(new) || + ovl_may_have_whiteouts(new))) { opaquedir = ovl_check_empty_and_clear(new); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { @@ -1069,9 +1076,10 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, drop_nlink(d_inode(new)); } - ovl_dentry_version_inc(old->d_parent, - !overwrite && ovl_type_origin(new)); - ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old)); + ovl_dentry_version_inc(old->d_parent, ovl_type_origin(old) || + (!overwrite && ovl_type_origin(new))); + ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old) || + (d_inode(new) && ovl_type_origin(new))); out_dput: dput(newdentry); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 321511ed8c42..00b6b294272a 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -15,6 +15,14 @@ #include <linux/ratelimit.h> #include "overlayfs.h" + +static dev_t ovl_get_pseudo_dev(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->lowerstack[0].layer->pseudo_dev; +} + int ovl_setattr(struct dentry *dentry, struct iattr *attr) { int err; @@ -66,6 +74,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, struct path realpath; const struct cred *old_cred; bool is_dir = S_ISDIR(dentry->d_inode->i_mode); + bool samefs = ovl_same_sb(dentry->d_sb); int err; type = ovl_path_real(dentry, &realpath); @@ -75,16 +84,13 @@ int ovl_getattr(const struct path *path, struct kstat *stat, goto out; /* - * When all layers are on the same fs, all real inode number are - * unique, so we use the overlay st_dev, which is friendly to du -x. - * - * We also use st_ino of the copy up origin, if we know it. - * This guaranties constant st_dev/st_ino across copy up. + * For non-dir or same fs, we use st_ino of the copy up origin, if we + * know it. This guaranties constant st_dev/st_ino across copy up. * * If filesystem supports NFS export ops, this also guaranties * persistent st_ino across mount cycle. */ - if (ovl_same_sb(dentry->d_sb)) { + if (!is_dir || samefs) { if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); @@ -95,7 +101,6 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (err) goto out; - WARN_ON_ONCE(stat->dev != lowerstat.dev); /* * Lower hardlinks may be broken on copy up to different * upper files, so we cannot use the lower origin st_ino @@ -107,17 +112,36 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (is_dir || lowerstat.nlink == 1 || ovl_test_flag(OVL_INDEX, d_inode(dentry))) stat->ino = lowerstat.ino; + + if (samefs) + WARN_ON_ONCE(stat->dev != lowerstat.dev); + else + stat->dev = ovl_get_pseudo_dev(dentry); } - stat->dev = dentry->d_sb->s_dev; - } else if (is_dir) { + if (samefs) { + /* + * When all layers are on the same fs, all real inode + * number are unique, so we use the overlay st_dev, + * which is friendly to du -x. + */ + stat->dev = dentry->d_sb->s_dev; + } else if (!OVL_TYPE_UPPER(type)) { + /* + * For non-samefs setup, to make sure that st_dev/st_ino + * pair is unique across the system, we use a unique + * anonymous st_dev for lower layer inode. + */ + stat->dev = ovl_get_pseudo_dev(dentry); + } + } else { /* - * If not all layers are on the same fs the pair {real st_ino; - * overlay st_dev} is not unique, so use the non persistent - * overlay st_ino. - * * Always use the overlay st_dev for directories, so 'find * -xdev' will scan the entire overlay mount and won't cross the * overlay mount boundaries. + * + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino for directories. */ stat->dev = dentry->d_sb->s_dev; stat->ino = dentry->d_inode->i_ino; @@ -409,6 +433,7 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) #ifdef CONFIG_LOCKDEP static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth - 1; @@ -419,6 +444,8 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); else lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); + + lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); #endif } @@ -657,6 +684,16 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); + /* Check for non-merge dir that may have whiteouts */ + if (S_ISDIR(realinode->i_mode)) { + struct ovl_entry *oe = dentry->d_fsdata; + + if (((upperdentry && lowerdentry) || oe->numlower > 1) || + ovl_check_origin_xattr(upperdentry ?: lowerdentry)) { + ovl_set_flag(OVL_WHITEOUTS, inode); + } + } + if (inode->i_state & I_NEW) unlock_new_inode(inode); out: diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index a12dc10bf726..beb945e1963c 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -15,7 +15,6 @@ #include <linux/mount.h> #include <linux/exportfs.h> #include "overlayfs.h" -#include "ovl_entry.h" struct ovl_lookup_data { struct qstr name; @@ -286,16 +285,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, static int ovl_check_origin(struct dentry *upperdentry, - struct path *lowerstack, unsigned int numlower, - struct path **stackp, unsigned int *ctrp) + struct ovl_path *lower, unsigned int numlower, + struct ovl_path **stackp, unsigned int *ctrp) { struct vfsmount *mnt; struct dentry *origin = NULL; int i; - for (i = 0; i < numlower; i++) { - mnt = lowerstack[i].mnt; + mnt = lower[i].layer->mnt; origin = ovl_get_origin(upperdentry, mnt); if (IS_ERR(origin)) return PTR_ERR(origin); @@ -309,12 +307,12 @@ static int ovl_check_origin(struct dentry *upperdentry, BUG_ON(*ctrp); if (!*stackp) - *stackp = kmalloc(sizeof(struct path), GFP_KERNEL); + *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL); if (!*stackp) { dput(origin); return -ENOMEM; } - **stackp = (struct path) { .dentry = origin, .mnt = mnt }; + **stackp = (struct ovl_path){.dentry = origin, .layer = lower[i].layer}; *ctrp = 1; return 0; @@ -350,8 +348,8 @@ static int ovl_verify_origin_fh(struct dentry *dentry, const struct ovl_fh *fh) * * Return 0 on match, -ESTALE on mismatch, < 0 on error. */ -int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt, - struct dentry *origin, bool is_upper, bool set) +int ovl_verify_origin(struct dentry *dentry, struct dentry *origin, + bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; @@ -384,13 +382,13 @@ fail: * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error. */ -int ovl_verify_index(struct dentry *index, struct path *lowerstack, +int ovl_verify_index(struct dentry *index, struct ovl_path *lower, unsigned int numlower) { struct ovl_fh *fh = NULL; size_t len; - struct path origin = { }; - struct path *stack = &origin; + struct ovl_path origin = { }; + struct ovl_path *stack = &origin; unsigned int ctr = 0; int err; @@ -429,7 +427,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack, if (err) goto fail; - err = ovl_check_origin(index, lowerstack, numlower, &stack, &ctr); + err = ovl_check_origin(index, lower, numlower, &stack, &ctr); if (!err && !ctr) err = -ESTALE; if (err) @@ -437,7 +435,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack, /* Check if index is orphan and don't warn before cleaning it */ if (d_inode(index)->i_nlink == 1 && - ovl_get_nlink(index, origin.dentry, 0) == 0) + ovl_get_nlink(origin.dentry, index, 0) == 0) err = -ENOENT; dput(origin.dentry); @@ -568,11 +566,24 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) idx++; } BUG_ON(idx > oe->numlower); - *path = oe->lowerstack[idx - 1]; + path->dentry = oe->lowerstack[idx - 1].dentry; + path->mnt = oe->lowerstack[idx - 1].layer->mnt; return (idx < oe->numlower) ? idx + 1 : -1; } +static int ovl_find_layer(struct ovl_fs *ofs, struct ovl_path *path) +{ + int i; + + for (i = 0; i < ofs->numlower; i++) { + if (ofs->lower_layers[i].mnt == path->layer->mnt) + break; + } + + return i; +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -581,7 +592,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *poe = dentry->d_parent->d_fsdata; struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; - struct path *stack = NULL; + struct ovl_path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; struct dentry *index = NULL; unsigned int ctr = 0; @@ -630,7 +641,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = ovl_check_origin(upperdentry, roe->lowerstack, roe->numlower, &stack, &ctr); if (err) - goto out; + goto out_put_upper; } if (d.redirect) { @@ -646,17 +657,17 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!d.stop && poe->numlower) { err = -ENOMEM; - stack = kcalloc(ofs->numlower, sizeof(struct path), + stack = kcalloc(ofs->numlower, sizeof(struct ovl_path), GFP_KERNEL); if (!stack) goto out_put_upper; } for (i = 0; !d.stop && i < poe->numlower; i++) { - struct path lowerpath = poe->lowerstack[i]; + struct ovl_path lower = poe->lowerstack[i]; d.last = i == poe->numlower - 1; - err = ovl_lookup_layer(lowerpath.dentry, &d, &this); + err = ovl_lookup_layer(lower.dentry, &d, &this); if (err) goto out_put; @@ -664,20 +675,34 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, continue; stack[ctr].dentry = this; - stack[ctr].mnt = lowerpath.mnt; + stack[ctr].layer = lower.layer; ctr++; if (d.stop) break; + /* + * Following redirects can have security consequences: it's like + * a symlink into the lower layer without the permission checks. + * This is only a problem if the upper layer is untrusted (e.g + * comes from an USB drive). This can allow a non-readable file + * or directory to become readable. + * + * Only following redirects when redirects are enabled disables + * this attack vector when not necessary. + */ + err = -EPERM; + if (d.redirect && !ofs->config.redirect_follow) { + pr_warn_ratelimited("overlay: refusing to follow redirect for (%pd2)\n", dentry); + goto out_put; + } + if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ - for (i = 0; i < poe->numlower; i++) - if (poe->lowerstack[i].mnt == lowerpath.mnt) - break; - if (WARN_ON(i == poe->numlower)) + i = ovl_find_layer(ofs, &lower); + if (WARN_ON(i == ofs->numlower)) break; } } @@ -700,7 +725,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_put; oe->opaque = upperopaque; - memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); + memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr); dentry->d_fsdata = oe; if (upperdentry) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d9a0edd4e57e..b489099ccd49 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/uuid.h> +#include "ovl_entry.h" enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), @@ -28,7 +29,10 @@ enum ovl_path_type { #define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink" enum ovl_flag { + /* Pure upper dir that may contain non pure upper entries */ OVL_IMPURE, + /* Non-merge dir that may contain whiteout entries */ + OVL_WHITEOUTS, OVL_INDEX, }; @@ -176,7 +180,7 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) { struct dentry *ret = vfs_tmpfile(dentry, mode, 0); - int err = IS_ERR(ret) ? PTR_ERR(ret) : 0; + int err = PTR_ERR_OR_ZERO(ret); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); return ret; @@ -223,6 +227,7 @@ bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry); void ovl_copy_up_end(struct dentry *dentry); +bool ovl_check_origin_xattr(struct dentry *dentry); bool ovl_check_dir_xattr(struct dentry *dentry, const char *name); int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, const char *name, const void *value, size_t size, @@ -244,9 +249,9 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) /* namei.c */ -int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt, - struct dentry *origin, bool is_upper, bool set); -int ovl_verify_index(struct dentry *index, struct path *lowerstack, +int ovl_verify_origin(struct dentry *dentry, struct dentry *origin, + bool is_upper, bool set); +int ovl_verify_index(struct dentry *index, struct ovl_path *lower, unsigned int numlower); int ovl_get_index_name(struct dentry *origin, struct qstr *name); int ovl_path_next(int idx, struct dentry *dentry, struct path *path); @@ -263,7 +268,7 @@ int ovl_check_d_type_supported(struct path *realpath); void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, - struct path *lowerstack, unsigned int numlower); + struct ovl_path *lower, unsigned int numlower); /* inode.c */ int ovl_set_nlink_upper(struct dentry *dentry); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 36b49bd09264..9d0bc03bf6e4 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -14,14 +14,26 @@ struct ovl_config { char *workdir; bool default_permissions; bool redirect_dir; + bool redirect_follow; + const char *redirect_mode; bool index; }; +struct ovl_layer { + struct vfsmount *mnt; + dev_t pseudo_dev; +}; + +struct ovl_path { + struct ovl_layer *layer; + struct dentry *dentry; +}; + /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; unsigned numlower; - struct vfsmount **lower_mnt; + struct ovl_layer *lower_layers; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' directory under workbasedir */ @@ -52,7 +64,7 @@ struct ovl_entry { struct rcu_head rcu; }; unsigned numlower; - struct path lowerstack[]; + struct ovl_path lowerstack[]; }; struct ovl_entry *ovl_alloc_entry(unsigned int numlower); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c310e3ff7f3f..8c98578d27a1 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -26,6 +26,7 @@ struct ovl_cache_entry { struct list_head l_node; struct rb_node node; struct ovl_cache_entry *next_maybe_whiteout; + bool is_upper; bool is_whiteout; char name[]; }; @@ -158,6 +159,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, /* Defer setting d_ino for upper entry to ovl_iterate() */ if (ovl_calc_d_ino(rdd, p)) p->ino = 0; + p->is_upper = rdd->is_upper; p->is_whiteout = false; if (d_type == DT_CHR) { @@ -316,21 +318,37 @@ static inline int ovl_dir_read(struct path *realpath, return err; } +/* + * Can we iterate real dir directly? + * + * Non-merge dir may contain whiteouts from a time it was a merge upper, before + * lower dir was removed under it and possibly before it was rotated from upper + * to lower layer. + */ +static bool ovl_dir_is_real(struct dentry *dir) +{ + return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); +} + static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; struct dentry *dentry = file->f_path.dentry; - enum ovl_path_type type = ovl_path_type(dentry); + bool is_real; if (cache && ovl_dentry_version_get(dentry) != cache->version) { ovl_cache_put(od, dentry); od->cache = NULL; od->cursor = NULL; } - WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type)); - if (od->is_real && OVL_TYPE_MERGE(type)) + is_real = ovl_dir_is_real(dentry); + if (od->is_real != is_real) { + /* is_real can only become false when dir is copied up */ + if (WARN_ON(is_real)) + return; od->is_real = false; + } } static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, @@ -481,7 +499,7 @@ out: return err; fail: - pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n", + pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } @@ -645,7 +663,10 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) return PTR_ERR(rdt.cache); } - return iterate_dir(od->realfile, &rdt.ctx); + err = iterate_dir(od->realfile, &rdt.ctx); + ctx->pos = rdt.ctx.pos; + + return err; } @@ -816,7 +837,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return PTR_ERR(realfile); } od->realfile = realfile; - od->is_real = !OVL_TYPE_MERGE(type); + od->is_real = ovl_dir_is_real(file->f_path.dentry); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; @@ -835,7 +856,7 @@ const struct file_operations ovl_dir_operations = { int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; - struct ovl_cache_entry *p; + struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; err = ovl_dir_read_merged(dentry, list, &root); @@ -844,18 +865,29 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) err = 0; - list_for_each_entry(p, list, l_node) { - if (p->is_whiteout) - continue; + list_for_each_entry_safe(p, n, list, l_node) { + /* + * Select whiteouts in upperdir, they should + * be cleared when deleting this directory. + */ + if (p->is_whiteout) { + if (p->is_upper) + continue; + goto del_entry; + } if (p->name[0] == '.') { if (p->len == 1) - continue; + goto del_entry; if (p->len == 2 && p->name[1] == '.') - continue; + goto del_entry; } err = -ENOTEMPTY; break; + +del_entry: + list_del(&p->l_node); + kfree(p); } return err; @@ -869,7 +901,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) list_for_each_entry(p, list, l_node) { struct dentry *dentry; - if (!p->is_whiteout) + if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; dentry = lookup_one_len(p->name, upper, p->len); @@ -985,7 +1017,7 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, } int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, - struct path *lowerstack, unsigned int numlower) + struct ovl_path *lower, unsigned int numlower) { int err; struct dentry *index = NULL; @@ -1020,7 +1052,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, index = NULL; break; } - err = ovl_verify_index(index, lowerstack, numlower); + err = ovl_verify_index(index, lower, numlower); /* Cleanup stale and orphan index entries */ if (err && (err == -ESTALE || err == -ENOENT)) err = ovl_cleanup(dir, index); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f5738e96a052..76440feb79f6 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -18,7 +18,6 @@ #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> #include "overlayfs.h" -#include "ovl_entry.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); @@ -34,20 +33,32 @@ module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); MODULE_PARM_DESC(ovl_redirect_dir_def, "Default to on or off for the redirect_dir feature"); +static bool ovl_redirect_always_follow = + IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); +module_param_named(redirect_always_follow, ovl_redirect_always_follow, + bool, 0644); +MODULE_PARM_DESC(ovl_redirect_always_follow, + "Follow redirects even if redirect_dir feature is turned off"); + static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); module_param_named(index, ovl_index_def, bool, 0644); MODULE_PARM_DESC(ovl_index_def, "Default to on or off for the inodes index feature"); +static void ovl_entry_stack_free(struct ovl_entry *oe) +{ + unsigned int i; + + for (i = 0; i < oe->numlower; i++) + dput(oe->lowerstack[i].dentry); +} + static void ovl_dentry_release(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; if (oe) { - unsigned int i; - - for (i = 0; i < oe->numlower; i++) - dput(oe->lowerstack[i].dentry); + ovl_entry_stack_free(oe); kfree_rcu(oe, rcu); } } @@ -207,46 +218,67 @@ static void ovl_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, ovl_i_callback); } -static void ovl_put_super(struct super_block *sb) +static void ovl_free_fs(struct ovl_fs *ofs) { - struct ovl_fs *ufs = sb->s_fs_info; unsigned i; - dput(ufs->indexdir); - dput(ufs->workdir); - if (ufs->workdir_locked) - ovl_inuse_unlock(ufs->workbasedir); - dput(ufs->workbasedir); - if (ufs->upper_mnt && ufs->upperdir_locked) - ovl_inuse_unlock(ufs->upper_mnt->mnt_root); - mntput(ufs->upper_mnt); - for (i = 0; i < ufs->numlower; i++) - mntput(ufs->lower_mnt[i]); - kfree(ufs->lower_mnt); - - kfree(ufs->config.lowerdir); - kfree(ufs->config.upperdir); - kfree(ufs->config.workdir); - put_cred(ufs->creator_cred); - kfree(ufs); + dput(ofs->indexdir); + dput(ofs->workdir); + if (ofs->workdir_locked) + ovl_inuse_unlock(ofs->workbasedir); + dput(ofs->workbasedir); + if (ofs->upperdir_locked) + ovl_inuse_unlock(ofs->upper_mnt->mnt_root); + mntput(ofs->upper_mnt); + for (i = 0; i < ofs->numlower; i++) { + mntput(ofs->lower_layers[i].mnt); + free_anon_bdev(ofs->lower_layers[i].pseudo_dev); + } + kfree(ofs->lower_layers); + + kfree(ofs->config.lowerdir); + kfree(ofs->config.upperdir); + kfree(ofs->config.workdir); + kfree(ofs->config.redirect_mode); + if (ofs->creator_cred) + put_cred(ofs->creator_cred); + kfree(ofs); } +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + ovl_free_fs(ofs); +} + +/* Sync real dirty inodes in upper filesystem (if it exists) */ static int ovl_sync_fs(struct super_block *sb, int wait) { - struct ovl_fs *ufs = sb->s_fs_info; + struct ovl_fs *ofs = sb->s_fs_info; struct super_block *upper_sb; int ret; - if (!ufs->upper_mnt) + if (!ofs->upper_mnt) return 0; - upper_sb = ufs->upper_mnt->mnt_sb; - if (!upper_sb->s_op->sync_fs) + + /* + * If this is a sync(2) call or an emergency sync, all the super blocks + * will be iterated, including upper_sb, so no need to do anything. + * + * If this is a syncfs(2) call, then we do need to call + * sync_filesystem() on upper_sb, but enough if we do it when being + * called with wait == 1. + */ + if (!wait) return 0; - /* real inodes have already been synced by sync_filesystem(ovl_sb) */ + upper_sb = ofs->upper_mnt->mnt_sb; + down_read(&upper_sb->s_umount); - ret = upper_sb->s_op->sync_fs(upper_sb, wait); + ret = sync_filesystem(upper_sb); up_read(&upper_sb->s_umount); + return ret; } @@ -277,9 +309,14 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) } /* Will this overlay be forced to mount/remount ro? */ -static bool ovl_force_readonly(struct ovl_fs *ufs) +static bool ovl_force_readonly(struct ovl_fs *ofs) +{ + return (!ofs->upper_mnt || !ofs->workdir); +} + +static const char *ovl_redirect_mode_def(void) { - return (!ufs->upper_mnt || !ufs->workdir); + return ovl_redirect_dir_def ? "on" : "off"; } /** @@ -291,29 +328,27 @@ static bool ovl_force_readonly(struct ovl_fs *ufs) static int ovl_show_options(struct seq_file *m, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; - struct ovl_fs *ufs = sb->s_fs_info; + struct ovl_fs *ofs = sb->s_fs_info; - seq_show_option(m, "lowerdir", ufs->config.lowerdir); - if (ufs->config.upperdir) { - seq_show_option(m, "upperdir", ufs->config.upperdir); - seq_show_option(m, "workdir", ufs->config.workdir); + seq_show_option(m, "lowerdir", ofs->config.lowerdir); + if (ofs->config.upperdir) { + seq_show_option(m, "upperdir", ofs->config.upperdir); + seq_show_option(m, "workdir", ofs->config.workdir); } - if (ufs->config.default_permissions) + if (ofs->config.default_permissions) seq_puts(m, ",default_permissions"); - if (ufs->config.redirect_dir != ovl_redirect_dir_def) - seq_printf(m, ",redirect_dir=%s", - ufs->config.redirect_dir ? "on" : "off"); - if (ufs->config.index != ovl_index_def) - seq_printf(m, ",index=%s", - ufs->config.index ? "on" : "off"); + if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0) + seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); + if (ofs->config.index != ovl_index_def) + seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); return 0; } static int ovl_remount(struct super_block *sb, int *flags, char *data) { - struct ovl_fs *ufs = sb->s_fs_info; + struct ovl_fs *ofs = sb->s_fs_info; - if (!(*flags & MS_RDONLY) && ovl_force_readonly(ufs)) + if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs)) return -EROFS; return 0; @@ -335,8 +370,7 @@ enum { OPT_UPPERDIR, OPT_WORKDIR, OPT_DEFAULT_PERMISSIONS, - OPT_REDIRECT_DIR_ON, - OPT_REDIRECT_DIR_OFF, + OPT_REDIRECT_DIR, OPT_INDEX_ON, OPT_INDEX_OFF, OPT_ERR, @@ -347,8 +381,7 @@ static const match_table_t ovl_tokens = { {OPT_UPPERDIR, "upperdir=%s"}, {OPT_WORKDIR, "workdir=%s"}, {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_REDIRECT_DIR_ON, "redirect_dir=on"}, - {OPT_REDIRECT_DIR_OFF, "redirect_dir=off"}, + {OPT_REDIRECT_DIR, "redirect_dir=%s"}, {OPT_INDEX_ON, "index=on"}, {OPT_INDEX_OFF, "index=off"}, {OPT_ERR, NULL} @@ -377,10 +410,37 @@ static char *ovl_next_opt(char **s) return sbegin; } +static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) +{ + if (strcmp(mode, "on") == 0) { + config->redirect_dir = true; + /* + * Does not make sense to have redirect creation without + * redirect following. + */ + config->redirect_follow = true; + } else if (strcmp(mode, "follow") == 0) { + config->redirect_follow = true; + } else if (strcmp(mode, "off") == 0) { + if (ovl_redirect_always_follow) + config->redirect_follow = true; + } else if (strcmp(mode, "nofollow") != 0) { + pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n", + mode); + return -EINVAL; + } + + return 0; +} + static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; + config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); + if (!config->redirect_mode) + return -ENOMEM; + while ((p = ovl_next_opt(&opt)) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -415,12 +475,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->default_permissions = true; break; - case OPT_REDIRECT_DIR_ON: - config->redirect_dir = true; - break; - - case OPT_REDIRECT_DIR_OFF: - config->redirect_dir = false; + case OPT_REDIRECT_DIR: + kfree(config->redirect_mode); + config->redirect_mode = match_strdup(&args[0]); + if (!config->redirect_mode) + return -ENOMEM; break; case OPT_INDEX_ON: @@ -445,19 +504,17 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->workdir = NULL; } - return 0; + return ovl_parse_redirect_mode(config, config->redirect_mode); } #define OVL_WORKDIR_NAME "work" #define OVL_INDEXDIR_NAME "index" -static struct dentry *ovl_workdir_create(struct super_block *sb, - struct ovl_fs *ufs, - struct dentry *dentry, +static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, const char *name, bool persist) { - struct inode *dir = dentry->d_inode; - struct vfsmount *mnt = ufs->upper_mnt; + struct inode *dir = ofs->workbasedir->d_inode; + struct vfsmount *mnt = ofs->upper_mnt; struct dentry *work; int err; bool retried = false; @@ -471,7 +528,7 @@ static struct dentry *ovl_workdir_create(struct super_block *sb, locked = true; retry: - work = lookup_one_len(name, dentry, strlen(name)); + work = lookup_one_len(name, ofs->workbasedir, strlen(name)); if (!IS_ERR(work)) { struct iattr attr = { @@ -541,8 +598,7 @@ out_dput: dput(work); out_err: pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", - ufs->config.workdir, name, -err); - sb->s_flags |= MS_RDONLY; + ofs->config.workdir, name, -err); work = NULL; goto out_unlock; } @@ -585,7 +641,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) return 0; out_put: - path_put(path); + path_put_init(path); out: return err; } @@ -603,7 +659,7 @@ static int ovl_mount_dir(const char *name, struct path *path) if (ovl_dentry_remote(path->dentry)) { pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", tmp); - path_put(path); + path_put_init(path); err = -EINVAL; } kfree(tmp); @@ -655,7 +711,7 @@ static int ovl_lower_dir(const char *name, struct path *path, return 0; out_put: - path_put(path); + path_put_init(path); out: return err; } @@ -826,129 +882,269 @@ static const struct xattr_handler *ovl_xattr_handlers[] = { NULL }; -static int ovl_fill_super(struct super_block *sb, void *data, int silent) +static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath) { - struct path upperpath = { }; - struct path workpath = { }; - struct dentry *root_dentry; - struct ovl_entry *oe; - struct ovl_fs *ufs; - struct path *stack = NULL; - char *lowertmp; - char *lower; - unsigned int numlower; - unsigned int stacklen = 0; - unsigned int i; - bool remote = false; - struct cred *cred; + struct vfsmount *upper_mnt; int err; - err = -ENOMEM; - ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); - if (!ufs) + err = ovl_mount_dir(ofs->config.upperdir, upperpath); + if (err) goto out; - ufs->config.redirect_dir = ovl_redirect_dir_def; - ufs->config.index = ovl_index_def; - err = ovl_parse_opt((char *) data, &ufs->config); + /* Upper fs should not be r/o */ + if (sb_rdonly(upperpath->mnt->mnt_sb)) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out; + } + + err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir); if (err) - goto out_free_config; + goto out; + + err = -EBUSY; + if (ovl_inuse_trylock(upperpath->dentry)) { + ofs->upperdir_locked = true; + } else if (ofs->config.index) { + pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); + goto out; + } else { + pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + } + + upper_mnt = clone_private_mount(upperpath); + err = PTR_ERR(upper_mnt); + if (IS_ERR(upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out; + } + + /* Don't inherit atime flags */ + upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); + ofs->upper_mnt = upper_mnt; + err = 0; +out: + return err; +} + +static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) +{ + struct dentry *temp; + int err; + + ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false); + if (!ofs->workdir) + return 0; + + /* + * Upper should support d_type, else whiteouts are visible. Given + * workdir and upper are on same fs, we can do iterate_dir() on + * workdir. This check requires successful creation of workdir in + * previous step. + */ + err = ovl_check_d_type_supported(workpath); + if (err < 0) + return err; + + /* + * We allowed this configuration and don't want to break users over + * kernel upgrade. So warn instead of erroring out. + */ + if (!err) + pr_warn("overlayfs: upper fs needs to support d_type.\n"); + + /* Check if upper/work fs supports O_TMPFILE */ + temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0); + ofs->tmpfile = !IS_ERR(temp); + if (ofs->tmpfile) + dput(temp); + else + pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + + /* + * Check if upper/work fs supports trusted.overlay.* xattr + */ + err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0); + if (err) { + ofs->noxattr = true; + pr_warn("overlayfs: upper fs does not support xattr.\n"); + } else { + vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); + } + + /* Check if upper/work fs supports file handles */ + if (ofs->config.index && + !ovl_can_decode_fh(ofs->workdir->d_sb)) { + ofs->config.index = false; + pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); + } + + return 0; +} + +static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) +{ + int err; + struct path workpath = { }; + + err = ovl_mount_dir(ofs->config.workdir, &workpath); + if (err) + goto out; err = -EINVAL; - if (!ufs->config.lowerdir) { - if (!silent) - pr_err("overlayfs: missing 'lowerdir'\n"); - goto out_free_config; + if (upperpath->mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out; } - sb->s_stack_depth = 0; - sb->s_maxbytes = MAX_LFS_FILESIZE; - if (ufs->config.upperdir) { - if (!ufs->config.workdir) { - pr_err("overlayfs: missing 'workdir'\n"); - goto out_free_config; - } + err = -EBUSY; + if (ovl_inuse_trylock(workpath.dentry)) { + ofs->workdir_locked = true; + } else if (ofs->config.index) { + pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); + goto out; + } else { + pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + } - err = ovl_mount_dir(ufs->config.upperdir, &upperpath); - if (err) - goto out_free_config; + ofs->workbasedir = dget(workpath.dentry); + err = ovl_make_workdir(ofs, &workpath); + if (err) + goto out; - /* Upper fs should not be r/o */ - if (sb_rdonly(upperpath.mnt->mnt_sb)) { - pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); - err = -EINVAL; - goto out_put_upperpath; - } + err = 0; +out: + path_put(&workpath); - err = ovl_check_namelen(&upperpath, ufs, ufs->config.upperdir); - if (err) - goto out_put_upperpath; - - err = -EBUSY; - if (ovl_inuse_trylock(upperpath.dentry)) { - ufs->upperdir_locked = true; - } else if (ufs->config.index) { - pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); - goto out_put_upperpath; - } else { - pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); - } + return err; +} + +static int ovl_get_indexdir(struct ovl_fs *ofs, struct ovl_entry *oe, + struct path *upperpath) +{ + int err; + + /* Verify lower root is upper root origin */ + err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry, + false, true); + if (err) { + pr_err("overlayfs: failed to verify upper root origin\n"); + goto out; + } - err = ovl_mount_dir(ufs->config.workdir, &workpath); + ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); + if (ofs->indexdir) { + /* Verify upper root is index dir origin */ + err = ovl_verify_origin(ofs->indexdir, upperpath->dentry, + true, true); if (err) - goto out_unlock_upperdentry; + pr_err("overlayfs: failed to verify index dir origin\n"); - err = -EINVAL; - if (upperpath.mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); - goto out_put_workpath; - } - if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); - goto out_put_workpath; + /* Cleanup bad/stale/orphan index entries */ + if (!err) + err = ovl_indexdir_cleanup(ofs->indexdir, + ofs->upper_mnt, + oe->lowerstack, + oe->numlower); + } + if (err || !ofs->indexdir) + pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + +out: + return err; +} + +static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, + unsigned int numlower) +{ + int err; + unsigned int i; + + err = -ENOMEM; + ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer), + GFP_KERNEL); + if (ofs->lower_layers == NULL) + goto out; + for (i = 0; i < numlower; i++) { + struct vfsmount *mnt; + dev_t dev; + + err = get_anon_bdev(&dev); + if (err) { + pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + goto out; } - err = -EBUSY; - if (ovl_inuse_trylock(workpath.dentry)) { - ufs->workdir_locked = true; - } else if (ufs->config.index) { - pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); - goto out_put_workpath; - } else { - pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + mnt = clone_private_mount(&stack[i]); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + free_anon_bdev(dev); + goto out; } + /* + * Make lower layers R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; - ufs->workbasedir = workpath.dentry; - sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; + ofs->lower_layers[ofs->numlower].mnt = mnt; + ofs->lower_layers[ofs->numlower].pseudo_dev = dev; + ofs->numlower++; + + /* Check if all lower layers are on same sb */ + if (i == 0) + ofs->same_sb = mnt->mnt_sb; + else if (ofs->same_sb != mnt->mnt_sb) + ofs->same_sb = NULL; } + err = 0; +out: + return err; +} + +static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, + struct ovl_fs *ofs) +{ + int err; + char *lowertmp, *lower; + struct path *stack = NULL; + unsigned int stacklen, numlower = 0, i; + bool remote = false; + struct ovl_entry *oe; + err = -ENOMEM; - lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL); + lowertmp = kstrdup(ofs->config.lowerdir, GFP_KERNEL); if (!lowertmp) - goto out_unlock_workdentry; + goto out_err; err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); if (stacklen > OVL_MAX_STACK) { pr_err("overlayfs: too many lower directories, limit is %d\n", OVL_MAX_STACK); - goto out_free_lowertmp; - } else if (!ufs->config.upperdir && stacklen == 1) { + goto out_err; + } else if (!ofs->config.upperdir && stacklen == 1) { pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); - goto out_free_lowertmp; + goto out_err; } err = -ENOMEM; stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); if (!stack) - goto out_free_lowertmp; + goto out_err; err = -EINVAL; lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { - err = ovl_lower_dir(lower, &stack[numlower], ufs, + err = ovl_lower_dir(lower, &stack[numlower], ofs, &sb->s_stack_depth, &remote); if (err) - goto out_put_lowerpath; + goto out_err; lower = strchr(lower, '\0') + 1; } @@ -957,190 +1153,143 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("overlayfs: maximum fs stacking depth exceeded\n"); - goto out_put_lowerpath; + goto out_err; } - if (ufs->config.upperdir) { - ufs->upper_mnt = clone_private_mount(&upperpath); - err = PTR_ERR(ufs->upper_mnt); - if (IS_ERR(ufs->upper_mnt)) { - pr_err("overlayfs: failed to clone upperpath\n"); - goto out_put_lowerpath; - } + err = ovl_get_lower_layers(ofs, stack, numlower); + if (err) + goto out_err; - /* Don't inherit atime flags */ - ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); + err = -ENOMEM; + oe = ovl_alloc_entry(numlower); + if (!oe) + goto out_err; - sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran; + for (i = 0; i < numlower; i++) { + oe->lowerstack[i].dentry = dget(stack[i].dentry); + oe->lowerstack[i].layer = &ofs->lower_layers[i]; + } - ufs->workdir = ovl_workdir_create(sb, ufs, workpath.dentry, - OVL_WORKDIR_NAME, false); - /* - * Upper should support d_type, else whiteouts are visible. - * Given workdir and upper are on same fs, we can do - * iterate_dir() on workdir. This check requires successful - * creation of workdir in previous step. - */ - if (ufs->workdir) { - struct dentry *temp; - - err = ovl_check_d_type_supported(&workpath); - if (err < 0) - goto out_put_workdir; - - /* - * We allowed this configuration and don't want to - * break users over kernel upgrade. So warn instead - * of erroring out. - */ - if (!err) - pr_warn("overlayfs: upper fs needs to support d_type.\n"); - - /* Check if upper/work fs supports O_TMPFILE */ - temp = ovl_do_tmpfile(ufs->workdir, S_IFREG | 0); - ufs->tmpfile = !IS_ERR(temp); - if (ufs->tmpfile) - dput(temp); - else - pr_warn("overlayfs: upper fs does not support tmpfile.\n"); - - /* - * Check if upper/work fs supports trusted.overlay.* - * xattr - */ - err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE, - "0", 1, 0); - if (err) { - ufs->noxattr = true; - pr_warn("overlayfs: upper fs does not support xattr.\n"); - } else { - vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE); - } + if (remote) + sb->s_d_op = &ovl_reval_dentry_operations; + else + sb->s_d_op = &ovl_dentry_operations; - /* Check if upper/work fs supports file handles */ - if (ufs->config.index && - !ovl_can_decode_fh(ufs->workdir->d_sb)) { - ufs->config.index = false; - pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); - } - } - } +out: + for (i = 0; i < numlower; i++) + path_put(&stack[i]); + kfree(stack); + kfree(lowertmp); + + return oe; + +out_err: + oe = ERR_PTR(err); + goto out; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path upperpath = { }; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ofs; + struct cred *cred; + int err; err = -ENOMEM; - ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL); - if (ufs->lower_mnt == NULL) - goto out_put_workdir; - for (i = 0; i < numlower; i++) { - struct vfsmount *mnt = clone_private_mount(&stack[i]); + ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ofs) + goto out; - err = PTR_ERR(mnt); - if (IS_ERR(mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); - goto out_put_lower_mnt; - } - /* - * Make lower_mnt R/O. That way fchmod/fchown on lower file - * will fail instead of modifying lower fs. - */ - mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; + ofs->creator_cred = cred = prepare_creds(); + if (!cred) + goto out_err; - ufs->lower_mnt[ufs->numlower] = mnt; - ufs->numlower++; + ofs->config.index = ovl_index_def; + err = ovl_parse_opt((char *) data, &ofs->config); + if (err) + goto out_err; - /* Check if all lower layers are on same sb */ - if (i == 0) - ufs->same_sb = mnt->mnt_sb; - else if (ufs->same_sb != mnt->mnt_sb) - ufs->same_sb = NULL; + err = -EINVAL; + if (!ofs->config.lowerdir) { + if (!silent) + pr_err("overlayfs: missing 'lowerdir'\n"); + goto out_err; } - /* If the upper fs is nonexistent, we mark overlayfs r/o too */ - if (!ufs->upper_mnt) - sb->s_flags |= MS_RDONLY; - else if (ufs->upper_mnt->mnt_sb != ufs->same_sb) - ufs->same_sb = NULL; - - if (!(ovl_force_readonly(ufs)) && ufs->config.index) { - /* Verify lower root is upper root origin */ - err = ovl_verify_origin(upperpath.dentry, ufs->lower_mnt[0], - stack[0].dentry, false, true); - if (err) { - pr_err("overlayfs: failed to verify upper root origin\n"); - goto out_put_lower_mnt; + sb->s_stack_depth = 0; + sb->s_maxbytes = MAX_LFS_FILESIZE; + if (ofs->config.upperdir) { + if (!ofs->config.workdir) { + pr_err("overlayfs: missing 'workdir'\n"); + goto out_err; } - ufs->indexdir = ovl_workdir_create(sb, ufs, workpath.dentry, - OVL_INDEXDIR_NAME, true); - if (ufs->indexdir) { - /* Verify upper root is index dir origin */ - err = ovl_verify_origin(ufs->indexdir, ufs->upper_mnt, - upperpath.dentry, true, true); - if (err) - pr_err("overlayfs: failed to verify index dir origin\n"); + err = ovl_get_upper(ofs, &upperpath); + if (err) + goto out_err; - /* Cleanup bad/stale/orphan index entries */ - if (!err) - err = ovl_indexdir_cleanup(ufs->indexdir, - ufs->upper_mnt, - stack, numlower); - } - if (err || !ufs->indexdir) - pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + err = ovl_get_workdir(ofs, &upperpath); if (err) - goto out_put_indexdir; + goto out_err; + + if (!ofs->workdir) + sb->s_flags |= SB_RDONLY; + + sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth; + sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran; + } + oe = ovl_get_lowerstack(sb, ofs); + err = PTR_ERR(oe); + if (IS_ERR(oe)) + goto out_err; - /* Show index=off/on in /proc/mounts for any of the reasons above */ - if (!ufs->indexdir) - ufs->config.index = false; + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ofs->upper_mnt) + sb->s_flags |= SB_RDONLY; + else if (ofs->upper_mnt->mnt_sb != ofs->same_sb) + ofs->same_sb = NULL; - if (remote) - sb->s_d_op = &ovl_reval_dentry_operations; - else - sb->s_d_op = &ovl_dentry_operations; + if (!(ovl_force_readonly(ofs)) && ofs->config.index) { + err = ovl_get_indexdir(ofs, oe, &upperpath); + if (err) + goto out_free_oe; - err = -ENOMEM; - ufs->creator_cred = cred = prepare_creds(); - if (!cred) - goto out_put_indexdir; + if (!ofs->indexdir) + sb->s_flags |= SB_RDONLY; + } + + /* Show index=off/on in /proc/mounts for any of the reasons above */ + if (!ofs->indexdir) + ofs->config.index = false; /* Never override disk quota limits or use reserved space */ cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); - err = -ENOMEM; - oe = ovl_alloc_entry(numlower); - if (!oe) - goto out_put_cred; - sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; sb->s_xattr = ovl_xattr_handlers; - sb->s_fs_info = ufs; - sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; + sb->s_fs_info = ofs; + sb->s_flags |= SB_POSIXACL | SB_NOREMOTELOCK; + err = -ENOMEM; root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); if (!root_dentry) goto out_free_oe; mntput(upperpath.mnt); - for (i = 0; i < numlower; i++) - mntput(stack[i].mnt); - mntput(workpath.mnt); - kfree(lowertmp); - if (upperpath.dentry) { oe->has_upper = true; if (ovl_is_impuredir(upperpath.dentry)) ovl_set_flag(OVL_IMPURE, d_inode(root_dentry)); } - for (i = 0; i < numlower; i++) { - oe->lowerstack[i].dentry = stack[i].dentry; - oe->lowerstack[i].mnt = ufs->lower_mnt[i]; - } - kfree(stack); root_dentry->d_fsdata = oe; + /* Root is always merge -> can have whiteouts */ + ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); ovl_inode_init(d_inode(root_dentry), upperpath.dentry, ovl_dentry_lower(root_dentry)); @@ -1149,39 +1298,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) return 0; out_free_oe: + ovl_entry_stack_free(oe); kfree(oe); -out_put_cred: - put_cred(ufs->creator_cred); -out_put_indexdir: - dput(ufs->indexdir); -out_put_lower_mnt: - for (i = 0; i < ufs->numlower; i++) - mntput(ufs->lower_mnt[i]); - kfree(ufs->lower_mnt); -out_put_workdir: - dput(ufs->workdir); - mntput(ufs->upper_mnt); -out_put_lowerpath: - for (i = 0; i < numlower; i++) - path_put(&stack[i]); - kfree(stack); -out_free_lowertmp: - kfree(lowertmp); -out_unlock_workdentry: - if (ufs->workdir_locked) - ovl_inuse_unlock(workpath.dentry); -out_put_workpath: - path_put(&workpath); -out_unlock_upperdentry: - if (ufs->upperdir_locked) - ovl_inuse_unlock(upperpath.dentry); -out_put_upperpath: +out_err: path_put(&upperpath); -out_free_config: - kfree(ufs->config.lowerdir); - kfree(ufs->config.upperdir); - kfree(ufs->config.workdir); - kfree(ufs); + ovl_free_fs(ofs); out: return err; } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index b9b239fa5cfd..d6bb1c9f5e7a 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -17,7 +17,6 @@ #include <linux/namei.h> #include <linux/ratelimit.h> #include "overlayfs.h" -#include "ovl_entry.h" int ovl_want_write(struct dentry *dentry) { @@ -125,7 +124,12 @@ void ovl_path_lower(struct dentry *dentry, struct path *path) { struct ovl_entry *oe = dentry->d_fsdata; - *path = oe->numlower ? oe->lowerstack[0] : (struct path) { }; + if (oe->numlower) { + path->mnt = oe->lowerstack[0].layer->mnt; + path->dentry = oe->lowerstack[0].dentry; + } else { + *path = (struct path) { }; + } } enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) @@ -329,6 +333,19 @@ void ovl_copy_up_end(struct dentry *dentry) mutex_unlock(&OVL_I(d_inode(dentry))->lock); } +bool ovl_check_origin_xattr(struct dentry *dentry) +{ + int res; + + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0); + + /* Zero size value means "copied up but origin unknown" */ + if (res >= 0) + return true; + + return false; +} + bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) { int res; diff --git a/fs/pipe.c b/fs/pipe.c index 349c9d56d4b3..6d98566201ef 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fops = { /* * Currently we rely on the pipe array holding a power-of-2 number - * of pages. + * of pages. Returns 0 on error. */ -static inline unsigned int round_pipe_size(unsigned int size) +unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; + if (size < pipe_min_size) + size = pipe_min_size; + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages == 0) + return 0; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } @@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) long ret = 0; size = round_pipe_size(arg); + if (size == 0) + return -EINVAL; nr_pages = size >> PAGE_SHIFT; if (!nr_pages) @@ -1117,20 +1125,13 @@ out_revert_acct: } /* - * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size * will return an error. */ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { - int ret; - - ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); - if (ret < 0 || !write) - return ret; - - pipe_max_size = round_pipe_size(pipe_max_size); - return ret; + return proc_dopipe_max_size(table, write, buf, lenp, ppos); } /* diff --git a/fs/proc/Makefile b/fs/proc/Makefile index f7456c4e7d0f..ead487e80510 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -21,6 +21,7 @@ proc-y += loadavg.o proc-y += meminfo.o proc-y += stat.o proc-y += uptime.o +proc-y += util.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 9390032a11e1..d67a72dcb92c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -138,7 +138,7 @@ static const char * const task_state_array[] = { static inline const char *get_task_state(struct task_struct *tsk) { BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); - return task_state_array[__get_task_state(tsk)]; + return task_state_array[task_state_index(tsk)]; } static inline int get_task_umask(struct task_struct *tsk) @@ -366,6 +366,11 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(&task->cpus_allowed)); } +static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -376,6 +381,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, if (mm) { task_mem(m, mm); + task_core_dumping(m, mm); mmput(mm); } task_sig(m, task); @@ -424,8 +430,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * safe because the task has stopped executing permanently. */ if (permitted && (task->flags & PF_DUMPCORE)) { - eip = KSTK_EIP(task); - esp = KSTK_ESP(task); + if (try_get_task_stack(task)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + put_task_stack(task); + } } } @@ -454,7 +463,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; - rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); + rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); /* add up live thread stats at the group level */ if (whole) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 9d357b2ea6cb..60316b52d659 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -443,8 +443,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%pK>] %pB\n", - (void *)entries[i], (void *)entries[i]); + seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } unlock_trace(task); } @@ -1682,7 +1681,7 @@ const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ -void task_dump_owner(struct task_struct *task, mode_t mode, +void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid) { /* Depending on the state of dumpable compute who should own a @@ -2269,7 +2268,7 @@ static int show_timer(struct seq_file *m, void *v) notify = timer->it_sigev_notify; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%p\n", + seq_printf(m, "signal: %d/%px\n", timer->sigq->info.si_signo, timer->sigq->info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index e0f867cd8553..96f1087e372c 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -1,12 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cpufreq.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +__weak void arch_freq_prepare_all(void) +{ +} + extern const struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { + arch_freq_prepare_all(); return seq_open(file, &cpuinfo_op); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 225f541f7078..dd0f82622427 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -483,7 +483,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent) /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; - s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; + s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a34195e92b20..4a67188c8d74 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -100,31 +100,10 @@ static inline struct task_struct *get_proc_task(struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -void task_dump_owner(struct task_struct *task, mode_t mode, +void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); -static inline unsigned name_to_int(const struct qstr *qstr) -{ - const char *name = qstr->name; - int len = qstr->len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - +unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 9bc5c58c00ee..a000d7547479 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - task_active_pid_ns(current)->last_pid); + idr_get_cursor(&task_active_pid_ns(current)->idr)); return 0; } diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 2da657848cfc..d0cf1c50bb6c 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -15,6 +15,7 @@ #include <linux/tty.h> #include <linux/seq_file.h> #include <linux/bitops.h> +#include "internal.h" /* * The /proc/tty directory inodes... @@ -165,7 +166,7 @@ void proc_tty_unregister_driver(struct tty_driver *driver) if (!ent) return; - remove_proc_entry(driver->driver_name, proc_tty_driver); + remove_proc_entry(ent->name, proc_tty_driver); driver->proc_entry = NULL; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 4e42aba97f2e..ede8e64974be 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -91,7 +91,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, { struct pid_namespace *ns; - if (flags & MS_KERNMOUNT) { + if (flags & SB_KERNMOUNT) { ns = data; data = NULL; } else { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6744bd706ecf..339e4c1c044d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -26,7 +26,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; + unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -50,8 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); - pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -67,7 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" - "VmPMD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -80,8 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - ptes >> 10, - pmds >> 10, + mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); } @@ -665,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", diff --git a/fs/proc/util.c b/fs/proc/util.c new file mode 100644 index 000000000000..b161cfa0f9fa --- /dev/null +++ b/fs/proc/util.c @@ -0,0 +1,23 @@ +#include <linux/dcache.h> + +unsigned name_to_int(const struct qstr *qstr) +{ + const char *name = qstr->name; + int len = qstr->len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + do { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } while (--len > 0); + return n; +out: + return ~0U; +} diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 7626ee11b06c..b786840facd9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -28,7 +28,7 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) poll_wait(file, &p->ns->poll, wait); - event = ACCESS_ONCE(ns->event); + event = READ_ONCE(ns->event); if (m->poll_event != event) { m->poll_event = event; res |= POLLERR | POLLPRI; @@ -45,10 +45,10 @@ struct proc_fs_info { static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_info fs_info[] = { - { MS_SYNCHRONOUS, ",sync" }, - { MS_DIRSYNC, ",dirsync" }, - { MS_MANDLOCK, ",mand" }, - { MS_LAZYTIME, ",lazytime" }, + { SB_SYNCHRONOUS, ",sync" }, + { SB_DIRSYNC, ",dirsync" }, + { SB_MANDLOCK, ",mand" }, + { SB_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 2b21d180157c..691032107f8c 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -61,8 +61,8 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " static int pstore_new_entry; -static void pstore_timefunc(unsigned long); -static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0); +static void pstore_timefunc(struct timer_list *); +static DEFINE_TIMER(pstore_timer, pstore_timefunc); static void pstore_dowork(struct work_struct *); static DECLARE_WORK(pstore_work, pstore_dowork); @@ -482,10 +482,7 @@ void pstore_record_init(struct pstore_record *record, record->psi = psinfo; /* Report zeroed timestamp if called before timekeeping has resumed. */ - if (__getnstimeofday(&record->time)) { - record->time.tv_sec = 0; - record->time.tv_nsec = 0; - } + record->time = ns_to_timespec(ktime_get_real_fast_ns()); } /* @@ -654,7 +651,7 @@ static int pstore_write_user_compat(struct pstore_record *record, return -EINVAL; record->buf = memdup_user(buf, record->size); - if (unlikely(IS_ERR(record->buf))) { + if (IS_ERR(record->buf)) { ret = PTR_ERR(record->buf); goto out; } @@ -893,7 +890,7 @@ static void pstore_dowork(struct work_struct *work) pstore_get_records(1); } -static void pstore_timefunc(unsigned long dummy) +static void pstore_timefunc(struct timer_list *unused) { if (pstore_new_entry) { pstore_new_entry = 0; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 3a67cfb142d8..3d46fe302fcb 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -47,7 +47,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data) sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -199,7 +199,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) s->s_op = &qnx4_sops; s->s_magic = QNX4_SUPER_MAGIC; - s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ /* Check the superblock signature. Since the qnx4 code is dangerous, we should leave as quickly as possible diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 1192422a1c56..4aeb26bcb4d0 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -56,7 +56,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root) static int qnx6_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -427,7 +427,7 @@ mmi_success: } s->s_op = &qnx6_sops; s->s_magic = QNX6_SUPER_MAGIC; - s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ /* ease the later tree level calculations */ sbi = QNX6_SB(s); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9f78b5015f2e..020c597ef9b6 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -645,8 +645,15 @@ int dquot_writeback_dquots(struct super_block *sb, int type) spin_unlock(&dq_list_lock); dqstats_inc(DQST_LOOKUPS); err = sb->dq_op->write_dquot(dquot); - if (!ret && err) - ret = err; + if (err) { + /* + * Clear dirty bit anyway to avoid infinite + * loop here. + */ + clear_dquot_dirty(dquot); + if (!ret) + ret = err; + } dqput(dquot); spin_lock(&dq_list_lock); } @@ -934,12 +941,13 @@ static int dqinit_needed(struct inode *inode, int type) } /* This routine is guarded by s_umount semaphore */ -static void add_dquot_ref(struct super_block *sb, int type) +static int add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif + int err = 0; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { @@ -959,7 +967,11 @@ static void add_dquot_ref(struct super_block *sb, int type) reserved = 1; #endif iput(old_inode); - __dquot_initialize(inode, type); + err = __dquot_initialize(inode, type); + if (err) { + iput(inode); + goto out; + } /* * We hold a reference to 'inode' so it couldn't have been @@ -974,7 +986,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } spin_unlock(&sb->s_inode_list_lock); iput(old_inode); - +out: #ifdef CONFIG_QUOTA_DEBUG if (reserved) { quota_error(sb, "Writes happened before quota was turned on " @@ -982,6 +994,7 @@ static void add_dquot_ref(struct super_block *sb, int type) "Please run quotacheck(8)"); } #endif + return err; } /* @@ -2139,7 +2152,7 @@ int dquot_file_open(struct inode *inode, struct file *file) error = generic_file_open(inode, file); if (!error && (file->f_mode & FMODE_WRITE)) - dquot_initialize(inode); + error = dquot_initialize(inode); return error; } EXPORT_SYMBOL(dquot_file_open); @@ -2372,10 +2385,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); - add_dquot_ref(sb, type); - - return 0; + error = add_dquot_ref(sb, type); + if (error) + dquot_disable(sb, type, flags); + return error; out_file_init: dqopt->files[type] = NULL; iput(inode); @@ -2978,7 +2992,8 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - register_shrinker(&dqcache_shrinker); + if (register_shrinker(&dqcache_shrinker)) + panic("Cannot register dquot shrinker"); return 0; } diff --git a/fs/read_write.c b/fs/read_write.c index 0046d72efe94..f8547b82dfb3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -635,27 +635,6 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, return ret; } -/* - * Reduce an iovec's length in-place. Return the resulting number of segments - */ -unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) -{ - unsigned long seg = 0; - size_t len = 0; - - while (seg < nr_segs) { - seg++; - if (len + iov->iov_len >= to) { - iov->iov_len = to - len; - break; - } - len += iov->iov_len; - iov++; - } - return seg; -} -EXPORT_SYMBOL(iov_shorten); - static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { diff --git a/fs/readdir.c b/fs/readdir.c index d336db65a33e..1b83b0ad183b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -37,13 +37,12 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; - if (shared) { - inode_lock_shared(inode); - } else { + if (shared) + res = down_read_killable(&inode->i_rwsem); + else res = down_write_killable(&inode->i_rwsem); - if (res) - goto out; - } + if (res) + goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 11a48affa882..b13fc024d2ee 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2106,7 +2106,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, journal_end(th); goto out_inserted_sd; } - } else if (inode->i_sb->s_flags & MS_POSIXACL) { + } else if (inode->i_sb->s_flags & SB_POSIXACL) { reiserfs_warning(inode->i_sb, "jdm-13090", "ACLs aren't enabled in the fs, " "but vfs thinks they are!"); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 69ff280bdfe8..70057359fbaf 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1960,7 +1960,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, /* * Cancel flushing of old commits. Note that neither of these works * will be requeued because superblock is being shutdown and doesn't - * have MS_ACTIVE set. + * have SB_ACTIVE set. */ reiserfs_cancel_old_flush(sb); /* wait for all commits to finish */ @@ -4302,7 +4302,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) * Avoid queueing work when sb is being shut down. Transaction * will be flushed on journal shutdown. */ - if (sb->s_flags & MS_ACTIVE) + if (sb->s_flags & SB_ACTIVE) queue_delayed_work(REISERFS_SB(sb)->commit_wq, &journal->j_work, HZ / 10); } @@ -4393,7 +4393,7 @@ void reiserfs_abort_journal(struct super_block *sb, int errno) if (!journal->j_errno) journal->j_errno = errno; - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; set_bit(J_ABORTED, &journal->j_state); #ifdef CONFIG_REISERFS_CHECK diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 64f49cafbc5b..7e288d97adcb 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -390,7 +390,7 @@ void __reiserfs_error(struct super_block *sb, const char *id, return; reiserfs_info(sb, "Remounting filesystem read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; reiserfs_abort_journal(sb, -EIO); } @@ -409,7 +409,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id, error_buf); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; reiserfs_abort_journal(sb, errno); } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 5464ec517702..1fc934d24459 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -121,7 +121,7 @@ void reiserfs_schedule_old_flush(struct super_block *s) * Avoid scheduling flush when sb is being shut down. It can race * with journal shutdown and free still queued delayed work. */ - if (sb_rdonly(s) || !(s->s_flags & MS_ACTIVE)) + if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE)) return; spin_lock(&sbi->old_work_lock); @@ -252,11 +252,11 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ - if (s->s_flags & MS_ACTIVE) { + if (s->s_flags & SB_ACTIVE) { ms_active_set = 0; } else { ms_active_set = 1; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; } /* Turn on quotas so that they are updated correctly */ for (i = 0; i < REISERFS_MAXQUOTAS; i++) { @@ -411,7 +411,7 @@ static int finish_unfinished(struct super_block *s) reiserfs_write_lock(s); if (ms_active_set) /* Restore the flag back */ - s->s_flags &= ~MS_ACTIVE; + s->s_flags &= ~SB_ACTIVE; #endif pathrelse(&path); if (done) @@ -1521,7 +1521,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) goto out_err_unlock; } - if (*mount_flags & MS_RDONLY) { + if (*mount_flags & SB_RDONLY) { reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); /* remount read-only */ @@ -1567,7 +1567,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); /* now it is safe to call journal_begin */ - s->s_flags &= ~MS_RDONLY; + s->s_flags &= ~SB_RDONLY; err = journal_begin(&th, s, 10); if (err) goto out_err_unlock; @@ -1575,7 +1575,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) /* Mount a partition which is read-only, read-write */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); - s->s_flags &= ~MS_RDONLY; + s->s_flags &= ~SB_RDONLY; set_sb_umount_state(rs, REISERFS_ERROR_FS); if (!old_format_only(s)) set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); @@ -1590,7 +1590,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) goto out_err_unlock; reiserfs_write_unlock(s); - if (!(*mount_flags & MS_RDONLY)) { + if (!(*mount_flags & SB_RDONLY)) { dquot_resume(s, -1); reiserfs_write_lock(s); finish_unfinished(s); @@ -2055,7 +2055,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) { SWARN(silent, s, "clm-7000", "Detected readonly device, marking FS readonly"); - s->s_flags |= MS_RDONLY; + s->s_flags |= SB_RDONLY; } args.objectid = REISERFS_ROOT_OBJECTID; args.dirid = REISERFS_ROOT_PARENT_OBJECTID; @@ -2591,7 +2591,6 @@ out: return err; if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); - inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); return len - towrite; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 46492fb37a4c..5dbf5324bdda 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -959,7 +959,7 @@ int reiserfs_lookup_privroot(struct super_block *s) /* * We need to take a copy of the mount flags since things like - * MS_RDONLY don't get set until *after* we're called. + * SB_RDONLY don't get set until *after* we're called. * mount_flags != mount_options */ int reiserfs_xattr_init(struct super_block *s, int mount_flags) @@ -971,7 +971,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (err) goto error; - if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { + if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) { inode_lock(d_inode(s->s_root)); err = create_privroot(REISERFS_SB(s)->priv_root); inode_unlock(d_inode(s->s_root)); @@ -999,11 +999,11 @@ error: clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt); } - /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ + /* The super_block SB_POSIXACL must mirror the (no)acl mount option. */ if (reiserfs_posixacl(s)) - s->s_flags |= MS_POSIXACL; + s->s_flags |= SB_POSIXACL; else - s->s_flags &= ~MS_POSIXACL; + s->s_flags &= ~SB_POSIXACL; return err; } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 0186fe6d39f3..8f06fd1f3d69 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -451,7 +451,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int romfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -502,7 +502,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = 0xFFFFFFFF; sb->s_magic = ROMFS_MAGIC; - sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_op = &romfs_super_ops; #ifdef CONFIG_ROMFS_ON_MTD diff --git a/fs/select.c b/fs/select.c index 063067e606ca..6de493bb42a4 100644 --- a/fs/select.c +++ b/fs/select.c @@ -292,8 +292,7 @@ static int poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, int timeval, int ret) { - struct timespec64 rts64; - struct timespec rts; + struct timespec64 rts; struct timeval rtv; if (!p) @@ -306,23 +305,22 @@ static int poll_select_copy_remaining(struct timespec64 *end_time, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts64(&rts64); - rts64 = timespec64_sub(*end_time, rts64); - if (rts64.tv_sec < 0) - rts64.tv_sec = rts64.tv_nsec = 0; + ktime_get_ts64(&rts); + rts = timespec64_sub(*end_time, rts); + if (rts.tv_sec < 0) + rts.tv_sec = rts.tv_nsec = 0; - rts = timespec64_to_timespec(rts64); if (timeval) { if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); - rtv.tv_sec = rts64.tv_sec; - rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC; + rtv.tv_sec = rts.tv_sec; + rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; - } else if (!copy_to_user(p, &rts, sizeof(rts))) + } else if (!put_timespec64(&rts, p)) return ret; /* @@ -705,17 +703,15 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts; - struct timespec64 ts64, end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (get_timespec64(&ts, tsp)) return -EFAULT; - ts64 = timespec_to_timespec64(ts); to = &end_time; - if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec)) + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } @@ -1052,12 +1048,11 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts; - struct timespec64 end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1103,10 +1098,10 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) static -int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, +int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, int timeval, int ret) { - struct timespec ts; + struct timespec64 ts; if (!p) return ret; @@ -1118,8 +1113,8 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts(&ts); - ts = timespec_sub(*end_time, ts); + ktime_get_ts64(&ts); + ts = timespec64_sub(*end_time, ts); if (ts.tv_sec < 0) ts.tv_sec = ts.tv_nsec = 0; @@ -1132,12 +1127,7 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } else { - struct compat_timespec rts; - - rts.tv_sec = ts.tv_sec; - rts.tv_nsec = ts.tv_nsec; - - if (!copy_to_user(p, &rts, sizeof(rts))) + if (!compat_put_timespec64(&ts, p)) return ret; } /* @@ -1195,7 +1185,7 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct timespec *end_time) + struct timespec64 *end_time) { fd_set_bits fds; void *bits; @@ -1268,7 +1258,7 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct compat_timeval __user *, tvp) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; struct compat_timeval tv; int ret; @@ -1312,14 +1302,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { - compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (compat_get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1330,9 +1318,8 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + if (get_compat_sigset(&ksigmask, sigmask)) return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); @@ -1381,14 +1368,12 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct compat_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { - compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (compat_get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1399,9 +1384,8 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + if (get_compat_sigset(&ksigmask, sigmask)) return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); diff --git a/fs/signalfd.c b/fs/signalfd.c index 1c667af86da5..5f1ff8756595 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -313,15 +313,13 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, compat_size_t, sigsetsize, int, flags) { - compat_sigset_t ss32; sigset_t tmp; sigset_t __user *ksigmask; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + if (get_compat_sigset(&tmp, sigmask)) return -EFAULT; - sigset_from_compat(&tmp, &ss32); ksigmask = compat_alloc_user_space(sizeof(sigset_t)); if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) return -EFAULT; diff --git a/fs/splice.c b/fs/splice.c index f3084cce0ea6..39e2dc01ac12 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -253,7 +253,7 @@ EXPORT_SYMBOL(add_to_pipe); */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { - unsigned int buffers = ACCESS_ONCE(pipe->buffers); + unsigned int buffers = READ_ONCE(pipe->buffers); spd->nr_pages_max = buffers; if (buffers <= PIPE_DEF_BUFFERS) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index cf01e15a7b16..8a73b97217c8 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -195,7 +195,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) (u64) le64_to_cpu(sblk->id_table_start)); sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; sb->s_op = &squashfs_super_ops; err = -ENOMEM; @@ -373,7 +373,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int squashfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } diff --git a/fs/statfs.c b/fs/statfs.c index c25dd9a26cc1..5b2a24f0f263 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -35,11 +35,11 @@ static int flags_by_mnt(int mnt_flags) static int flags_by_sb(int s_flags) { int flags = 0; - if (s_flags & MS_SYNCHRONOUS) + if (s_flags & SB_SYNCHRONOUS) flags |= ST_SYNCHRONOUS; - if (s_flags & MS_MANDLOCK) + if (s_flags & SB_MANDLOCK) flags |= ST_MANDLOCK; - if (s_flags & MS_RDONLY) + if (s_flags & SB_RDONLY) flags |= ST_RDONLY; return flags; } @@ -217,7 +217,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user return error; } -int vfs_ustat(dev_t dev, struct kstatfs *sbuf) +static int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { struct super_block *s = user_get_super(dev); int err; diff --git a/fs/super.c b/fs/super.c index 994db21f59bf..06bd25d90ba5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -155,21 +155,19 @@ static void destroy_super_rcu(struct rcu_head *head) schedule_work(&s->destroy_work); } -/** - * destroy_super - frees a superblock - * @s: superblock to free - * - * Frees a superblock. - */ -static void destroy_super(struct super_block *s) +/* Free a superblock that has never been seen by anyone */ +static void destroy_unused_super(struct super_block *s) { + if (!s) + return; + up_write(&s->s_umount); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); - WARN_ON(!list_empty(&s->s_mounts)); put_user_ns(s->s_user_ns); kfree(s->s_subtype); - call_rcu(&s->rcu, destroy_super_rcu); + /* no delays needed */ + destroy_super_work(&s->destroy_work); } /** @@ -193,6 +191,24 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); + init_rwsem(&s->s_umount); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; @@ -220,25 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru)) goto fail; - - init_rwsem(&s->s_umount); - lockdep_set_class(&s->s_umount, &type->s_umount_key); - /* - * sget() can have s_umount recursion. - * - * When it cannot find a suitable sb, it allocates a new - * one (this one), and tries again to find a suitable old - * one. - * - * In case that succeeds, it will acquire the s_umount - * lock of the old one. Since these are clearly distrinct - * locks, and this object isn't exposed yet, there's no - * risk of deadlocks. - * - * Annotate this by putting this lock in a different - * subclass. - */ - down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); @@ -257,7 +254,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, return s; fail: - destroy_super(s); + destroy_unused_super(s); return NULL; } @@ -266,11 +263,17 @@ fail: /* * Drop a superblock's refcount. The caller must hold sb_lock. */ -static void __put_super(struct super_block *sb) +static void __put_super(struct super_block *s) { - if (!--sb->s_count) { - list_del_init(&sb->s_list); - destroy_super(sb); + if (!--s->s_count) { + list_del_init(&s->s_list); + WARN_ON(s->s_dentry_lru.node); + WARN_ON(s->s_inode_lru.node); + WARN_ON(!list_empty(&s->s_mounts)); + security_sb_free(s); + put_user_ns(s->s_user_ns); + kfree(s->s_subtype); + call_rcu(&s->rcu, destroy_super_rcu); } } @@ -485,19 +488,12 @@ retry: continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); - if (s) { - up_write(&s->s_umount); - destroy_super(s); - } + destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; - if (s) { - up_write(&s->s_umount); - destroy_super(s); - s = NULL; - } + destroy_unused_super(s); return old; } } @@ -512,8 +508,7 @@ retry: err = set(s, data); if (err) { spin_unlock(&sb_lock); - up_write(&s->s_umount); - destroy_super(s); + destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; @@ -522,7 +517,11 @@ retry: hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); - register_shrinker(&s->s_shrink); + err = register_shrinker(&s->s_shrink); + if (err) { + deactivate_locked_super(s); + s = ERR_PTR(err); + } return s; } diff --git a/fs/sync.c b/fs/sync.c index 83ac79a960dd..6e0a2cbaf6de 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -109,7 +109,7 @@ SYSCALL_DEFINE0(sync) { int nowait = 0, wait = 1; - wakeup_flusher_threads(0, WB_REASON_SYNC); + wakeup_flusher_threads(WB_REASON_SYNC); iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 20b8f82e115b..fb49510c5dcf 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -30,7 +30,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, void *ns; bool new_sb; - if (!(flags & MS_KERNMOUNT)) { + if (!(flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) return ERR_PTR(-EPERM); } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 3c47b7d5d4cf..bec9f79adb25 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -63,7 +63,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data) sync_filesystem(sb); if (sbi->s_forced_ro) - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } diff --git a/fs/sysv/super.c b/fs/sysv/super.c index 0d56e486b392..89765ddfb738 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -333,7 +333,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size) /* set up enough so that it can read an inode */ sb->s_op = &sysv_sops; if (sbi->s_forced_ro) - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; if (sbi->s_truncate) sb->s_d_op = &sysv_dentry_operations; root_inode = sysv_iget(sb, SYSV_ROOT_INO); diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 16a5d5c82073..616a688f5d8f 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -88,7 +88,6 @@ const struct fscrypt_operations ubifs_crypt_operations = { .key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, - .is_encrypted = __ubifs_crypt_is_encrypted, .empty_dir = ubifs_crypt_empty_dir, .max_namelen = ubifs_crypt_max_namelen, }; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a02aa59d1e24..dfe85069586e 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1406,7 +1406,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, if (flags & S_MTIME) inode->i_mtime = *time; - if (!(inode->i_sb->s_flags & MS_LAZYTIME)) + if (!(inode->i_sb->s_flags & SB_LAZYTIME)) iflags |= I_DIRTY_SYNC; release = ui->dirty; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 3be28900bf37..fe77e9625e84 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -84,7 +84,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) if (!c->ro_error) { c->ro_error = 1; c->no_chk_data_crc = 0; - c->vfs_sb->s_flags |= MS_RDONLY; + c->vfs_sb->s_flags |= SB_RDONLY; ubifs_warn(c, "switched to read-only mode, error %d", err); dump_stack(); } diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index fdc311246807..0164bcc827f8 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -38,7 +38,8 @@ void ubifs_set_inode_flags(struct inode *inode) { unsigned int flags = ubifs_inode(inode)->flags; - inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC); + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC | + S_ENCRYPTED); if (flags & UBIFS_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & UBIFS_APPEND_FL) @@ -47,6 +48,8 @@ void ubifs_set_inode_flags(struct inode *inode) inode->i_flags |= S_IMMUTABLE; if (flags & UBIFS_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; + if (flags & UBIFS_CRYPT_FL) + inode->i_flags |= S_ENCRYPTED; } /* diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 5496b17b959c..0beb285b143d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -968,7 +968,7 @@ static int parse_standard_option(const char *option) pr_notice("UBIFS: parse %s\n", option); if (!strcmp(option, "sync")) - return MS_SYNCHRONOUS; + return SB_SYNCHRONOUS; return 0; } @@ -1160,8 +1160,8 @@ static int mount_ubifs(struct ubifs_info *c) size_t sz; c->ro_mount = !!sb_rdonly(c->vfs_sb); - /* Suppress error messages while probing if MS_SILENT is set */ - c->probing = !!(c->vfs_sb->s_flags & MS_SILENT); + /* Suppress error messages while probing if SB_SILENT is set */ + c->probing = !!(c->vfs_sb->s_flags & SB_SILENT); err = init_constants_early(c); if (err) @@ -1852,7 +1852,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) return err; } - if (c->ro_mount && !(*flags & MS_RDONLY)) { + if (c->ro_mount && !(*flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; @@ -1864,7 +1864,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) err = ubifs_remount_rw(c); if (err) return err; - } else if (!c->ro_mount && (*flags & MS_RDONLY)) { + } else if (!c->ro_mount && (*flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; @@ -2007,12 +2007,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) return c; } -#ifndef CONFIG_UBIFS_FS_ENCRYPTION -const struct fscrypt_operations ubifs_crypt_operations = { - .is_encrypted = __ubifs_crypt_is_encrypted, -}; -#endif - static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { struct ubifs_info *c = sb->s_fs_info; @@ -2055,7 +2049,9 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; sb->s_xattr = ubifs_xattr_handlers; +#ifdef CONFIG_UBIFS_FS_ENCRYPTION sb->s_cop = &ubifs_crypt_operations; +#endif mutex_lock(&c->umount_mutex); err = mount_ubifs(c); @@ -2121,7 +2117,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - if (!(flags & MS_SILENT)) + if (!(flags & SB_SILENT)) pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", current->pid, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); @@ -2147,18 +2143,18 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); - if (!!(flags & MS_RDONLY) != c1->ro_mount) { + if (!!(flags & SB_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { - err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0); if (err) goto out_deact; /* We do not support atime */ - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; #ifndef CONFIG_UBIFS_ATIME_SUPPORT - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; #else ubifs_msg(c, "full atime support is enabled."); #endif diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index cd43651f1731..5ee7af879cc4 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -38,12 +38,11 @@ #include <linux/backing-dev.h> #include <linux/security.h> #include <linux/xattr.h> -#ifdef CONFIG_UBIFS_FS_ENCRYPTION -#include <linux/fscrypt_supp.h> -#else -#include <linux/fscrypt_notsupp.h> -#endif #include <linux/random.h> + +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION) +#include <linux/fscrypt.h> + #include "ubifs-media.h" /* Version of this UBIFS implementation */ @@ -1202,7 +1201,7 @@ struct ubifs_debug_info; * @need_recovery: %1 if the file-system needs recovery * @replaying: %1 during journal replay * @mounting: %1 while mounting - * @probing: %1 while attempting to mount if MS_SILENT mount flag is set + * @probing: %1 while attempting to mount if SB_SILENT mount flag is set * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay @@ -1835,18 +1834,13 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, extern const struct fscrypt_operations ubifs_crypt_operations; -static inline bool __ubifs_crypt_is_encrypted(struct inode *inode) +static inline bool ubifs_crypt_is_encrypted(const struct inode *inode) { - struct ubifs_inode *ui = ubifs_inode(inode); + const struct ubifs_inode *ui = ubifs_inode(inode); return ui->flags & UBIFS_CRYPT_FL; } -static inline bool ubifs_crypt_is_encrypted(const struct inode *inode) -{ - return __ubifs_crypt_is_encrypted((struct inode *)inode); -} - /* Normal UBIFS messages */ __printf(2, 3) void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...); @@ -1856,7 +1850,7 @@ __printf(2, 3) void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...); /* * A conditional variant of 'ubifs_err()' which doesn't output anything - * if probing (ie. MS_SILENT set). + * if probing (ie. SB_SILENT set). */ #define ubifs_errc(c, fmt, ...) \ do { \ diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index c13eae819cbc..5ddc89d564fd 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -170,6 +170,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, err = ubifs_jnl_update(c, host, nm, inode, 0, 1); if (err) goto out_cancel; + ubifs_set_inode_flags(host); mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index e0fd65fe73e8..1b961b1d9699 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -58,7 +58,7 @@ static int __load_block_bitmap(struct super_block *sb, int nr_groups = bitmap->s_nr_groups; if (block_group >= nr_groups) { - udf_debug("block_group (%d) > nr_groups (%d)\n", + udf_debug("block_group (%u) > nr_groups (%d)\n", block_group, nr_groups); } @@ -122,7 +122,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; if (bloc->logicalBlockNum + count < count || (bloc->logicalBlockNum + count) > partmap->s_partition_len) { - udf_debug("%d < %d || %d + %d > %d\n", + udf_debug("%u < %d || %u + %u > %u\n", bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, partmap->s_partition_len); @@ -151,9 +151,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb, bh = bitmap->s_block_bitmap[bitmap_nr]; for (i = 0; i < count; i++) { if (udf_set_bit(bit + i, bh->b_data)) { - udf_debug("bit %ld already set\n", bit + i); + udf_debug("bit %lu already set\n", bit + i); udf_debug("byte=%2x\n", - ((char *)bh->b_data)[(bit + i) >> 3]); + ((__u8 *)bh->b_data)[(bit + i) >> 3]); } } udf_add_free_space(sb, sbi->s_partition, count); @@ -218,16 +218,18 @@ out: return alloc_count; } -static int udf_bitmap_new_block(struct super_block *sb, +static udf_pblk_t udf_bitmap_new_block(struct super_block *sb, struct udf_bitmap *bitmap, uint16_t partition, uint32_t goal, int *err) { struct udf_sb_info *sbi = UDF_SB(sb); - int newbit, bit = 0, block, block_group, group_start; + int newbit, bit = 0; + udf_pblk_t block; + int block_group, group_start; int end_goal, nr_groups, bitmap_nr, i; struct buffer_head *bh = NULL; char *ptr; - int newblock = 0; + udf_pblk_t newblock = 0; *err = -ENOSPC; mutex_lock(&sbi->s_alloc_mutex); @@ -362,7 +364,7 @@ static void udf_table_free_blocks(struct super_block *sb, partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; if (bloc->logicalBlockNum + count < count || (bloc->logicalBlockNum + count) > partmap->s_partition_len) { - udf_debug("%d < %d || %d + %d > %d\n", + udf_debug("%u < %d || %u + %u > %u\n", bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, partmap->s_partition_len); @@ -515,7 +517,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, while (first_block != eloc.logicalBlockNum && (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) { - udf_debug("eloc=%d, elen=%d, first_block=%d\n", + udf_debug("eloc=%u, elen=%u, first_block=%u\n", eloc.logicalBlockNum, elen, first_block); ; /* empty loop body */ } @@ -545,13 +547,14 @@ static int udf_table_prealloc_blocks(struct super_block *sb, return alloc_count; } -static int udf_table_new_block(struct super_block *sb, +static udf_pblk_t udf_table_new_block(struct super_block *sb, struct inode *table, uint16_t partition, uint32_t goal, int *err) { struct udf_sb_info *sbi = UDF_SB(sb); uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; - uint32_t newblock = 0, adsize; + udf_pblk_t newblock = 0; + uint32_t adsize; uint32_t elen, goal_elen = 0; struct kernel_lb_addr eloc, uninitialized_var(goal_eloc); struct extent_position epos, goal_epos; @@ -700,12 +703,12 @@ inline int udf_prealloc_blocks(struct super_block *sb, return allocated; } -inline int udf_new_block(struct super_block *sb, +inline udf_pblk_t udf_new_block(struct super_block *sb, struct inode *inode, uint16_t partition, uint32_t goal, int *err) { struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; - int block; + udf_pblk_t block; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) block = udf_bitmap_new_block(sb, diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 2d0e028067eb..c19dba45aa20 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -43,7 +43,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; - int block, iblock; + udf_pblk_t block, iblock; loff_t nf_pos; int flen; unsigned char *fname = NULL, *copy_name = NULL; diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 7aa48bd7cbaf..0a98a2369738 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -26,7 +26,8 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, sector_t *offset) { struct fileIdentDesc *fi; - int i, num, block; + int i, num; + udf_pblk_t block; struct buffer_head *tmp, *bha[16]; struct udf_inode_info *iinfo = UDF_I(dir); @@ -51,7 +52,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, } if (fibh->eoffset == dir->i_sb->s_blocksize) { - int lextoffset = epos->offset; + uint32_t lextoffset = epos->offset; unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits; if (udf_next_aext(dir, epos, eloc, elen, 1) != @@ -110,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, memcpy((uint8_t *)cfi, (uint8_t *)fi, sizeof(struct fileIdentDesc)); } else if (fibh->eoffset > dir->i_sb->s_blocksize) { - int lextoffset = epos->offset; + uint32_t lextoffset = epos->offset; if (udf_next_aext(dir, epos, eloc, elen, 1) != (EXT_RECORDED_ALLOCATED >> 30)) @@ -175,7 +176,7 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) if (fi->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { udf_debug("0x%x != TAG_IDENT_FID\n", le16_to_cpu(fi->descTag.tagIdent)); - udf_debug("offset: %u sizeof: %lu bufsize: %u\n", + udf_debug("offset: %d sizeof: %lu bufsize: %d\n", *offset, (unsigned long)sizeof(struct fileIdentDesc), bufsize); return NULL; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index c1ed18a10ce4..b6e420c1bfeb 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -50,7 +50,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; - int block; + udf_pblk_t block; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 8dacf4f57414..c23744d5ae5c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -52,7 +52,7 @@ static int udf_alloc_i_data(struct inode *inode, size_t size); static sector_t inode_getblk(struct inode *, sector_t, int *, int *); static int8_t udf_insert_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); -static void udf_split_extents(struct inode *, int *, int, int, +static void udf_split_extents(struct inode *, int *, int, udf_pblk_t, struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, struct kernel_long_ad *, int *); @@ -316,10 +316,10 @@ int udf_expand_file_adinicb(struct inode *inode) return err; } -struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, - int *err) +struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, + udf_pblk_t *block, int *err) { - int newblock; + udf_pblk_t newblock; struct buffer_head *dbh = NULL; struct kernel_lb_addr eloc; uint8_t alloctype; @@ -446,7 +446,7 @@ abort: return err; } -static struct buffer_head *udf_getblk(struct inode *inode, long block, +static struct buffer_head *udf_getblk(struct inode *inode, udf_pblk_t block, int create, int *err) { struct buffer_head *bh; @@ -480,7 +480,7 @@ static int udf_do_extend_file(struct inode *inode, int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; struct kernel_lb_addr prealloc_loc = {}; - int prealloc_len = 0; + uint32_t prealloc_len = 0; struct udf_inode_info *iinfo; int err; @@ -663,11 +663,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; - uint32_t newblocknum, newblock; + udf_pblk_t newblocknum, newblock; sector_t offset = 0; int8_t etype; struct udf_inode_info *iinfo = UDF_I(inode); - int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; + udf_pblk_t goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; bool isBeyondEOF; @@ -879,8 +879,8 @@ out_free: } static void udf_split_extents(struct inode *inode, int *c, int offset, - int newblocknum, struct kernel_long_ad *laarr, - int *endnum) + udf_pblk_t newblocknum, + struct kernel_long_ad *laarr, int *endnum) { unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; @@ -1166,7 +1166,7 @@ static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr } } -struct buffer_head *udf_bread(struct inode *inode, int block, +struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err) { struct buffer_head *bh = NULL; @@ -1193,7 +1193,7 @@ int udf_setsize(struct inode *inode, loff_t newsize) { int err; struct udf_inode_info *iinfo; - int bsize = i_blocksize(inode); + unsigned int bsize = i_blocksize(inode); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -1278,14 +1278,14 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { - udf_debug("partition reference: %d > logical volume partitions: %d\n", + udf_debug("partition reference: %u > logical volume partitions: %u\n", iloc->partitionReferenceNum, sbi->s_partitions); return -EIO; } if (iloc->logicalBlockNum >= sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { - udf_debug("block=%d, partition=%d out of range\n", + udf_debug("block=%u, partition=%u out of range\n", iloc->logicalBlockNum, iloc->partitionReferenceNum); return -EIO; } @@ -1304,13 +1304,13 @@ reread: */ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { - udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); + udf_err(inode->i_sb, "(ino %lu) failed !bh\n", inode->i_ino); return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { - udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", + udf_err(inode->i_sb, "(ino %lu) failed ident=%u\n", inode->i_ino, ident); goto out; } @@ -1346,7 +1346,7 @@ reread: } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { - udf_err(inode->i_sb, "unsupported strategy type: %d\n", + udf_err(inode->i_sb, "unsupported strategy type: %u\n", le16_to_cpu(fe->icbTag.strategyType)); goto out; } @@ -1547,7 +1547,7 @@ reread: udf_debug("METADATA BITMAP FILE-----\n"); break; default: - udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", + udf_err(inode->i_sb, "(ino %lu) failed unknown file type=%u\n", inode->i_ino, fe->icbTag.fileType); goto out; } @@ -1852,7 +1852,7 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, return inode; } -int udf_setup_indirect_aext(struct inode *inode, int block, +int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos) { struct super_block *sb = inode->i_sb; @@ -1994,7 +1994,7 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, if (epos->offset + (2 * adsize) > sb->s_blocksize) { int err; - int new_block; + udf_pblk_t new_block; new_block = udf_new_block(sb, NULL, epos->block.partitionReferenceNum, @@ -2076,7 +2076,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { - int block; + udf_pblk_t block; if (++indirections > UDF_MAX_INDIR_EXTS) { udf_err(inode->i_sb, @@ -2091,7 +2091,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); epos->bh = udf_tread(inode->i_sb, block); if (!epos->bh) { - udf_debug("reading block %d failed!\n", block); + udf_debug("reading block %u failed!\n", block); return -1; } } @@ -2146,7 +2146,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; break; default: - udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type); + udf_debug("alloc_type = %u unsupported\n", iinfo->i_alloc_type); return -1; } @@ -2289,13 +2289,13 @@ int8_t inode_bmap(struct inode *inode, sector_t block, return etype; } -long udf_block_map(struct inode *inode, sector_t block) +udf_pblk_t udf_block_map(struct inode *inode, sector_t block) { struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; - int ret; + udf_pblk_t ret; down_read(&UDF_I(inode)->i_data_sem); diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 3949c4bec3a3..401e64cde1be 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -28,7 +28,7 @@ #include "udf_i.h" #include "udf_sb.h" -struct buffer_head *udf_tgetblk(struct super_block *sb, int block) +struct buffer_head *udf_tgetblk(struct super_block *sb, udf_pblk_t block) { if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) return sb_getblk(sb, udf_fixed_to_variable(block)); @@ -36,7 +36,7 @@ struct buffer_head *udf_tgetblk(struct super_block *sb, int block) return sb_getblk(sb, block); } -struct buffer_head *udf_tread(struct super_block *sb, int block) +struct buffer_head *udf_tread(struct super_block *sb, udf_pblk_t block) { if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) return sb_bread(sb, udf_fixed_to_variable(block)); @@ -209,7 +209,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, bh = udf_tread(sb, block); if (!bh) { - udf_err(sb, "read failed, block=%u, location=%d\n", + udf_err(sb, "read failed, block=%u, location=%u\n", block, location); return NULL; } @@ -247,7 +247,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, le16_to_cpu(tag_p->descCRCLength))) return bh; - udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block, + udf_debug("Crc failure block %u: crc = %u, crclen = %u\n", block, le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); error_out: diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 885198dfd9f8..0458dd47e105 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -164,7 +164,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, { struct fileIdentDesc *fi = NULL; loff_t f_pos; - int block, flen; + udf_pblk_t block; + int flen; unsigned char *fname = NULL, *copy_name = NULL; unsigned char *nameptr; uint8_t lfi; @@ -352,7 +353,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, int nfidlen; uint8_t lfi; uint16_t liu; - int block; + udf_pblk_t block; struct kernel_lb_addr eloc; uint32_t elen = 0; sector_t offset; @@ -749,7 +750,7 @@ static int empty_dir(struct inode *dir) struct udf_fileident_bh fibh; loff_t f_pos; loff_t size = udf_ext0_offset(dir) + dir->i_size; - int block; + udf_pblk_t block; struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; @@ -839,7 +840,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (inode->i_nlink != 2) - udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n", + udf_warn(inode->i_sb, "empty directory has nlink != 2 (%u)\n", inode->i_nlink); clear_nlink(inode); inode->i_size = 0; @@ -881,7 +882,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; if (!inode->i_nlink) { - udf_debug("Deleting nonexistent file (%lu), %d\n", + udf_debug("Deleting nonexistent file (%lu), %u\n", inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } @@ -913,7 +914,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, int eoffset, elen = 0; uint8_t *ea; int err; - int block; + udf_pblk_t block; unsigned char *name = NULL; int namelen; struct udf_inode_info *iinfo; diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 888c364b2fe9..090baff83990 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -32,7 +32,7 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; if (partition >= sbi->s_partitions) { - udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n", + udf_debug("block=%u, partition=%u, offset=%u: invalid partition\n", block, partition, offset); return 0xFFFFFFFF; } @@ -59,7 +59,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, vdata = &map->s_type_specific.s_virtual; if (block > vdata->s_num_entries) { - udf_debug("Trying to access block beyond end of VAT (%d max %d)\n", + udf_debug("Trying to access block beyond end of VAT (%u max %u)\n", block, vdata->s_num_entries); return 0xFFFFFFFF; } @@ -83,7 +83,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, bh = sb_bread(sb, loc); if (!bh) { - udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%d,%d) VAT: %d[%d]\n", + udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%u,%u) VAT: %u[%u]\n", sb, block, partition, loc, index); return 0xFFFFFFFF; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 99cb81d0077f..f73239a9a97d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -366,7 +366,7 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) if (sbi->s_dmode != UDF_INVALID_MODE) seq_printf(seq, ",dmode=%ho", sbi->s_dmode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) - seq_printf(seq, ",session=%u", sbi->s_session); + seq_printf(seq, ",session=%d", sbi->s_session); if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); if (sbi->s_anchor != 0) @@ -650,7 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sync_filesystem(sb); if (lvidiu) { int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); - if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) + if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY)) return -EACCES; } @@ -673,10 +673,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sbi->s_dmode = uopt.dmode; write_unlock(&sbi->s_cred_lock); - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out_unlock; - if (*flags & MS_RDONLY) + if (*flags & SB_RDONLY) udf_close_lvid(sb); else udf_open_lvid(sb); @@ -703,9 +703,9 @@ static loff_t udf_check_vsd(struct super_block *sb) else sectorsize = sb->s_blocksize; - sector += (sbi->s_session << sb->s_blocksize_bits); + sector += (((loff_t)sbi->s_session) << sb->s_blocksize_bits); - udf_debug("Starting at sector %u (%ld byte sectors)\n", + udf_debug("Starting at sector %u (%lu byte sectors)\n", (unsigned int)(sector >> sb->s_blocksize_bits), sb->s_blocksize); /* Process the sequence (if applicable). The hard limit on the sector @@ -868,7 +868,7 @@ static int udf_find_fileset(struct super_block *sb, if ((fileset->logicalBlockNum != 0xFFFFFFFF || fileset->partitionReferenceNum != 0xFFFF) && bh) { - udf_debug("Fileset at block=%d, partition=%d\n", + udf_debug("Fileset at block=%u, partition=%u\n", fileset->logicalBlockNum, fileset->partitionReferenceNum); @@ -981,14 +981,14 @@ static int udf_load_metadata_files(struct super_block *sb, int partition, mdata->s_phys_partition_ref = type1_index; /* metadata address */ - udf_debug("Metadata file location: block = %d part = %d\n", + udf_debug("Metadata file location: block = %u part = %u\n", mdata->s_meta_file_loc, mdata->s_phys_partition_ref); fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc, mdata->s_phys_partition_ref); if (IS_ERR(fe)) { /* mirror file entry */ - udf_debug("Mirror metadata file location: block = %d part = %d\n", + udf_debug("Mirror metadata file location: block = %u part = %u\n", mdata->s_mirror_file_loc, mdata->s_phys_partition_ref); fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc, @@ -1012,7 +1012,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition, addr.logicalBlockNum = mdata->s_bitmap_file_loc; addr.partitionReferenceNum = mdata->s_phys_partition_ref; - udf_debug("Bitmap file location: block = %d part = %d\n", + udf_debug("Bitmap file location: block = %u part = %u\n", addr.logicalBlockNum, addr.partitionReferenceNum); fe = udf_iget_special(sb, &addr); @@ -1042,7 +1042,7 @@ static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum); - udf_debug("Rootdir at block=%d, partition=%d\n", + udf_debug("Rootdir at block=%u, partition=%u\n", root->logicalBlockNum, root->partitionReferenceNum); } @@ -1097,7 +1097,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; - udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n", + udf_debug("Partition (%d type %x) starts at physical %u, block length %u\n", p_index, map->s_partition_type, map->s_partition_root, map->s_partition_len); @@ -1122,7 +1122,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, } map->s_uspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; - udf_debug("unallocSpaceTable (part %d) @ %ld\n", + udf_debug("unallocSpaceTable (part %d) @ %lu\n", p_index, map->s_uspace.s_table->i_ino); } @@ -1134,7 +1134,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; - udf_debug("unallocSpaceBitmap (part %d) @ %d\n", + udf_debug("unallocSpaceBitmap (part %d) @ %u\n", p_index, bitmap->s_extPosition); } @@ -1157,7 +1157,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, } map->s_fspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; - udf_debug("freedSpaceTable (part %d) @ %ld\n", + udf_debug("freedSpaceTable (part %d) @ %lu\n", p_index, map->s_fspace.s_table->i_ino); } @@ -1169,7 +1169,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, bitmap->s_extPosition = le32_to_cpu( phd->freedSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; - udf_debug("freedSpaceBitmap (part %d) @ %d\n", + udf_debug("freedSpaceBitmap (part %d) @ %u\n", p_index, bitmap->s_extPosition); } return 0; @@ -1282,7 +1282,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) /* First scan for TYPE1 and SPARABLE partitions */ for (i = 0; i < sbi->s_partitions; i++) { map = &sbi->s_partmaps[i]; - udf_debug("Searching map: (%d == %d)\n", + udf_debug("Searching map: (%u == %u)\n", map->s_partition_num, partitionNumber); if (map->s_partition_num == partitionNumber && (map->s_partition_type == UDF_TYPE1_MAP15 || @@ -1291,7 +1291,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) } if (i >= sbi->s_partitions) { - udf_debug("Partition (%d) not found in partition map\n", + udf_debug("Partition (%u) not found in partition map\n", partitionNumber); ret = 0; goto out_bh; @@ -1483,7 +1483,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct metadataPartitionMap *mdm = (struct metadataPartitionMap *) &(lvd->partitionMaps[offset]); - udf_debug("Parsing Logical vol part %d type %d id=%s\n", + udf_debug("Parsing Logical vol part %d type %u id=%s\n", i, type, UDF_ID_METADATA); map->s_partition_type = UDF_METADATA_MAP25; @@ -1505,17 +1505,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, udf_debug("Metadata Ident suffix=0x%x\n", le16_to_cpu(*(__le16 *) mdm->partIdent.identSuffix)); - udf_debug("Metadata part num=%d\n", + udf_debug("Metadata part num=%u\n", le16_to_cpu(mdm->partitionNum)); - udf_debug("Metadata part alloc unit size=%d\n", + udf_debug("Metadata part alloc unit size=%u\n", le32_to_cpu(mdm->allocUnitSize)); - udf_debug("Metadata file loc=%d\n", + udf_debug("Metadata file loc=%u\n", le32_to_cpu(mdm->metadataFileLoc)); - udf_debug("Mirror file loc=%d\n", + udf_debug("Mirror file loc=%u\n", le32_to_cpu(mdm->metadataMirrorFileLoc)); - udf_debug("Bitmap file loc=%d\n", + udf_debug("Bitmap file loc=%u\n", le32_to_cpu(mdm->metadataBitmapFileLoc)); - udf_debug("Flags: %d %d\n", + udf_debug("Flags: %d %u\n", mdata->s_flags, mdm->flags); } else { udf_debug("Unknown ident: %s\n", @@ -1525,7 +1525,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, map->s_volumeseqnum = le16_to_cpu(upm2->volSeqNum); map->s_partition_num = le16_to_cpu(upm2->partitionNum); } - udf_debug("Partition (%d:%d) type %d on volume %d\n", + udf_debug("Partition (%d:%u) type %u on volume %u\n", i, map->s_partition_num, type, map->s_volumeseqnum); } @@ -1533,7 +1533,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); *fileset = lelb_to_cpu(la->extLocation); - udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n", + udf_debug("FileSet found in LogicalVolDesc at block=%u, partition=%u\n", fileset->logicalBlockNum, fileset->partitionReferenceNum); } @@ -2159,7 +2159,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = udf_load_vrs(sb, &uopt, silent, &fileset); if (ret < 0) { if (!silent && ret != -EACCES) { - pr_notice("Scanning with blocksize %d failed\n", + pr_notice("Scanning with blocksize %u failed\n", uopt.blocksize); } brelse(sbi->s_lvid_bh); @@ -2184,7 +2184,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) goto error_out; } - udf_debug("Lastblock=%d\n", sbi->s_last_block); + udf_debug("Lastblock=%u\n", sbi->s_last_block); if (sbi->s_lvid_bh) { struct logicalVolIntegrityDescImpUse *lvidiu = @@ -2255,7 +2255,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* perhaps it's not extensible enough, but for now ... */ inode = udf_iget(sb, &rootdir); if (IS_ERR(inode)) { - udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", + udf_err(sb, "Error in udf_iget, block=%u, partition=%u\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); ret = PTR_ERR(inode); goto error_out; @@ -2389,7 +2389,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, struct buffer_head *bh = NULL; unsigned int accum = 0; int index; - int block = 0, newblock; + udf_pblk_t block = 0, newblock; struct kernel_lb_addr loc; uint32_t bytes; uint8_t *ptr; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 42b8c57795cb..b647f0bd150c 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -48,7 +48,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos, if (elen != nelen) { udf_write_aext(inode, epos, &neloc, nelen, 0); - if (last_block - first_block > 0) { + if (last_block > first_block) { if (etype == (EXT_RECORDED_ALLOCATED >> 30)) mark_inode_dirty(inode); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index fa206558128d..f5e0fe78979e 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -74,6 +74,8 @@ static inline size_t udf_ext0_offset(struct inode *inode) /* computes tag checksum */ u8 udf_tag_checksum(const struct tag *t); +typedef uint32_t udf_pblk_t; + struct dentry; struct inode; struct task_struct; @@ -145,15 +147,17 @@ static inline struct inode *udf_iget(struct super_block *sb, return __udf_iget(sb, ino, false); } extern int udf_expand_file_adinicb(struct inode *); -extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); -extern struct buffer_head *udf_bread(struct inode *, int, int, int *); +extern struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, + udf_pblk_t *block, int *err); +extern struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, + int create, int *err); extern int udf_setsize(struct inode *, loff_t); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); -extern long udf_block_map(struct inode *, sector_t); +extern udf_pblk_t udf_block_map(struct inode *inode, sector_t block); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); -extern int udf_setup_indirect_aext(struct inode *inode, int block, +extern int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos); extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc); @@ -169,8 +173,9 @@ extern int8_t udf_current_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t *, int); /* misc.c */ -extern struct buffer_head *udf_tgetblk(struct super_block *, int); -extern struct buffer_head *udf_tread(struct super_block *, int); +extern struct buffer_head *udf_tgetblk(struct super_block *sb, + udf_pblk_t block); +extern struct buffer_head *udf_tread(struct super_block *sb, udf_pblk_t block); extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t, uint32_t, uint8_t); extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, @@ -229,8 +234,8 @@ extern void udf_free_blocks(struct super_block *, struct inode *, struct kernel_lb_addr *, uint32_t, uint32_t); extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, uint32_t, uint32_t); -extern int udf_new_block(struct super_block *, struct inode *, uint16_t, - uint32_t, int *); +extern udf_pblk_t udf_new_block(struct super_block *sb, struct inode *inode, + uint16_t partition, uint32_t goal, int *err); /* directory.c */ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 695389a4fc23..f897e55f2cd0 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -200,7 +200,7 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, cmp_id = ocu[0]; if (cmp_id != 8 && cmp_id != 16) { memset(str_o, 0, str_max_len); - pr_err("unknown compression code (%d)\n", cmp_id); + pr_err("unknown compression code (%u)\n", cmp_id); return -EINVAL; } u_ch = cmp_id >> 3; diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index b5cd79065ef9..e727ee07dbe4 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -115,7 +115,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); @@ -205,7 +205,7 @@ do_more: ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); if (overflow) { @@ -567,7 +567,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); @@ -688,7 +688,7 @@ cg_found: succed: ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 916b4a428933..e1ef0f0a1353 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -112,7 +112,7 @@ void ufs_free_inode (struct inode * inode) ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); @@ -146,14 +146,14 @@ static void ufs2_init_inodes_chunk(struct super_block *sb, set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bh); brelse(bh); } fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); UFSD("EXIT\n"); @@ -284,7 +284,7 @@ cg_found: } ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); @@ -330,7 +330,7 @@ cg_found: ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec); mark_buffer_dirty(bh); unlock_buffer(bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bh); brelse(bh); } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 6440003f8ddc..4d497e9c6883 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -282,7 +282,7 @@ void ufs_error (struct super_block * sb, const char * function, usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } va_start(args, fmt); vaf.fmt = fmt; @@ -320,7 +320,7 @@ void ufs_panic (struct super_block * sb, const char * function, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; pr_crit("panic (device %s): %s: %pV\n", sb->s_id, function, &vaf); va_end(args); @@ -905,7 +905,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=old is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; @@ -921,7 +921,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; @@ -937,7 +937,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep-cd is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; @@ -953,7 +953,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=openstep is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; @@ -968,7 +968,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=hp is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; default: @@ -1125,21 +1125,21 @@ magic_found: break; case UFS_FSACTIVE: pr_err("%s(): fs is active\n", __func__); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; break; case UFS_FSBAD: pr_err("%s(): fs is bad\n", __func__); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; break; default: pr_err("%s(): can't grok fs_clean 0x%x\n", __func__, usb1->fs_clean); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; break; } } else { pr_err("%s(): fs needs fsck\n", __func__); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } /* @@ -1328,7 +1328,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) return -EINVAL; } - if ((bool)(*mount_flags & MS_RDONLY) == sb_rdonly(sb)) { + if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; @@ -1337,7 +1337,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) /* * fs was mouted as rw, remounting ro */ - if (*mount_flags & MS_RDONLY) { + if (*mount_flags & SB_RDONLY) { ufs_put_super_internal(sb); usb1->fs_time = cpu_to_fs32(sb, get_seconds()); if ((flags & UFS_ST_MASK) == UFS_ST_SUN @@ -1346,7 +1346,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); ubh_mark_buffer_dirty (USPI_UBH(uspi)); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } else { /* * fs was mounted as ro, remounting rw @@ -1370,7 +1370,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) mutex_unlock(&UFS_SB(sb)->s_lock); return -EPERM; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1c713fd5b3e6..41a75f9f23fd 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -381,7 +381,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * in __get_user_pages if userfaultfd_release waits on the * caller of handle_userfault to release the mmap_sem. */ - if (unlikely(ACCESS_ONCE(ctx->released))) { + if (unlikely(READ_ONCE(ctx->released))) { /* * Don't return VM_FAULT_SIGBUS in this case, so a non * cooperative manager can close the uffd after the @@ -477,7 +477,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) vmf->flags, reason); up_read(&mm->mmap_sem); - if (likely(must_wait && !ACCESS_ONCE(ctx->released) && + if (likely(must_wait && !READ_ONCE(ctx->released) && (return_to_userland ? !signal_pending(current) : !fatal_signal_pending(current)))) { wake_up_poll(&ctx->fd_wqh, POLLIN); @@ -570,11 +570,14 @@ out: static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, struct userfaultfd_wait_queue *ewq) { + struct userfaultfd_ctx *release_new_ctx; + if (WARN_ON_ONCE(current->flags & PF_EXITING)) goto out; ewq->ctx = ctx; init_waitqueue_entry(&ewq->wq, current); + release_new_ctx = NULL; spin_lock(&ctx->event_wqh.lock); /* @@ -586,7 +589,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, set_current_state(TASK_KILLABLE); if (ewq->msg.event == 0) break; - if (ACCESS_ONCE(ctx->released) || + if (READ_ONCE(ctx->released) || fatal_signal_pending(current)) { /* * &ewq->wq may be queued in fork_event, but @@ -601,8 +604,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, new = (struct userfaultfd_ctx *) (unsigned long) ewq->msg.arg.reserved.reserved1; - - userfaultfd_ctx_put(new); + release_new_ctx = new; } break; } @@ -617,6 +619,20 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, __set_current_state(TASK_RUNNING); spin_unlock(&ctx->event_wqh.lock); + if (release_new_ctx) { + struct vm_area_struct *vma; + struct mm_struct *mm = release_new_ctx->mm; + + /* the various vma->vm_userfaultfd_ctx still points to it */ + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + up_write(&mm->mmap_sem); + + userfaultfd_ctx_put(release_new_ctx); + } + /* * ctx may go away after this if the userfault pseudo fd is * already released. @@ -668,7 +684,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->features = octx->features; ctx->released = false; ctx->mm = vma->vm_mm; - atomic_inc(&ctx->mm->mm_count); + mmgrab(ctx->mm); userfaultfd_ctx_get(octx); fctx->orig = octx; @@ -833,7 +849,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - ACCESS_ONCE(ctx->released) = true; + WRITE_ONCE(ctx->released, true); if (!mmget_not_zero(mm)) goto wakeup; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 1b98cfa342ab..f42fcf1b5465 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -71,6 +71,23 @@ config XFS_RT If unsure, say N. +config XFS_ONLINE_SCRUB + bool "XFS online metadata check support" + default n + depends on XFS_FS + help + If you say Y here you will be able to check metadata on a + mounted XFS filesystem. This feature is intended to reduce + filesystem downtime by supplementing xfs_repair. The key + advantage here is to look for problems proactively so that + they can be dealt with in a controlled manner. + + This feature is considered EXPERIMENTAL. Use with caution! + + See the xfs_scrub man page in section 8 for additional information. + + If unsure, say N. + config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index a6e955bfead8..7ceb41a9786a 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -49,6 +49,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_dquot_buf.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ + xfs_iext_tree.o \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ @@ -135,3 +136,31 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o + +# online scrub/repair +ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) + +# Tracepoints like to blow up, so build that before everything else + +xfs-y += $(addprefix scrub/, \ + trace.o \ + agheader.o \ + alloc.o \ + attr.o \ + bmap.o \ + btree.o \ + common.o \ + dabtree.o \ + dir.o \ + ialloc.o \ + inode.o \ + parent.o \ + refcount.o \ + rmap.o \ + scrub.o \ + symlink.o \ + ) + +xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o +xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o +endif diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 4d85992d75b2..4b87472f35bc 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -104,7 +104,7 @@ kmem_zone_init(int size, char *zone_name) } static inline kmem_zone_t * -kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, +kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags, void (*construct)(void *)) { return kmem_cache_create(zone_name, size, 0, flags, construct); @@ -119,8 +119,7 @@ kmem_zone_free(kmem_zone_t *zone, void *ptr) static inline void kmem_zone_destroy(kmem_zone_t *zone) { - if (zone) - kmem_cache_destroy(zone); + kmem_cache_destroy(zone); } extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index df3e600835e8..2291f4224e24 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -27,6 +27,7 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_alloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f965ce832bc0..83ed7715f856 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -31,6 +31,7 @@ #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_cksum.h" #include "xfs_trace.h" @@ -701,7 +702,7 @@ xfs_alloc_ag_vextent( ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ - if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) { + if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, args->agbno, args->len, &args->oinfo); if (error) @@ -1681,7 +1682,7 @@ xfs_free_ag_extent( bno_cur = cnt_cur = NULL; mp = tp->t_mountp; - if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) { + if (!xfs_rmap_should_skip_owner_update(oinfo)) { error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); if (error) goto error0; @@ -2931,3 +2932,52 @@ xfs_alloc_query_all( query.fn = fn; return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); } + +/* Find the size of the AG, in blocks. */ +xfs_agblock_t +xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + ASSERT(agno < mp->m_sb.sb_agcount); + + if (agno < mp->m_sb.sb_agcount - 1) + return mp->m_sb.sb_agblocks; + return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks); +} + +/* + * Verify that an AG block number pointer neither points outside the AG + * nor points at static metadata. + */ +bool +xfs_verify_agbno( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t agbno) +{ + xfs_agblock_t eoag; + + eoag = xfs_ag_block_count(mp, agno); + if (agbno >= eoag) + return false; + if (agbno <= XFS_AGFL_BLOCK(mp)) + return false; + return true; +} + +/* + * Verify that an FS block number pointer neither points outside the + * filesystem nor points at static AG metadata. + */ +bool +xfs_verify_fsbno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno) +{ + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); + + if (agno >= mp->m_sb.sb_agcount) + return false; + return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index ef26edc2e938..7ba2d129d504 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -232,5 +232,9 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); +xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); +bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno); +bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 6249c92671de..a76914db72ef 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -212,6 +212,7 @@ xfs_attr_set( int flags) { struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *leaf_bp = NULL; struct xfs_da_args args; struct xfs_defer_ops dfops; struct xfs_trans_res tres; @@ -327,9 +328,16 @@ xfs_attr_set( * GROT: another possible req'mt for a double-split btree op. */ xfs_defer_init(args.dfops, args.firstblock); - error = xfs_attr_shortform_to_leaf(&args); + error = xfs_attr_shortform_to_leaf(&args, &leaf_bp); if (error) goto out_defer_cancel; + /* + * Prevent the leaf buffer from being unlocked so that a + * concurrent AIL push cannot grab the half-baked leaf + * buffer and run into problems with the write verifier. + */ + xfs_trans_bhold(args.trans, leaf_bp); + xfs_defer_bjoin(args.dfops, leaf_bp); xfs_defer_ijoin(args.dfops, dp); error = xfs_defer_finish(&args.trans, args.dfops); if (error) @@ -337,13 +345,14 @@ xfs_attr_set( /* * Commit the leaf transformation. We'll need another (linked) - * transaction to add the new attribute to the leaf. + * transaction to add the new attribute to the leaf, which + * means that we have to hold & join the leaf buffer here too. */ - error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; - + xfs_trans_bjoin(args.trans, leaf_bp); + leaf_bp = NULL; } if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) @@ -374,8 +383,9 @@ xfs_attr_set( out_defer_cancel: xfs_defer_cancel(&dfops); - args.trans = NULL; out: + if (leaf_bp) + xfs_trans_brelse(args.trans, leaf_bp); if (args.trans) xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 5c16db86b38f..601eaa36f1ad 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -397,13 +397,9 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) /* rounded down */ offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; - switch (dp->i_d.di_format) { - case XFS_DINODE_FMT_DEV: + if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; return (offset >= minforkoff) ? minforkoff : 0; - case XFS_DINODE_FMT_UUID: - minforkoff = roundup(sizeof(uuid_t), 8) >> 3; - return (offset >= minforkoff) ? minforkoff : 0; } /* @@ -739,10 +735,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) } /* - * Convert from using the shortform to the leaf. + * Convert from using the shortform to the leaf. On success, return the + * buffer so that we can keep it locked until we're totally done with it. */ int -xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +xfs_attr_shortform_to_leaf( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) { xfs_inode_t *dp; xfs_attr_shortform_t *sf; @@ -822,7 +821,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } error = 0; - + *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index f7dda0c237b0..894124efb421 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -48,7 +48,8 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, + struct xfs_buf **leaf_bp); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 89263797cf32..1bddbba6b80c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -38,6 +38,7 @@ #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rtalloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" @@ -112,28 +113,21 @@ xfs_bmap_compute_maxlevels( STATIC int /* error */ xfs_bmbt_lookup_eq( struct xfs_btree_cur *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, + struct xfs_bmbt_irec *irec, int *stat) /* success/failure */ { - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; + cur->bc_rec.b = *irec; return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); } STATIC int /* error */ -xfs_bmbt_lookup_ge( +xfs_bmbt_lookup_first( struct xfs_btree_cur *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, int *stat) /* success/failure */ { - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; + cur->bc_rec.b.br_startoff = 0; + cur->bc_rec.b.br_startblock = 0; + cur->bc_rec.b.br_blockcount = 0; return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } @@ -160,21 +154,17 @@ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) } /* - * Update the record referred to by cur to the value given - * by [off, bno, len, state]. + * Update the record referred to by cur to the value given by irec * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int xfs_bmbt_update( struct xfs_btree_cur *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - xfs_exntst_t state) + struct xfs_bmbt_irec *irec) { union xfs_btree_rec rec; - xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state); + xfs_bmbt_disk_set_all(&rec.bmbt, irec); return xfs_btree_update(cur, &rec); } @@ -242,7 +232,6 @@ xfs_bmap_forkoff_reset( { if (whichfork == XFS_ATTR_FORK && ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_UUID && ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; @@ -499,31 +488,6 @@ error_norelse: } /* - * Add bmap trace insert entries for all the contents of the extent records. - */ -void -xfs_bmap_trace_exlist( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr or cow fork */ - unsigned long caller_ip) -{ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int state = 0; - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - else if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == xfs_iext_count(ifp)); - for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, state, caller_ip); -} - -/* * Validate that the bmbt_irecs being returned from bmapi are valid * given the caller's original parameters. Specifically check the * ranges of the returned irecs to ensure that they only extend beyond @@ -657,8 +621,8 @@ xfs_bmap_btree_to_extents( cbno = be64_to_cpu(*pp); *logflagsp = 0; #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, cbno, 1))) - return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + xfs_btree_check_lptr(cur, cbno, 1)); #endif error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); @@ -703,14 +667,14 @@ xfs_bmap_extents_to_btree( xfs_bmbt_rec_t *arp; /* child record pointer */ struct xfs_btree_block *block; /* btree root block */ xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ - xfs_extnum_t i, cnt; /* extent record index */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_key_t *kp; /* root block key pointer */ xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ xfs_bmbt_ptr_t *pp; /* root block address pointer */ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec rec; + xfs_extnum_t cnt = 0; mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); @@ -789,15 +753,12 @@ xfs_bmap_extents_to_btree( XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = xfs_iext_count(ifp); - for (cnt = i = 0; i < nextents; i++) { - ep = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { - arp->l0 = cpu_to_be64(ep->l0); - arp->l1 = cpu_to_be64(ep->l1); - arp++; cnt++; - } + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) + continue; + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt); + xfs_bmbt_disk_set_all(arp, &rec); + cnt++; } ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); xfs_btree_set_numrecs(ablock, cnt); @@ -845,6 +806,8 @@ xfs_bmap_local_to_extents_empty( xfs_bmap_forkoff_reset(ip, whichfork); ifp->if_flags &= ~XFS_IFINLINE; ifp->if_flags |= XFS_IFEXTENTS; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); } @@ -868,6 +831,7 @@ xfs_bmap_local_to_extents( xfs_alloc_arg_t args; /* allocation arguments */ xfs_buf_t *bp; /* buffer for extent block */ struct xfs_bmbt_irec rec; + struct xfs_iext_cursor icur; /* * We don't want to deal with the case of keeping inode data inline yet. @@ -885,8 +849,7 @@ xfs_bmap_local_to_extents( flags = 0; error = 0; - ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == - XFS_IFINLINE); + ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS)) == XFS_IFINLINE); memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; @@ -930,15 +893,16 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(ip, whichfork); flags |= XFS_ILOG_CORE; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; + rec.br_startoff = 0; rec.br_startblock = args.fsbno; rec.br_blockcount = 1; rec.br_state = XFS_EXT_NORM; - xfs_iext_insert(ip, 0, 1, &rec, 0); + xfs_iext_first(ifp, &icur); + xfs_iext_insert(ip, &icur, &rec, 0); - trace_xfs_bmap_post_update(ip, 0, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, - _THIS_IP_); XFS_IFORK_NEXT_SET(ip, whichfork, 1); ip->i_d.di_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, @@ -973,7 +937,8 @@ xfs_bmap_add_attrfork_btree( cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); cur->bc_private.b.dfops = dfops; cur->bc_private.b.firstblock = *firstblock; - if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) + error = xfs_bmbt_lookup_first(cur, &stat); + if (error) goto error0; /* must be at least one entry */ XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); @@ -1124,9 +1089,6 @@ xfs_bmap_add_attrfork( case XFS_DINODE_FMT_DEV: ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; break; - case XFS_DINODE_FMT_UUID: - ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; - break; case XFS_DINODE_FMT_LOCAL: case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -1206,32 +1168,35 @@ trans_cancel: */ /* - * Read in the extents to if_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. If the file system cannot contain unwritten - * extents, the records are checked for no "state" flags. + * Read in extents from a btree-format inode. */ -int /* error */ -xfs_bmap_read_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ +int +xfs_iread_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) { - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_extnum_t i, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - /* REFERENCED */ - xfs_extnum_t room; /* number of entries there's room for */ + struct xfs_mount *mp = ip->i_mount; + int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_extnum_t nextents = XFS_IFORK_NEXTENTS(ip, whichfork); + struct xfs_btree_block *block = ifp->if_broot; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec new; + xfs_fsblock_t bno; + struct xfs_buf *bp; + xfs_extnum_t i, j; + int level; + __be64 *pp; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. */ @@ -1248,21 +1213,23 @@ xfs_bmap_read_extents( error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) - return error; + goto out; block = XFS_BUF_TO_BLOCK(bp); if (level == 0) break; pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(mp, - XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_FSB_SANITY_CHECK(mp, bno), out_brelse); xfs_trans_brelse(tp, bp); } + /* * Here with bp and block set to the leftmost leaf node in the tree. */ - room = xfs_iext_count(ifp); i = 0; + xfs_iext_first(ifp, &icur); + /* * Loop over all leaf nodes. Copy information to the extent records. */ @@ -1272,14 +1239,15 @@ xfs_bmap_read_extents( xfs_extnum_t num_recs; num_recs = xfs_btree_get_numrecs(block); - if (unlikely(i + num_recs > room)) { - ASSERT(i + num_recs <= room); + if (unlikely(i + num_recs > nextents)) { + ASSERT(i + num_recs <= nextents); xfs_warn(ip->i_mount, "corrupt dinode %Lu, (btree extents).", (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, ip->i_mount, block); - goto error0; + error = -EFSCORRUPTED; + goto out_brelse; } /* * Read-ahead the next leaf block, if any. @@ -1292,15 +1260,17 @@ xfs_bmap_read_extents( * Copy records into the extent records. */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); - for (j = 0; j < num_recs; j++, i++, frp++) { - xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); - trp->l0 = be64_to_cpu(frp->l0); - trp->l1 = be64_to_cpu(frp->l1); - if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) { + for (j = 0; j < num_recs; j++, frp++, i++) { + xfs_bmbt_disk_get_all(frp, &new); + if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) { XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", XFS_ERRLEVEL_LOW, mp); - goto error0; + error = -EFSCORRUPTED; + goto out_brelse; } + xfs_iext_insert(ip, &icur, &new, state); + trace_xfs_read_extent(ip, &icur, state, _THIS_IP_); + xfs_iext_next(ifp, &icur); } xfs_trans_brelse(tp, bp); bno = nextbno; @@ -1312,71 +1282,74 @@ xfs_bmap_read_extents( error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) - return error; + goto out; block = XFS_BUF_TO_BLOCK(bp); } - if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) - return -EFSCORRUPTED; + + if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) { + error = -EFSCORRUPTED; + goto out; + } ASSERT(i == xfs_iext_count(ifp)); - XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + + ifp->if_flags |= XFS_IFEXTENTS; return 0; -error0: + +out_brelse: xfs_trans_brelse(tp, bp); - return -EFSCORRUPTED; +out: + xfs_iext_destroy(ifp); + return error; } /* - * Returns the file-relative block number of the first unused block(s) - * in the file with at least "len" logically contiguous blocks free. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - * Return 0 if the file is currently local (in-inode). + * Returns the relative block number of the first unused block(s) in the given + * fork with at least "len" logically contiguous blocks free. This is the + * lowest-address hole if the fork has holes, else the first block past the end + * of fork. Return 0 if the fork is currently local (in-inode). */ int /* error */ xfs_bmap_first_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *first_unused, /* unused block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ { - int error; /* error return value */ - int idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_fileoff_t lastaddr; /* last block number seen */ - xfs_fileoff_t lowest; /* lowest useful block */ - xfs_fileoff_t max; /* starting useful block */ - xfs_extnum_t nextents; /* number of extent entries */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + xfs_fileoff_t lastaddr = 0; + xfs_fileoff_t lowest, max; + int error; ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { *first_unused = 0; return 0; } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - lowest = *first_unused; - nextents = xfs_iext_count(ifp); - for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - struct xfs_bmbt_irec got; - xfs_iext_get_extent(ifp, idx, &got); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + lowest = max = *first_unused; + for_each_xfs_iext(ifp, &icur, &got) { /* * See if the hole before this extent will work. */ if (got.br_startoff >= lowest + len && - got.br_startoff - max >= len) { - *first_unused = max; - return 0; - } + got.br_startoff - max >= len) + break; lastaddr = got.br_startoff + got.br_blockcount; max = XFS_FILEOFF_MAX(lastaddr, lowest); } + *first_unused = max; return 0; } @@ -1396,7 +1369,7 @@ xfs_bmap_last_before( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; int error; switch (XFS_IFORK_FORMAT(ip, whichfork)) { @@ -1416,17 +1389,8 @@ xfs_bmap_last_before( return error; } - if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) { - if (got.br_startoff <= *last_block - 1) - return 0; - } - - if (xfs_iext_get_extent(ifp, idx - 1, &got)) { - *last_block = got.br_startoff + got.br_blockcount; - return 0; - } - - *last_block = 0; + if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got)) + *last_block = 0; return 0; } @@ -1439,8 +1403,8 @@ xfs_bmap_last_extent( int *is_empty) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_iext_cursor icur; int error; - int nextents; if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); @@ -1448,14 +1412,11 @@ xfs_bmap_last_extent( return error; } - nextents = xfs_iext_count(ifp); - if (nextents == 0) { + xfs_iext_last(ifp, &icur); + if (!xfs_iext_get_extent(ifp, &icur, rec)) *is_empty = 1; - return 0; - } - - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); - *is_empty = 0; + else + *is_empty = 0; return 0; } @@ -1540,10 +1501,10 @@ xfs_bmap_one_block( xfs_inode_t *ip, /* incore inode */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ xfs_ifork_t *ifp; /* inode fork pointer */ int rval; /* return value */ xfs_bmbt_irec_t s; /* internal version of extent */ + struct xfs_iext_cursor icur; #ifndef DEBUG if (whichfork == XFS_DATA_FORK) @@ -1555,8 +1516,8 @@ xfs_bmap_one_block( return 0; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_get_all(ep, &s); + xfs_iext_first(ifp, &icur); + xfs_iext_get_extent(ifp, &icur, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); @@ -1576,8 +1537,6 @@ xfs_bmap_add_extent_delay_real( int whichfork) { struct xfs_bmbt_irec *new = &bma->got; - int diff; /* temp value */ - xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -1585,14 +1544,14 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = 0;/* state bits, accessed thru macros */ + int state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ - xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ struct xfs_mount *mp; xfs_extnum_t *nextents; + struct xfs_bmbt_irec old; mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); @@ -1600,8 +1559,6 @@ xfs_bmap_add_extent_delay_real( nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : &bma->ip->i_d.di_nextents); - ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -1612,15 +1569,12 @@ xfs_bmap_add_extent_delay_real( #define RIGHT r[1] #define PREV r[2] - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - /* * Set up a bunch of variables to make the tests simpler. */ - ep = xfs_iext_get_ext(ifp, bma->idx); - xfs_bmbt_get_all(ep, &PREV); + xfs_iext_get_extent(ifp, &bma->icur, &PREV); new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(isnullstartblock(PREV.br_startblock)); ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); @@ -1640,10 +1594,8 @@ xfs_bmap_add_extent_delay_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (bma->idx > 0) { + if (xfs_iext_peek_prev_extent(ifp, &bma->icur, &LEFT)) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); - if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -1660,10 +1612,8 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < xfs_iext_count(ifp) - 1) { + if (xfs_iext_peek_next_extent(ifp, &bma->icur, &RIGHT)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); - if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1693,22 +1643,19 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - bma->idx--; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), - LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - - xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount; + + xfs_iext_remove(bma->ip, &bma->icur, state); + xfs_iext_remove(bma->ip, &bma->icur, state); + xfs_iext_prev(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); (*nextents)--; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -1720,11 +1667,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + - PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state); + error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } @@ -1735,28 +1678,22 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - bma->idx--; + old = LEFT; + LEFT.br_blockcount += PREV.br_blockcount; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), - LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_iext_remove(bma->ip, &bma->icur, state); + xfs_iext_prev(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); - xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, LEFT.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + - PREV.br_blockcount, LEFT.br_state); + error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } @@ -1767,27 +1704,23 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, new->br_startblock); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + PREV.br_startblock = new->br_startblock; + PREV.br_blockcount += RIGHT.br_blockcount; + + xfs_iext_next(ifp, &bma->icur); + xfs_iext_remove(bma->ip, &bma->icur, state); + xfs_iext_prev(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); - xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, PREV.br_startoff, - new->br_startblock, - PREV.br_blockcount + - RIGHT.br_blockcount, PREV.br_state); + error = xfs_bmbt_update(bma->cur, &PREV); if (error) goto done; } @@ -1799,23 +1732,19 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, new->br_startblock); - xfs_bmbt_set_state(ep, new->br_state); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + PREV.br_startblock = new->br_startblock; + PREV.br_state = new->br_state; + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; @@ -1828,40 +1757,33 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), - LEFT.br_blockcount + new->br_blockcount); - xfs_bmbt_set_startoff(ep, - PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); - + old = LEFT; temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock)); + + LEFT.br_blockcount += new->br_blockcount; + + PREV.br_blockcount = temp; + PREV.br_startoff += new->br_blockcount; + PREV.br_startblock = nullstartblock(da_new); + + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + xfs_iext_prev(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, LEFT.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + - new->br_blockcount, - LEFT.br_state); + error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } - da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), - startblockval(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - - bma->idx--; break; case BMAP_LEFT_FILLING: @@ -1869,23 +1791,16 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, new_endoff); - temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + xfs_iext_update_extent(bma->ip, state, &bma->icur, new); (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; @@ -1900,12 +1815,18 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + + temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, bma->idx + 1); - xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + + PREV.br_startoff = new_endoff; + PREV.br_blockcount = temp; + PREV.br_startblock = nullstartblock(da_new); + xfs_iext_next(ifp, &bma->icur); + xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); + xfs_iext_prev(ifp, &bma->icur); break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1913,40 +1834,34 @@ xfs_bmap_add_extent_delay_real( * Filling in the last part of a previous delayed allocation. * The right neighbor is contiguous with the new allocation. */ - temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), - new->br_startoff, new->br_startblock, - new->br_blockcount + RIGHT.br_blockcount, - RIGHT.br_state); - trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + old = RIGHT; + RIGHT.br_startoff = new->br_startoff; + RIGHT.br_startblock = new->br_startblock; + RIGHT.br_blockcount += new->br_blockcount; + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, new->br_startoff, - new->br_startblock, - new->br_blockcount + - RIGHT.br_blockcount, - RIGHT.br_state); + error = xfs_bmbt_update(bma->cur, &RIGHT); if (error) goto done; } + temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - bma->idx++; + PREV.br_blockcount = temp; + PREV.br_startblock = nullstartblock(da_new); + + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + xfs_iext_next(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); break; case BMAP_RIGHT_FILLING: @@ -1954,22 +1869,16 @@ xfs_bmap_add_extent_delay_real( * Filling in the last part of a previous delayed allocation. * The right neighbor is not contiguous. */ - temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + xfs_iext_update_extent(bma->ip, state, &bma->icur, new); (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; @@ -1984,14 +1893,16 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + + temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, bma->idx); - xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - bma->idx++; + PREV.br_startblock = nullstartblock(da_new); + PREV.br_blockcount = temp; + xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); + xfs_iext_next(ifp, &bma->icur); break; case 0: @@ -2015,30 +1926,40 @@ xfs_bmap_add_extent_delay_real( * PREV @ idx LEFT RIGHT * inserted at idx + 1 */ - temp = new->br_startoff - PREV.br_startoff; - temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + old = PREV; + + /* LEFT is the new middle */ LEFT = *new; + + /* RIGHT is the new right */ RIGHT.br_state = PREV.br_state; - RIGHT.br_startblock = nullstartblock( - (int)xfs_bmap_worst_indlen(bma->ip, temp2)); RIGHT.br_startoff = new_endoff; - RIGHT.br_blockcount = temp2; - /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ - xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + RIGHT.br_blockcount = + PREV.br_startoff + PREV.br_blockcount - new_endoff; + RIGHT.br_startblock = + nullstartblock(xfs_bmap_worst_indlen(bma->ip, + RIGHT.br_blockcount)); + + /* truncate PREV */ + PREV.br_blockcount = new->br_startoff - PREV.br_startoff; + PREV.br_startblock = + nullstartblock(xfs_bmap_worst_indlen(bma->ip, + PREV.br_blockcount)); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + + xfs_iext_next(ifp, &bma->icur); + xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state); + xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state); (*nextents)++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; @@ -2053,30 +1974,9 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } - temp = xfs_bmap_worst_indlen(bma->ip, temp); - temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - - (startblockval(PREV.br_startblock) - - (bma->cur ? - bma->cur->bc_private.b.allocated : 0))); - if (diff > 0) { - error = xfs_mod_fdblocks(bma->ip->i_mount, - -((int64_t)diff), false); - ASSERT(!error); - if (error) - goto done; - } - - ep = xfs_iext_get_ext(ifp, bma->idx); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), - nullstartblock((int)temp2)); - trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); - bma->idx++; - da_new = temp + temp2; + da_new = startblockval(PREV.br_startblock) + + startblockval(RIGHT.br_startblock); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -2110,19 +2010,17 @@ xfs_bmap_add_extent_delay_real( goto done; } - /* adjust for changes in reserved delayed indirect blocks */ - if (da_old || da_new) { - temp = da_new; - if (bma->cur) - temp += bma->cur->bc_private.b.allocated; - if (temp < da_old) - xfs_mod_fdblocks(bma->ip->i_mount, - (int64_t)(da_old - temp), false); + if (bma->cur) { + da_new += bma->cur->bc_private.b.allocated; + bma->cur->bc_private.b.allocated = 0; } - /* clear out the allocated field, done with it now in any case. */ - if (bma->cur) - bma->cur->bc_private.b.allocated = 0; + /* adjust for changes in reserved delayed indirect blocks */ + if (da_new != da_old) { + ASSERT(state == 0 || da_new < da_old); + error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), + false); + } xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: @@ -2142,7 +2040,7 @@ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ int whichfork, - xfs_extnum_t *idx, /* extent number to update/insert */ + struct xfs_iext_cursor *icur, xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_fsblock_t *first, /* pointer to firstblock variable */ @@ -2150,28 +2048,22 @@ xfs_bmap_add_extent_unwritten_real( int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ - xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ - xfs_exntst_t newext; /* new extent state */ - xfs_exntst_t oldext; /* old extent state */ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = 0;/* state bits, accessed thru macros */ + int state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec old; *logflagsp = 0; cur = *curp; ifp = XFS_IFORK_PTR(ip, whichfork); - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - ASSERT(*idx >= 0); - ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2184,12 +2076,8 @@ xfs_bmap_add_extent_unwritten_real( * Set up a bunch of variables to make the tests simpler. */ error = 0; - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_get_all(ep, &PREV); - newext = new->br_state; - oldext = (newext == XFS_EXT_UNWRITTEN) ? - XFS_EXT_NORM : XFS_EXT_UNWRITTEN; - ASSERT(PREV.br_state == oldext); + xfs_iext_get_extent(ifp, icur, &PREV); + ASSERT(new->br_state != PREV.br_state); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); @@ -2207,10 +2095,8 @@ xfs_bmap_add_extent_unwritten_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (*idx > 0) { + if (xfs_iext_peek_prev_extent(ifp, icur, &LEFT)) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); - if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2218,7 +2104,7 @@ xfs_bmap_add_extent_unwritten_real( if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && - LEFT.br_state == newext && + LEFT.br_state == new->br_state && LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) state |= BMAP_LEFT_CONTIG; @@ -2227,9 +2113,8 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < xfs_iext_count(ifp) - 1) { + if (xfs_iext_peek_next_extent(ifp, icur, &RIGHT)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2237,7 +2122,7 @@ xfs_bmap_add_extent_unwritten_real( if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && - newext == RIGHT.br_state && + new->br_state == RIGHT.br_state && new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != @@ -2258,24 +2143,20 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ - --*idx; - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), - LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount; - xfs_iext_remove(ip, *idx + 1, 2, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &LEFT); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) @@ -2290,10 +2171,8 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state))) + error = xfs_bmbt_update(cur, &LEFT); + if (error) goto done; } break; @@ -2303,23 +2182,19 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - --*idx; + LEFT.br_blockcount += PREV.br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), - LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &LEFT); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &PREV, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) @@ -2328,10 +2203,8 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + PREV.br_blockcount, - LEFT.br_state))) + error = xfs_bmbt_update(cur, &LEFT); + if (error) goto done; } break; @@ -2341,21 +2214,22 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmbt_set_state(ep, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; + + xfs_iext_next(ifp, icur); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &PREV); + XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) @@ -2364,10 +2238,8 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, - new->br_startblock, - new->br_blockcount + RIGHT.br_blockcount, - newext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; } break; @@ -2378,22 +2250,19 @@ xfs_bmap_add_extent_unwritten_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_state(ep, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + PREV.br_state = new->br_state; + xfs_iext_update_extent(ip, state, icur, &PREV); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, new, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - newext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; } break; @@ -2403,43 +2272,32 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), - LEFT.br_blockcount + new->br_blockcount); - xfs_bmbt_set_startoff(ep, - PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, - new->br_startblock + new->br_blockcount); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - --*idx; + LEFT.br_blockcount += new->br_blockcount; + + old = PREV; + PREV.br_startoff += new->br_blockcount; + PREV.br_startblock += new->br_blockcount; + PREV.br_blockcount -= new->br_blockcount; + + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &LEFT); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, - PREV.br_startoff + new->br_blockcount, - PREV.br_startblock + new->br_blockcount, - PREV.br_blockcount - new->br_blockcount, - oldext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(cur, 0, &i); + if (error) goto done; - error = xfs_bmbt_update(cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state); + error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; } @@ -2450,32 +2308,25 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); - xfs_bmbt_set_startoff(ep, new_endoff); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount - new->br_blockcount); - xfs_bmbt_set_startblock(ep, - new->br_startblock + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - xfs_iext_insert(ip, *idx, 1, new, state); + old = PREV; + PREV.br_startoff += new->br_blockcount; + PREV.br_startblock += new->br_blockcount; + PREV.br_blockcount -= new->br_blockcount; + + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_insert(ip, icur, new, state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, - PREV.br_startoff + new->br_blockcount, - PREV.br_startblock + new->br_blockcount, - PREV.br_blockcount - new->br_blockcount, - oldext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) @@ -2489,39 +2340,33 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = PREV; + PREV.br_blockcount -= new->br_blockcount; - ++*idx; + RIGHT.br_startoff = new->br_startoff; + RIGHT.br_startblock = new->br_startblock; + RIGHT.br_blockcount += new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), - new->br_startoff, new->br_startblock, - new->br_blockcount + RIGHT.br_blockcount, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_next(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &RIGHT); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, - PREV.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, - PREV.br_startblock, - PREV.br_blockcount - new->br_blockcount, - oldext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; - if ((error = xfs_btree_increment(cur, 0, &i))) + error = xfs_btree_increment(cur, 0, &i); + if (error) goto done; - if ((error = xfs_bmbt_update(cur, new->br_startoff, - new->br_startblock, - new->br_blockcount + RIGHT.br_blockcount, - newext))) + error = xfs_bmbt_update(cur, &RIGHT); + if (error) goto done; } break; @@ -2531,13 +2376,12 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = PREV; + PREV.br_blockcount -= new->br_blockcount; - ++*idx; - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, new, state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); @@ -2545,22 +2389,17 @@ xfs_bmap_add_extent_unwritten_real( rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, - PREV.br_startblock, - PREV.br_blockcount - new->br_blockcount, - oldext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, new, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -2573,20 +2412,20 @@ xfs_bmap_add_extent_unwritten_real( * newext. Contiguity is impossible here. * One extent becomes three extents. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, - new->br_startoff - PREV.br_startoff); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = PREV; + PREV.br_blockcount = new->br_startoff - PREV.br_startoff; r[0] = *new; r[1].br_startoff = new_endoff; r[1].br_blockcount = - PREV.br_startoff + PREV.br_blockcount - new_endoff; + old.br_startoff + old.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; - r[1].br_state = oldext; + r[1].br_state = PREV.br_state; - ++*idx; - xfs_iext_insert(ip, *idx, 2, &r[0], state); + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, &r[1], state); + xfs_iext_insert(ip, icur, &r[0], state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 2); @@ -2594,20 +2433,16 @@ xfs_bmap_add_extent_unwritten_real( rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* new right extent - oldext */ - if ((error = xfs_bmbt_update(cur, r[1].br_startoff, - r[1].br_startblock, r[1].br_blockcount, - r[1].br_state))) + error = xfs_bmbt_update(cur, &r[1]); + if (error) goto done; /* new left extent - oldext */ cur->bc_rec.b = PREV; - cur->bc_rec.b.br_blockcount = - new->br_startoff - PREV.br_startoff; if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -2616,13 +2451,11 @@ xfs_bmap_add_extent_unwritten_real( * we are about to insert as we can't trust it after * the previous insert. */ - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, new, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); /* new middle extent - newext */ - cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -2681,7 +2514,7 @@ STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ int whichfork, - xfs_extnum_t *idx, /* extent number to update/insert */ + struct xfs_iext_cursor *icur, xfs_bmbt_irec_t *new) /* new data to add to file extents */ { xfs_ifork_t *ifp; /* inode fork pointer */ @@ -2689,22 +2522,17 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ - int state; /* state bits, accessed thru macros */ - xfs_filblks_t temp=0; /* temp for indirect calculations */ + int state = xfs_bmap_fork_to_state(whichfork); + xfs_filblks_t temp; /* temp for indirect calculations */ ifp = XFS_IFORK_PTR(ip, whichfork); - state = 0; - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; ASSERT(isnullstartblock(new->br_startblock)); /* * Check and set flags if this segment has a left neighbor */ - if (*idx > 0) { + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); - if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2713,10 +2541,8 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < xfs_iext_count(ifp)) { + if (xfs_iext_get_extent(ifp, icur, &right)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); - if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2748,22 +2574,20 @@ xfs_bmap_add_extent_hole_delay( * on the left and on the right. * Merge all three into a single extent record. */ - --*idx; temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), - nullstartblock((int)newlen)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + left.br_startblock = nullstartblock(newlen); + left.br_blockcount = temp; - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); break; case BMAP_LEFT_CONTIG: @@ -2772,18 +2596,17 @@ xfs_bmap_add_extent_hole_delay( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; temp = left.br_blockcount + new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), - nullstartblock((int)newlen)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + left.br_blockcount = temp; + left.br_startblock = nullstartblock(newlen); + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); break; case BMAP_RIGHT_CONTIG: @@ -2792,16 +2615,15 @@ xfs_bmap_add_extent_hole_delay( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), - new->br_startoff, - nullstartblock((int)newlen), temp, right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + right.br_startoff = new->br_startoff; + right.br_startblock = nullstartblock(newlen); + right.br_blockcount = temp; + xfs_iext_update_extent(ip, state, icur, &right); break; case 0: @@ -2811,7 +2633,7 @@ xfs_bmap_add_extent_hole_delay( * Insert a new entry. */ oldlen = newlen = 0; - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_insert(ip, icur, new, state); break; } if (oldlen != newlen) { @@ -2832,7 +2654,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, - xfs_extnum_t *idx, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, xfs_fsblock_t *first, @@ -2847,27 +2669,19 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ - int state; /* state bits, accessed thru macros */ + int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_bmbt_irec old; - ASSERT(*idx >= 0); - ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); - state = 0; - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - /* * Check and set flags if this segment has a left neighbor. */ - if (*idx > 0) { + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2876,9 +2690,8 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (*idx < xfs_iext_count(ifp)) { + if (xfs_iext_get_extent(ifp, icur, &right)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2915,14 +2728,11 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), - left.br_blockcount + new->br_blockcount + - right.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + left.br_blockcount += new->br_blockcount + right.br_blockcount; - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); @@ -2930,9 +2740,7 @@ xfs_bmap_add_extent_hole_real( rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(cur, right.br_startoff, - right.br_startblock, right.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(cur, &right, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -2944,12 +2752,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(cur, left.br_startoff, - left.br_startblock, - left.br_blockcount + - new->br_blockcount + - right.br_blockcount, - left.br_state); + error = xfs_bmbt_update(cur, &left); if (error) goto done; } @@ -2961,27 +2764,21 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), - left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = left; + left.br_blockcount += new->br_blockcount; + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(cur, left.br_startoff, - left.br_startblock, left.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(cur, left.br_startoff, - left.br_startblock, - left.br_blockcount + - new->br_blockcount, - left.br_state); + error = xfs_bmbt_update(cur, &left); if (error) goto done; } @@ -2993,29 +2790,22 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), - new->br_startoff, new->br_startblock, - new->br_blockcount + right.br_blockcount, - right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = right; + + right.br_startoff = new->br_startoff; + right.br_startblock = new->br_startblock; + right.br_blockcount += new->br_blockcount; + xfs_iext_update_extent(ip, state, icur, &right); if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(cur, - right.br_startoff, - right.br_startblock, - right.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(cur, new->br_startoff, - new->br_startblock, - new->br_blockcount + - right.br_blockcount, - right.br_state); + error = xfs_bmbt_update(cur, &right); if (error) goto done; } @@ -3027,21 +2817,17 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_insert(ip, icur, new, state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(cur, - new->br_startoff, - new->br_startblock, - new->br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - cur->bc_rec.b.br_state = new->br_state; error = xfs_btree_insert(cur, &i); if (error) goto done; @@ -3981,7 +3767,7 @@ xfs_bmapi_read( struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; int error; bool eof = false; int n = 0; @@ -4023,7 +3809,7 @@ xfs_bmapi_read( return error; } - if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) + if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) eof = true; end = bno + len; obno = bno; @@ -4055,7 +3841,7 @@ xfs_bmapi_read( break; /* Else go on to the next record. */ - if (!xfs_iext_get_extent(ifp, ++idx, &got)) + if (!xfs_iext_next_extent(ifp, &icur, &got)) eof = true; } *nmap = n; @@ -4083,7 +3869,7 @@ xfs_bmapi_reserve_delalloc( xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, - xfs_extnum_t *lastx, + struct xfs_iext_cursor *icur, int eof) { struct xfs_mount *mp = ip->i_mount; @@ -4113,7 +3899,7 @@ xfs_bmapi_reserve_delalloc( if (extsz) { struct xfs_bmbt_irec prev; - if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev)) + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) prev.br_startoff = NULLFILEOFF; error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof, @@ -4162,7 +3948,7 @@ xfs_bmapi_reserve_delalloc( got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); /* * Tag the inode if blocks were preallocated. Note that COW fork @@ -4207,10 +3993,7 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - if (bma->idx) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), - &bma->prev); - } + xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev); } else { bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); if (!bma->eof) @@ -4295,7 +4078,7 @@ xfs_bmapi_allocate( error = xfs_bmap_add_extent_delay_real(bma, whichfork); else error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, - whichfork, &bma->idx, &bma->cur, &bma->got, + whichfork, &bma->icur, &bma->cur, &bma->got, bma->firstblock, bma->dfops, &bma->logflags); bma->logflags |= tmp_logflags; @@ -4307,7 +4090,7 @@ xfs_bmapi_allocate( * or xfs_bmap_add_extent_hole_real might have merged it into one of * the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + xfs_iext_get_extent(ifp, &bma->icur, &bma->got); ASSERT(bma->got.br_startoff <= bma->offset); ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= @@ -4365,8 +4148,8 @@ xfs_bmapi_convert_unwritten( } error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, - &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops, - &tmp_logflags); + &bma->icur, &bma->cur, mval, bma->firstblock, + bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion * path because the conversion might not have done so (e.g., if the @@ -4388,7 +4171,7 @@ xfs_bmapi_convert_unwritten( * xfs_bmap_add_extent_unwritten_real might have merged it into one * of the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + xfs_iext_get_extent(ifp, &bma->icur, &bma->got); /* * We may have combined previously unwritten space with written space, @@ -4507,9 +4290,9 @@ xfs_bmapi_write( end = bno + len; obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got)) + if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; - if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev)) + if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; bma.tp = tp; bma.ip = ip; @@ -4551,7 +4334,8 @@ xfs_bmapi_write( * First, deal with the hole before the allocated space * that we found, if any. */ - if (need_alloc || wasdelay) { + if ((need_alloc || wasdelay) && + !(flags & XFS_BMAPI_CONVERT_ONLY)) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4614,7 +4398,7 @@ xfs_bmapi_write( /* Else go on to the next record. */ bma.prev = bma.got; - if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got)) + if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) eof = true; } *nmap = n; @@ -4687,7 +4471,7 @@ xfs_bmapi_remap( struct xfs_btree_cur *cur = NULL; xfs_fsblock_t firstblock = NULLFSBLOCK; struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; int logflags = 0, error; ASSERT(len > 0); @@ -4711,7 +4495,7 @@ xfs_bmapi_remap( return error; } - if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) { + if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { /* make sure we only reflink into a hole. */ ASSERT(got.br_startoff > bno); ASSERT(got.br_startoff - bno >= len); @@ -4732,8 +4516,8 @@ xfs_bmapi_remap( got.br_blockcount = len; got.br_state = XFS_EXT_NORM; - error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur, - &got, &firstblock, dfops, &logflags); + error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur, + &cur, &got, &firstblock, dfops, &logflags); if (error) goto error0; @@ -4849,7 +4633,7 @@ int xfs_bmap_del_extent_delay( struct xfs_inode *ip, int whichfork, - xfs_extnum_t *idx, + struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del) { @@ -4859,7 +4643,8 @@ xfs_bmap_del_extent_delay( int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen; - int error = 0, state = 0; + int state = xfs_bmap_fork_to_state(whichfork); + int error = 0; bool isrt; XFS_STATS_INC(mp, xs_del_exlist); @@ -4870,8 +4655,6 @@ xfs_bmap_del_extent_delay( da_old = startblockval(got->br_startblock); da_new = 0; - ASSERT(*idx >= 0); - ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -4895,46 +4678,39 @@ xfs_bmap_del_extent_delay( return error; ip->i_delayed_blks -= del->br_blockcount; - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - if (got->br_startoff == del->br_startoff) - state |= BMAP_LEFT_CONTIG; + state |= BMAP_LEFT_FILLING; if (got_endoff == del_endoff) - state |= BMAP_RIGHT_CONTIG; + state |= BMAP_RIGHT_FILLING; - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ - xfs_iext_remove(ip, *idx, 1, state); - --*idx; + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); break; - case BMAP_LEFT_CONTIG: + case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, got); break; - case BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount = got->br_blockcount - del->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, got); break; case 0: /* @@ -4946,8 +4722,6 @@ xfs_bmap_del_extent_delay( * Warn if either of the new indlen reservations is zero as this * can lead to delalloc problems. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - got->br_blockcount = del->br_startoff - got->br_startoff; got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount); @@ -4959,15 +4733,14 @@ xfs_bmap_del_extent_delay( del->br_blockcount); got->br_startblock = nullstartblock((int)got_indlen); - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); new.br_startoff = del_endoff; new.br_state = got->br_state; new.br_startblock = nullstartblock((int)new_indlen); - ++*idx; - xfs_iext_insert(ip, *idx, 1, &new, state); + xfs_iext_update_extent(ip, state, icur, got); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, &new, state); da_new = got_indlen + new_indlen - stolen; del->br_blockcount -= stolen; @@ -4986,7 +4759,7 @@ xfs_bmap_del_extent_delay( void xfs_bmap_del_extent_cow( struct xfs_inode *ip, - xfs_extnum_t *idx, + struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del) { @@ -5001,75 +4774,67 @@ xfs_bmap_del_extent_cow( del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got->br_startoff + got->br_blockcount; - ASSERT(*idx >= 0); - ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got->br_startblock)); if (got->br_startoff == del->br_startoff) - state |= BMAP_LEFT_CONTIG; + state |= BMAP_LEFT_FILLING; if (got_endoff == del_endoff) - state |= BMAP_RIGHT_CONTIG; + state |= BMAP_RIGHT_FILLING; - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ - xfs_iext_remove(ip, *idx, 1, state); - --*idx; + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); break; - case BMAP_LEFT_CONTIG: + case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; got->br_startblock = del->br_startblock + del->br_blockcount; - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, got); break; - case BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount -= del->br_blockcount; - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, got); break; case 0: /* * Deleting the middle of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount = del->br_startoff - got->br_startoff; - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); new.br_startoff = del_endoff; new.br_blockcount = got_endoff - del_endoff; new.br_state = got->br_state; new.br_startblock = del->br_startblock + del->br_blockcount; - ++*idx; - xfs_iext_insert(ip, *idx, 1, &new, state); + xfs_iext_update_extent(ip, state, icur, got); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, &new, state); break; } } /* * Called by xfs_bmapi to update file extent records and the btree - * after removing space (or undoing a delayed allocation). + * after removing space. */ STATIC int /* error */ -xfs_bmap_del_extent( +xfs_bmap_del_extent_real( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t *idx, /* extent number to update/delete */ + struct xfs_iext_cursor *icur, struct xfs_defer_ops *dfops, /* list of extents to be freed */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ @@ -5077,16 +4842,12 @@ xfs_bmap_del_extent( int whichfork, /* data or attr fork */ int bflags) /* bmapi flags */ { - xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ - xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ - int delay; /* current block is delayed allocated */ int do_fx; /* free extent at end of routine */ - xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ int error; /* error return value */ - int flags; /* inode logging flags */ - xfs_bmbt_irec_t got; /* current extent entry */ + int flags = 0;/* inode logging flags */ + struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -5095,103 +4856,81 @@ xfs_bmap_del_extent( xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ - xfs_filblks_t temp; /* for indirect length calculations */ - xfs_filblks_t temp2; /* for indirect length calculations */ - int state = 0; + int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_bmbt_irec old; mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - else if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp))); ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, icur, &got); ASSERT(got.br_startoff <= del->br_startoff); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got.br_startoff + got.br_blockcount; ASSERT(got_endoff >= del_endoff); - delay = isnullstartblock(got.br_startblock); - ASSERT(isnullstartblock(del->br_startblock) == delay); - flags = 0; + ASSERT(!isnullstartblock(got.br_startblock)); qfield = 0; error = 0; + /* - * If deleting a real allocation, must free up the disk space. + * If it's the case where the directory code is running with no block + * reservation, and the deleted block is in the middle of its extent, + * and the resulting insert of an extent would cause transformation to + * btree format, then reject it. The calling code will then swap blocks + * around instead. We have to do this now, rather than waiting for the + * conversion to btree format, since the transaction will be dirty then. */ - if (!delay) { - flags = XFS_ILOG_CORE; - /* - * Realtime allocation. Free it and record di_nblocks update. - */ - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; - xfs_filblks_t len; - - ASSERT(do_mod(del->br_blockcount, - mp->m_sb.sb_rextsize) == 0); - ASSERT(do_mod(del->br_startblock, - mp->m_sb.sb_rextsize) == 0); - bno = del->br_startblock; - len = del->br_blockcount; - do_div(bno, mp->m_sb.sb_rextsize); - do_div(len, mp->m_sb.sb_rextsize); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; - do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; - qfield = XFS_TRANS_DQ_RTBCOUNT; - } - /* - * Ordinary allocation. - */ - else { - do_fx = 1; - nblks = del->br_blockcount; - qfield = XFS_TRANS_DQ_BCOUNT; - } - /* - * Set up del_endblock and cur for later. - */ - del_endblock = del->br_startblock + del->br_blockcount; - if (cur) { - if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, got.br_blockcount, - &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - } - da_old = da_new = 0; - } else { - da_old = startblockval(got.br_startblock); - da_new = 0; - nblks = 0; + if (tp->t_blk_res == 0 && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) >= + XFS_IFORK_MAXEXT(ip, whichfork) && + del->br_startoff > got.br_startoff && del_endoff < got_endoff) + return -ENOSPC; + + flags = XFS_ILOG_CORE; + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; } - /* - * Set flag value to use in switch statement. - * Left-contig is 2, right-contig is 1. - */ - switch (((got.br_startoff == del->br_startoff) << 1) | - (got_endoff == del_endoff)) { - case 3: + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + error = xfs_bmbt_lookup_eq(cur, &got, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + if (got.br_startoff == del->br_startoff) + state |= BMAP_LEFT_FILLING; + if (got_endoff == del_endoff) + state |= BMAP_RIGHT_FILLING; + + switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx, 1, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - --*idx; - if (delay) - break; - + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); flags |= XFS_ILOG_CORE; @@ -5203,168 +4942,106 @@ xfs_bmap_del_extent( goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); break; - - case 2: + case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, del_endoff); - temp = got.br_blockcount - del->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + got.br_startoff = del_endoff; + got.br_startblock = del_endblock; + got.br_blockcount -= del->br_blockcount; + xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; } - if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) + error = xfs_bmbt_update(cur, &got); + if (error) goto done; break; - - case 1: + case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ - temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + got.br_blockcount -= del->br_blockcount; + xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; } - if ((error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) + error = xfs_bmbt_update(cur, &got); + if (error) goto done; break; - case 0: /* * Deleting the middle of the extent. */ - temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); + old = got; + + got.br_blockcount = del->br_startoff - got.br_startoff; + xfs_iext_update_extent(ip, state, icur, &got); + new.br_startoff = del_endoff; - temp2 = got_endoff - del_endoff; - new.br_blockcount = temp2; + new.br_blockcount = got_endoff - del_endoff; new.br_state = got.br_state; - if (!delay) { - new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; - if (cur) { - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, temp, - got.br_state))) - goto done; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto done; - cur->bc_rec.b = new; - error = xfs_btree_insert(cur, &i); - if (error && error != -ENOSPC) - goto done; + new.br_startblock = del_endblock; + + flags |= XFS_ILOG_CORE; + if (cur) { + error = xfs_bmbt_update(cur, &got); + if (error) + goto done; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != -ENOSPC) + goto done; + /* + * If get no-space back from btree insert, it tried a + * split, and we have a zero block reservation. Fix up + * our state and return the error. + */ + if (error == -ENOSPC) { /* - * If get no-space back from btree insert, - * it tried a split, and we have a zero - * block reservation. - * Fix up our state and return the error. + * Reset the cursor, don't trust it after any + * insert operation. */ - if (error == -ENOSPC) { - /* - * Reset the cursor, don't trust - * it after any insert operation. - */ - if ((error = xfs_bmbt_lookup_eq(cur, - got.br_startoff, - got.br_startblock, - temp, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(mp, - i == 1, done); - /* - * Update the btree record back - * to the original value. - */ - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state))) - goto done; - /* - * Reset the extent record back - * to the original value. - */ - xfs_bmbt_set_blockcount(ep, - got.br_blockcount); - flags = 0; - error = -ENOSPC; + error = xfs_bmbt_lookup_eq(cur, &got, &i); + if (error) goto done; - } XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - } else - flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - } else { - xfs_filblks_t stolen; - ASSERT(whichfork == XFS_DATA_FORK); - - /* - * Distribute the original indlen reservation across the - * two new extents. Steal blocks from the deleted extent - * if necessary. Stealing blocks simply fudges the - * fdblocks accounting in xfs_bunmapi(). - */ - temp = xfs_bmap_worst_indlen(ip, got.br_blockcount); - temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount); - stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2, - del->br_blockcount); - da_new = temp + temp2 - stolen; - del->br_blockcount -= stolen; - - /* - * Set the reservation for each extent. Warn if either - * is zero as this can lead to delalloc problems. - */ - WARN_ON_ONCE(!temp || !temp2); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - new.br_startblock = nullstartblock((int)temp2); - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_insert(ip, *idx + 1, 1, &new, state); - ++*idx; + /* + * Update the btree record back + * to the original value. + */ + error = xfs_bmbt_update(cur, &old); + if (error) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_iext_update_extent(ip, state, icur, &old); + flags = 0; + error = -ENOSPC; + goto done; + } + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, &new, state); break; } /* remove reverse mapping */ - if (!delay) { - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del); - if (error) - goto done; - } + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del); + if (error) + goto done; /* * If we need to, add to list of extents to delete. @@ -5390,13 +5067,6 @@ xfs_bmap_del_extent( if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); - /* - * Account for change in delayed indirect blocks. - * Nothing to do for disk quota accounting here. - */ - ASSERT(da_old >= da_new); - if (da_old > da_new) - xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); done: *logflagsp = flags; return error; @@ -5412,7 +5082,7 @@ int /* error */ __xfs_bunmapi( xfs_trans_t *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ int flags, /* misc flags */ xfs_extnum_t nexts, /* number of extents max */ @@ -5427,11 +5097,9 @@ __xfs_bunmapi( xfs_bmbt_irec_t got; /* current extent record */ xfs_ifork_t *ifp; /* inode fork pointer */ int isrt; /* freeing in rt area */ - xfs_extnum_t lastx; /* last extent index used */ int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ xfs_mount_t *mp; /* mount structure */ - xfs_fileoff_t start; /* first file offset deleted */ int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ @@ -5439,8 +5107,11 @@ __xfs_bunmapi( xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t max_len; xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; + xfs_fileoff_t end; + struct xfs_iext_cursor icur; + bool done = false; - trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + trace_xfs_bunmap(ip, start, len, flags, _RET_IP_); whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); @@ -5465,7 +5136,7 @@ __xfs_bunmapi( * blowing out the transaction with a mix of EFIs and reflink * adjustments. */ - if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) + if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); else max_len = len; @@ -5479,18 +5150,13 @@ __xfs_bunmapi( } XFS_STATS_INC(mp, xs_blk_unmap); isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); - start = bno; - bno = start + len - 1; + end = start + len; - /* - * Check to see if the given block number is past the end of the - * file, back up to the last block if so... - */ - if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) { - ASSERT(lastx > 0); - xfs_iext_get_extent(ifp, --lastx, &got); - bno = got.br_startoff + got.br_blockcount - 1; + if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) { + *rlen = 0; + return 0; } + end--; logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { @@ -5513,24 +5179,24 @@ __xfs_bunmapi( } extno = 0; - while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && + while (end != (xfs_fileoff_t)-1 && end >= start && (nexts == 0 || extno < nexts) && max_len > 0) { /* - * Is the found extent after a hole in which bno lives? + * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. */ - if (got.br_startoff > bno) { - if (--lastx < 0) - break; - xfs_iext_get_extent(ifp, lastx, &got); + if (got.br_startoff > end && + !xfs_iext_prev_extent(ifp, &icur, &got)) { + done = true; + break; } /* * Is the last block of this extent before the range * we're supposed to delete? If so, we're done. */ - bno = XFS_FILEOFF_MIN(bno, + end = XFS_FILEOFF_MIN(end, got.br_startoff + got.br_blockcount - 1); - if (bno < start) + if (end < start) break; /* * Then deal with the (possibly delayed) allocated space @@ -5555,8 +5221,8 @@ __xfs_bunmapi( if (!wasdel) del.br_startblock += start - got.br_startoff; } - if (del.br_startoff + del.br_blockcount > bno + 1) - del.br_blockcount = bno + 1 - del.br_startoff; + if (del.br_startoff + del.br_blockcount > end + 1) + del.br_blockcount = end + 1 - del.br_startoff; /* How much can we safely unmap? */ if (max_len < del.br_blockcount) { @@ -5582,13 +5248,13 @@ __xfs_bunmapi( * This piece is unwritten, or we're not * using unwritten extents. Skip over it. */ - ASSERT(bno >= mod); - bno -= mod > del.br_blockcount ? + ASSERT(end >= mod); + end -= mod > del.br_blockcount ? del.br_blockcount : mod; - if (bno < got.br_startoff) { - if (--lastx >= 0) - xfs_bmbt_get_all(xfs_iext_get_ext( - ifp, lastx), &got); + if (end < got.br_startoff && + !xfs_iext_prev_extent(ifp, &icur, &got)) { + done = true; + break; } continue; } @@ -5609,7 +5275,7 @@ __xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - whichfork, &lastx, &cur, &del, + whichfork, &icur, &cur, &del, firstblock, dfops, &logflags); if (error) goto error0; @@ -5634,10 +5300,13 @@ __xfs_bunmapi( * Can't make it unwritten. There isn't * a full extent here so just skip it. */ - ASSERT(bno >= del.br_blockcount); - bno -= del.br_blockcount; - if (got.br_startoff > bno && --lastx >= 0) - xfs_iext_get_extent(ifp, lastx, &got); + ASSERT(end >= del.br_blockcount); + end -= del.br_blockcount; + if (got.br_startoff > end && + !xfs_iext_prev_extent(ifp, &icur, &got)) { + done = true; + break; + } continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { struct xfs_bmbt_irec prev; @@ -5648,8 +5317,8 @@ __xfs_bunmapi( * Unwrite the killed part of that one and * try again. */ - ASSERT(lastx > 0); - xfs_iext_get_extent(ifp, lastx - 1, &prev); + if (!xfs_iext_prev_extent(ifp, &icur, &prev)) + ASSERT(0); ASSERT(prev.br_state == XFS_EXT_NORM); ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == @@ -5661,9 +5330,8 @@ __xfs_bunmapi( prev.br_startoff = start; } prev.br_state = XFS_EXT_UNWRITTEN; - lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, whichfork, &lastx, &cur, + ip, whichfork, &icur, &cur, &prev, firstblock, dfops, &logflags); if (error) @@ -5673,7 +5341,7 @@ __xfs_bunmapi( ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, whichfork, &lastx, &cur, + ip, whichfork, &icur, &cur, &del, firstblock, dfops, &logflags); if (error) @@ -5682,85 +5350,39 @@ __xfs_bunmapi( } } - /* - * If it's the case where the directory code is running - * with no block reservation, and the deleted block is in - * the middle of its extent, and the resulting insert - * of an extent would cause transformation to btree format, - * then reject it. The calling code will then swap - * blocks around instead. - * We have to do this now, rather than waiting for the - * conversion to btree format, since the transaction - * will be dirty. - */ - if (!wasdel && tp->t_blk_res == 0 && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ - XFS_IFORK_MAXEXT(ip, whichfork) && - del.br_startoff > got.br_startoff && - del.br_startoff + del.br_blockcount < - got.br_startoff + got.br_blockcount) { - error = -ENOSPC; - goto error0; + if (wasdel) { + error = xfs_bmap_del_extent_delay(ip, whichfork, &icur, + &got, &del); + } else { + error = xfs_bmap_del_extent_real(ip, tp, &icur, dfops, + cur, &del, &tmp_logflags, whichfork, + flags); + logflags |= tmp_logflags; } - /* - * Unreserve quota and update realtime free space, if - * appropriate. If delayed allocation, update the inode delalloc - * counter now and wait to update the sb counters as - * xfs_bmap_del_extent() might need to borrow some blocks. - */ - if (wasdel) { - ASSERT(startblockval(del.br_startblock) > 0); - if (isrt) { - xfs_filblks_t rtexts; - - rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); - do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_frextents(mp, (int64_t)rtexts); - (void)xfs_trans_reserve_quota_nblks(NULL, - ip, -((long)del.br_blockcount), 0, - XFS_QMOPT_RES_RTBLKS); - } else { - (void)xfs_trans_reserve_quota_nblks(NULL, - ip, -((long)del.br_blockcount), 0, - XFS_QMOPT_RES_REGBLKS); - } - ip->i_delayed_blks -= del.br_blockcount; - if (cur) - cur->bc_private.b.flags |= - XFS_BTCUR_BPRV_WASDEL; - } else if (cur) - cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; - - error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, - &tmp_logflags, whichfork, flags); - logflags |= tmp_logflags; if (error) goto error0; - if (!isrt && wasdel) - xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); - max_len -= del.br_blockcount; - bno = del.br_startoff - 1; + end = del.br_startoff - 1; nodelete: /* * If not done go on to the next (previous) record. */ - if (bno != (xfs_fileoff_t)-1 && bno >= start) { - if (lastx >= 0) { - xfs_iext_get_extent(ifp, lastx, &got); - if (got.br_startoff > bno && --lastx >= 0) - xfs_iext_get_extent(ifp, lastx, &got); + if (end != (xfs_fileoff_t)-1 && end >= start) { + if (!xfs_iext_get_extent(ifp, &icur, &got) || + (got.br_startoff > end && + !xfs_iext_prev_extent(ifp, &icur, &got))) { + done = true; + break; } extno++; } } - if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0) + if (done || end == (xfs_fileoff_t)-1 || end < start) *rlen = 0; else - *rlen = bno - start + 1; + *rlen = end - start + 1; /* * Convert to a btree if necessary. @@ -5878,14 +5500,13 @@ xfs_bmse_merge( struct xfs_inode *ip, int whichfork, xfs_fileoff_t shift, /* shift fsb */ - int current_ext, /* idx of gotp */ + struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, /* extent to shift */ struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_btree_cur *cur, int *logflags, /* output */ struct xfs_defer_ops *dfops) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5913,8 +5534,7 @@ xfs_bmse_merge( } /* lookup and remove the extent to merge */ - error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock, - got->br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, got, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); @@ -5925,20 +5545,20 @@ xfs_bmse_merge( XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ - error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock, - left->br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, left, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); - error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock, - new.br_blockcount, new.br_state); + error = xfs_bmbt_update(cur, &new); if (error) return error; done: - xfs_iext_update_extent(ifp, current_ext - 1, &new); - xfs_iext_remove(ip, current_ext, 1, 0); + xfs_iext_remove(ip, icur, 0); + xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); + xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, + &new); /* update reverse mapping. rmap functions merge the rmaps for us */ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); @@ -5949,183 +5569,83 @@ done: return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } -/* - * Shift a single extent. - */ -STATIC int -xfs_bmse_shift_one( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t offset_shift_fsb, - int *current_ext, - struct xfs_bmbt_irec *got, - struct xfs_btree_cur *cur, - int *logflags, - enum shift_direction direction, - struct xfs_defer_ops *dfops) +static int +xfs_bmap_shift_update_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_iext_cursor *icur, + struct xfs_bmbt_irec *got, + struct xfs_btree_cur *cur, + int *logflags, + struct xfs_defer_ops *dfops, + xfs_fileoff_t startoff) { - struct xfs_ifork *ifp; - struct xfs_mount *mp; - xfs_fileoff_t startoff; - struct xfs_bmbt_irec adj_irec, new; - int error; - int i; - int total_extents; - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - total_extents = xfs_iext_count(ifp); - - /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock)); - - if (direction == SHIFT_LEFT) { - startoff = got->br_startoff - offset_shift_fsb; - - /* - * Check for merge if we've got an extent to the left, - * otherwise make sure there's enough room at the start - * of the file for the shift. - */ - if (!*current_ext) { - if (got->br_startoff < offset_shift_fsb) - return -EINVAL; - goto update_current_ext; - } - - /* - * grab the left extent and check for a large enough hole. - */ - xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec); - if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount) - return -EINVAL; - - /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) { - return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, got, &adj_irec, - cur, logflags, dfops); - } - } else { - startoff = got->br_startoff + offset_shift_fsb; - /* nothing to move if this is the last extent */ - if (*current_ext >= (total_extents - 1)) - goto update_current_ext; - - /* - * If this is not the last extent in the file, make sure there - * is enough room between current extent and next extent for - * accommodating the shift. - */ - xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec); - if (startoff + got->br_blockcount > adj_irec.br_startoff) - return -EINVAL; - - /* - * Unlike a left shift (which involves a hole punch), - * a right shift does not modify extent neighbors - * in any way. We should never find mergeable extents - * in this scenario. Check anyways and warn if we - * encounter two extents that could be one. - */ - if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb)) - WARN_ON_ONCE(1); - } + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec prev = *got; + int error, i; - /* - * Increment the extent index for the next iteration, update the start - * offset of the in-core extent and update the btree if applicable. - */ -update_current_ext: *logflags |= XFS_ILOG_CORE; - new = *got; - new.br_startoff = startoff; + got->br_startoff = startoff; if (cur) { - error = xfs_bmbt_lookup_eq(cur, got->br_startoff, - got->br_startblock, got->br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, &prev, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); - error = xfs_bmbt_update(cur, new.br_startoff, - new.br_startblock, new.br_blockcount, - new.br_state); + error = xfs_bmbt_update(cur, got); if (error) return error; } else { *logflags |= XFS_ILOG_DEXT; } - xfs_iext_update_extent(ifp, *current_ext, &new); - - if (direction == SHIFT_LEFT) - (*current_ext)++; - else - (*current_ext)--; + xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, + got); /* update reverse mapping */ - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &prev); if (error) return error; - return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, got); } -/* - * Shift extent records to the left/right to cover/create a hole. - * - * The maximum number of extents to be shifted in a single operation is - * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the - * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb - * is the length by which each extent is shifted. If there is no hole to shift - * the extents into, this will be considered invalid operation and we abort - * immediately. - */ int -xfs_bmap_shift_extents( +xfs_bmap_collapse_extents( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, - int *done, + bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, - struct xfs_defer_ops *dfops, - enum shift_direction direction, - int num_exts) + struct xfs_defer_ops *dfops) { - struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_irec got; - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; - xfs_extnum_t nexts = 0; - xfs_extnum_t current_ext; - xfs_extnum_t total_extents; - xfs_extnum_t stop_extent; - int error = 0; - int whichfork = XFS_DATA_FORK; - int logflags = 0; + int whichfork = XFS_DATA_FORK; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_irec got, prev; + struct xfs_iext_cursor icur; + xfs_fileoff_t new_startoff; + int error = 0; + int logflags = 0; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmap_shift_extents", - XFS_ERRLEVEL_LOW, mp); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); - ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { - /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; @@ -6138,107 +5658,167 @@ xfs_bmap_shift_extents( cur->bc_private.b.flags = 0; } - /* - * There may be delalloc extents in the data fork before the range we - * are collapsing out, so we cannot use the count of real extents here. - * Instead we have to calculate it from the incore fork. - */ - total_extents = xfs_iext_count(ifp); - if (total_extents == 0) { - *done = 1; + if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { + *done = true; goto del_cursor; } + XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), + del_cursor); - /* - * In case of first right shift, we need to initialize next_fsb - */ - if (*next_fsb == NULLFSBLOCK) { - ASSERT(direction == SHIFT_RIGHT); - - current_ext = total_extents - 1; - xfs_iext_get_extent(ifp, current_ext, &got); - if (stop_fsb > got.br_startoff) { - *done = 1; + new_startoff = got.br_startoff - offset_shift_fsb; + if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) { + if (new_startoff < prev.br_startoff + prev.br_blockcount) { + error = -EINVAL; goto del_cursor; } - *next_fsb = got.br_startoff; + + if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) { + error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + &icur, &got, &prev, cur, &logflags, + dfops); + if (error) + goto del_cursor; + goto done; + } } else { - /* - * Look up the extent index for the fsb where we start shifting. We can - * henceforth iterate with current_ext as extent list changes are locked - * out via ilock. - * - * If next_fsb lies in a hole beyond which there are no extents we are - * done. - */ - if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, ¤t_ext, - &got)) { - *done = 1; + if (got.br_startoff < offset_shift_fsb) { + error = -EINVAL; goto del_cursor; } } - /* Lookup the extent index at which we have to stop */ - if (direction == SHIFT_RIGHT) { - struct xfs_bmbt_irec s; + error = xfs_bmap_shift_update_extent(ip, whichfork, &icur, &got, cur, + &logflags, dfops, new_startoff); + if (error) + goto del_cursor; + +done: + if (!xfs_iext_next_extent(ifp, &icur, &got)) { + *done = true; + goto del_cursor; + } + + *next_fsb = got.br_startoff; +del_cursor: + if (cur) + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + return error; +} + +int +xfs_bmap_insert_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *next_fsb, + xfs_fileoff_t offset_shift_fsb, + bool *done, + xfs_fileoff_t stop_fsb, + xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops) +{ + int whichfork = XFS_DATA_FORK; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_irec got, next; + struct xfs_iext_cursor icur; + xfs_fileoff_t new_startoff; + int error = 0; + int logflags = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT))) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.dfops = dfops; + cur->bc_private.b.flags = 0; + } - xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s); - /* Make stop_extent exclusive of shift range */ - stop_extent--; - if (current_ext <= stop_extent) { - error = -EIO; + if (*next_fsb == NULLFSBLOCK) { + xfs_iext_last(ifp, &icur); + if (!xfs_iext_get_extent(ifp, &icur, &got) || + stop_fsb > got.br_startoff) { + *done = true; goto del_cursor; } } else { - stop_extent = total_extents; - if (current_ext >= stop_extent) { - error = -EIO; + if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { + *done = true; goto del_cursor; } } + XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), + del_cursor); - while (nexts++ < num_exts) { - error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, &got, cur, &logflags, - direction, dfops); - if (error) + if (stop_fsb >= got.br_startoff + got.br_blockcount) { + error = -EIO; + goto del_cursor; + } + + new_startoff = got.br_startoff + offset_shift_fsb; + if (xfs_iext_peek_next_extent(ifp, &icur, &next)) { + if (new_startoff + got.br_blockcount > next.br_startoff) { + error = -EINVAL; goto del_cursor; - /* - * If there was an extent merge during the shift, the extent - * count can change. Update the total and grade the next record. - */ - if (direction == SHIFT_LEFT) { - total_extents = xfs_iext_count(ifp); - stop_extent = total_extents; } - if (current_ext == stop_extent) { - *done = 1; - *next_fsb = NULLFSBLOCK; - break; - } - xfs_iext_get_extent(ifp, current_ext, &got); + /* + * Unlike a left shift (which involves a hole punch), a right + * shift does not modify extent neighbors in any way. We should + * never find mergeable extents in this scenario. Check anyways + * and warn if we encounter two extents that could be one. + */ + if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb)) + WARN_ON_ONCE(1); } - if (!*done) - *next_fsb = got.br_startoff; + error = xfs_bmap_shift_update_extent(ip, whichfork, &icur, &got, cur, + &logflags, dfops, new_startoff); + if (error) + goto del_cursor; + + if (!xfs_iext_prev_extent(ifp, &icur, &got) || + stop_fsb >= got.br_startoff + got.br_blockcount) { + *done = true; + goto del_cursor; + } + *next_fsb = got.br_startoff; del_cursor: if (cur) xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); - if (logflags) xfs_trans_log_inode(tp, ip, logflags); - return error; } /* - * Splits an extent into two extents at split_fsb block such that it is - * the first block of the current_ext. @current_ext is a target extent - * to be split. @split_fsb is a block where the extents is split. - * If split_fsb lies in a hole or the first block of extents, just return 0. + * Splits an extent into two extents at split_fsb block such that it is the + * first block of the current_ext. @ext is a target extent to be split. + * @split_fsb is a block where the extents is split. If split_fsb lies in a + * hole or the first block of extents, just return 0. */ STATIC int xfs_bmap_split_extent_at( @@ -6255,7 +5835,7 @@ xfs_bmap_split_extent_at( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; xfs_fsblock_t gotblkcnt; /* new block count for got */ - xfs_extnum_t current_ext; + struct xfs_iext_cursor icur; int error = 0; int logflags = 0; int i = 0; @@ -6283,7 +5863,7 @@ xfs_bmap_split_extent_at( /* * If there are not extents, or split_fsb lies in a hole we are done. */ - if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, ¤t_ext, &got) || + if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &icur, &got) || got.br_startoff >= split_fsb) return 0; @@ -6298,44 +5878,35 @@ xfs_bmap_split_extent_at( cur->bc_private.b.firstblock = *firstfsb; cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); } got.br_blockcount = gotblkcnt; - xfs_iext_update_extent(ifp, current_ext, &got); + xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), &icur, + &got); logflags = XFS_ILOG_CORE; if (cur) { - error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state); + error = xfs_bmbt_update(cur, &got); if (error) goto del_cursor; } else logflags |= XFS_ILOG_DEXT; /* Add new extent */ - current_ext++; - xfs_iext_insert(ip, current_ext, 1, &new, 0); + xfs_iext_next(ifp, &icur); + xfs_iext_insert(ip, &icur, &new, 0); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur) { - error = xfs_bmbt_lookup_eq(cur, new.br_startoff, - new.br_startblock, new.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(cur, &new, &i); if (error) goto del_cursor; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); - cur->bc_rec.b.br_state = new.br_state; - error = xfs_btree_insert(cur, &i); if (error) goto del_cursor; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 502e0d8fb4ff..e36d75799cd5 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -43,7 +43,7 @@ struct xfs_bmalloca { xfs_fsblock_t blkno; /* starting block of new extent */ struct xfs_btree_cur *cur; /* btree cursor */ - xfs_extnum_t idx; /* current extent index */ + struct xfs_iext_cursor icur; /* incore extent cursor */ int nallocs;/* number of extents alloc'd */ int logflags;/* flags for transaction logging */ @@ -113,6 +113,9 @@ struct xfs_extent_free_item /* Only convert delalloc space, don't allocate entirely new extents */ #define XFS_BMAPI_DELALLOC 0x400 +/* Only convert unwritten extents, don't allocate new blocks */ +#define XFS_BMAPI_CONVERT_ONLY 0x800 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -124,7 +127,8 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_DELALLOC, "DELALLOC" } + { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ + { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" } static inline int xfs_bmapi_aflag(int w) @@ -183,29 +187,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) !isnullstartblock(irec->br_startblock); } -/* - * This macro is used to determine how many extents will be shifted - * in one write transaction. We could require two splits, - * an extent move on the first and an extent merge on the second, - * So it is proper that one extent is shifted inside write transaction - * at a time. - */ -#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 - -enum shift_direction { - SHIFT_LEFT = 0, - SHIFT_RIGHT, -}; - -#ifdef DEBUG -void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, - int whichfork, unsigned long caller_ip); -#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ - xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) -#else -#define XFS_BMAP_TRACE_EXLIST(ip,c,w) -#endif - void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); @@ -222,8 +203,6 @@ int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); -int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); @@ -241,20 +220,25 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, - xfs_extnum_t *idx, struct xfs_bmbt_irec *got, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del); +void xfs_bmap_del_extent_cow(struct xfs_inode *ip, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); -void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx, - struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); uint xfs_default_attroffset(struct xfs_inode *ip); -int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, +int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, + bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops); +int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, - int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, - struct xfs_defer_ops *dfops, enum shift_direction direction, - int num_exts); + bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); + struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, + int eof); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, @@ -278,4 +262,16 @@ int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); +static inline int xfs_bmap_fork_to_state(int whichfork) +{ + switch (whichfork) { + case XFS_ATTR_FORK: + return BMAP_ATTRFORK; + case XFS_COW_FORK: + return BMAP_COWFORK; + default: + return 0; + } +} + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index a6331ffa51e3..c10aecaaae44 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -38,22 +38,6 @@ #include "xfs_rmap.h" /* - * Determine the extent state. - */ -/* ARGSUSED */ -STATIC xfs_exntst_t -xfs_extent_state( - xfs_filblks_t blks, - int extent_flag) -{ - if (extent_flag) { - ASSERT(blks != 0); /* saved for DMIG */ - return XFS_EXT_UNWRITTEN; - } - return XFS_EXT_NORM; -} - -/* * Convert on-disk form of btree root to in-memory form. */ void @@ -87,84 +71,21 @@ xfs_bmdr_to_bmbt( memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } -/* - * Convert a compressed bmap extent record to an uncompressed form. - * This code must be in sync with the routines xfs_bmbt_get_startoff, - * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. - */ -STATIC void -__xfs_bmbt_get_all( - uint64_t l0, - uint64_t l1, - xfs_bmbt_irec_t *s) -{ - int ext_flag; - xfs_exntst_t st; - - ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); - s->br_startoff = ((xfs_fileoff_t)l0 & - xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; - s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) | - (((xfs_fsblock_t)l1) >> 21); - s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21)); - /* This is xfs_extent_state() in-line */ - if (ext_flag) { - ASSERT(s->br_blockcount != 0); /* saved for DMIG */ - st = XFS_EXT_UNWRITTEN; - } else - st = XFS_EXT_NORM; - s->br_state = st; -} - void -xfs_bmbt_get_all( - xfs_bmbt_rec_host_t *r, - xfs_bmbt_irec_t *s) -{ - __xfs_bmbt_get_all(r->l0, r->l1, s); -} - -/* - * Extract the blockcount field from an in memory bmap extent record. - */ -xfs_filblks_t -xfs_bmbt_get_blockcount( - xfs_bmbt_rec_host_t *r) -{ - return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21)); -} - -/* - * Extract the startblock field from an in memory bmap extent record. - */ -xfs_fsblock_t -xfs_bmbt_get_startblock( - xfs_bmbt_rec_host_t *r) -{ - return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) | - (((xfs_fsblock_t)r->l1) >> 21); -} - -/* - * Extract the startoff field from an in memory bmap extent record. - */ -xfs_fileoff_t -xfs_bmbt_get_startoff( - xfs_bmbt_rec_host_t *r) -{ - return ((xfs_fileoff_t)r->l0 & - xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; -} - -xfs_exntst_t -xfs_bmbt_get_state( - xfs_bmbt_rec_host_t *r) -{ - int ext_flag; - - ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN)); - return xfs_extent_state(xfs_bmbt_get_blockcount(r), - ext_flag); +xfs_bmbt_disk_get_all( + struct xfs_bmbt_rec *rec, + struct xfs_bmbt_irec *irec) +{ + uint64_t l0 = get_unaligned_be64(&rec->l0); + uint64_t l1 = get_unaligned_be64(&rec->l1); + + irec->br_startoff = (l0 & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; + irec->br_startblock = ((l0 & xfs_mask64lo(9)) << 43) | (l1 >> 21); + irec->br_blockcount = l1 & xfs_mask64lo(21); + if (l0 >> (64 - BMBT_EXNTFLAG_BITLEN)) + irec->br_state = XFS_EXT_UNWRITTEN; + else + irec->br_state = XFS_EXT_NORM; } /* @@ -188,142 +109,29 @@ xfs_bmbt_disk_get_startoff( xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; } - -/* - * Set all the fields in a bmap extent record from the arguments. - */ -void -xfs_bmbt_set_allf( - xfs_bmbt_rec_host_t *r, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state) -{ - int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; - - ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); - ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); - ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - - ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); - - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)startoff << 9) | - ((xfs_bmbt_rec_base_t)startblock >> 43); - r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | - ((xfs_bmbt_rec_base_t)blockcount & - (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); -} - /* * Set all the fields in a bmap extent record from the uncompressed form. */ void -xfs_bmbt_set_all( - xfs_bmbt_rec_host_t *r, - xfs_bmbt_irec_t *s) -{ - xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock, - s->br_blockcount, s->br_state); -} - - -/* - * Set all the fields in a disk format bmap extent record from the arguments. - */ -void -xfs_bmbt_disk_set_allf( - xfs_bmbt_rec_t *r, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state) -{ - int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; - - ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); - ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); - ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); - - r->l0 = cpu_to_be64( - ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)startoff << 9) | - ((xfs_bmbt_rec_base_t)startblock >> 43)); - r->l1 = cpu_to_be64( - ((xfs_bmbt_rec_base_t)startblock << 21) | - ((xfs_bmbt_rec_base_t)blockcount & - (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); -} - -/* - * Set all the fields in a bmap extent record from the uncompressed form. - */ -STATIC void xfs_bmbt_disk_set_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock, - s->br_blockcount, s->br_state); -} - -/* - * Set the blockcount field in a bmap extent record. - */ -void -xfs_bmbt_set_blockcount( - xfs_bmbt_rec_host_t *r, - xfs_filblks_t v) + struct xfs_bmbt_rec *r, + struct xfs_bmbt_irec *s) { - ASSERT((v & xfs_mask64hi(43)) == 0); - r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) | - (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21)); -} - -/* - * Set the startblock field in a bmap extent record. - */ -void -xfs_bmbt_set_startblock( - xfs_bmbt_rec_host_t *r, - xfs_fsblock_t v) -{ - ASSERT((v & xfs_mask64hi(12)) == 0); - r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) | - (xfs_bmbt_rec_base_t)(v >> 43); - r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | - (xfs_bmbt_rec_base_t)(v << 21); -} + int extent_flag = (s->br_state != XFS_EXT_NORM); -/* - * Set the startoff field in a bmap extent record. - */ -void -xfs_bmbt_set_startoff( - xfs_bmbt_rec_host_t *r, - xfs_fileoff_t v) -{ - ASSERT((v & xfs_mask64hi(9)) == 0); - r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) | - ((xfs_bmbt_rec_base_t)v << 9) | - (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); -} + ASSERT(s->br_state == XFS_EXT_NORM || s->br_state == XFS_EXT_UNWRITTEN); + ASSERT(!(s->br_startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN))); + ASSERT(!(s->br_blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN))); + ASSERT(!(s->br_startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN))); -/* - * Set the extent state field in a bmap extent record. - */ -void -xfs_bmbt_set_state( - xfs_bmbt_rec_host_t *r, - xfs_exntst_t v) -{ - ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); - if (v == XFS_EXT_NORM) - r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN); - else - r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN); + put_unaligned_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | + ((xfs_bmbt_rec_base_t)s->br_startblock >> 43), &r->l0); + put_unaligned_be64( + ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)), &r->l1); } /* diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 9da5a8d4f184..135b8c56d23e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -98,25 +98,11 @@ struct xfs_trans; */ extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); -extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); -extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); -extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); -extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); -extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); +void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); - -extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); -extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, - xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); -extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v); -extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); -extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); -extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); - -extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, - xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); +extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); @@ -136,9 +122,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, * Check that the extent does not contain an invalid unwritten extent flag. */ static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork, - struct xfs_bmbt_rec_host *ep) + struct xfs_bmbt_irec *irec) { - if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0) + if (irec->br_state == XFS_EXT_NORM) return true; if (whichfork == XFS_DATA_FORK && xfs_sb_version_hasextflgbit(&mp->m_sb)) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5bfb88261c7e..5f33adf8eecb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -29,6 +29,7 @@ #include "xfs_inode_item.h" #include "xfs_buf_item.h" #include "xfs_btree.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" @@ -63,44 +64,63 @@ xfs_btree_magic( return magic; } -STATIC int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lblock( - struct xfs_btree_cur *cur, /* btree cursor */ - struct xfs_btree_block *block, /* btree long form block pointer */ - int level, /* level of the btree block */ - struct xfs_buf *bp) /* buffer for block, if any */ +/* + * Check a long btree block header. Return the address of the failing check, + * or NULL if everything is ok. + */ +xfs_failaddr_t +__xfs_btree_check_lblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) { - int lblock_ok = 1; /* block passes checks */ - struct xfs_mount *mp; /* file system mount point */ + struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc; - - mp = cur->bc_mp; - crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_sb_version_hascrc(&mp->m_sb); if (crc) { - lblock_ok = lblock_ok && - uuid_equal(&block->bb_u.l.bb_uuid, - &mp->m_sb.sb_meta_uuid) && - block->bb_u.l.bb_blkno == cpu_to_be64( - bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (block->bb_u.l.bb_blkno != + cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + return __this_address; + if (block->bb_u.l.bb_pad != cpu_to_be32(0)) + return __this_address; } - lblock_ok = lblock_ok && - be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && - be16_to_cpu(block->bb_level) == level && - be16_to_cpu(block->bb_numrecs) <= - cur->bc_ops->get_maxrecs(cur, level) && - block->bb_u.l.bb_leftsib && - (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && - block->bb_u.l.bb_rightsib && - (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); - - if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + return __this_address; + if (be16_to_cpu(block->bb_level) != level) + return __this_address; + if (be16_to_cpu(block->bb_numrecs) > + cur->bc_ops->get_maxrecs(cur, level)) + return __this_address; + if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && + !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), + level + 1)) + return __this_address; + if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && + !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), + level + 1)) + return __this_address; + + return NULL; +} + +/* Check a long btree block header. */ +static int +xfs_btree_check_lblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; + + fa = __xfs_btree_check_lblock(cur, block, level, bp); + if (unlikely(XFS_TEST_ERROR(fa != NULL, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); @@ -110,48 +130,61 @@ xfs_btree_check_lblock( return 0; } -STATIC int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sblock( - struct xfs_btree_cur *cur, /* btree cursor */ - struct xfs_btree_block *block, /* btree short form block pointer */ - int level, /* level of the btree block */ - struct xfs_buf *bp) /* buffer containing block */ +/* + * Check a short btree block header. Return the address of the failing check, + * or NULL if everything is ok. + */ +xfs_failaddr_t +__xfs_btree_check_sblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) { - struct xfs_mount *mp; /* file system mount point */ - struct xfs_buf *agbp; /* buffer for ag. freespace struct */ - struct xfs_agf *agf; /* ag. freespace structure */ - xfs_agblock_t agflen; /* native ag. freespace length */ - int sblock_ok = 1; /* block passes checks */ + struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc; - - mp = cur->bc_mp; - crc = xfs_sb_version_hascrc(&mp->m_sb); - agbp = cur->bc_private.a.agbp; - agf = XFS_BUF_TO_AGF(agbp); - agflen = be32_to_cpu(agf->agf_length); + int crc = xfs_sb_version_hascrc(&mp->m_sb); if (crc) { - sblock_ok = sblock_ok && - uuid_equal(&block->bb_u.s.bb_uuid, - &mp->m_sb.sb_meta_uuid) && - block->bb_u.s.bb_blkno == cpu_to_be64( - bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (block->bb_u.s.bb_blkno != + cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + return __this_address; } - sblock_ok = sblock_ok && - be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && - be16_to_cpu(block->bb_level) == level && - be16_to_cpu(block->bb_numrecs) <= - cur->bc_ops->get_maxrecs(cur, level) && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && - block->bb_u.s.bb_rightsib; - - if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + return __this_address; + if (be16_to_cpu(block->bb_level) != level) + return __this_address; + if (be16_to_cpu(block->bb_numrecs) > + cur->bc_ops->get_maxrecs(cur, level)) + return __this_address; + if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && + !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), + level + 1)) + return __this_address; + if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && + !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), + level + 1)) + return __this_address; + + return NULL; +} + +/* Check a short btree block header. */ +STATIC int +xfs_btree_check_sblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; + + fa = __xfs_btree_check_sblock(cur, block, level, bp); + if (unlikely(XFS_TEST_ERROR(fa != NULL, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); @@ -177,59 +210,53 @@ xfs_btree_check_block( return xfs_btree_check_sblock(cur, block, level, bp); } -/* - * Check that (long) pointer is ok. - */ -int /* error (0 or EFSCORRUPTED) */ +/* Check that this long pointer is valid and points within the fs. */ +bool xfs_btree_check_lptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_fsblock_t bno, /* btree block disk address */ - int level) /* btree block level */ + struct xfs_btree_cur *cur, + xfs_fsblock_t fsbno, + int level) { - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, - level > 0 && - bno != NULLFSBLOCK && - XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); - return 0; + if (level <= 0) + return false; + return xfs_verify_fsbno(cur->bc_mp, fsbno); } -#ifdef DEBUG -/* - * Check that (short) pointer is ok. - */ -STATIC int /* error (0 or EFSCORRUPTED) */ +/* Check that this short pointer is valid and points within the AG. */ +bool xfs_btree_check_sptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t bno, /* btree block disk address */ - int level) /* btree block level */ + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + int level) { - xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; - - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, - level > 0 && - bno != NULLAGBLOCK && - bno != 0 && - bno < agblocks); - return 0; + if (level <= 0) + return false; + return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno); } +#ifdef DEBUG /* - * Check that block ptr is ok. + * Check that a given (indexed) btree pointer at a certain level of a + * btree is valid and doesn't point past where it should. */ -STATIC int /* error (0 or EFSCORRUPTED) */ +static int xfs_btree_check_ptr( - struct xfs_btree_cur *cur, /* btree cursor */ - union xfs_btree_ptr *ptr, /* btree block disk address */ - int index, /* offset from ptr to check */ - int level) /* btree block level */ + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int index, + int level) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - return xfs_btree_check_lptr(cur, - be64_to_cpu((&ptr->l)[index]), level); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + xfs_btree_check_lptr(cur, + be64_to_cpu((&ptr->l)[index]), level)); } else { - return xfs_btree_check_sptr(cur, - be32_to_cpu((&ptr->s)[index]), level); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + xfs_btree_check_sptr(cur, + be32_to_cpu((&ptr->s)[index]), level)); } + + return 0; } #endif @@ -1027,7 +1054,7 @@ xfs_btree_setbuf( } } -STATIC int +bool xfs_btree_ptr_is_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) @@ -1052,7 +1079,7 @@ xfs_btree_set_ptr_null( /* * Get/set/init sibling pointers */ -STATIC void +void xfs_btree_get_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -2001,7 +2028,7 @@ error0: } /* Find the high key storage area from a regular key. */ -STATIC union xfs_btree_key * +union xfs_btree_key * xfs_btree_high_key_from_key( struct xfs_btree_cur *cur, union xfs_btree_key *key) @@ -2075,7 +2102,7 @@ xfs_btree_get_node_keys( } /* Derive the keys for any btree block. */ -STATIC void +void xfs_btree_get_keys( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -4914,3 +4941,15 @@ xfs_btree_count_blocks( return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, blocks); } + +/* Compare two btree pointers. */ +int64_t +xfs_btree_diff_two_ptrs( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *a, + const union xfs_btree_ptr *b) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); + return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index f2a88c3b1159..b57501c6f71d 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -255,6 +255,14 @@ typedef struct xfs_btree_cur */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) +/* + * Internal long and short btree block checks. They return NULL if the + * block is ok or the address of the failed check otherwise. + */ +xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, int level, struct xfs_buf *bp); +xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, int level, struct xfs_buf *bp); /* * Check that block header is ok. @@ -269,10 +277,19 @@ xfs_btree_check_block( /* * Check that (long) pointer is ok. */ -int /* error (0 or EFSCORRUPTED) */ +bool /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lptr( struct xfs_btree_cur *cur, /* btree cursor */ - xfs_fsblock_t ptr, /* btree block disk address */ + xfs_fsblock_t fsbno, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Check that (short) pointer is ok. + */ +bool /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t agbno, /* btree block disk address */ int level); /* btree block level */ /* @@ -517,5 +534,16 @@ int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); +bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); +int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *a, + const union xfs_btree_ptr *b); +void xfs_btree_get_sibling(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, int lr); +void xfs_btree_get_keys(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, union xfs_btree_key *key); +union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, + union xfs_btree_key *key); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 6d4335815c3f..651611530d2f 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -1466,6 +1466,7 @@ xfs_da3_node_lookup_int( int max; int error; int retval; + unsigned int expected_level = 0; struct xfs_inode *dp = state->args->dp; args = state->args; @@ -1474,7 +1475,7 @@ xfs_da3_node_lookup_int( * Descend thru the B-tree searching each level for the right * node to use, until the right hashval is found. */ - blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0; + blkno = args->geo->leafblk; for (blk = &state->path.blk[0], state->path.active = 1; state->path.active <= XFS_DA_NODE_MAXDEPTH; blk++, state->path.active++) { @@ -1517,6 +1518,18 @@ xfs_da3_node_lookup_int( dp->d_ops->node_hdr_from_disk(&nodehdr, node); btree = dp->d_ops->node_tree_p(node); + /* Tree taller than we can handle; bail out! */ + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + return -EFSCORRUPTED; + + /* Check the level from the root. */ + if (blkno == args->geo->leafblk) + expected_level = nodehdr.level - 1; + else if (expected_level != nodehdr.level) + return -EFSCORRUPTED; + else + expected_level--; + max = nodehdr.count; blk->hashval = be32_to_cpu(btree[max - 1].hashval); @@ -1562,8 +1575,15 @@ xfs_da3_node_lookup_int( blk->index = probe; blkno = be32_to_cpu(btree[probe].before); } + + /* We can't point back to the root. */ + if (blkno == args->geo->leafblk) + return -EFSCORRUPTED; } + if (expected_level != 0) + return -EFSCORRUPTED; + /* * A leaf block that ends in the hashval that we are interested in * (final hashval == search hashval) means that the next block may diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 072ebfe1d6ae..087fea02c389 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -249,6 +249,10 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); + /* Hold the (previously bjoin'd) buffer locked across the roll. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) + xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]); + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ @@ -264,6 +268,12 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); + /* Rejoin the buffers and dirty them so the log moves forward. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) { + xfs_trans_bjoin(*tp, dop->dop_bufs[i]); + xfs_trans_bhold(*tp, dop->dop_bufs[i]); + } + return error; } @@ -295,6 +305,31 @@ xfs_defer_ijoin( } } + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Add this buffer to the deferred op. Each joined buffer is relogged + * each time we roll the transaction. + */ +int +xfs_defer_bjoin( + struct xfs_defer_ops *dop, + struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS; i++) { + if (dop->dop_bufs[i] == bp) + return 0; + else if (dop->dop_bufs[i] == NULL) { + dop->dop_bufs[i] = bp; + return 0; + } + } + + ASSERT(0); return -EFSCORRUPTED; } @@ -493,9 +528,7 @@ xfs_defer_init( struct xfs_defer_ops *dop, xfs_fsblock_t *fbp) { - dop->dop_committed = false; - dop->dop_low = false; - memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); + memset(dop, 0, sizeof(struct xfs_defer_ops)); *fbp = NULLFSBLOCK; INIT_LIST_HEAD(&dop->dop_intake); INIT_LIST_HEAD(&dop->dop_pending); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index d4f046dd44bd..045beacdd37d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -59,6 +59,7 @@ enum xfs_defer_ops_type { }; #define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ +#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ struct xfs_defer_ops { bool dop_committed; /* did any trans commit? */ @@ -66,8 +67,9 @@ struct xfs_defer_ops { struct list_head dop_intake; /* unlogged pending work */ struct list_head dop_pending; /* logged pending work */ - /* relog these inodes with each roll */ + /* relog these with each roll */ struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES]; + struct xfs_buf *dop_bufs[XFS_DEFER_OPS_NR_BUFS]; }; void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, @@ -77,6 +79,7 @@ void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_bjoin(struct xfs_defer_ops *dop, struct xfs_buf *bp); /* Description of a deferred type. */ struct xfs_defer_op_type { diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index ccf9783fd3f0..e10778c102ea 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -30,6 +30,8 @@ #include "xfs_bmap.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_ialloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -38,7 +40,9 @@ struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; /* * Convert inode mode to directory entry filetype */ -unsigned char xfs_mode_to_ftype(int mode) +unsigned char +xfs_mode_to_ftype( + int mode) { switch (mode & S_IFMT) { case S_IFREG: @@ -202,22 +206,8 @@ xfs_dir_ino_validate( xfs_mount_t *mp, xfs_ino_t ino) { - xfs_agblock_t agblkno; - xfs_agino_t agino; - xfs_agnumber_t agno; - int ino_ok; - int ioff; - - agno = XFS_INO_TO_AGNO(mp, ino); - agblkno = XFS_INO_TO_AGBNO(mp, ino); - ioff = XFS_INO_TO_OFFSET(mp, ino); - agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff); - ino_ok = - agno < mp->m_sb.sb_agcount && - agblkno < mp->m_sb.sb_agblocks && - agblkno != 0 && - ioff < (1 << mp->m_sb.sb_inopblog) && - XFS_AGINO_TO_INO(mp, agno, agino) == ino; + bool ino_ok = xfs_verify_dir_ino(mp, ino); + if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 21c8f8bf94d5..1a8f2cf977ca 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -324,4 +324,21 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) sizeof(struct xfs_dir2_leaf_tail)); } +/* + * The Linux API doesn't pass down the total size of the buffer + * we read into down to the filesystem. With the filldir concept + * it's not needed for correct information, but the XFS dir2 leaf + * code wants an estimate of the buffer size to calculate it's + * readahead window and size the buffers used for mapping to + * physical blocks. + * + * Try to give it an estimate that's good enough, maybe at some + * point we can change the ->readdir prototype to include the + * buffer size. For now we use the current glibc buffer size. + * musl libc hardcodes 2k and dietlibc uses PAGE_SIZE. + */ +#define XFS_READDIR_BUFSIZE (32768) + +unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h new file mode 100644 index 000000000000..bc1789d95152 --- /dev/null +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (C) 2017 Oracle. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_ERRORTAG_H_ +#define __XFS_ERRORTAG_H_ + +/* + * error injection tags - the labels can be anything you want + * but each tag should have its own unique number + */ + +#define XFS_ERRTAG_NOERROR 0 +#define XFS_ERRTAG_IFLUSH_1 1 +#define XFS_ERRTAG_IFLUSH_2 2 +#define XFS_ERRTAG_IFLUSH_3 3 +#define XFS_ERRTAG_IFLUSH_4 4 +#define XFS_ERRTAG_IFLUSH_5 5 +#define XFS_ERRTAG_IFLUSH_6 6 +#define XFS_ERRTAG_DA_READ_BUF 7 +#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8 +#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9 +#define XFS_ERRTAG_ALLOC_READ_AGF 10 +#define XFS_ERRTAG_IALLOC_READ_AGI 11 +#define XFS_ERRTAG_ITOBP_INOTOBP 12 +#define XFS_ERRTAG_IUNLINK 13 +#define XFS_ERRTAG_IUNLINK_REMOVE 14 +#define XFS_ERRTAG_DIR_INO_VALIDATE 15 +#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16 +#define XFS_ERRTAG_IODONE_IOERR 17 +#define XFS_ERRTAG_STRATREAD_IOERR 18 +#define XFS_ERRTAG_STRATCMPL_IOERR 19 +#define XFS_ERRTAG_DIOWRITE_IOERR 20 +#define XFS_ERRTAG_BMAPIFORMAT 21 +#define XFS_ERRTAG_FREE_EXTENT 22 +#define XFS_ERRTAG_RMAP_FINISH_ONE 23 +#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24 +#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 +#define XFS_ERRTAG_BMAP_FINISH_ONE 26 +#define XFS_ERRTAG_AG_RESV_CRITICAL 27 +/* + * DEBUG mode instrumentation to test and/or trigger delayed allocation + * block killing in the event of failed writes. When enabled, all + * buffered writes are silenty dropped and handled as if they failed. + * All delalloc blocks in the range of the write (including pre-existing + * delalloc blocks!) are tossed as part of the write failure error + * handling sequence. + */ +#define XFS_ERRTAG_DROP_WRITES 28 +#define XFS_ERRTAG_LOG_BAD_CRC 29 +#define XFS_ERRTAG_LOG_ITEM_PIN 30 +#define XFS_ERRTAG_BUF_LRU_REF 31 +#define XFS_ERRTAG_MAX 32 + +/* + * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. + */ +#define XFS_RANDOM_DEFAULT 100 +#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) +#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT +#define XFS_RANDOM_FREE_EXTENT 1 +#define XFS_RANDOM_RMAP_FINISH_ONE 1 +#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 +#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 +#define XFS_RANDOM_BMAP_FINISH_ONE 1 +#define XFS_RANDOM_AG_RESV_CRITICAL 4 +#define XFS_RANDOM_DROP_WRITES 1 +#define XFS_RANDOM_LOG_BAD_CRC 1 +#define XFS_RANDOM_LOG_ITEM_PIN 1 +#define XFS_RANDOM_BUF_LRU_REF 2 + +#endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 23229f0c5b15..1acb584fc5f7 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -315,6 +315,11 @@ static inline bool xfs_sb_good_version(struct xfs_sb *sbp) return false; } +static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp) +{ + return sbp->sb_rblocks > 0; +} + /* * Detect a mismatched features2 field. Older kernels read/wrote * this into the wrong slot, so to be safe we keep them in sync. @@ -500,12 +505,12 @@ xfs_sb_has_incompat_log_feature( /* * V5 superblock specific feature checks */ -static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } -static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) +static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } @@ -518,7 +523,7 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); } -static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); @@ -941,7 +946,7 @@ typedef enum xfs_dinode_fmt { XFS_DINODE_FMT_LOCAL, /* bulk data */ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ - XFS_DINODE_FMT_UUID /* uuid_t */ + XFS_DINODE_FMT_UUID /* added long ago, but never used */ } xfs_dinode_fmt_t; /* @@ -1142,7 +1147,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * Dquot and dquot block format definitions */ #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ -#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ +#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ /* * This is the main portion of the on-disk representation of quota @@ -1548,10 +1553,6 @@ typedef struct xfs_bmbt_rec { typedef uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; -typedef struct xfs_bmbt_rec_host { - uint64_t l0, l1; -} xfs_bmbt_rec_host_t; - /* * Values and macros for delayed-allocation startblock fields. */ @@ -1577,24 +1578,6 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x) } /* - * Possible extent states. - */ -typedef enum { - XFS_EXT_NORM, XFS_EXT_UNWRITTEN, -} xfs_exntst_t; - -/* - * Incore version of above. - */ -typedef struct xfs_bmbt_irec -{ - xfs_fileoff_t br_startoff; /* starting file offset */ - xfs_fsblock_t br_startblock; /* starting block number */ - xfs_filblks_t br_blockcount; /* number of blocks */ - xfs_exntst_t br_state; /* extent state */ -} xfs_bmbt_irec_t; - -/* * Key structure for non-leaf levels of the tree. */ typedef struct xfs_bmbt_key { diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 8c61f21535d4..b90924104596 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -468,6 +468,82 @@ typedef struct xfs_swapext #define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ +/* metadata scrubbing */ +struct xfs_scrub_metadata { + __u32 sm_type; /* What to check? */ + __u32 sm_flags; /* flags; see below. */ + __u64 sm_ino; /* inode number. */ + __u32 sm_gen; /* inode generation. */ + __u32 sm_agno; /* ag number. */ + __u64 sm_reserved[5]; /* pad to 64 bytes */ +}; + +/* + * Metadata types and flags for scrub operation. + */ + +/* Scrub subcommands. */ +#define XFS_SCRUB_TYPE_PROBE 0 /* presence test ioctl */ +#define XFS_SCRUB_TYPE_SB 1 /* superblock */ +#define XFS_SCRUB_TYPE_AGF 2 /* AG free header */ +#define XFS_SCRUB_TYPE_AGFL 3 /* AG free list */ +#define XFS_SCRUB_TYPE_AGI 4 /* AG inode header */ +#define XFS_SCRUB_TYPE_BNOBT 5 /* freesp by block btree */ +#define XFS_SCRUB_TYPE_CNTBT 6 /* freesp by length btree */ +#define XFS_SCRUB_TYPE_INOBT 7 /* inode btree */ +#define XFS_SCRUB_TYPE_FINOBT 8 /* free inode btree */ +#define XFS_SCRUB_TYPE_RMAPBT 9 /* reverse mapping btree */ +#define XFS_SCRUB_TYPE_REFCNTBT 10 /* reference count btree */ +#define XFS_SCRUB_TYPE_INODE 11 /* inode record */ +#define XFS_SCRUB_TYPE_BMBTD 12 /* data fork block mapping */ +#define XFS_SCRUB_TYPE_BMBTA 13 /* attr fork block mapping */ +#define XFS_SCRUB_TYPE_BMBTC 14 /* CoW fork block mapping */ +#define XFS_SCRUB_TYPE_DIR 15 /* directory */ +#define XFS_SCRUB_TYPE_XATTR 16 /* extended attribute */ +#define XFS_SCRUB_TYPE_SYMLINK 17 /* symbolic link */ +#define XFS_SCRUB_TYPE_PARENT 18 /* parent pointers */ +#define XFS_SCRUB_TYPE_RTBITMAP 19 /* realtime bitmap */ +#define XFS_SCRUB_TYPE_RTSUM 20 /* realtime summary */ +#define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */ +#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */ +#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ + +/* Number of scrub subcommands. */ +#define XFS_SCRUB_TYPE_NR 24 + +/* i: Repair this metadata. */ +#define XFS_SCRUB_IFLAG_REPAIR (1 << 0) + +/* o: Metadata object needs repair. */ +#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1) + +/* + * o: Metadata object could be optimized. It's not corrupt, but + * we could improve on it somehow. + */ +#define XFS_SCRUB_OFLAG_PREEN (1 << 2) + +/* o: Cross-referencing failed. */ +#define XFS_SCRUB_OFLAG_XFAIL (1 << 3) + +/* o: Metadata object disagrees with cross-referenced metadata. */ +#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4) + +/* o: Scan was not complete. */ +#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5) + +/* o: Metadata object looked funny but isn't corrupt. */ +#define XFS_SCRUB_OFLAG_WARNING (1 << 6) + +#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) +#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ + XFS_SCRUB_OFLAG_PREEN | \ + XFS_SCRUB_OFLAG_XFAIL | \ + XFS_SCRUB_OFLAG_XCORRUPT | \ + XFS_SCRUB_OFLAG_INCOMPLETE | \ + XFS_SCRUB_OFLAG_WARNING) +#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) + /* * ioctl limits */ @@ -511,6 +587,7 @@ typedef struct xfs_swapext #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) /* XFS_IOC_GETFSMAP ------ hoisted 59 */ +#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index dfd643909f85..3b57ef0f2f76 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -31,6 +31,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_cksum.h" @@ -919,8 +920,7 @@ STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - umode_t mode, /* bits set to indicate file type */ - int okalloc) /* ok to allocate more space */ + umode_t mode) /* bits set to indicate file type */ { xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ @@ -977,9 +977,6 @@ xfs_ialloc_ag_select( return agno; } - if (!okalloc) - goto nextag; - if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, tp, agno, flags); if (error) @@ -1679,7 +1676,6 @@ xfs_dialloc( struct xfs_trans *tp, xfs_ino_t parent, umode_t mode, - int okalloc, struct xfs_buf **IO_agbp, xfs_ino_t *inop) { @@ -1691,6 +1687,7 @@ xfs_dialloc( int noroom = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; + int okalloc = 1; if (*IO_agbp) { /* @@ -1706,7 +1703,7 @@ xfs_dialloc( * We do not have an agbp, so select an initial allocation * group for inode allocation. */ - start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + start_agno = xfs_ialloc_ag_select(tp, parent, mode); if (start_agno == NULLAGNUMBER) { *inop = NULLFSINO; return 0; @@ -2664,3 +2661,93 @@ xfs_ialloc_pagi_init( xfs_trans_brelse(tp, bp); return 0; } + +/* Calculate the first and last possible inode number in an AG. */ +void +xfs_ialloc_agino_range( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t *first, + xfs_agino_t *last) +{ + xfs_agblock_t bno; + xfs_agblock_t eoag; + + eoag = xfs_ag_block_count(mp, agno); + + /* + * Calculate the first inode, which will be in the first + * cluster-aligned block after the AGFL. + */ + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, + xfs_ialloc_cluster_alignment(mp)); + *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0); + + /* + * Calculate the last inode, which will be at the end of the + * last (aligned) cluster that can be allocated in the AG. + */ + bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp)); + *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1; +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata. + */ +bool +xfs_verify_agino( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + xfs_agino_t first; + xfs_agino_t last; + + xfs_ialloc_agino_range(mp, agno, &first, &last); + return agino >= first && agino <= last; +} + +/* + * Verify that an FS inode number pointer neither points outside the + * filesystem nor points at static AG metadata. + */ +bool +xfs_verify_ino( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + + if (agno >= mp->m_sb.sb_agcount) + return false; + if (XFS_AGINO_TO_INO(mp, agno, agino) != ino) + return false; + return xfs_verify_agino(mp, agno, agino); +} + +/* Is this an internal inode number? */ +bool +xfs_internal_inum( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (xfs_sb_version_hasquota(&mp->m_sb) && + xfs_is_quota_inode(&mp->m_sb, ino)); +} + +/* + * Verify that a directory entry's inode number doesn't point at an internal + * inode, empty space, or static AG metadata. + */ +bool +xfs_verify_dir_ino( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + if (xfs_internal_inum(mp, ino)) + return false; + return xfs_verify_ino(mp, ino); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index b32cfb5aeb5b..66a8de0b1caa 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -81,7 +81,6 @@ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ umode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ xfs_ino_t *inop); /* inode number allocated */ @@ -173,5 +172,12 @@ void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); +void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t *first, xfs_agino_t *last); +bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); +bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c new file mode 100644 index 000000000000..b0f31791c7e6 --- /dev/null +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -0,0 +1,1043 @@ +/* + * Copyright (c) 2017 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/cache.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trace.h" + +/* + * In-core extent record layout: + * + * +-------+----------------------------+ + * | 00:53 | all 54 bits of startoff | + * | 54:63 | low 10 bits of startblock | + * +-------+----------------------------+ + * | 00:20 | all 21 bits of length | + * | 21 | unwritten extent bit | + * | 22:63 | high 42 bits of startblock | + * +-------+----------------------------+ + */ +#define XFS_IEXT_STARTOFF_MASK xfs_mask64lo(BMBT_STARTOFF_BITLEN) +#define XFS_IEXT_LENGTH_MASK xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN) +#define XFS_IEXT_STARTBLOCK_MASK xfs_mask64lo(BMBT_STARTBLOCK_BITLEN) + +struct xfs_iext_rec { + uint64_t lo; + uint64_t hi; +}; + +/* + * Given that the length can't be a zero, only an empty hi value indicates an + * unused record. + */ +static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec) +{ + return rec->hi == 0; +} + +static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec) +{ + rec->lo = 0; + rec->hi = 0; +} + +static void +xfs_iext_set( + struct xfs_iext_rec *rec, + struct xfs_bmbt_irec *irec) +{ + ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0); + ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0); + ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0); + + rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK; + rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK; + + rec->lo |= (irec->br_startblock << 54); + rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10)); + + if (irec->br_state == XFS_EXT_UNWRITTEN) + rec->hi |= (1 << 21); +} + +static void +xfs_iext_get( + struct xfs_bmbt_irec *irec, + struct xfs_iext_rec *rec) +{ + irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK; + irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK; + + irec->br_startblock = rec->lo >> 54; + irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10); + + if (rec->hi & (1 << 21)) + irec->br_state = XFS_EXT_UNWRITTEN; + else + irec->br_state = XFS_EXT_NORM; +} + +enum { + NODE_SIZE = 256, + KEYS_PER_NODE = NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)), + RECS_PER_LEAF = (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) / + sizeof(struct xfs_iext_rec), +}; + +/* + * In-core extent btree block layout: + * + * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks. + * + * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each + * contain the startoffset, blockcount, startblock and unwritten extent flag. + * See above for the exact format, followed by pointers to the previous and next + * leaf blocks (if there are any). + * + * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed + * by an equal number of pointers to the btree blocks at the next lower level. + * + * +-------+-------+-------+-------+-------+----------+----------+ + * Leaf: | rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr | + * +-------+-------+-------+-------+-------+----------+----------+ + * + * +-------+-------+-------+-------+-------+-------+------+-------+ + * Inner: | key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N | + * +-------+-------+-------+-------+-------+-------+------+-------+ + */ +struct xfs_iext_node { + uint64_t keys[KEYS_PER_NODE]; +#define XFS_IEXT_KEY_INVALID (1ULL << 63) + void *ptrs[KEYS_PER_NODE]; +}; + +struct xfs_iext_leaf { + struct xfs_iext_rec recs[RECS_PER_LEAF]; + struct xfs_iext_leaf *prev; + struct xfs_iext_leaf *next; +}; + +inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp) +{ + return ifp->if_bytes / sizeof(struct xfs_iext_rec); +} + +static inline int xfs_iext_max_recs(struct xfs_ifork *ifp) +{ + if (ifp->if_height == 1) + return xfs_iext_count(ifp); + return RECS_PER_LEAF; +} + +static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur) +{ + return &cur->leaf->recs[cur->pos]; +} + +static inline bool xfs_iext_valid(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + if (!cur->leaf) + return false; + if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp)) + return false; + if (xfs_iext_rec_is_empty(cur_rec(cur))) + return false; + return true; +} + +static void * +xfs_iext_find_first_leaf( + struct xfs_ifork *ifp) +{ + struct xfs_iext_node *node = ifp->if_u1.if_root; + int height; + + if (!ifp->if_height) + return NULL; + + for (height = ifp->if_height; height > 1; height--) { + node = node->ptrs[0]; + ASSERT(node); + } + + return node; +} + +static void * +xfs_iext_find_last_leaf( + struct xfs_ifork *ifp) +{ + struct xfs_iext_node *node = ifp->if_u1.if_root; + int height, i; + + if (!ifp->if_height) + return NULL; + + for (height = ifp->if_height; height > 1; height--) { + for (i = 1; i < KEYS_PER_NODE; i++) + if (!node->ptrs[i]) + break; + node = node->ptrs[i - 1]; + ASSERT(node); + } + + return node; +} + +void +xfs_iext_first( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + cur->pos = 0; + cur->leaf = xfs_iext_find_first_leaf(ifp); +} + +void +xfs_iext_last( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + int i; + + cur->leaf = xfs_iext_find_last_leaf(ifp); + if (!cur->leaf) { + cur->pos = 0; + return; + } + + for (i = 1; i < xfs_iext_max_recs(ifp); i++) { + if (xfs_iext_rec_is_empty(&cur->leaf->recs[i])) + break; + } + cur->pos = i - 1; +} + +void +xfs_iext_next( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + if (!cur->leaf) { + ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF); + xfs_iext_first(ifp, cur); + return; + } + + ASSERT(cur->pos >= 0); + ASSERT(cur->pos < xfs_iext_max_recs(ifp)); + + cur->pos++; + if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) && + cur->leaf->next) { + cur->leaf = cur->leaf->next; + cur->pos = 0; + } +} + +void +xfs_iext_prev( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + if (!cur->leaf) { + ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF); + xfs_iext_last(ifp, cur); + return; + } + + ASSERT(cur->pos >= 0); + ASSERT(cur->pos <= RECS_PER_LEAF); + +recurse: + do { + cur->pos--; + if (xfs_iext_valid(ifp, cur)) + return; + } while (cur->pos > 0); + + if (ifp->if_height > 1 && cur->leaf->prev) { + cur->leaf = cur->leaf->prev; + cur->pos = RECS_PER_LEAF; + goto recurse; + } +} + +static inline int +xfs_iext_key_cmp( + struct xfs_iext_node *node, + int n, + xfs_fileoff_t offset) +{ + if (node->keys[n] > offset) + return 1; + if (node->keys[n] < offset) + return -1; + return 0; +} + +static inline int +xfs_iext_rec_cmp( + struct xfs_iext_rec *rec, + xfs_fileoff_t offset) +{ + uint64_t rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK; + uint32_t rec_len = rec->hi & XFS_IEXT_LENGTH_MASK; + + if (rec_offset > offset) + return 1; + if (rec_offset + rec_len <= offset) + return -1; + return 0; +} + +static void * +xfs_iext_find_level( + struct xfs_ifork *ifp, + xfs_fileoff_t offset, + int level) +{ + struct xfs_iext_node *node = ifp->if_u1.if_root; + int height, i; + + if (!ifp->if_height) + return NULL; + + for (height = ifp->if_height; height > level; height--) { + for (i = 1; i < KEYS_PER_NODE; i++) + if (xfs_iext_key_cmp(node, i, offset) > 0) + break; + + node = node->ptrs[i - 1]; + if (!node) + break; + } + + return node; +} + +static int +xfs_iext_node_pos( + struct xfs_iext_node *node, + xfs_fileoff_t offset) +{ + int i; + + for (i = 1; i < KEYS_PER_NODE; i++) { + if (xfs_iext_key_cmp(node, i, offset) > 0) + break; + } + + return i - 1; +} + +static int +xfs_iext_node_insert_pos( + struct xfs_iext_node *node, + xfs_fileoff_t offset) +{ + int i; + + for (i = 0; i < KEYS_PER_NODE; i++) { + if (xfs_iext_key_cmp(node, i, offset) > 0) + return i; + } + + return KEYS_PER_NODE; +} + +static int +xfs_iext_node_nr_entries( + struct xfs_iext_node *node, + int start) +{ + int i; + + for (i = start; i < KEYS_PER_NODE; i++) { + if (node->keys[i] == XFS_IEXT_KEY_INVALID) + break; + } + + return i; +} + +static int +xfs_iext_leaf_nr_entries( + struct xfs_ifork *ifp, + struct xfs_iext_leaf *leaf, + int start) +{ + int i; + + for (i = start; i < xfs_iext_max_recs(ifp); i++) { + if (xfs_iext_rec_is_empty(&leaf->recs[i])) + break; + } + + return i; +} + +static inline uint64_t +xfs_iext_leaf_key( + struct xfs_iext_leaf *leaf, + int n) +{ + return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK; +} + +static void +xfs_iext_grow( + struct xfs_ifork *ifp) +{ + struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS); + int i; + + if (ifp->if_height == 1) { + struct xfs_iext_leaf *prev = ifp->if_u1.if_root; + + node->keys[0] = xfs_iext_leaf_key(prev, 0); + node->ptrs[0] = prev; + } else { + struct xfs_iext_node *prev = ifp->if_u1.if_root; + + ASSERT(ifp->if_height > 1); + + node->keys[0] = prev->keys[0]; + node->ptrs[0] = prev; + } + + for (i = 1; i < KEYS_PER_NODE; i++) + node->keys[i] = XFS_IEXT_KEY_INVALID; + + ifp->if_u1.if_root = node; + ifp->if_height++; +} + +static void +xfs_iext_update_node( + struct xfs_ifork *ifp, + xfs_fileoff_t old_offset, + xfs_fileoff_t new_offset, + int level, + void *ptr) +{ + struct xfs_iext_node *node = ifp->if_u1.if_root; + int height, i; + + for (height = ifp->if_height; height > level; height--) { + for (i = 0; i < KEYS_PER_NODE; i++) { + if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0) + break; + if (node->keys[i] == old_offset) + node->keys[i] = new_offset; + } + node = node->ptrs[i - 1]; + ASSERT(node); + } + + ASSERT(node == ptr); +} + +static struct xfs_iext_node * +xfs_iext_split_node( + struct xfs_iext_node **nodep, + int *pos, + int *nr_entries) +{ + struct xfs_iext_node *node = *nodep; + struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + const int nr_move = KEYS_PER_NODE / 2; + int nr_keep = nr_move + (KEYS_PER_NODE & 1); + int i = 0; + + /* for sequential append operations just spill over into the new node */ + if (*pos == KEYS_PER_NODE) { + *nodep = new; + *pos = 0; + *nr_entries = 0; + goto done; + } + + + for (i = 0; i < nr_move; i++) { + new->keys[i] = node->keys[nr_keep + i]; + new->ptrs[i] = node->ptrs[nr_keep + i]; + + node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID; + node->ptrs[nr_keep + i] = NULL; + } + + if (*pos >= nr_keep) { + *nodep = new; + *pos -= nr_keep; + *nr_entries = nr_move; + } else { + *nr_entries = nr_keep; + } +done: + for (; i < KEYS_PER_NODE; i++) + new->keys[i] = XFS_IEXT_KEY_INVALID; + return new; +} + +static void +xfs_iext_insert_node( + struct xfs_ifork *ifp, + uint64_t offset, + void *ptr, + int level) +{ + struct xfs_iext_node *node, *new; + int i, pos, nr_entries; + +again: + if (ifp->if_height < level) + xfs_iext_grow(ifp); + + new = NULL; + node = xfs_iext_find_level(ifp, offset, level); + pos = xfs_iext_node_insert_pos(node, offset); + nr_entries = xfs_iext_node_nr_entries(node, pos); + + ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0); + ASSERT(nr_entries <= KEYS_PER_NODE); + + if (nr_entries == KEYS_PER_NODE) + new = xfs_iext_split_node(&node, &pos, &nr_entries); + + /* + * Update the pointers in higher levels if the first entry changes + * in an existing node. + */ + if (node != new && pos == 0 && nr_entries > 0) + xfs_iext_update_node(ifp, node->keys[0], offset, level, node); + + for (i = nr_entries; i > pos; i--) { + node->keys[i] = node->keys[i - 1]; + node->ptrs[i] = node->ptrs[i - 1]; + } + node->keys[pos] = offset; + node->ptrs[pos] = ptr; + + if (new) { + offset = new->keys[0]; + ptr = new; + level++; + goto again; + } +} + +static struct xfs_iext_leaf * +xfs_iext_split_leaf( + struct xfs_iext_cursor *cur, + int *nr_entries) +{ + struct xfs_iext_leaf *leaf = cur->leaf; + struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + const int nr_move = RECS_PER_LEAF / 2; + int nr_keep = nr_move + (RECS_PER_LEAF & 1); + int i; + + /* for sequential append operations just spill over into the new node */ + if (cur->pos == RECS_PER_LEAF) { + cur->leaf = new; + cur->pos = 0; + *nr_entries = 0; + goto done; + } + + for (i = 0; i < nr_move; i++) { + new->recs[i] = leaf->recs[nr_keep + i]; + xfs_iext_rec_clear(&leaf->recs[nr_keep + i]); + } + + if (cur->pos >= nr_keep) { + cur->leaf = new; + cur->pos -= nr_keep; + *nr_entries = nr_move; + } else { + *nr_entries = nr_keep; + } +done: + if (leaf->next) + leaf->next->prev = new; + new->next = leaf->next; + new->prev = leaf; + leaf->next = new; + return new; +} + +static void +xfs_iext_alloc_root( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + ASSERT(ifp->if_bytes == 0); + + ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); + ifp->if_height = 1; + + /* now that we have a node step into it */ + cur->leaf = ifp->if_u1.if_root; + cur->pos = 0; +} + +static void +xfs_iext_realloc_root( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec); + void *new; + + /* account for the prev/next pointers */ + if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) + new_size = NODE_SIZE; + + new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS); + memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); + ifp->if_u1.if_root = new; + cur->leaf = new; +} + +void +xfs_iext_insert( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + xfs_fileoff_t offset = irec->br_startoff; + struct xfs_iext_leaf *new = NULL; + int nr_entries, i; + + if (ifp->if_height == 0) + xfs_iext_alloc_root(ifp, cur); + else if (ifp->if_height == 1) + xfs_iext_realloc_root(ifp, cur); + + nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos); + ASSERT(nr_entries <= RECS_PER_LEAF); + ASSERT(cur->pos >= nr_entries || + xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0); + + if (nr_entries == RECS_PER_LEAF) + new = xfs_iext_split_leaf(cur, &nr_entries); + + /* + * Update the pointers in higher levels if the first entry changes + * in an existing node. + */ + if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) { + xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0), + offset, 1, cur->leaf); + } + + for (i = nr_entries; i > cur->pos; i--) + cur->leaf->recs[i] = cur->leaf->recs[i - 1]; + xfs_iext_set(cur_rec(cur), irec); + ifp->if_bytes += sizeof(struct xfs_iext_rec); + + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); + + if (new) + xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); +} + +static struct xfs_iext_node * +xfs_iext_rebalance_node( + struct xfs_iext_node *parent, + int *pos, + struct xfs_iext_node *node, + int nr_entries) +{ + /* + * If the neighbouring nodes are completely full, or have different + * parents, we might never be able to merge our node, and will only + * delete it once the number of entries hits zero. + */ + if (nr_entries == 0) + return node; + + if (*pos > 0) { + struct xfs_iext_node *prev = parent->ptrs[*pos - 1]; + int nr_prev = xfs_iext_node_nr_entries(prev, 0), i; + + if (nr_prev + nr_entries <= KEYS_PER_NODE) { + for (i = 0; i < nr_entries; i++) { + prev->keys[nr_prev + i] = node->keys[i]; + prev->ptrs[nr_prev + i] = node->ptrs[i]; + } + return node; + } + } + + if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) { + struct xfs_iext_node *next = parent->ptrs[*pos + 1]; + int nr_next = xfs_iext_node_nr_entries(next, 0), i; + + if (nr_entries + nr_next <= KEYS_PER_NODE) { + /* + * Merge the next node into this node so that we don't + * have to do an additional update of the keys in the + * higher levels. + */ + for (i = 0; i < nr_next; i++) { + node->keys[nr_entries + i] = next->keys[i]; + node->ptrs[nr_entries + i] = next->ptrs[i]; + } + + ++*pos; + return next; + } + } + + return NULL; +} + +static void +xfs_iext_remove_node( + struct xfs_ifork *ifp, + xfs_fileoff_t offset, + void *victim) +{ + struct xfs_iext_node *node, *parent; + int level = 2, pos, nr_entries, i; + + ASSERT(level <= ifp->if_height); + node = xfs_iext_find_level(ifp, offset, level); + pos = xfs_iext_node_pos(node, offset); +again: + ASSERT(node->ptrs[pos]); + ASSERT(node->ptrs[pos] == victim); + kmem_free(victim); + + nr_entries = xfs_iext_node_nr_entries(node, pos) - 1; + offset = node->keys[0]; + for (i = pos; i < nr_entries; i++) { + node->keys[i] = node->keys[i + 1]; + node->ptrs[i] = node->ptrs[i + 1]; + } + node->keys[nr_entries] = XFS_IEXT_KEY_INVALID; + node->ptrs[nr_entries] = NULL; + + if (pos == 0 && nr_entries > 0) { + xfs_iext_update_node(ifp, offset, node->keys[0], level, node); + offset = node->keys[0]; + } + + if (nr_entries >= KEYS_PER_NODE / 2) + return; + + if (level < ifp->if_height) { + /* + * If we aren't at the root yet try to find a neighbour node to + * merge with (or delete the node if it is empty), and then + * recurse up to the next level. + */ + level++; + parent = xfs_iext_find_level(ifp, offset, level); + pos = xfs_iext_node_pos(parent, offset); + + ASSERT(pos != KEYS_PER_NODE); + ASSERT(parent->ptrs[pos] == node); + + node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries); + if (node) { + victim = node; + node = parent; + goto again; + } + } else if (nr_entries == 1) { + /* + * If we are at the root and only one entry is left we can just + * free this node and update the root pointer. + */ + ASSERT(node == ifp->if_u1.if_root); + ifp->if_u1.if_root = node->ptrs[0]; + ifp->if_height--; + kmem_free(node); + } +} + +static void +xfs_iext_rebalance_leaf( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_iext_leaf *leaf, + xfs_fileoff_t offset, + int nr_entries) +{ + /* + * If the neighbouring nodes are completely full we might never be able + * to merge our node, and will only delete it once the number of + * entries hits zero. + */ + if (nr_entries == 0) + goto remove_node; + + if (leaf->prev) { + int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i; + + if (nr_prev + nr_entries <= RECS_PER_LEAF) { + for (i = 0; i < nr_entries; i++) + leaf->prev->recs[nr_prev + i] = leaf->recs[i]; + + if (cur->leaf == leaf) { + cur->leaf = leaf->prev; + cur->pos += nr_prev; + } + goto remove_node; + } + } + + if (leaf->next) { + int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i; + + if (nr_entries + nr_next <= RECS_PER_LEAF) { + /* + * Merge the next node into this node so that we don't + * have to do an additional update of the keys in the + * higher levels. + */ + for (i = 0; i < nr_next; i++) { + leaf->recs[nr_entries + i] = + leaf->next->recs[i]; + } + + if (cur->leaf == leaf->next) { + cur->leaf = leaf; + cur->pos += nr_entries; + } + + offset = xfs_iext_leaf_key(leaf->next, 0); + leaf = leaf->next; + goto remove_node; + } + } + + return; +remove_node: + if (leaf->prev) + leaf->prev->next = leaf->next; + if (leaf->next) + leaf->next->prev = leaf->prev; + xfs_iext_remove_node(ifp, offset, leaf); +} + +static void +xfs_iext_free_last_leaf( + struct xfs_ifork *ifp) +{ + ifp->if_height--; + kmem_free(ifp->if_u1.if_root); + ifp->if_u1.if_root = NULL; +} + +void +xfs_iext_remove( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + struct xfs_iext_leaf *leaf = cur->leaf; + xfs_fileoff_t offset = xfs_iext_leaf_key(leaf, 0); + int i, nr_entries; + + trace_xfs_iext_remove(ip, cur, state, _RET_IP_); + + ASSERT(ifp->if_height > 0); + ASSERT(ifp->if_u1.if_root != NULL); + ASSERT(xfs_iext_valid(ifp, cur)); + + nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; + for (i = cur->pos; i < nr_entries; i++) + leaf->recs[i] = leaf->recs[i + 1]; + xfs_iext_rec_clear(&leaf->recs[nr_entries]); + ifp->if_bytes -= sizeof(struct xfs_iext_rec); + + if (cur->pos == 0 && nr_entries > 0) { + xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1, + leaf); + offset = xfs_iext_leaf_key(leaf, 0); + } else if (cur->pos == nr_entries) { + if (ifp->if_height > 1 && leaf->next) + cur->leaf = leaf->next; + else + cur->leaf = NULL; + cur->pos = 0; + } + + if (nr_entries >= RECS_PER_LEAF / 2) + return; + + if (ifp->if_height > 1) + xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries); + else if (nr_entries == 0) + xfs_iext_free_last_leaf(ifp); +} + +/* + * Lookup the extent covering bno. + * + * If there is an extent covering bno return the extent index, and store the + * expanded extent structure in *gotp, and the extent cursor in *cur. + * If there is no extent covering bno, but there is an extent after it (e.g. + * it lies in a hole) return that extent in *gotp and its cursor in *cur + * instead. + * If bno is beyond the last extent return false, and return an invalid + * cursor value. + */ +bool +xfs_iext_lookup_extent( + struct xfs_inode *ip, + struct xfs_ifork *ifp, + xfs_fileoff_t offset, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp) +{ + XFS_STATS_INC(ip->i_mount, xs_look_exlist); + + cur->leaf = xfs_iext_find_level(ifp, offset, 1); + if (!cur->leaf) { + cur->pos = 0; + return false; + } + + for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) { + struct xfs_iext_rec *rec = cur_rec(cur); + + if (xfs_iext_rec_is_empty(rec)) + break; + if (xfs_iext_rec_cmp(rec, offset) >= 0) + goto found; + } + + /* Try looking in the next node for an entry > offset */ + if (ifp->if_height == 1 || !cur->leaf->next) + return false; + cur->leaf = cur->leaf->next; + cur->pos = 0; + if (!xfs_iext_valid(ifp, cur)) + return false; +found: + xfs_iext_get(gotp, cur_rec(cur)); + return true; +} + +/* + * Returns the last extent before end, and if this extent doesn't cover + * end, update end to the end of the extent. + */ +bool +xfs_iext_lookup_extent_before( + struct xfs_inode *ip, + struct xfs_ifork *ifp, + xfs_fileoff_t *end, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp) +{ + /* could be optimized to not even look up the next on a match.. */ + if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) && + gotp->br_startoff <= *end - 1) + return true; + if (!xfs_iext_prev_extent(ifp, cur, gotp)) + return false; + *end = gotp->br_startoff + gotp->br_blockcount; + return true; +} + +void +xfs_iext_update_extent( + struct xfs_inode *ip, + int state, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *new) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + + if (cur->pos == 0) { + struct xfs_bmbt_irec old; + + xfs_iext_get(&old, cur_rec(cur)); + if (new->br_startoff != old.br_startoff) { + xfs_iext_update_node(ifp, old.br_startoff, + new->br_startoff, 1, cur->leaf); + } + } + + trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_); + xfs_iext_set(cur_rec(cur), new); + trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_); +} + +/* + * Return true if the cursor points at an extent and return the extent structure + * in gotp. Else return false. + */ +bool +xfs_iext_get_extent( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp) +{ + if (!xfs_iext_valid(ifp, cur)) + return false; + xfs_iext_get(gotp, cur_rec(cur)); + return true; +} + +/* + * This is a recursive function, because of that we need to be extremely + * careful with stack usage. + */ +static void +xfs_iext_destroy_node( + struct xfs_iext_node *node, + int level) +{ + int i; + + if (level > 1) { + for (i = 0; i < KEYS_PER_NODE; i++) { + if (node->keys[i] == XFS_IEXT_KEY_INVALID) + break; + xfs_iext_destroy_node(node->ptrs[i], level - 1); + } + } + + kmem_free(node); +} + +void +xfs_iext_destroy( + struct xfs_ifork *ifp) +{ + xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height); + + ifp->if_bytes = 0; + ifp->if_height = 0; + ifp->if_u1.if_root = NULL; +} diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 378f8fbc91a7..6b7989038d75 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -24,6 +24,7 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_cksum.h" #include "xfs_icache.h" diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 31840ca24018..c79a1616b79d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -43,20 +43,21 @@ STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); /* - * Move inode type and inode format specific information from the - * on-disk inode to the in-core inode. For fifos, devs, and sockets - * this means set if_rdev to the proper value. For files, directories, - * and symlinks this means to bring in the in-line data or extent - * pointers. For a file in B-tree format, only the root is immediately - * brought in-core. The rest will be in-lined in if_extents when it - * is first referenced (see xfs_iread_extents()). + * Copy inode type and data and attr format specific information from the + * on-disk inode to the in-core inode and fork structures. For fifos, devices, + * and sockets this means set i_rdev to the proper value. For files, + * directories, and symlinks this means to bring in the in-line data or extent + * pointers as well as the attribute fork. For a fork in B-tree format, only + * the root is immediately brought in-core. The rest will be read in later when + * first referenced (see xfs_iread_extents()). */ int xfs_iformat_fork( - xfs_inode_t *ip, - xfs_dinode_t *dip) + struct xfs_inode *ip, + struct xfs_dinode *dip) { - xfs_attr_shortform_t *atp; + struct inode *inode = VFS_I(ip); + struct xfs_attr_shortform *atp; int size; int error = 0; xfs_fsize_t di_size; @@ -95,8 +96,7 @@ xfs_iformat_fork( return -EFSCORRUPTED; } - if (unlikely(xfs_is_reflink_inode(ip) && - (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) { + if (unlikely(xfs_is_reflink_inode(ip) && !S_ISREG(inode->i_mode))) { xfs_warn(ip->i_mount, "corrupt dinode %llu, wrong file type for reflink.", ip->i_ino); @@ -115,7 +115,7 @@ xfs_iformat_fork( return -EFSCORRUPTED; } - switch (VFS_I(ip)->i_mode & S_IFMT) { + switch (inode->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: @@ -126,7 +126,7 @@ xfs_iformat_fork( return -EFSCORRUPTED; } ip->i_d.di_size = 0; - ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); + inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); break; case S_IFREG: @@ -184,8 +184,7 @@ xfs_iformat_fork( return error; /* Check inline dir contents. */ - if (S_ISDIR(VFS_I(ip)->i_mode) && - dip->di_format == XFS_DINODE_FMT_LOCAL) { + if (S_ISDIR(inode->i_mode) && dip->di_format == XFS_DINODE_FMT_LOCAL) { error = xfs_dir2_sf_verify(ip); if (error) { xfs_idestroy_fork(ip, XFS_DATA_FORK); @@ -265,19 +264,14 @@ xfs_init_local_fork( if (zero_terminate) mem_size++; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (mem_size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { + if (size) { real_size = roundup(mem_size, 4); ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - - if (size) { memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; + } else { + ifp->if_u1.if_data = NULL; } ifp->if_bytes = size; @@ -288,13 +282,6 @@ xfs_init_local_fork( /* * The file is in-lined in the on-disk inode. - * If it fits into if_inline_data, then copy - * it there, otherwise allocate a buffer for it - * and copy the data there. Either way, set - * if_data to point at the data. - * If we allocate a buffer for the data, make - * sure that its size is a multiple of 4 and - * record the real size in i_real_bytes. */ STATIC int xfs_iformat_local( @@ -324,9 +311,7 @@ xfs_iformat_local( /* * The file consists of a set of extents all of which fit into the on-disk - * inode. If there are few enough extents to fit into the if_inline_ext, then - * copy them there. Otherwise allocate a buffer for them and copy them into it. - * Either way, set if_extents to point at the extents. + * inode. */ STATIC int xfs_iformat_extents( @@ -336,9 +321,12 @@ xfs_iformat_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int state = xfs_bmap_fork_to_state(whichfork); int nex = XFS_DFORK_NEXTENTS(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); + struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; + struct xfs_bmbt_irec new; int i; /* @@ -354,27 +342,25 @@ xfs_iformat_extents( } ifp->if_real_bytes = 0; - if (nex == 0) - ifp->if_u1.if_extents = NULL; - else if (nex <= XFS_INLINE_EXTS) - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - else - xfs_iext_add(ifp, 0, nex); - - ifp->if_bytes = size; + ifp->if_bytes = 0; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + + xfs_iext_first(ifp, &icur); for (i = 0; i < nex; i++, dp++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - ep->l0 = get_unaligned_be64(&dp->l0); - ep->l1 = get_unaligned_be64(&dp->l1); - if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) { + xfs_bmbt_disk_get_all(dp, &new); + if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) { XFS_ERROR_REPORT("xfs_iformat_extents(2)", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } + + xfs_iext_insert(ip, &icur, &new, state); + trace_xfs_read_extent(ip, &icur, state, _THIS_IP_); + xfs_iext_next(ifp, &icur); } - XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); } ifp->if_flags |= XFS_IFEXTENTS; return 0; @@ -440,47 +426,14 @@ xfs_iformat_btree( ifp->if_flags &= ~XFS_IFEXTENTS; ifp->if_flags |= XFS_IFBROOT; + ifp->if_real_bytes = 0; + ifp->if_bytes = 0; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; return 0; } /* - * Read in extents from a btree-format inode. - * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. - */ -int -xfs_iread_extents( - xfs_trans_t *tp, - xfs_inode_t *ip, - int whichfork) -{ - int error; - xfs_ifork_t *ifp; - xfs_extnum_t nextents; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, - ip->i_mount); - return -EFSCORRUPTED; - } - nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - ifp = XFS_IFORK_PTR(ip, whichfork); - - /* - * We know that the size is valid (it's checked in iformat_btree) - */ - ifp->if_bytes = ifp->if_real_bytes = 0; - xfs_iext_add(ifp, 0, nextents); - error = xfs_bmap_read_extents(tp, ip, whichfork); - if (error) { - xfs_iext_destroy(ifp); - return error; - } - ifp->if_flags |= XFS_IFEXTENTS; - return 0; -} -/* * Reallocate the space for if_broot based on the number of records * being added or deleted as indicated in rec_diff. Move the records * and pointers in if_broot to fit the new size. When shrinking this @@ -644,26 +597,9 @@ xfs_idata_realloc( ASSERT(new_size >= 0); if (new_size == 0) { - if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - kmem_free(ifp->if_u1.if_data); - } + kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = NULL; real_size = 0; - } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { - /* - * If the valid extents/data can fit in if_inline_ext/data, - * copy them from the malloc'd vector and free it. - */ - if (ifp->if_u1.if_data == NULL) { - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - ASSERT(ifp->if_real_bytes != 0); - memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, - new_size); - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } - real_size = 0; } else { /* * Stuck with malloc/realloc. @@ -677,7 +613,7 @@ xfs_idata_realloc( ASSERT(ifp->if_real_bytes == 0); ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + } else { /* * Only do the realloc if the underlying size * is really changing. @@ -688,12 +624,6 @@ xfs_idata_realloc( real_size, KM_SLEEP | KM_NOFS); } - } else { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, - KM_SLEEP | KM_NOFS); - memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, - ifp->if_bytes); } } ifp->if_real_bytes = real_size; @@ -721,23 +651,18 @@ xfs_idestroy_fork( * so check and free it up if we do. */ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && - (ifp->if_u1.if_data != NULL)) { + if (ifp->if_u1.if_data != NULL) { ASSERT(ifp->if_real_bytes != 0); kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = NULL; ifp->if_real_bytes = 0; } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && - ((ifp->if_flags & XFS_IFEXTIREC) || - ((ifp->if_u1.if_extents != NULL) && - (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { - ASSERT(ifp->if_real_bytes != 0); + } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) { xfs_iext_destroy(ifp); } - ASSERT(ifp->if_u1.if_extents == NULL || - ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; @@ -747,19 +672,9 @@ xfs_idestroy_fork( } } -/* Count number of incore extents based on if_bytes */ -xfs_extnum_t -xfs_iext_count(struct xfs_ifork *ifp) -{ - return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); -} - /* * Convert in-core extents to on-disk form * - * For either the data or attr fork in extent format, we need to endian convert - * the in-core extent as we place them into the on-disk inode. - * * In the case of the data fork, the in-core and on-disk fork sizes can be * different due to delayed allocation extents. We only copy on-disk extents * here, so callers must always use the physical fork size to determine the @@ -768,53 +683,32 @@ xfs_iext_count(struct xfs_ifork *ifp) */ int xfs_iextents_copy( - xfs_inode_t *ip, - xfs_bmbt_rec_t *dp, + struct xfs_inode *ip, + struct xfs_bmbt_rec *dp, int whichfork) { - int copied; - int i; - xfs_ifork_t *ifp; - int nrecs; - xfs_fsblock_t start_block; + int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec rec; + int copied = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); - nrecs = xfs_iext_count(ifp); - XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); - ASSERT(nrecs > 0); - - /* - * There are some delayed allocation extents in the - * inode, so copy the extents one at a time and skip - * the delayed ones. There must be at least one - * non-delayed extent. - */ - copied = 0; - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - - ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep)); - - start_block = xfs_bmbt_get_startblock(ep); - if (isnullstartblock(start_block)) { - /* - * It's a delayed allocation extent, so skip it. - */ + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) continue; - } - - /* Translate to on disk format */ - put_unaligned_be64(ep->l0, &dp->l0); - put_unaligned_be64(ep->l1, &dp->l1); + ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, &rec)); + xfs_bmbt_disk_set_all(dp, &rec); + trace_xfs_write_extent(ip, &icur, state, _RET_IP_); + copied += sizeof(struct xfs_bmbt_rec); dp++; - copied++; } - ASSERT(copied != 0); - return (copied * (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(copied > 0); + ASSERT(copied <= ifp->if_bytes); + return copied; } /* @@ -872,7 +766,6 @@ xfs_iflush_fork( !(iip->ili_fields & extflag[whichfork])); if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(xfs_iext_get_ext(ifp, 0)); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, whichfork); @@ -894,16 +787,8 @@ xfs_iflush_fork( case XFS_DINODE_FMT_DEV: if (iip->ili_fields & XFS_ILOG_DEV) { ASSERT(whichfork == XFS_DATA_FORK); - xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); - } - break; - - case XFS_DINODE_FMT_UUID: - if (iip->ili_fields & XFS_ILOG_UUID) { - ASSERT(whichfork == XFS_DATA_FORK); - memcpy(XFS_DFORK_DPTR(dip), - &ip->i_df.if_u2.if_uuid, - sizeof(uuid_t)); + xfs_dinode_put_rdev(dip, + linux_to_xfs_dev_t(VFS_I(ip)->i_rdev)); } break; @@ -913,33 +798,6 @@ xfs_iflush_fork( } } -/* - * Return a pointer to the extent record at file index idx. - */ -xfs_bmbt_rec_host_t * -xfs_iext_get_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx) /* index of target extent */ -{ - ASSERT(idx >= 0); - ASSERT(idx < xfs_iext_count(ifp)); - - if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { - return ifp->if_u1.if_ext_irec->er_extbuf; - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_ext_irec_t *erp; /* irec pointer */ - int erp_idx = 0; /* irec index */ - xfs_extnum_t page_idx = idx; /* ext index in target list */ - - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - return &erp->er_extbuf[page_idx]; - } else if (ifp->if_bytes) { - return &ifp->if_u1.if_extents[idx]; - } else { - return NULL; - } -} - /* Convert bmap state flags to an inode fork. */ struct xfs_ifork * xfs_iext_state_to_fork( @@ -954,1011 +812,6 @@ xfs_iext_state_to_fork( } /* - * Insert new item(s) into the extent records for incore inode - * fork 'ifp'. 'count' new items are inserted at index 'idx'. - */ -void -xfs_iext_insert( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* starting index of new items */ - xfs_extnum_t count, /* number of inserted items */ - xfs_bmbt_irec_t *new, /* items to insert */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); - xfs_extnum_t i; /* extent record index */ - - trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); - - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - xfs_iext_add(ifp, idx, count); - for (i = idx; i < idx + count; i++, new++) - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be increased. The ext_diff parameter stores the - * number of new extents being added and the idx parameter contains - * the extent index where the new extents will be added. If the new - * extents are being appended, then we just need to (re)allocate and - * initialize the space. Otherwise, if the new extents are being - * inserted into the middle of the existing entries, a bit more work - * is required to make room for the new extents to be inserted. The - * caller is responsible for filling in the new extent entries upon - * return. - */ -void -xfs_iext_add( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin adding exts */ - int ext_diff) /* number of extents to add */ -{ - int byte_diff; /* new bytes being added */ - int new_size; /* size of extents after adding */ - xfs_extnum_t nextents; /* number of extents in file */ - - nextents = xfs_iext_count(ifp); - ASSERT((idx >= 0) && (idx <= nextents)); - byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); - new_size = ifp->if_bytes + byte_diff; - /* - * If the new number of extents (nextents + ext_diff) - * fits inside the inode, then continue to use the inline - * extent buffer. - */ - if (nextents + ext_diff <= XFS_INLINE_EXTS) { - if (idx < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], - &ifp->if_u2.if_inline_ext[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); - } - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; - } - /* - * Otherwise use a linear (direct) extent list. - * If the extents are currently inside the inode, - * xfs_iext_realloc_direct will switch us from - * inline to direct extent allocation mode. - */ - else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, new_size); - if (idx < nextents) { - memmove(&ifp->if_u1.if_extents[idx + ext_diff], - &ifp->if_u1.if_extents[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); - } - } - /* Indirection array */ - else { - xfs_ext_irec_t *erp; - int erp_idx = 0; - int page_idx = idx; - - ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); - if (ifp->if_flags & XFS_IFEXTIREC) { - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); - } else { - xfs_iext_irec_init(ifp); - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = ifp->if_u1.if_ext_irec; - } - /* Extents fit in target extent page */ - if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { - if (page_idx < erp->er_extcount) { - memmove(&erp->er_extbuf[page_idx + ext_diff], - &erp->er_extbuf[page_idx], - (erp->er_extcount - page_idx) * - sizeof(xfs_bmbt_rec_t)); - memset(&erp->er_extbuf[page_idx], 0, byte_diff); - } - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - } - /* Insert a new extent page */ - else if (erp) { - xfs_iext_add_indirect_multi(ifp, - erp_idx, page_idx, ext_diff); - } - /* - * If extent(s) are being appended to the last page in - * the indirection array and the new extent(s) don't fit - * in the page, then erp is NULL and erp_idx is set to - * the next index needed in the indirection array. - */ - else { - uint count = ext_diff; - - while (count) { - erp = xfs_iext_irec_new(ifp, erp_idx); - erp->er_extcount = min(count, XFS_LINEAR_EXTS); - count -= erp->er_extcount; - if (count) - erp_idx++; - } - } - } - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being added to the indirection - * array and the new extents do not fit in the target extent list. The - * erp_idx parameter contains the irec index for the target extent list - * in the indirection array, and the idx parameter contains the extent - * index within the list. The number of extents being added is stored - * in the count parameter. - * - * |-------| |-------| - * | | | | idx - number of extents before idx - * | idx | | count | - * | | | | count - number of extents being inserted at idx - * |-------| |-------| - * | count | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_add_indirect_multi( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* target extent irec index */ - xfs_extnum_t idx, /* index within target list */ - int count) /* new extents being added */ -{ - int byte_diff; /* new bytes being added */ - xfs_ext_irec_t *erp; /* pointer to irec entry */ - xfs_extnum_t ext_diff; /* number of extents to add */ - xfs_extnum_t ext_cnt; /* new extents still needed */ - xfs_extnum_t nex2; /* extents after idx + count */ - xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ - int nlists; /* number of irec's (lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex2 = erp->er_extcount - idx; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * Save second part of target extent list - * (all extents past */ - if (nex2) { - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); - memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); - erp->er_extcount -= nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); - memset(&erp->er_extbuf[idx], 0, byte_diff); - } - - /* - * Add the new extents to the end of the target - * list, then allocate new irec record(s) and - * extent buffer(s) as needed to store the rest - * of the new extents. - */ - ext_cnt = count; - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); - if (ext_diff) { - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - while (ext_cnt) { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); - erp->er_extcount = ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - - /* Add nex2 extents back to indirection array */ - if (nex2) { - xfs_extnum_t ext_avail; - int i; - - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - i = 0; - /* - * If nex2 extents fit in the current page, append - * nex2_ep after the new extents. - */ - if (nex2 <= ext_avail) { - i = erp->er_extcount; - } - /* - * Otherwise, check if space is available in the - * next page. - */ - else if ((erp_idx < nlists - 1) && - (nex2 <= (ext_avail = XFS_LINEAR_EXTS - - ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { - erp_idx++; - erp++; - /* Create a hole for nex2 extents */ - memmove(&erp->er_extbuf[nex2], erp->er_extbuf, - erp->er_extcount * sizeof(xfs_bmbt_rec_t)); - } - /* - * Final choice, create a new extent page for - * nex2 extents. - */ - else { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - } - memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); - kmem_free(nex2_ep); - erp->er_extcount += nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); - } -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be decreased. The ext_diff parameter stores the - * number of extents to be removed and the idx parameter contains - * the extent index where the extents will be removed from. - * - * If the amount of space needed has decreased below the linear - * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous - * extent array. Otherwise, use kmem_realloc() to adjust the - * size to what is needed. - */ -void -xfs_iext_remove( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff, /* number of extents to remove */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - trace_xfs_iext_remove(ip, idx, state, _RET_IP_); - - ASSERT(ext_diff > 0); - nextents = xfs_iext_count(ifp); - new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_iext_remove_indirect(ifp, idx, ext_diff); - } else if (ifp->if_real_bytes) { - xfs_iext_remove_direct(ifp, idx, ext_diff); - } else { - xfs_iext_remove_inline(ifp, idx, ext_diff); - } - ifp->if_bytes = new_size; -} - -/* - * This removes ext_diff extents from the inline buffer, beginning - * at extent index idx. - */ -void -xfs_iext_remove_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - int nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - ASSERT(idx < XFS_INLINE_EXTS); - nextents = xfs_iext_count(ifp); - ASSERT(((nextents - ext_diff) > 0) && - (nextents - ext_diff) < XFS_INLINE_EXTS); - - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx], - &ifp->if_u2.if_inline_ext[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - } else { - memset(&ifp->if_u2.if_inline_ext[idx], 0, - ext_diff * sizeof(xfs_bmbt_rec_t)); - } -} - -/* - * This removes ext_diff extents from a linear (direct) extent list, - * beginning at extent index idx. If the extents are being removed - * from the end of the list (ie. truncate) then we just need to re- - * allocate the list to remove the extra space. Otherwise, if the - * extents are being removed from the middle of the existing extent - * entries, then we first need to move the extent records beginning - * at idx + ext_diff up in the list to overwrite the records being - * removed, then remove the extra space via kmem_realloc. - */ -void -xfs_iext_remove_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - new_size = ifp->if_bytes - - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = xfs_iext_count(ifp); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - return; - } - /* Move extents up in the list (if needed) */ - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u1.if_extents[idx], - &ifp->if_u1.if_extents[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - } - memset(&ifp->if_u1.if_extents[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - /* - * Reallocate the direct extent list. If the extents - * will fit inside the inode then xfs_iext_realloc_direct - * will switch from direct to inline extent allocation - * mode for us. - */ - xfs_iext_realloc_direct(ifp, new_size); - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being removed from the - * indirection array and the extents being removed span multiple extent - * buffers. The idx parameter contains the file extent index where we - * want to begin removing extents, and the count parameter contains - * how many extents need to be removed. - * - * |-------| |-------| - * | nex1 | | | nex1 - number of extents before idx - * |-------| | count | - * | | | | count - number of extents being removed at idx - * | count | |-------| - * | | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_remove_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing extents */ - int count) /* number of extents to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int erp_idx = 0; /* indirection array index */ - xfs_extnum_t ext_cnt; /* extents left to remove */ - xfs_extnum_t ext_diff; /* extents to remove in current list */ - xfs_extnum_t nex1; /* number of extents before idx */ - xfs_extnum_t nex2; /* extents after idx + count */ - int page_idx = idx; /* index in target extent list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - ASSERT(erp != NULL); - nex1 = page_idx; - ext_cnt = count; - while (ext_cnt) { - nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); - ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); - /* - * Check for deletion of entire list; - * xfs_iext_irec_remove() updates extent offsets. - */ - if (ext_diff == erp->er_extcount) { - xfs_iext_irec_remove(ifp, erp_idx); - ext_cnt -= ext_diff; - nex1 = 0; - if (ext_cnt) { - ASSERT(erp_idx < ifp->if_real_bytes / - XFS_IEXT_BUFSZ); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex1 = 0; - continue; - } else { - break; - } - } - /* Move extents up (if needed) */ - if (nex2) { - memmove(&erp->er_extbuf[nex1], - &erp->er_extbuf[nex1 + ext_diff], - nex2 * sizeof(xfs_bmbt_rec_t)); - } - /* Zero out rest of page */ - memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - - ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); - /* Update remaining counters */ - erp->er_extcount -= ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); - ext_cnt -= ext_diff; - nex1 = 0; - erp_idx++; - erp++; - } - ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact(ifp); -} - -/* - * Create, destroy, or resize a linear (direct) block of extents. - */ -void -xfs_iext_realloc_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new size of extents after adding */ -{ - int rnew_size; /* real new size of extents */ - - rnew_size = new_size; - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || - ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && - (new_size != ifp->if_real_bytes))); - - /* Free extent records */ - if (new_size == 0) { - xfs_iext_destroy(ifp); - } - /* Resize direct extent list and zero any new bytes */ - else if (ifp->if_real_bytes) { - /* Check if extents will fit inside the inode */ - if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { - xfs_iext_direct_to_inline(ifp, new_size / - (uint)sizeof(xfs_bmbt_rec_t)); - ifp->if_bytes = new_size; - return; - } - if (!is_power_of_2(new_size)){ - rnew_size = roundup_pow_of_two(new_size); - } - if (rnew_size != ifp->if_real_bytes) { - ifp->if_u1.if_extents = - kmem_realloc(ifp->if_u1.if_extents, - rnew_size, KM_NOFS); - } - if (rnew_size > ifp->if_real_bytes) { - memset(&ifp->if_u1.if_extents[ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)], 0, - rnew_size - ifp->if_real_bytes); - } - } - /* Switch from the inline extent buffer to a direct extent list */ - else { - if (!is_power_of_2(new_size)) { - rnew_size = roundup_pow_of_two(new_size); - } - xfs_iext_inline_to_direct(ifp, rnew_size); - } - ifp->if_real_bytes = rnew_size; - ifp->if_bytes = new_size; -} - -/* - * Switch from linear (direct) extent records to inline buffer. - */ -void -xfs_iext_direct_to_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t nextents) /* number of extents in file */ -{ - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(nextents <= XFS_INLINE_EXTS); - /* - * The inline buffer was zeroed when we switched - * from inline to direct extent allocation mode, - * so we don't need to clear it here. - */ - memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, - nextents * sizeof(xfs_bmbt_rec_t)); - kmem_free(ifp->if_u1.if_extents); - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; -} - -/* - * Switch from inline buffer to linear (direct) extent records. - * new_size should already be rounded up to the next power of 2 - * by the caller (when appropriate), so use new_size as it is. - * However, since new_size may be rounded up, we can't update - * if_bytes here. It is the caller's responsibility to update - * if_bytes upon return. - */ -void -xfs_iext_inline_to_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* number of extents in file */ -{ - ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); - memset(ifp->if_u1.if_extents, 0, new_size); - if (ifp->if_bytes) { - memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, - ifp->if_bytes); - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_real_bytes = new_size; -} - -/* - * Resize an extent indirection array to new_size bytes. - */ -STATIC void -xfs_iext_realloc_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new indirection array size */ -{ - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && - (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * - sizeof(xfs_ext_irec_t)))); - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else { - ifp->if_u1.if_ext_irec = - kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS); - } -} - -/* - * Switch from indirection array to linear (direct) extent allocations. - */ -STATIC void -xfs_iext_indirect_to_direct( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - int size; /* size of file extents */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = xfs_iext_count(ifp); - ASSERT(nextents <= XFS_LINEAR_EXTS); - size = nextents * sizeof(xfs_bmbt_rec_t); - - xfs_iext_irec_compact_pages(ifp); - ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); - - ep = ifp->if_u1.if_ext_irec->er_extbuf; - kmem_free(ifp->if_u1.if_ext_irec); - ifp->if_flags &= ~XFS_IFEXTIREC; - ifp->if_u1.if_extents = ep; - ifp->if_bytes = size; - if (nextents < XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, size); - } -} - -/* - * Remove all records from the indirection array. - */ -STATIC void -xfs_iext_irec_remove_all( - struct xfs_ifork *ifp) -{ - int nlists; - int i; - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (i = 0; i < nlists; i++) - kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf); - kmem_free(ifp->if_u1.if_ext_irec); - ifp->if_flags &= ~XFS_IFEXTIREC; -} - -/* - * Free incore file extents. - */ -void -xfs_iext_destroy( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_iext_irec_remove_all(ifp); - } else if (ifp->if_real_bytes) { - kmem_free(ifp->if_u1.if_extents); - } else if (ifp->if_bytes) { - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_u1.if_extents = NULL; - ifp->if_real_bytes = 0; - ifp->if_bytes = 0; -} - -/* - * Return a pointer to the extent record for file system block bno. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent record */ -xfs_iext_bno_to_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - xfs_extnum_t *idxp) /* index of target extent */ -{ - xfs_bmbt_rec_host_t *base; /* pointer to first extent */ - xfs_filblks_t blockcount = 0; /* number of blocks in extent */ - xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - int high; /* upper boundary in search */ - xfs_extnum_t idx = 0; /* index of target extent */ - int low; /* lower boundary in search */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff = 0; /* start offset of extent */ - - nextents = xfs_iext_count(ifp); - if (nextents == 0) { - *idxp = 0; - return NULL; - } - low = 0; - if (ifp->if_flags & XFS_IFEXTIREC) { - /* Find target extent list */ - int erp_idx = 0; - erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); - base = erp->er_extbuf; - high = erp->er_extcount - 1; - } else { - base = ifp->if_u1.if_extents; - high = nextents - 1; - } - /* Binary search extent records */ - while (low <= high) { - idx = (low + high) >> 1; - ep = base + idx; - startoff = xfs_bmbt_get_startoff(ep); - blockcount = xfs_bmbt_get_blockcount(ep); - if (bno < startoff) { - high = idx - 1; - } else if (bno >= startoff + blockcount) { - low = idx + 1; - } else { - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - *idxp = idx; - return ep; - } - } - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - if (bno >= startoff + blockcount) { - if (++idx == nextents) { - ep = NULL; - } else { - ep = xfs_iext_get_ext(ifp, idx); - } - } - *idxp = idx; - return ep; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record for filesystem block bno. Store the index of the - * target irec in *erp_idxp. - */ -xfs_ext_irec_t * /* pointer to found extent record */ -xfs_iext_bno_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - int *erp_idxp) /* irec index of target ext list */ -{ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - xfs_ext_irec_t *erp_next; /* next indirection array entry */ - int erp_idx; /* indirection array index */ - int nlists; /* number of extent irec's (lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; - if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { - high = erp_idx - 1; - } else if (erp_next && bno >= - xfs_bmbt_get_startoff(erp_next->er_extbuf)) { - low = erp_idx + 1; - } else { - break; - } - } - *erp_idxp = erp_idx; - return erp; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record at file extent index *idxp. Store the index of the - * target irec in *erp_idxp and store the page index of the target - * extent record in *idxp. - */ -xfs_ext_irec_t * -xfs_iext_idx_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t *idxp, /* extent index (file -> page) */ - int *erp_idxp, /* pointer to target irec */ - int realloc) /* new bytes were just added */ -{ - xfs_ext_irec_t *prev; /* pointer to previous irec */ - xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ - int erp_idx; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - xfs_extnum_t page_idx = *idxp; /* extent index in target list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(page_idx >= 0); - ASSERT(page_idx <= xfs_iext_count(ifp)); - ASSERT(page_idx < xfs_iext_count(ifp) || realloc); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - - /* Binary search extent irec's */ - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - prev = erp_idx > 0 ? erp - 1 : NULL; - if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && - realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { - high = erp_idx - 1; - } else if (page_idx > erp->er_extoff + erp->er_extcount || - (page_idx == erp->er_extoff + erp->er_extcount && - !realloc)) { - low = erp_idx + 1; - } else if (page_idx == erp->er_extoff + erp->er_extcount && - erp->er_extcount == XFS_LINEAR_EXTS) { - ASSERT(realloc); - page_idx = 0; - erp_idx++; - erp = erp_idx < nlists ? erp + 1 : NULL; - break; - } else { - page_idx -= erp->er_extoff; - break; - } - } - *idxp = page_idx; - *erp_idxp = erp_idx; - return erp; -} - -/* - * Allocate and initialize an indirection array once the space needed - * for incore extents increases above XFS_IEXT_BUFSZ. - */ -void -xfs_iext_irec_init( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = xfs_iext_count(ifp); - ASSERT(nextents <= XFS_LINEAR_EXTS); - - erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); - - if (nextents == 0) { - ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - } else if (!ifp->if_real_bytes) { - xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); - } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { - xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); - } - erp->er_extbuf = ifp->if_u1.if_extents; - erp->er_extcount = nextents; - erp->er_extoff = 0; - - ifp->if_flags |= XFS_IFEXTIREC; - ifp->if_real_bytes = XFS_IEXT_BUFSZ; - ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); - ifp->if_u1.if_ext_irec = erp; - - return; -} - -/* - * Allocate and initialize a new entry in the indirection array. - */ -xfs_ext_irec_t * -xfs_iext_irec_new( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* index for new irec */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* Resize indirection array */ - xfs_iext_realloc_indirect(ifp, ++nlists * - sizeof(xfs_ext_irec_t)); - /* - * Move records down in the array so the - * new page can use erp_idx. - */ - erp = ifp->if_u1.if_ext_irec; - for (i = nlists - 1; i > erp_idx; i--) { - memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); - } - ASSERT(i == erp_idx); - - /* Initialize new extent record */ - erp = ifp->if_u1.if_ext_irec; - erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; - memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); - erp[erp_idx].er_extcount = 0; - erp[erp_idx].er_extoff = erp_idx > 0 ? - erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; - return (&erp[erp_idx]); -} - -/* - * Remove a record from the indirection array. - */ -void -xfs_iext_irec_remove( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* irec index to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - if (erp->er_extbuf) { - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, - -erp->er_extcount); - kmem_free(erp->er_extbuf); - } - /* Compact extent records */ - erp = ifp->if_u1.if_ext_irec; - for (i = erp_idx; i < nlists - 1; i++) { - memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); - } - /* - * Manually free the last extent record from the indirection - * array. A call to xfs_iext_realloc_indirect() with a size - * of zero would result in a call to xfs_iext_destroy() which - * would in turn call this function again, creating a nasty - * infinite loop. - */ - if (--nlists) { - xfs_iext_realloc_indirect(ifp, - nlists * sizeof(xfs_ext_irec_t)); - } else { - kmem_free(ifp->if_u1.if_ext_irec); - } - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; -} - -/* - * This is called to clean up large amounts of unused memory allocated - * by the indirection array. Before compacting anything though, verify - * that the indirection array is still needed and switch back to the - * linear extent list (or even the inline buffer) if possible. The - * compaction policy is as follows: - * - * Full Compaction: Extents fit into a single page (or inline buffer) - * Partial Compaction: Extents occupy less than 50% of allocated space - * No Compaction: Extents occupy at least 50% of allocated space - */ -void -xfs_iext_irec_compact( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = xfs_iext_count(ifp); - - if (nextents == 0) { - xfs_iext_destroy(ifp); - } else if (nextents <= XFS_INLINE_EXTS) { - xfs_iext_indirect_to_direct(ifp); - xfs_iext_direct_to_inline(ifp, nextents); - } else if (nextents <= XFS_LINEAR_EXTS) { - xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { - xfs_iext_irec_compact_pages(ifp); - } -} - -/* - * Combine extents from neighboring extent pages. - */ -void -xfs_iext_irec_compact_pages( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ - int erp_idx = 0; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - while (erp_idx < nlists - 1) { - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp + 1; - if (erp_next->er_extcount <= - (XFS_LINEAR_EXTS - erp->er_extcount)) { - memcpy(&erp->er_extbuf[erp->er_extcount], - erp_next->er_extbuf, erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += erp_next->er_extcount; - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - } else { - erp_idx++; - } - } -} - -/* - * This is called to update the er_extoff field in the indirection - * array when extents have been added or removed from one of the - * extent lists. erp_idx contains the irec index to begin updating - * at and ext_diff contains the number of extents that were added - * or removed. - */ -void -xfs_iext_irec_update_extoffs( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* irec index to update */ - int ext_diff) /* number of new extents */ -{ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (i = erp_idx; i < nlists; i++) { - ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; - } -} - -/* * Initialize an inode's copy-on-write fork. */ void @@ -1974,61 +827,3 @@ xfs_ifork_init_cow( ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; } - -/* - * Lookup the extent covering bno. - * - * If there is an extent covering bno return the extent index, and store the - * expanded extent structure in *gotp, and the extent index in *idx. - * If there is no extent covering bno, but there is an extent after it (e.g. - * it lies in a hole) return that extent in *gotp and its index in *idx - * instead. - * If bno is beyond the last extent return false, and return the index after - * the last valid index in *idxp. - */ -bool -xfs_iext_lookup_extent( - struct xfs_inode *ip, - struct xfs_ifork *ifp, - xfs_fileoff_t bno, - xfs_extnum_t *idxp, - struct xfs_bmbt_irec *gotp) -{ - struct xfs_bmbt_rec_host *ep; - - XFS_STATS_INC(ip->i_mount, xs_look_exlist); - - ep = xfs_iext_bno_to_ext(ifp, bno, idxp); - if (!ep) - return false; - xfs_bmbt_get_all(ep, gotp); - return true; -} - -/* - * Return true if there is an extent at index idx, and return the expanded - * extent structure at idx in that case. Else return false. - */ -bool -xfs_iext_get_extent( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - struct xfs_bmbt_irec *gotp) -{ - if (idx < 0 || idx >= xfs_iext_count(ifp)) - return false; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); - return true; -} - -void -xfs_iext_update_extent( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - struct xfs_bmbt_irec *gotp) -{ - ASSERT(idx >= 0); - ASSERT(idx < xfs_iext_count(ifp)); - - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp); -} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 11af705219f6..b9f0098e33b8 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -22,56 +22,19 @@ struct xfs_inode_log_item; struct xfs_dinode; /* - * The following xfs_ext_irec_t struct introduces a second (top) level - * to the in-core extent allocation scheme. These structs are allocated - * in a contiguous block, creating an indirection array where each entry - * (irec) contains a pointer to a buffer of in-core extent records which - * it manages. Each extent buffer is 4k in size, since 4k is the system - * page size on Linux i386 and systems with larger page sizes don't seem - * to gain much, if anything, by using their native page size as the - * extent buffer size. Also, using 4k extent buffers everywhere provides - * a consistent interface for CXFS across different platforms. - * - * There is currently no limit on the number of irec's (extent lists) - * allowed, so heavily fragmented files may require an indirection array - * which spans multiple system pages of memory. The number of extents - * which would require this amount of contiguous memory is very large - * and should not cause problems in the foreseeable future. However, - * if the memory needed for the contiguous array ever becomes a problem, - * it is possible that a third level of indirection may be required. - */ -typedef struct xfs_ext_irec { - xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ - xfs_extnum_t er_extoff; /* extent offset in file */ - xfs_extnum_t er_extcount; /* number of extents in page/block */ -} xfs_ext_irec_t; - -/* * File incore extent information, present for each of data & attr forks. */ -#define XFS_IEXT_BUFSZ 4096 -#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) -#define XFS_INLINE_EXTS 2 -#define XFS_INLINE_DATA 32 typedef struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ int if_real_bytes; /* bytes allocated in if_u1 */ struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ + int if_height; /* height of the extent tree */ union { - xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ - xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ + void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; - union { - xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; - /* very small file extents */ - char if_inline_data[XFS_INLINE_DATA]; - /* very small file data */ - xfs_dev_t if_rdev; /* dev number if special */ - uuid_t if_uuid; /* mount point value */ - } if_u2; } xfs_ifork_t; /* @@ -80,7 +43,6 @@ typedef struct xfs_ifork { #define XFS_IFINLINE 0x01 /* Inline data is read in */ #define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ #define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ -#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ /* * Fork handling. @@ -150,45 +112,75 @@ int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); -struct xfs_bmbt_rec_host * - xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); -xfs_extnum_t xfs_iext_count(struct xfs_ifork *); -void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, - struct xfs_bmbt_irec *, int); -void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); -void xfs_iext_add_indirect_multi(struct xfs_ifork *, int, - xfs_extnum_t, int); -void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int); -void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int); -void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int); -void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int); -void xfs_iext_realloc_direct(struct xfs_ifork *, int); -void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t); -void xfs_iext_inline_to_direct(struct xfs_ifork *, int); +xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); +void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *, int); +void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *, + int); void xfs_iext_destroy(struct xfs_ifork *); -struct xfs_bmbt_rec_host * - xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *); -struct xfs_ext_irec * - xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *); -struct xfs_ext_irec * - xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *, - int); -void xfs_iext_irec_init(struct xfs_ifork *); -struct xfs_ext_irec * - xfs_iext_irec_new(struct xfs_ifork *, int); -void xfs_iext_irec_remove(struct xfs_ifork *, int); -void xfs_iext_irec_compact(struct xfs_ifork *); -void xfs_iext_irec_compact_pages(struct xfs_ifork *); -void xfs_iext_irec_compact_full(struct xfs_ifork *); -void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); bool xfs_iext_lookup_extent(struct xfs_inode *ip, struct xfs_ifork *ifp, xfs_fileoff_t bno, - xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); -bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp); -void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, +bool xfs_iext_lookup_extent_before(struct xfs_inode *ip, + struct xfs_ifork *ifp, xfs_fileoff_t *end, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp); +bool xfs_iext_get_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp); +void xfs_iext_update_extent(struct xfs_inode *ip, int state, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp); + +void xfs_iext_first(struct xfs_ifork *, struct xfs_iext_cursor *); +void xfs_iext_last(struct xfs_ifork *, struct xfs_iext_cursor *); +void xfs_iext_next(struct xfs_ifork *, struct xfs_iext_cursor *); +void xfs_iext_prev(struct xfs_ifork *, struct xfs_iext_cursor *); + +static inline bool xfs_iext_next_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp) +{ + xfs_iext_next(ifp, cur); + return xfs_iext_get_extent(ifp, cur, gotp); +} + +static inline bool xfs_iext_prev_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp) +{ + xfs_iext_prev(ifp, cur); + return xfs_iext_get_extent(ifp, cur, gotp); +} + +/* + * Return the extent after cur in gotp without updating the cursor. + */ +static inline bool xfs_iext_peek_next_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp) +{ + struct xfs_iext_cursor ncur = *cur; + + xfs_iext_next(ifp, &ncur); + return xfs_iext_get_extent(ifp, &ncur, gotp); +} + +/* + * Return the extent before cur in gotp without updating the cursor. + */ +static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp) +{ + struct xfs_iext_cursor ncur = *cur; + + xfs_iext_prev(ifp, &ncur); + return xfs_iext_get_extent(ifp, &ncur, gotp); +} + +#define for_each_xfs_iext(ifp, ext, got) \ + for (xfs_iext_first((ifp), (ext)); \ + xfs_iext_get_extent((ifp), (ext), (got)); \ + xfs_iext_next((ifp), (ext))) extern struct kmem_zone *xfs_ifork_zone; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 71de185735e0..349d9f8edb89 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -264,7 +264,7 @@ typedef struct xfs_trans_header { * (if any) is indicated in the ilf_dsize field. Changes to this structure * must be added on to the end. */ -typedef struct xfs_inode_log_format { +struct xfs_inode_log_format { uint16_t ilf_type; /* inode log item type */ uint16_t ilf_size; /* size of this item */ uint32_t ilf_fields; /* flags for fields logged */ @@ -274,12 +274,12 @@ typedef struct xfs_inode_log_format { uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ + uint8_t __pad[16]; /* unused */ } ilf_u; int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_t; +}; /* * Old 32 bit systems will log in this format without the 64 bit @@ -295,7 +295,7 @@ struct xfs_inode_log_format_32 { uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ + uint8_t __pad[16]; /* unused */ } ilf_u; int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ @@ -311,7 +311,7 @@ struct xfs_inode_log_format_32 { #define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ #define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ #define XFS_ILOG_DEV 0x010 /* log the dev field */ -#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_UUID 0x020 /* added long ago, but never used */ #define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ #define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ #define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ @@ -329,9 +329,9 @@ struct xfs_inode_log_format_32 { #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ - XFS_ILOG_UUID | XFS_ILOG_ADATA | \ - XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ - XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \ + XFS_ILOG_AOWNER) #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT) @@ -341,10 +341,10 @@ struct xfs_inode_log_format_32 { #define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ - XFS_ILOG_DEV | XFS_ILOG_UUID | \ - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \ - XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + XFS_ILOG_DEV | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ + XFS_ILOG_TIMESTAMP | XFS_ILOG_DOWNER | \ + XFS_ILOG_AOWNER) static inline int xfs_ilog_fbroot(int w) { diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 9d5406b4f663..c40d26763075 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -30,6 +30,7 @@ #include "xfs_bmap.h" #include "xfs_refcount_btree.h" #include "xfs_alloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" @@ -1487,27 +1488,12 @@ __xfs_refcount_cow_alloc( xfs_extlen_t aglen, struct xfs_defer_ops *dfops) { - int error; - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, agbno, aglen); /* Add refcount btree reservation */ - error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + return xfs_refcount_adjust_cow(rcur, agbno, aglen, XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); - if (error) - return error; - - /* Add rmap entry */ - if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { - error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, - rcur->bc_private.a.agno, - agbno, aglen, XFS_RMAP_OWN_COW); - if (error) - return error; - } - - return error; } /* @@ -1520,27 +1506,12 @@ __xfs_refcount_cow_free( xfs_extlen_t aglen, struct xfs_defer_ops *dfops) { - int error; - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, agbno, aglen); /* Remove refcount btree reservation */ - error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + return xfs_refcount_adjust_cow(rcur, agbno, aglen, XFS_REFCOUNT_ADJUST_COW_FREE, dfops); - if (error) - return error; - - /* Remove rmap entry */ - if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { - error = xfs_rmap_free_extent(rcur->bc_mp, dfops, - rcur->bc_private.a.agno, - agbno, aglen, XFS_RMAP_OWN_COW); - if (error) - return error; - } - - return error; } /* Record a CoW staging extent in the refcount btree. */ @@ -1551,11 +1522,19 @@ xfs_refcount_alloc_cow_extent( xfs_fsblock_t fsb, xfs_extlen_t len) { + int error; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, + error = __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, fsb, len); + if (error) + return error; + + /* Add rmap entry */ + return xfs_rmap_alloc_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ @@ -1566,9 +1545,17 @@ xfs_refcount_free_cow_extent( xfs_fsblock_t fsb, xfs_extlen_t len) { + int error; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; + /* Remove rmap entry */ + error = xfs_rmap_free_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); + if (error) + return error; + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, fsb, len); } diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 55c88a732690..50db920ceeeb 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -34,6 +34,7 @@ #include "xfs_rmap_btree.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_bmap.h" @@ -367,6 +368,51 @@ xfs_rmap_lookup_le_range( } /* + * Perform all the relevant owner checks for a removal op. If we're doing an + * unknown-owner removal then we have no owner information to check. + */ +static int +xfs_rmap_free_check_owner( + struct xfs_mount *mp, + uint64_t ltoff, + struct xfs_rmap_irec *rec, + xfs_fsblock_t bno, + xfs_filblks_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int error = 0; + + if (owner == XFS_RMAP_OWN_UNKNOWN) + return 0; + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (rec->rm_flags & XFS_RMAP_UNWRITTEN), out); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out); + + /* Check the offset, if necessary. */ + if (XFS_RMAP_NON_INODE_OWNER(owner)) + goto out; + + if (flags & XFS_RMAP_BMBT_BLOCK) { + XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK, + out); + } else { + XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out); + XFS_WANT_CORRUPTED_GOTO(mp, + ltoff + rec->rm_blockcount >= offset + len, + out); + } + +out: + return error; +} + +/* * Find the extent in the rmap btree and remove it. * * The record we find should always be an exact match for the extent that we're @@ -443,33 +489,40 @@ xfs_rmap_unmap( goto out_done; } - /* Make sure the unwritten flag matches. */ - XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == - (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + /* + * If we're doing an unknown-owner removal for EFI recovery, we expect + * to find the full range in the rmapbt or nothing at all. If we + * don't find any rmaps overlapping either end of the range, we're + * done. Hopefully this means that the EFI creator already queued + * (and finished) a RUI to remove the rmap. + */ + if (owner == XFS_RMAP_OWN_UNKNOWN && + ltrec.rm_startblock + ltrec.rm_blockcount <= bno) { + struct xfs_rmap_irec rtrec; + + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto out_error; + if (i == 0) + goto out_done; + error = xfs_rmap_get_rec(cur, &rtrec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (rtrec.rm_startblock >= bno + len) + goto out_done; + } /* Make sure the extent we found covers the entire freeing range. */ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && - ltrec.rm_startblock + ltrec.rm_blockcount >= - bno + len, out_error); + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); - /* Make sure the owner matches what we expect to find in the tree. */ - XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner || - XFS_RMAP_NON_INODE_OWNER(owner), out_error); - - /* Check the offset, if necessary. */ - if (!XFS_RMAP_NON_INODE_OWNER(owner)) { - if (flags & XFS_RMAP_BMBT_BLOCK) { - XFS_WANT_CORRUPTED_GOTO(mp, - ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK, - out_error); - } else { - XFS_WANT_CORRUPTED_GOTO(mp, - ltrec.rm_offset <= offset, out_error); - XFS_WANT_CORRUPTED_GOTO(mp, - ltoff + ltrec.rm_blockcount >= offset + len, - out_error); - } - } + /* Check owner information. */ + error = xfs_rmap_free_check_owner(mp, ltoff, <rec, bno, len, owner, + offset, flags); + if (error) + goto out_error; if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ @@ -663,6 +716,7 @@ xfs_rmap_map( flags |= XFS_RMAP_UNWRITTEN; trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, unwritten, oinfo); + ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); /* * For the initial lookup, look for an exact match or the left-adjacent diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 466ede637080..0fcd5b1ba729 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -61,7 +61,21 @@ static inline void xfs_rmap_skip_owner_update( struct xfs_owner_info *oi) { - oi->oi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL); +} + +static inline bool +xfs_rmap_should_skip_owner_update( + struct xfs_owner_info *oi) +{ + return oi->oi_owner == XFS_RMAP_OWN_NULL; +} + +static inline void +xfs_rmap_any_owner_update( + struct xfs_owner_info *oi) +{ + xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN); } /* Reverse mapping functions. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5d4e43ef4eea..3fb29a5ea915 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -672,7 +672,6 @@ xfs_rtmodify_range( /* * Compute a mask of relevant bits. */ - bit = 0; mask = ((xfs_rtword_t)1 << lastbit) - 1; /* * Set/clear the active bits. @@ -1086,3 +1085,15 @@ xfs_rtalloc_query_all( return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); } + +/* + * Verify that an realtime block number pointer doesn't point off the + * end of the realtime device. + */ +bool +xfs_verify_rtbno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return rtbno < mp->m_sb.sb_rblocks; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 0220159bd463..3c560695c546 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -48,6 +48,12 @@ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ typedef int64_t xfs_sfiloff_t; /* signed block number in a file */ /* + * New verifiers will return the instruction address of the failing check. + * NULL means everything is ok. + */ +typedef void * xfs_failaddr_t; + +/* * Null values for the types. */ #define NULLFSBLOCK ((xfs_fsblock_t)-1) @@ -136,5 +142,21 @@ typedef uint32_t xfs_dqid_t; #define XFS_NBWORD (1 << XFS_NBWORDLOG) #define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) +struct xfs_iext_cursor { + struct xfs_iext_leaf *leaf; + int pos; +}; + +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, +} xfs_exntst_t; + +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c new file mode 100644 index 000000000000..2a9b4f9e93c6 --- /dev/null +++ b/fs/xfs/scrub/agheader.c @@ -0,0 +1,658 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Set up scrub to check all the static metadata in each AG. + * This means the SB, AGF, AGI, and AGFL headers. + */ +int +xfs_scrub_setup_ag_header( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + + if (sc->sm->sm_agno >= mp->m_sb.sb_agcount || + sc->sm->sm_ino || sc->sm->sm_gen) + return -EINVAL; + return xfs_scrub_setup_fs(sc, ip); +} + +/* Walk all the blocks in the AGFL. */ +int +xfs_scrub_walk_agfl( + struct xfs_scrub_context *sc, + int (*fn)(struct xfs_scrub_context *, + xfs_agblock_t bno, void *), + void *priv) +{ + struct xfs_agf *agf; + __be32 *agfl_bno; + struct xfs_mount *mp = sc->mp; + unsigned int flfirst; + unsigned int fllast; + int i; + int error; + + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp); + flfirst = be32_to_cpu(agf->agf_flfirst); + fllast = be32_to_cpu(agf->agf_fllast); + + /* Nothing to walk in an empty AGFL. */ + if (agf->agf_flcount == cpu_to_be32(0)) + return 0; + + /* first to last is a consecutive list. */ + if (fllast >= flfirst) { + for (i = flfirst; i <= fllast; i++) { + error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + } + + return 0; + } + + /* first to the end */ + for (i = flfirst; i < XFS_AGFL_SIZE(mp); i++) { + error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + } + + /* the start to last. */ + for (i = 0; i <= fllast; i++) { + error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + } + + return 0; +} + +/* Superblock */ + +/* + * Scrub the filesystem superblock. + * + * Note: We do /not/ attempt to check AG 0's superblock. Mount is + * responsible for validating all the geometry information in sb 0, so + * if the filesystem is capable of initiating online scrub, then clearly + * sb 0 is ok and we can use its information to check everything else. + */ +int +xfs_scrub_superblock( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + struct xfs_dsb *sb; + xfs_agnumber_t agno; + uint32_t v2_ok; + __be32 features_mask; + int error; + __be16 vernum_mask; + + agno = sc->sm->sm_agno; + if (agno == 0) + return 0; + + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + if (!xfs_scrub_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) + return error; + + sb = XFS_BUF_TO_SBP(bp); + + /* + * Verify the geometries match. Fields that are permanently + * set by mkfs are checked; fields that can be updated later + * (and are not propagated to backup superblocks) are preen + * checked. + */ + if (sb->sb_blocksize != cpu_to_be32(mp->m_sb.sb_blocksize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_dblocks != cpu_to_be64(mp->m_sb.sb_dblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rblocks != cpu_to_be64(mp->m_sb.sb_rblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rextents != cpu_to_be64(mp->m_sb.sb_rextents)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (!uuid_equal(&sb->sb_uuid, &mp->m_sb.sb_uuid)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_logstart != cpu_to_be64(mp->m_sb.sb_logstart)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_agblocks != cpu_to_be32(mp->m_sb.sb_agblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_agcount != cpu_to_be32(mp->m_sb.sb_agcount)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rbmblocks != cpu_to_be32(mp->m_sb.sb_rbmblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_logblocks != cpu_to_be32(mp->m_sb.sb_logblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check sb_versionnum bits that are set at mkfs time. */ + vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS | + XFS_SB_VERSION_NUMBITS | + XFS_SB_VERSION_ALIGNBIT | + XFS_SB_VERSION_DALIGNBIT | + XFS_SB_VERSION_SHAREDBIT | + XFS_SB_VERSION_LOGV2BIT | + XFS_SB_VERSION_SECTORBIT | + XFS_SB_VERSION_EXTFLGBIT | + XFS_SB_VERSION_DIRV2BIT); + if ((sb->sb_versionnum & vernum_mask) != + (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check sb_versionnum bits that can be set after mkfs time. */ + vernum_mask = cpu_to_be16(XFS_SB_VERSION_ATTRBIT | + XFS_SB_VERSION_NLINKBIT | + XFS_SB_VERSION_QUOTABIT); + if ((sb->sb_versionnum & vernum_mask) != + (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_sectsize != cpu_to_be16(mp->m_sb.sb_sectsize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inodesize != cpu_to_be16(mp->m_sb.sb_inodesize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inopblock != cpu_to_be16(mp->m_sb.sb_inopblock)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (memcmp(sb->sb_fname, mp->m_sb.sb_fname, sizeof(sb->sb_fname))) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_blocklog != mp->m_sb.sb_blocklog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_sectlog != mp->m_sb.sb_sectlog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inodelog != mp->m_sb.sb_inodelog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inopblog != mp->m_sb.sb_inopblog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_agblklog != mp->m_sb.sb_agblklog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rextslog != mp->m_sb.sb_rextslog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_imax_pct != mp->m_sb.sb_imax_pct) + xfs_scrub_block_set_preen(sc, bp); + + /* + * Skip the summary counters since we track them in memory anyway. + * sb_icount, sb_ifree, sb_fdblocks, sb_frexents + */ + + if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) + xfs_scrub_block_set_preen(sc, bp); + + /* + * Skip the quota flags since repair will force quotacheck. + * sb_qflags + */ + + if (sb->sb_flags != mp->m_sb.sb_flags) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_shared_vn != mp->m_sb.sb_shared_vn) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inoalignmt != cpu_to_be32(mp->m_sb.sb_inoalignmt)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_unit != cpu_to_be32(mp->m_sb.sb_unit)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_width != cpu_to_be32(mp->m_sb.sb_width)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_dirblklog != mp->m_sb.sb_dirblklog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_logsectlog != mp->m_sb.sb_logsectlog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_logsectsize != cpu_to_be16(mp->m_sb.sb_logsectsize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_logsunit != cpu_to_be32(mp->m_sb.sb_logsunit)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Do we see any invalid bits in sb_features2? */ + if (!xfs_sb_version_hasmorebits(&mp->m_sb)) { + if (sb->sb_features2 != 0) + xfs_scrub_block_set_corrupt(sc, bp); + } else { + v2_ok = XFS_SB_VERSION2_OKBITS; + if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5) + v2_ok |= XFS_SB_VERSION2_CRCBIT; + + if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok))) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_features2 != sb->sb_bad_features2) + xfs_scrub_block_set_preen(sc, bp); + } + + /* Check sb_features2 flags that are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_VERSION2_LAZYSBCOUNTBIT | + XFS_SB_VERSION2_PROJID32BIT | + XFS_SB_VERSION2_CRCBIT | + XFS_SB_VERSION2_FTYPE); + if ((sb->sb_features2 & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check sb_features2 flags that can be set after mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT); + if ((sb->sb_features2 & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + /* all v5 fields must be zero */ + if (memchr_inv(&sb->sb_features_compat, 0, + sizeof(struct xfs_dsb) - + offsetof(struct xfs_dsb, sb_features_compat))) + xfs_scrub_block_set_corrupt(sc, bp); + } else { + /* Check compat flags; all are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN); + if ((sb->sb_features_compat & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check ro compat flags; all are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN | + XFS_SB_FEAT_RO_COMPAT_FINOBT | + XFS_SB_FEAT_RO_COMPAT_RMAPBT | + XFS_SB_FEAT_RO_COMPAT_REFLINK); + if ((sb->sb_features_ro_compat & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features_ro_compat) & + features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check incompat flags; all are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN | + XFS_SB_FEAT_INCOMPAT_FTYPE | + XFS_SB_FEAT_INCOMPAT_SPINODES | + XFS_SB_FEAT_INCOMPAT_META_UUID); + if ((sb->sb_features_incompat & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features_incompat) & + features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check log incompat flags; all are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN); + if ((sb->sb_features_log_incompat & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features_log_incompat) & + features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Don't care about sb_crc */ + + if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) + xfs_scrub_block_set_preen(sc, bp); + + /* Don't care about sb_lsn */ + } + + if (xfs_sb_version_hasmetauuid(&mp->m_sb)) { + /* The metadata UUID must be the same for all supers */ + if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid)) + xfs_scrub_block_set_corrupt(sc, bp); + } + + /* Everything else must be zero. */ + if (memchr_inv(sb + 1, 0, + BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) + xfs_scrub_block_set_corrupt(sc, bp); + + return error; +} + +/* AGF */ + +/* Scrub the AGF. */ +int +xfs_scrub_agf( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_agblock_t eoag; + xfs_agblock_t agfl_first; + xfs_agblock_t agfl_last; + xfs_agblock_t agfl_count; + xfs_agblock_t fl_count; + int level; + int error = 0; + + agno = sc->sa.agno = sc->sm->sm_agno; + error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp, + &sc->sa.agf_bp, &sc->sa.agfl_bp); + if (!xfs_scrub_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error)) + goto out; + + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + + /* Check the AG length */ + eoag = be32_to_cpu(agf->agf_length); + if (eoag != xfs_ag_block_count(mp, agno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + /* Check the AGF btree roots and levels */ + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + } + + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agbno = be32_to_cpu(agf->agf_refcount_root); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_refcount_level); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + } + + /* Check the AGFL counters */ + agfl_first = be32_to_cpu(agf->agf_flfirst); + agfl_last = be32_to_cpu(agf->agf_fllast); + agfl_count = be32_to_cpu(agf->agf_flcount); + if (agfl_last > agfl_first) + fl_count = agfl_last - agfl_first + 1; + else + fl_count = XFS_AGFL_SIZE(mp) - agfl_first + agfl_last + 1; + if (agfl_count != 0 && fl_count != agfl_count) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + +out: + return error; +} + +/* AGFL */ + +struct xfs_scrub_agfl_info { + unsigned int sz_entries; + unsigned int nr_entries; + xfs_agblock_t *entries; +}; + +/* Scrub an AGFL block. */ +STATIC int +xfs_scrub_agfl_block( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + void *priv) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_scrub_agfl_info *sai = priv; + xfs_agnumber_t agno = sc->sa.agno; + + if (xfs_verify_agbno(mp, agno, agbno) && + sai->nr_entries < sai->sz_entries) + sai->entries[sai->nr_entries++] = agbno; + else + xfs_scrub_block_set_corrupt(sc, sc->sa.agfl_bp); + + return 0; +} + +static int +xfs_scrub_agblock_cmp( + const void *pa, + const void *pb) +{ + const xfs_agblock_t *a = pa; + const xfs_agblock_t *b = pb; + + return (int)*a - (int)*b; +} + +/* Scrub the AGFL. */ +int +xfs_scrub_agfl( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_agfl_info sai = { 0 }; + struct xfs_agf *agf; + xfs_agnumber_t agno; + unsigned int agflcount; + unsigned int i; + int error; + + agno = sc->sa.agno = sc->sm->sm_agno; + error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp, + &sc->sa.agf_bp, &sc->sa.agfl_bp); + if (!xfs_scrub_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) + goto out; + if (!sc->sa.agf_bp) + return -EFSCORRUPTED; + + /* Allocate buffer to ensure uniqueness of AGFL entries. */ + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agflcount = be32_to_cpu(agf->agf_flcount); + if (agflcount > XFS_AGFL_SIZE(sc->mp)) { + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + goto out; + } + sai.sz_entries = agflcount; + sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS); + if (!sai.entries) { + error = -ENOMEM; + goto out; + } + + /* Check the blocks in the AGFL. */ + error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai); + if (error) + goto out_free; + + if (agflcount != sai.nr_entries) { + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + goto out_free; + } + + /* Sort entries, check for duplicates. */ + sort(sai.entries, sai.nr_entries, sizeof(sai.entries[0]), + xfs_scrub_agblock_cmp, NULL); + for (i = 1; i < sai.nr_entries; i++) { + if (sai.entries[i] == sai.entries[i - 1]) { + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + break; + } + } + +out_free: + kmem_free(sai.entries); +out: + return error; +} + +/* AGI */ + +/* Scrub the AGI. */ +int +xfs_scrub_agi( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_agi *agi; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_agblock_t eoag; + xfs_agino_t agino; + xfs_agino_t first_agino; + xfs_agino_t last_agino; + xfs_agino_t icount; + int i; + int level; + int error = 0; + + agno = sc->sa.agno = sc->sm->sm_agno; + error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp, + &sc->sa.agf_bp, &sc->sa.agfl_bp); + if (!xfs_scrub_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error)) + goto out; + + agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + + /* Check the AG length */ + eoag = be32_to_cpu(agi->agi_length); + if (eoag != xfs_ag_block_count(mp, agno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check btree roots and levels */ + agbno = be32_to_cpu(agi->agi_root); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + level = be32_to_cpu(agi->agi_level); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agbno = be32_to_cpu(agi->agi_free_root); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + level = be32_to_cpu(agi->agi_free_level); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + } + + /* Check inode counters */ + xfs_ialloc_agino_range(mp, agno, &first_agino, &last_agino); + icount = be32_to_cpu(agi->agi_count); + if (icount > last_agino - first_agino + 1 || + icount < be32_to_cpu(agi->agi_freecount)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check inode pointers */ + agino = be32_to_cpu(agi->agi_newino); + if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + agino = be32_to_cpu(agi->agi_dirino); + if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check unlinked inode buckets */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + agino = be32_to_cpu(agi->agi_unlinked[i]); + if (agino == NULLAGINO) + continue; + if (!xfs_verify_agino(mp, agno, agino)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + } + + if (agi->agi_pad32 != cpu_to_be32(0)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + +out: + return error; +} diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c new file mode 100644 index 000000000000..059663e13414 --- /dev/null +++ b/fs/xfs/scrub/alloc.c @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub free space btrees. + */ +int +xfs_scrub_setup_ag_allocbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, false); +} + +/* Free space btree scrubber. */ + +/* Scrub a bnobt/cntbt record. */ +STATIC int +xfs_scrub_allocbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agblock_t bno; + xfs_extlen_t len; + int error = 0; + + bno = be32_to_cpu(rec->alloc.ar_startblock); + len = be32_to_cpu(rec->alloc.ar_blockcount); + + if (bno + len <= bno || + !xfs_verify_agbno(mp, agno, bno) || + !xfs_verify_agbno(mp, agno, bno + len - 1)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + return error; +} + +/* Scrub the freespace btrees for some AG. */ +STATIC int +xfs_scrub_allocbt( + struct xfs_scrub_context *sc, + xfs_btnum_t which) +{ + struct xfs_owner_info oinfo; + struct xfs_btree_cur *cur; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; + return xfs_scrub_btree(sc, cur, xfs_scrub_allocbt_rec, &oinfo, NULL); +} + +int +xfs_scrub_bnobt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_allocbt(sc, XFS_BTNUM_BNO); +} + +int +xfs_scrub_cntbt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_allocbt(sc, XFS_BTNUM_CNT); +} diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c new file mode 100644 index 000000000000..4ed80474f545 --- /dev/null +++ b/fs/xfs/scrub/attr.c @@ -0,0 +1,471 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/dabtree.h" +#include "scrub/trace.h" + +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +/* Set us up to scrub an inode's extended attributes. */ +int +xfs_scrub_setup_xattr( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + size_t sz; + + /* + * Allocate the buffer without the inode lock held. We need enough + * space to read every xattr value in the file or enough space to + * hold three copies of the xattr free space bitmap. (Not both at + * the same time.) + */ + sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) * + BITS_TO_LONGS(sc->mp->m_attr_geo->blksize)); + sc->buf = kmem_zalloc_large(sz, KM_SLEEP); + if (!sc->buf) + return -ENOMEM; + + return xfs_scrub_setup_inode_contents(sc, ip, 0); +} + +/* Extended Attributes */ + +struct xfs_scrub_xattr { + struct xfs_attr_list_context context; + struct xfs_scrub_context *sc; +}; + +/* + * Check that an extended attribute key can be looked up by hash. + * + * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked) + * to call this function for every attribute key in an inode. Once + * we're here, we load the attribute value to see if any errors happen, + * or if we get more or less data than we expected. + */ +static void +xfs_scrub_xattr_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen) +{ + struct xfs_scrub_xattr *sx; + struct xfs_da_args args = { NULL }; + int error = 0; + + sx = container_of(context, struct xfs_scrub_xattr, context); + + if (flags & XFS_ATTR_INCOMPLETE) { + /* Incomplete attr key, just mark the inode for preening. */ + xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino, NULL); + return; + } + + args.flags = ATTR_KERNOTIME; + if (flags & XFS_ATTR_ROOT) + args.flags |= ATTR_ROOT; + else if (flags & XFS_ATTR_SECURE) + args.flags |= ATTR_SECURE; + args.geo = context->dp->i_mount->m_attr_geo; + args.whichfork = XFS_ATTR_FORK; + args.dp = context->dp; + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.trans = context->tp; + args.value = sx->sc->buf; + args.valuelen = XATTR_SIZE_MAX; + + error = xfs_attr_get_ilocked(context->dp, &args); + if (error == -EEXIST) + error = 0; + if (!xfs_scrub_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, + &error)) + goto fail_xref; + if (args.valuelen != valuelen) + xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, + args.blkno); + +fail_xref: + return; +} + +/* + * Mark a range [start, start+len) in this map. Returns true if the + * region was free, and false if there's a conflict or a problem. + * + * Within a char, the lowest bit of the char represents the byte with + * the smallest address + */ +STATIC bool +xfs_scrub_xattr_set_map( + struct xfs_scrub_context *sc, + unsigned long *map, + unsigned int start, + unsigned int len) +{ + unsigned int mapsize = sc->mp->m_attr_geo->blksize; + bool ret = true; + + if (start >= mapsize) + return false; + if (start + len > mapsize) { + len = mapsize - start; + ret = false; + } + + if (find_next_bit(map, mapsize, start) < start + len) + ret = false; + bitmap_set(map, start, len); + + return ret; +} + +/* + * Check the leaf freemap from the usage bitmap. Returns false if the + * attr freemap has problems or points to used space. + */ +STATIC bool +xfs_scrub_xattr_check_freemap( + struct xfs_scrub_context *sc, + unsigned long *map, + struct xfs_attr3_icleaf_hdr *leafhdr) +{ + unsigned long *freemap; + unsigned long *dstmap; + unsigned int mapsize = sc->mp->m_attr_geo->blksize; + int i; + + /* Construct bitmap of freemap contents. */ + freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize); + bitmap_zero(freemap, mapsize); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (!xfs_scrub_xattr_set_map(sc, freemap, + leafhdr->freemap[i].base, + leafhdr->freemap[i].size)) + return false; + } + + /* Look for bits that are set in freemap and are marked in use. */ + dstmap = freemap + BITS_TO_LONGS(mapsize); + return bitmap_and(dstmap, freemap, map, mapsize) == 0; +} + +/* + * Check this leaf entry's relations to everything else. + * Returns the number of bytes used for the name/value data. + */ +STATIC void +xfs_scrub_xattr_entry( + struct xfs_scrub_da_btree *ds, + int level, + char *buf_end, + struct xfs_attr_leafblock *leaf, + struct xfs_attr3_icleaf_hdr *leafhdr, + unsigned long *usedmap, + struct xfs_attr_leaf_entry *ent, + int idx, + unsigned int *usedbytes, + __u32 *last_hashval) +{ + struct xfs_mount *mp = ds->state->mp; + char *name_end; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + unsigned int nameidx; + unsigned int namesize; + + if (ent->pad2 != 0) + xfs_scrub_da_set_corrupt(ds, level); + + /* Hash values in order? */ + if (be32_to_cpu(ent->hashval) < *last_hashval) + xfs_scrub_da_set_corrupt(ds, level); + *last_hashval = be32_to_cpu(ent->hashval); + + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr->firstused || + nameidx >= mp->m_attr_geo->blksize) { + xfs_scrub_da_set_corrupt(ds, level); + return; + } + + /* Check the name information. */ + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, idx); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, + be16_to_cpu(lentry->valuelen)); + name_end = (char *)lentry + namesize; + if (lentry->namelen == 0) + xfs_scrub_da_set_corrupt(ds, level); + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, idx); + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + name_end = (char *)rentry + namesize; + if (rentry->namelen == 0 || rentry->valueblk == 0) + xfs_scrub_da_set_corrupt(ds, level); + } + if (name_end > buf_end) + xfs_scrub_da_set_corrupt(ds, level); + + if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, nameidx, namesize)) + xfs_scrub_da_set_corrupt(ds, level); + if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + *usedbytes += namesize; +} + +/* Scrub an attribute leaf. */ +STATIC int +xfs_scrub_xattr_block( + struct xfs_scrub_da_btree *ds, + int level) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_mount *mp = ds->state->mp; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; + struct xfs_buf *bp = blk->bp; + xfs_dablk_t *last_checked = ds->private; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *ent; + struct xfs_attr_leaf_entry *entries; + unsigned long *usedmap = ds->sc->buf; + char *buf_end; + size_t off; + __u32 last_hashval = 0; + unsigned int usedbytes = 0; + unsigned int hdrsize; + int i; + + if (*last_checked == blk->blkno) + return 0; + *last_checked = blk->blkno; + bitmap_zero(usedmap, mp->m_attr_geo->blksize); + + /* Check all the padding. */ + if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) { + struct xfs_attr3_leafblock *leaf = bp->b_addr; + + if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 || + leaf->hdr.info.hdr.pad != 0) + xfs_scrub_da_set_corrupt(ds, level); + } else { + if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0) + xfs_scrub_da_set_corrupt(ds, level); + } + + /* Check the leaf header */ + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + hdrsize = xfs_attr3_leaf_hdr_size(leaf); + + if (leafhdr.usedbytes > mp->m_attr_geo->blksize) + xfs_scrub_da_set_corrupt(ds, level); + if (leafhdr.firstused > mp->m_attr_geo->blksize) + xfs_scrub_da_set_corrupt(ds, level); + if (leafhdr.firstused < hdrsize) + xfs_scrub_da_set_corrupt(ds, level); + if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, 0, hdrsize)) + xfs_scrub_da_set_corrupt(ds, level); + + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + entries = xfs_attr3_leaf_entryp(leaf); + if ((char *)&entries[leafhdr.count] > (char *)leaf + leafhdr.firstused) + xfs_scrub_da_set_corrupt(ds, level); + + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { + /* Mark the leaf entry itself. */ + off = (char *)ent - (char *)leaf; + if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, off, + sizeof(xfs_attr_leaf_entry_t))) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + + /* Check the entry and nameval. */ + xfs_scrub_xattr_entry(ds, level, buf_end, leaf, &leafhdr, + usedmap, ent, i, &usedbytes, &last_hashval); + + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + } + + if (!xfs_scrub_xattr_check_freemap(ds->sc, usedmap, &leafhdr)) + xfs_scrub_da_set_corrupt(ds, level); + + if (leafhdr.usedbytes != usedbytes) + xfs_scrub_da_set_corrupt(ds, level); + +out: + return 0; +} + +/* Scrub a attribute btree record. */ +STATIC int +xfs_scrub_xattr_rec( + struct xfs_scrub_da_btree *ds, + int level, + void *rec) +{ + struct xfs_mount *mp = ds->state->mp; + struct xfs_attr_leaf_entry *ent = rec; + struct xfs_da_state_blk *blk; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + struct xfs_buf *bp; + xfs_dahash_t calc_hash; + xfs_dahash_t hash; + int nameidx; + int hdrsize; + unsigned int badflags; + int error; + + blk = &ds->state->path.blk[level]; + + /* Check the whole block, if necessary. */ + error = xfs_scrub_xattr_block(ds, level); + if (error) + goto out; + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Check the hash of the entry. */ + error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval); + if (error) + goto out; + + /* Find the attr entry's location. */ + bp = blk->bp; + hdrsize = xfs_attr3_leaf_hdr_size(bp->b_addr); + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < hdrsize || nameidx >= mp->m_attr_geo->blksize) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + + /* Retrieve the entry and check it. */ + hash = be32_to_cpu(ent->hashval); + badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | + XFS_ATTR_INCOMPLETE); + if ((ent->flags & badflags) != 0) + xfs_scrub_da_set_corrupt(ds, level); + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = (struct xfs_attr_leaf_name_local *) + (((char *)bp->b_addr) + nameidx); + if (lentry->namelen <= 0) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen); + } else { + rentry = (struct xfs_attr_leaf_name_remote *) + (((char *)bp->b_addr) + nameidx); + if (rentry->namelen <= 0) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_da_hashname(rentry->name, rentry->namelen); + } + if (calc_hash != hash) + xfs_scrub_da_set_corrupt(ds, level); + +out: + return error; +} + +/* Scrub the extended attribute metadata. */ +int +xfs_scrub_xattr( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_xattr sx; + struct attrlist_cursor_kern cursor = { 0 }; + xfs_dablk_t last_checked = -1U; + int error = 0; + + if (!xfs_inode_hasattr(sc->ip)) + return -ENOENT; + + memset(&sx, 0, sizeof(sx)); + /* Check attribute tree structure */ + error = xfs_scrub_da_btree(sc, XFS_ATTR_FORK, xfs_scrub_xattr_rec, + &last_checked); + if (error) + goto out; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Check that every attr key can also be looked up by hash. */ + sx.context.dp = sc->ip; + sx.context.cursor = &cursor; + sx.context.resynch = 1; + sx.context.put_listent = xfs_scrub_xattr_listent; + sx.context.tp = sc->tp; + sx.context.flags = ATTR_INCOMPLETE; + sx.sc = sc; + + /* + * Look up every xattr in this file by name. + * + * Use the backend implementation of xfs_attr_list to call + * xfs_scrub_xattr_listent on every attribute key in this inode. + * In other words, we use the same iterator/callback mechanism + * that listattr uses to scrub extended attributes, though in our + * _listent function, we check the value of the attribute. + * + * The VFS only locks i_rwsem when modifying attrs, so keep all + * three locks held because that's the only way to ensure we're + * the only thread poking into the da btree. We traverse the da + * btree while holding a leaf buffer locked for the xattr name + * iteration, which doesn't really follow the usual buffer + * locking order. + */ + error = xfs_attr_list_int_ilocked(&sx.context); + if (!xfs_scrub_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) + goto out; +out: + return error; +} diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c new file mode 100644 index 000000000000..42fec0bcd9e1 --- /dev/null +++ b/fs/xfs/scrub/bmap.c @@ -0,0 +1,363 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* Set us up with an inode's bmap. */ +int +xfs_scrub_setup_inode_bmap( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + int error; + + error = xfs_scrub_get_inode(sc, ip); + if (error) + goto out; + + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + + /* + * We don't want any ephemeral data fork updates sitting around + * while we inspect block mappings, so wait for directio to finish + * and flush dirty data if we have delalloc reservations. + */ + if (S_ISREG(VFS_I(sc->ip)->i_mode) && + sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { + inode_dio_wait(VFS_I(sc->ip)); + error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping); + if (error) + goto out; + } + + /* Got the inode, lock it and we're ready to go. */ + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + if (error) + goto out; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + +out: + /* scrub teardown will unlock and release the inode */ + return error; +} + +/* + * Inode fork block mapping (BMBT) scrubber. + * More complex than the others because we have to scrub + * all the extents regardless of whether or not the fork + * is in btree format. + */ + +struct xfs_scrub_bmap_info { + struct xfs_scrub_context *sc; + xfs_fileoff_t lastoff; + bool is_rt; + bool is_shared; + int whichfork; +}; + +/* Scrub a single extent record. */ +STATIC int +xfs_scrub_bmap_extent( + struct xfs_inode *ip, + struct xfs_btree_cur *cur, + struct xfs_scrub_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + struct xfs_buf *bp = NULL; + int error = 0; + + if (cur) + xfs_btree_get_block(cur, 0, &bp); + + /* + * Check for out-of-order extents. This record could have come + * from the incore list, for which there is no ordering check. + */ + if (irec->br_startoff < info->lastoff) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* There should never be a "hole" extent in either extent list. */ + if (irec->br_startblock == HOLESTARTBLOCK) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* + * Check for delalloc extents. We never iterate the ones in the + * in-core extent scan, and we should never see these in the bmbt. + */ + if (isnullstartblock(irec->br_startblock)) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Make sure the extent points to a valid place. */ + if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (info->is_rt && + (!xfs_verify_rtbno(mp, irec->br_startblock) || + !xfs_verify_rtbno(mp, irec->br_startblock + + irec->br_blockcount - 1))) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (!info->is_rt && + (!xfs_verify_fsbno(mp, irec->br_startblock) || + !xfs_verify_fsbno(mp, irec->br_startblock + + irec->br_blockcount - 1))) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* We don't allow unwritten extents on attr forks. */ + if (irec->br_state == XFS_EXT_UNWRITTEN && + info->whichfork == XFS_ATTR_FORK) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + info->lastoff = irec->br_startoff + irec->br_blockcount; + return error; +} + +/* Scrub a bmbt record. */ +STATIC int +xfs_scrub_bmapbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_bmbt_irec irec; + struct xfs_scrub_bmap_info *info = bs->private; + struct xfs_inode *ip = bs->cur->bc_private.b.ip; + struct xfs_buf *bp = NULL; + struct xfs_btree_block *block; + uint64_t owner; + int i; + + /* + * Check the owners of the btree blocks up to the level below + * the root since the verifiers don't do that. + */ + if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) && + bs->cur->bc_ptrs[0] == 1) { + for (i = 0; i < bs->cur->bc_nlevels - 1; i++) { + block = xfs_btree_get_block(bs->cur, i, &bp); + owner = be64_to_cpu(block->bb_u.l.bb_owner); + if (owner != ip->i_ino) + xfs_scrub_fblock_set_corrupt(bs->sc, + info->whichfork, 0); + } + } + + /* Set up the in-core record and scrub it. */ + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + return xfs_scrub_bmap_extent(ip, bs->cur, info, &irec); +} + +/* Scan the btree records. */ +STATIC int +xfs_scrub_bmap_btree( + struct xfs_scrub_context *sc, + int whichfork, + struct xfs_scrub_bmap_info *info) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xfs_btree_cur *cur; + int error; + + cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); + error = xfs_scrub_btree(sc, cur, xfs_scrub_bmapbt_rec, &oinfo, info); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR); + return error; +} + +/* + * Scrub an inode fork's block mappings. + * + * First we scan every record in every btree block, if applicable. + * Then we unconditionally scan the incore extent cache. + */ +STATIC int +xfs_scrub_bmap( + struct xfs_scrub_context *sc, + int whichfork) +{ + struct xfs_bmbt_irec irec; + struct xfs_scrub_bmap_info info = { NULL }; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xfs_ifork *ifp; + xfs_fileoff_t endoff; + struct xfs_iext_cursor icur; + bool found; + int error = 0; + + ifp = XFS_IFORK_PTR(ip, whichfork); + + info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); + info.whichfork = whichfork; + info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip); + info.sc = sc; + + switch (whichfork) { + case XFS_COW_FORK: + /* Non-existent CoW forks are ignorable. */ + if (!ifp) + goto out; + /* No CoW forks on non-reflink inodes/filesystems. */ + if (!xfs_is_reflink_inode(ip)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + goto out; + } + break; + case XFS_ATTR_FORK: + if (!ifp) + goto out; + if (!xfs_sb_version_hasattr(&mp->m_sb) && + !xfs_sb_version_hasattr2(&mp->m_sb)) + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + break; + default: + ASSERT(whichfork == XFS_DATA_FORK); + break; + } + + /* Check the fork values */ + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_UUID: + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + /* No mappings to check. */ + goto out; + case XFS_DINODE_FMT_EXTENTS: + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + xfs_scrub_fblock_set_corrupt(sc, whichfork, 0); + goto out; + } + break; + case XFS_DINODE_FMT_BTREE: + if (whichfork == XFS_COW_FORK) { + xfs_scrub_fblock_set_corrupt(sc, whichfork, 0); + goto out; + } + + error = xfs_scrub_bmap_btree(sc, whichfork, &info); + if (error) + goto out; + break; + default: + xfs_scrub_fblock_set_corrupt(sc, whichfork, 0); + goto out; + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Now try to scrub the in-memory extent list. */ + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(sc->tp, ip, whichfork); + if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error)) + goto out; + } + + /* Find the offset of the last extent in the mapping. */ + error = xfs_bmap_last_offset(ip, &endoff, whichfork); + if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error)) + goto out; + + /* Scrub extent records. */ + info.lastoff = 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + for (found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &irec); + found != 0; + found = xfs_iext_next_extent(ifp, &icur, &irec)) { + if (xfs_scrub_should_terminate(sc, &error)) + break; + if (isnullstartblock(irec.br_startblock)) + continue; + if (irec.br_startoff >= endoff) { + xfs_scrub_fblock_set_corrupt(sc, whichfork, + irec.br_startoff); + goto out; + } + error = xfs_scrub_bmap_extent(ip, NULL, &info, &irec); + if (error) + goto out; + } + +out: + return error; +} + +/* Scrub an inode's data fork. */ +int +xfs_scrub_bmap_data( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_bmap(sc, XFS_DATA_FORK); +} + +/* Scrub an inode's attr fork. */ +int +xfs_scrub_bmap_attr( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_bmap(sc, XFS_ATTR_FORK); +} + +/* Scrub an inode's CoW fork. */ +int +xfs_scrub_bmap_cow( + struct xfs_scrub_context *sc) +{ + if (!xfs_is_reflink_inode(sc->ip)) + return -ENOENT; + + return xfs_scrub_bmap(sc, XFS_COW_FORK); +} diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c new file mode 100644 index 000000000000..df0766132ace --- /dev/null +++ b/fs/xfs/scrub/btree.c @@ -0,0 +1,516 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* btree scrubbing */ + +/* + * Check for btree operation errors. See the section about handling + * operational errors in common.c. + */ +bool +xfs_scrub_btree_process_error( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level, + int *error) +{ + if (*error == 0) + return true; + + switch (*error) { + case -EDEADLOCK: + /* Used to restart an op with deadlock avoidance. */ + trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + /* fall through */ + default: + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + trace_xfs_scrub_ifork_btree_op_error(sc, cur, level, + *error, __return_address); + else + trace_xfs_scrub_btree_op_error(sc, cur, level, + *error, __return_address); + break; + } + return false; +} + +/* Record btree block corruption. */ +void +xfs_scrub_btree_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + trace_xfs_scrub_ifork_btree_error(sc, cur, level, + __return_address); + else + trace_xfs_scrub_btree_error(sc, cur, level, + __return_address); +} + +/* + * Make sure this record is in order and doesn't stray outside of the parent + * keys. + */ +STATIC void +xfs_scrub_btree_rec( + struct xfs_scrub_btree *bs) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_rec *rec; + union xfs_btree_key key; + union xfs_btree_key hkey; + union xfs_btree_key *keyp; + struct xfs_btree_block *block; + struct xfs_btree_block *keyblock; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, 0, &bp); + rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + + trace_xfs_scrub_btree_rec(bs->sc, cur, 0); + + /* If this isn't the first record, are they in order? */ + if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec)) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 0); + bs->firstrec = false; + memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len); + + if (cur->bc_nlevels == 1) + return; + + /* Is this at least as large as the parent low key? */ + cur->bc_ops->init_key_from_rec(&key, rec); + keyblock = xfs_btree_get_block(cur, 1, &bp); + keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock); + if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 1); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Is this no larger than the parent high key? */ + cur->bc_ops->init_high_key_from_rec(&hkey, rec); + keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock); + if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 1); +} + +/* + * Make sure this key is in order and doesn't stray outside of the parent + * keys. + */ +STATIC void +xfs_scrub_btree_key( + struct xfs_scrub_btree *bs, + int level) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_key *key; + union xfs_btree_key *keyp; + struct xfs_btree_block *block; + struct xfs_btree_block *keyblock; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, level, &bp); + key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block); + + trace_xfs_scrub_btree_key(bs->sc, cur, level); + + /* If this isn't the first key, are they in order? */ + if (!bs->firstkey[level] && + !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key)) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); + bs->firstkey[level] = false; + memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len); + + if (level + 1 >= cur->bc_nlevels) + return; + + /* Is this at least as large as the parent low key? */ + keyblock = xfs_btree_get_block(cur, level + 1, &bp); + keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock); + if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Is this no larger than the parent high key? */ + key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block); + keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock); + if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); +} + +/* + * Check a btree pointer. Returns true if it's ok to use this pointer. + * Callers do not need to set the corrupt flag. + */ +static bool +xfs_scrub_btree_ptr_ok( + struct xfs_scrub_btree *bs, + int level, + union xfs_btree_ptr *ptr) +{ + bool res; + + /* A btree rooted in an inode has no block pointer to the root. */ + if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == bs->cur->bc_nlevels) + return true; + + /* Otherwise, check the pointers. */ + if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) + res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); + else + res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); + if (!res) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); + + return res; +} + +/* Check that a btree block's sibling matches what we expect it. */ +STATIC int +xfs_scrub_btree_block_check_sibling( + struct xfs_scrub_btree *bs, + int level, + int direction, + union xfs_btree_ptr *sibling) +{ + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *pblock; + struct xfs_buf *pbp; + struct xfs_btree_cur *ncur = NULL; + union xfs_btree_ptr *pp; + int success; + int error; + + error = xfs_btree_dup_cursor(cur, &ncur); + if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error) || + !ncur) + return error; + + /* + * If the pointer is null, we shouldn't be able to move the upper + * level pointer anywhere. + */ + if (xfs_btree_ptr_is_null(cur, sibling)) { + if (direction > 0) + error = xfs_btree_increment(ncur, level + 1, &success); + else + error = xfs_btree_decrement(ncur, level + 1, &success); + if (error == 0 && success) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); + error = 0; + goto out; + } + + /* Increment upper level pointer. */ + if (direction > 0) + error = xfs_btree_increment(ncur, level + 1, &success); + else + error = xfs_btree_decrement(ncur, level + 1, &success); + if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error)) + goto out; + if (!success) { + xfs_scrub_btree_set_corrupt(bs->sc, cur, level + 1); + goto out; + } + + /* Compare upper level pointer to sibling pointer. */ + pblock = xfs_btree_get_block(ncur, level + 1, &pbp); + pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock); + if (!xfs_scrub_btree_ptr_ok(bs, level + 1, pp)) + goto out; + + if (xfs_btree_diff_two_ptrs(cur, pp, sibling)) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); +out: + xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR); + return error; +} + +/* Check the siblings of a btree block. */ +STATIC int +xfs_scrub_btree_block_check_siblings( + struct xfs_scrub_btree *bs, + struct xfs_btree_block *block) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_ptr leftsib; + union xfs_btree_ptr rightsib; + int level; + int error = 0; + + xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB); + xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB); + level = xfs_btree_get_level(block); + + /* Root block should never have siblings. */ + if (level == cur->bc_nlevels - 1) { + if (!xfs_btree_ptr_is_null(cur, &leftsib) || + !xfs_btree_ptr_is_null(cur, &rightsib)) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); + goto out; + } + + /* + * Does the left & right sibling pointers match the adjacent + * parent level pointers? + * (These function absorbs error codes for us.) + */ + error = xfs_scrub_btree_block_check_sibling(bs, level, -1, &leftsib); + if (error) + return error; + error = xfs_scrub_btree_block_check_sibling(bs, level, 1, &rightsib); + if (error) + return error; +out: + return error; +} + +/* + * Grab and scrub a btree block given a btree pointer. Returns block + * and buffer pointers (if applicable) if they're ok to use. + */ +STATIC int +xfs_scrub_btree_get_block( + struct xfs_scrub_btree *bs, + int level, + union xfs_btree_ptr *pp, + struct xfs_btree_block **pblock, + struct xfs_buf **pbp) +{ + void *failed_at; + int error; + + *pblock = NULL; + *pbp = NULL; + + error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock); + if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, level, &error) || + !*pblock) + return error; + + xfs_btree_get_block(bs->cur, level, pbp); + if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) + failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, + level, *pbp); + else + failed_at = __xfs_btree_check_sblock(bs->cur, *pblock, + level, *pbp); + if (failed_at) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); + return 0; + } + + /* + * Check the block's siblings; this function absorbs error codes + * for us. + */ + return xfs_scrub_btree_block_check_siblings(bs, *pblock); +} + +/* + * Check that the low and high keys of this block match the keys stored + * in the parent block. + */ +STATIC void +xfs_scrub_btree_block_keys( + struct xfs_scrub_btree *bs, + int level, + struct xfs_btree_block *block) +{ + union xfs_btree_key block_keys; + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_key *high_bk; + union xfs_btree_key *parent_keys; + union xfs_btree_key *high_pk; + struct xfs_btree_block *parent_block; + struct xfs_buf *bp; + + if (level >= cur->bc_nlevels - 1) + return; + + /* Calculate the keys for this block. */ + xfs_btree_get_keys(cur, block, &block_keys); + + /* Obtain the parent's copy of the keys for this block. */ + parent_block = xfs_btree_get_block(cur, level + 1, &bp); + parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], + parent_block); + + if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 1); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Get high keys */ + high_bk = xfs_btree_high_key_from_key(cur, &block_keys); + high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], + parent_block); + + if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 1); +} + +/* + * Visit all nodes and leaves of a btree. Check that all pointers and + * records are in order, that the keys reflect the records, and use a callback + * so that the caller can verify individual records. + */ +int +xfs_scrub_btree( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + xfs_scrub_btree_rec_fn scrub_fn, + struct xfs_owner_info *oinfo, + void *private) +{ + struct xfs_scrub_btree bs = { NULL }; + union xfs_btree_ptr ptr; + union xfs_btree_ptr *pp; + union xfs_btree_rec *recp; + struct xfs_btree_block *block; + int level; + struct xfs_buf *bp; + int i; + int error = 0; + + /* Initialize scrub state */ + bs.cur = cur; + bs.scrub_rec = scrub_fn; + bs.oinfo = oinfo; + bs.firstrec = true; + bs.private = private; + bs.sc = sc; + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) + bs.firstkey[i] = true; + INIT_LIST_HEAD(&bs.to_check); + + /* Don't try to check a tree with a height we can't handle. */ + if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) { + xfs_scrub_btree_set_corrupt(sc, cur, 0); + goto out; + } + + /* + * Load the root of the btree. The helper function absorbs + * error codes for us. + */ + level = cur->bc_nlevels - 1; + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + if (!xfs_scrub_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr)) + goto out; + error = xfs_scrub_btree_get_block(&bs, level, &ptr, &block, &bp); + if (error || !block) + goto out; + + cur->bc_ptrs[level] = 1; + + while (level < cur->bc_nlevels) { + block = xfs_btree_get_block(cur, level, &bp); + + if (level == 0) { + /* End of leaf, pop back towards the root. */ + if (cur->bc_ptrs[level] > + be16_to_cpu(block->bb_numrecs)) { + xfs_scrub_btree_block_keys(&bs, level, block); + if (level < cur->bc_nlevels - 1) + cur->bc_ptrs[level + 1]++; + level++; + continue; + } + + /* Records in order for scrub? */ + xfs_scrub_btree_rec(&bs); + + /* Call out to the record checker. */ + recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + error = bs.scrub_rec(&bs, recp); + if (error) + break; + if (xfs_scrub_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + cur->bc_ptrs[level]++; + continue; + } + + /* End of node, pop back towards the root. */ + if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) { + xfs_scrub_btree_block_keys(&bs, level, block); + if (level < cur->bc_nlevels - 1) + cur->bc_ptrs[level + 1]++; + level++; + continue; + } + + /* Keys in order for scrub? */ + xfs_scrub_btree_key(&bs, level); + + /* Drill another level deeper. */ + pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block); + if (!xfs_scrub_btree_ptr_ok(&bs, level, pp)) { + cur->bc_ptrs[level]++; + continue; + } + level--; + error = xfs_scrub_btree_get_block(&bs, level, pp, &block, &bp); + if (error || !block) + goto out; + + cur->bc_ptrs[level] = 1; + } + +out: + return error; +} diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h new file mode 100644 index 000000000000..4de825a626d1 --- /dev/null +++ b/fs/xfs/scrub/btree.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_BTREE_H__ +#define __XFS_SCRUB_BTREE_H__ + +/* btree scrub */ + +/* Check for btree operation errors. */ +bool xfs_scrub_btree_process_error(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, int level, int *error); + +/* Check for btree corruption. */ +void xfs_scrub_btree_set_corrupt(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, int level); + +struct xfs_scrub_btree; +typedef int (*xfs_scrub_btree_rec_fn)( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec); + +struct xfs_scrub_btree { + /* caller-provided scrub state */ + struct xfs_scrub_context *sc; + struct xfs_btree_cur *cur; + xfs_scrub_btree_rec_fn scrub_rec; + struct xfs_owner_info *oinfo; + void *private; + + /* internal scrub state */ + union xfs_btree_rec lastrec; + bool firstrec; + union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS]; + bool firstkey[XFS_BTREE_MAXLEVELS]; + struct list_head to_check; +}; +int xfs_scrub_btree(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + xfs_scrub_btree_rec_fn scrub_fn, + struct xfs_owner_info *oinfo, void *private); + +#endif /* __XFS_SCRUB_BTREE_H__ */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c new file mode 100644 index 000000000000..ac95fe911d96 --- /dev/null +++ b/fs/xfs/scrub/common.c @@ -0,0 +1,574 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/btree.h" + +/* Common code for the metadata scrubbers. */ + +/* + * Handling operational errors. + * + * The *_process_error() family of functions are used to process error return + * codes from functions called as part of a scrub operation. + * + * If there's no error, we return true to tell the caller that it's ok + * to move on to the next check in its list. + * + * For non-verifier errors (e.g. ENOMEM) we return false to tell the + * caller that something bad happened, and we preserve *error so that + * the caller can return the *error up the stack to userspace. + * + * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting + * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words, + * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT, + * not via return codes. We return false to tell the caller that + * something bad happened. Since the error has been cleared, the caller + * will (presumably) return that zero and scrubbing will move on to + * whatever's next. + * + * ftrace can be used to record the precise metadata location and the + * approximate code location of the failed operation. + */ + +/* Check for operational errors. */ +bool +xfs_scrub_process_error( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error) +{ + switch (*error) { + case 0: + return true; + case -EDEADLOCK: + /* Used to restart an op with deadlock avoidance. */ + trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + /* fall through */ + default: + trace_xfs_scrub_op_error(sc, agno, bno, *error, + __return_address); + break; + } + return false; +} + +/* Check for operational errors for a file offset. */ +bool +xfs_scrub_fblock_process_error( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset, + int *error) +{ + switch (*error) { + case 0: + return true; + case -EDEADLOCK: + /* Used to restart an op with deadlock avoidance. */ + trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + /* fall through */ + default: + trace_xfs_scrub_file_op_error(sc, whichfork, offset, *error, + __return_address); + break; + } + return false; +} + +/* + * Handling scrub corruption/optimization/warning checks. + * + * The *_set_{corrupt,preen,warning}() family of functions are used to + * record the presence of metadata that is incorrect (corrupt), could be + * optimized somehow (preen), or should be flagged for administrative + * review but is not incorrect (warn). + * + * ftrace can be used to record the precise metadata location and + * approximate code location of the failed check. + */ + +/* Record a block which could be optimized. */ +void +xfs_scrub_block_set_preen( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xfs_scrub_block_preen(sc, bp->b_bn, __return_address); +} + +/* + * Record an inode which could be optimized. The trace data will + * include the block given by bp if bp is given; otherwise it will use + * the block location of the inode record itself. + */ +void +xfs_scrub_ino_set_preen( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xfs_scrub_ino_preen(sc, ino, bp ? bp->b_bn : 0, + __return_address); +} + +/* Record a corrupt block. */ +void +xfs_scrub_block_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address); +} + +/* + * Record a corrupt inode. The trace data will include the block given + * by bp if bp is given; otherwise it will use the block location of the + * inode record itself. + */ +void +xfs_scrub_ino_set_corrupt( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); +} + +/* Record corruption in a block indexed by a file fork. */ +void +xfs_scrub_fblock_set_corrupt( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address); +} + +/* + * Warn about inodes that need administrative review but is not + * incorrect. + */ +void +xfs_scrub_ino_set_warning( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; + trace_xfs_scrub_ino_warning(sc, ino, bp ? bp->b_bn : 0, + __return_address); +} + +/* Warn about a block indexed by a file fork that needs review. */ +void +xfs_scrub_fblock_set_warning( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; + trace_xfs_scrub_fblock_warning(sc, whichfork, offset, __return_address); +} + +/* Signal an incomplete scrub. */ +void +xfs_scrub_set_incomplete( + struct xfs_scrub_context *sc) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE; + trace_xfs_scrub_incomplete(sc, __return_address); +} + +/* + * AG scrubbing + * + * These helpers facilitate locking an allocation group's header + * buffers, setting up cursors for all btrees that are present, and + * cleaning everything up once we're through. + */ + +/* Decide if we want to return an AG header read failure. */ +static inline bool +want_ag_read_header_failure( + struct xfs_scrub_context *sc, + unsigned int type) +{ + /* Return all AG header read failures when scanning btrees. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF && + sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL && + sc->sm->sm_type != XFS_SCRUB_TYPE_AGI) + return true; + /* + * If we're scanning a given type of AG header, we only want to + * see read failures from that specific header. We'd like the + * other headers to cross-check them, but this isn't required. + */ + if (sc->sm->sm_type == type) + return true; + return false; +} + +/* + * Grab all the headers for an AG. + * + * The headers should be released by xfs_scrub_ag_free, but as a fail + * safe we attach all the buffers we grab to the scrub transaction so + * they'll all be freed when we cancel it. + */ +int +xfs_scrub_ag_read_headers( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + struct xfs_buf **agi, + struct xfs_buf **agf, + struct xfs_buf **agfl) +{ + struct xfs_mount *mp = sc->mp; + int error; + + error = xfs_ialloc_read_agi(mp, sc->tp, agno, agi); + if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) + goto out; + + error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, agf); + if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) + goto out; + + error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl); + if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) + goto out; + +out: + return error; +} + +/* Release all the AG btree cursors. */ +void +xfs_scrub_ag_btcur_free( + struct xfs_scrub_ag *sa) +{ + if (sa->refc_cur) + xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR); + if (sa->rmap_cur) + xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR); + if (sa->fino_cur) + xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR); + if (sa->ino_cur) + xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR); + if (sa->cnt_cur) + xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR); + if (sa->bno_cur) + xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR); + + sa->refc_cur = NULL; + sa->rmap_cur = NULL; + sa->fino_cur = NULL; + sa->ino_cur = NULL; + sa->bno_cur = NULL; + sa->cnt_cur = NULL; +} + +/* Initialize all the btree cursors for an AG. */ +int +xfs_scrub_ag_btcur_init( + struct xfs_scrub_context *sc, + struct xfs_scrub_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sa->agno; + + if (sa->agf_bp) { + /* Set up a bnobt cursor for cross-referencing. */ + sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + agno, XFS_BTNUM_BNO); + if (!sa->bno_cur) + goto err; + + /* Set up a cntbt cursor for cross-referencing. */ + sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + agno, XFS_BTNUM_CNT); + if (!sa->cnt_cur) + goto err; + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sa->agi_bp) { + sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, + agno, XFS_BTNUM_INO); + if (!sa->ino_cur) + goto err; + } + + /* Set up a finobt cursor for cross-referencing. */ + if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) { + sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, + agno, XFS_BTNUM_FINO); + if (!sa->fino_cur) + goto err; + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) { + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + agno); + if (!sa->rmap_cur) + goto err; + } + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) { + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, agno, NULL); + if (!sa->refc_cur) + goto err; + } + + return 0; +err: + return -ENOMEM; +} + +/* Release the AG header context and btree cursors. */ +void +xfs_scrub_ag_free( + struct xfs_scrub_context *sc, + struct xfs_scrub_ag *sa) +{ + xfs_scrub_ag_btcur_free(sa); + if (sa->agfl_bp) { + xfs_trans_brelse(sc->tp, sa->agfl_bp); + sa->agfl_bp = NULL; + } + if (sa->agf_bp) { + xfs_trans_brelse(sc->tp, sa->agf_bp); + sa->agf_bp = NULL; + } + if (sa->agi_bp) { + xfs_trans_brelse(sc->tp, sa->agi_bp); + sa->agi_bp = NULL; + } + sa->agno = NULLAGNUMBER; +} + +/* + * For scrub, grab the AGI and the AGF headers, in that order. Locking + * order requires us to get the AGI before the AGF. We use the + * transaction to avoid deadlocking on crosslinked metadata buffers; + * either the caller passes one in (bmap scrub) or we have to create a + * transaction ourselves. + */ +int +xfs_scrub_ag_init( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + struct xfs_scrub_ag *sa) +{ + int error; + + sa->agno = agno; + error = xfs_scrub_ag_read_headers(sc, agno, &sa->agi_bp, + &sa->agf_bp, &sa->agfl_bp); + if (error) + return error; + + return xfs_scrub_ag_btcur_init(sc, sa); +} + +/* Per-scrubber setup functions */ + +/* Set us up with a transaction and an empty context. */ +int +xfs_scrub_setup_fs( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp); +} + +/* Set us up with AG headers and btree cursors. */ +int +xfs_scrub_setup_ag_btree( + struct xfs_scrub_context *sc, + struct xfs_inode *ip, + bool force_log) +{ + struct xfs_mount *mp = sc->mp; + int error; + + /* + * If the caller asks us to checkpont the log, do so. This + * expensive operation should be performed infrequently and only + * as a last resort. Any caller that sets force_log should + * document why they need to do so. + */ + if (force_log) { + error = xfs_scrub_checkpoint_log(mp); + if (error) + return error; + } + + error = xfs_scrub_setup_ag_header(sc, ip); + if (error) + return error; + + return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa); +} + +/* Push everything out of the log onto disk. */ +int +xfs_scrub_checkpoint_log( + struct xfs_mount *mp) +{ + int error; + + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + if (error) + return error; + xfs_ail_push_all_sync(mp->m_ail); + return 0; +} + +/* + * Given an inode and the scrub control structure, grab either the + * inode referenced in the control structure or the inode passed in. + * The inode is not locked. + */ +int +xfs_scrub_get_inode( + struct xfs_scrub_context *sc, + struct xfs_inode *ip_in) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = NULL; + int error; + + /* + * If userspace passed us an AG number or a generation number + * without an inode number, they haven't got a clue so bail out + * immediately. + */ + if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) + return -EINVAL; + + /* We want to scan the inode we already had opened. */ + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { + sc->ip = ip_in; + return 0; + } + + /* Look up the inode, see if the generation number matches. */ + if (xfs_internal_inum(mp, sc->sm->sm_ino)) + return -ENOENT; + error = xfs_iget(mp, NULL, sc->sm->sm_ino, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip); + if (error == -ENOENT || error == -EINVAL) { + /* inode doesn't exist... */ + return -ENOENT; + } else if (error) { + trace_xfs_scrub_op_error(sc, + XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), + XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); + return error; + } + if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { + iput(VFS_I(ip)); + return -ENOENT; + } + + sc->ip = ip; + return 0; +} + +/* Set us up to scrub a file's contents. */ +int +xfs_scrub_setup_inode_contents( + struct xfs_scrub_context *sc, + struct xfs_inode *ip, + unsigned int resblks) +{ + struct xfs_mount *mp = sc->mp; + int error; + + error = xfs_scrub_get_inode(sc, ip); + if (error) + return error; + + /* Got the inode, lock it and we're ready to go. */ + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + if (error) + goto out; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + +out: + /* scrub teardown will unlock and release the inode for us */ + return error; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h new file mode 100644 index 000000000000..5c043855570e --- /dev/null +++ b/fs/xfs/scrub/common.h @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_COMMON_H__ +#define __XFS_SCRUB_COMMON_H__ + +/* + * We /could/ terminate a scrub/repair operation early. If we're not + * in a good place to continue (fatal signal, etc.) then bail out. + * Note that we're careful not to make any judgements about *error. + */ +static inline bool +xfs_scrub_should_terminate( + struct xfs_scrub_context *sc, + int *error) +{ + if (fatal_signal_pending(current)) { + if (*error == 0) + *error = -EAGAIN; + return true; + } + return false; +} + +/* + * Grab an empty transaction so that we can re-grab locked buffers if + * one of our btrees turns out to be cyclic. + */ +static inline int +xfs_scrub_trans_alloc( + struct xfs_scrub_metadata *sm, + struct xfs_mount *mp, + struct xfs_trans **tpp) +{ + return xfs_trans_alloc_empty(mp, tpp); +} + +bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno, + xfs_agblock_t bno, int *error); +bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset, int *error); + +void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc, + struct xfs_buf *bp); +void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino, + struct xfs_buf *bp); + +void xfs_scrub_block_set_corrupt(struct xfs_scrub_context *sc, + struct xfs_buf *bp); +void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, + struct xfs_buf *bp); +void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset); + +void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino, + struct xfs_buf *bp); +void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset); + +void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc); +int xfs_scrub_checkpoint_log(struct xfs_mount *mp); + +/* Setup functions */ +int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip); +int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_ag_allocbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_ag_iallocbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_ag_rmapbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_ag_refcountbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_inode(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_inode_bmap(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_inode_bmap_data(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_directory(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_xattr(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_symlink(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_parent(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +#ifdef CONFIG_XFS_RT +int xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip); +#else +static inline int +xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip) +{ + return -ENOENT; +} +#endif +#ifdef CONFIG_XFS_QUOTA +int xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip); +#else +static inline int +xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip) +{ + return -ENOENT; +} +#endif + +void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); +int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno, + struct xfs_scrub_ag *sa); +int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno, + struct xfs_buf **agi, struct xfs_buf **agf, + struct xfs_buf **agfl); +void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa); +int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc, + struct xfs_scrub_ag *sa); +int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc, + int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno, + void *), + void *priv); + +int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc, + struct xfs_inode *ip, bool force_log); +int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in); +int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc, + struct xfs_inode *ip, unsigned int resblks); + +#endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c new file mode 100644 index 000000000000..d94edd93cba8 --- /dev/null +++ b/fs/xfs/scrub/dabtree.c @@ -0,0 +1,591 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_leaf.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/dabtree.h" + +/* Directory/Attribute Btree */ + +/* + * Check for da btree operation errors. See the section about handling + * operational errors in common.c. + */ +bool +xfs_scrub_da_process_error( + struct xfs_scrub_da_btree *ds, + int level, + int *error) +{ + struct xfs_scrub_context *sc = ds->sc; + + if (*error == 0) + return true; + + switch (*error) { + case -EDEADLOCK: + /* Used to restart an op with deadlock avoidance. */ + trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + /* fall through */ + default: + trace_xfs_scrub_file_op_error(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + *error, __return_address); + break; + } + return false; +} + +/* + * Check for da btree corruption. See the section about handling + * operational errors in common.c. + */ +void +xfs_scrub_da_set_corrupt( + struct xfs_scrub_da_btree *ds, + int level) +{ + struct xfs_scrub_context *sc = ds->sc; + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + + trace_xfs_scrub_fblock_error(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + __return_address); +} + +/* Find an entry at a certain level in a da btree. */ +STATIC void * +xfs_scrub_da_btree_entry( + struct xfs_scrub_da_btree *ds, + int level, + int rec) +{ + char *ents; + struct xfs_da_state_blk *blk; + void *baddr; + + /* Dispatch the entry finding function. */ + blk = &ds->state->path.blk[level]; + baddr = blk->bp->b_addr; + switch (blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + ents = (char *)xfs_attr3_leaf_entryp(baddr); + return ents + (rec * sizeof(struct xfs_attr_leaf_entry)); + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr); + return ents + (rec * sizeof(struct xfs_dir2_leaf_entry)); + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr); + return ents + (rec * sizeof(struct xfs_dir2_leaf_entry)); + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr); + return ents + (rec * sizeof(struct xfs_da_node_entry)); + } + + return NULL; +} + +/* Scrub a da btree hash (key). */ +int +xfs_scrub_da_btree_hash( + struct xfs_scrub_da_btree *ds, + int level, + __be32 *hashp) +{ + struct xfs_da_state_blk *blks; + struct xfs_da_node_entry *entry; + xfs_dahash_t hash; + xfs_dahash_t parent_hash; + + /* Is this hash in order? */ + hash = be32_to_cpu(*hashp); + if (hash < ds->hashes[level]) + xfs_scrub_da_set_corrupt(ds, level); + ds->hashes[level] = hash; + + if (level == 0) + return 0; + + /* Is this hash no larger than the parent hash? */ + blks = ds->state->path.blk; + entry = xfs_scrub_da_btree_entry(ds, level - 1, blks[level - 1].index); + parent_hash = be32_to_cpu(entry->hashval); + if (parent_hash < hash) + xfs_scrub_da_set_corrupt(ds, level); + + return 0; +} + +/* + * Check a da btree pointer. Returns true if it's ok to use this + * pointer. + */ +STATIC bool +xfs_scrub_da_btree_ptr_ok( + struct xfs_scrub_da_btree *ds, + int level, + xfs_dablk_t blkno) +{ + if (blkno < ds->lowest || (ds->highest != 0 && blkno >= ds->highest)) { + xfs_scrub_da_set_corrupt(ds, level); + return false; + } + + return true; +} + +/* + * The da btree scrubber can handle leaf1 blocks as a degenerate + * form of leafn blocks. Since the regular da code doesn't handle + * leaf1, we must multiplex the verifiers. + */ +static void +xfs_scrub_da_btree_read_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + bp->b_ops->verify_read(bp); + return; + default: + /* + * xfs_da3_node_buf_ops already know how to handle + * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks. + */ + bp->b_ops = &xfs_da3_node_buf_ops; + bp->b_ops->verify_read(bp); + return; + } +} +static void +xfs_scrub_da_btree_write_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + bp->b_ops->verify_write(bp); + return; + default: + /* + * xfs_da3_node_buf_ops already know how to handle + * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks. + */ + bp->b_ops = &xfs_da3_node_buf_ops; + bp->b_ops->verify_write(bp); + return; + } +} + +static const struct xfs_buf_ops xfs_scrub_da_btree_buf_ops = { + .name = "xfs_scrub_da_btree", + .verify_read = xfs_scrub_da_btree_read_verify, + .verify_write = xfs_scrub_da_btree_write_verify, +}; + +/* Check a block's sibling. */ +STATIC int +xfs_scrub_da_btree_block_check_sibling( + struct xfs_scrub_da_btree *ds, + int level, + int direction, + xfs_dablk_t sibling) +{ + int retval; + int error; + + memcpy(&ds->state->altpath, &ds->state->path, + sizeof(ds->state->altpath)); + + /* + * If the pointer is null, we shouldn't be able to move the upper + * level pointer anywhere. + */ + if (sibling == 0) { + error = xfs_da3_path_shift(ds->state, &ds->state->altpath, + direction, false, &retval); + if (error == 0 && retval == 0) + xfs_scrub_da_set_corrupt(ds, level); + error = 0; + goto out; + } + + /* Move the alternate cursor one block in the direction given. */ + error = xfs_da3_path_shift(ds->state, &ds->state->altpath, + direction, false, &retval); + if (!xfs_scrub_da_process_error(ds, level, &error)) + return error; + if (retval) { + xfs_scrub_da_set_corrupt(ds, level); + return error; + } + + /* Compare upper level pointer to sibling pointer. */ + if (ds->state->altpath.blk[level].blkno != sibling) + xfs_scrub_da_set_corrupt(ds, level); + xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp); +out: + return error; +} + +/* Check a block's sibling pointers. */ +STATIC int +xfs_scrub_da_btree_block_check_siblings( + struct xfs_scrub_da_btree *ds, + int level, + struct xfs_da_blkinfo *hdr) +{ + xfs_dablk_t forw; + xfs_dablk_t back; + int error = 0; + + forw = be32_to_cpu(hdr->forw); + back = be32_to_cpu(hdr->back); + + /* Top level blocks should not have sibling pointers. */ + if (level == 0) { + if (forw != 0 || back != 0) + xfs_scrub_da_set_corrupt(ds, level); + return 0; + } + + /* + * Check back (left) and forw (right) pointers. These functions + * absorb error codes for us. + */ + error = xfs_scrub_da_btree_block_check_sibling(ds, level, 0, back); + if (error) + goto out; + error = xfs_scrub_da_btree_block_check_sibling(ds, level, 1, forw); + +out: + memset(&ds->state->altpath, 0, sizeof(ds->state->altpath)); + return error; +} + +/* Load a dir/attribute block from a btree. */ +STATIC int +xfs_scrub_da_btree_block( + struct xfs_scrub_da_btree *ds, + int level, + xfs_dablk_t blkno) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_blkinfo *hdr3; + struct xfs_da_args *dargs = &ds->dargs; + struct xfs_inode *ip = ds->dargs.dp; + xfs_ino_t owner; + int *pmaxrecs; + struct xfs_da3_icnode_hdr nodehdr; + int error = 0; + + blk = &ds->state->path.blk[level]; + ds->state->path.active = level + 1; + + /* Release old block. */ + if (blk->bp) { + xfs_trans_brelse(dargs->trans, blk->bp); + blk->bp = NULL; + } + + /* Check the pointer. */ + blk->blkno = blkno; + if (!xfs_scrub_da_btree_ptr_ok(ds, level, blkno)) + goto out_nobuf; + + /* Read the buffer. */ + error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2, + &blk->bp, dargs->whichfork, + &xfs_scrub_da_btree_buf_ops); + if (!xfs_scrub_da_process_error(ds, level, &error)) + goto out_nobuf; + + /* + * We didn't find a dir btree root block, which means that + * there's no LEAF1/LEAFN tree (at least not where it's supposed + * to be), so jump out now. + */ + if (ds->dargs.whichfork == XFS_DATA_FORK && level == 0 && + blk->bp == NULL) + goto out_nobuf; + + /* It's /not/ ok for attr trees not to have a da btree. */ + if (blk->bp == NULL) { + xfs_scrub_da_set_corrupt(ds, level); + goto out_nobuf; + } + + hdr3 = blk->bp->b_addr; + blk->magic = be16_to_cpu(hdr3->hdr.magic); + pmaxrecs = &ds->maxrecs[level]; + + /* We only started zeroing the header on v5 filesystems. */ + if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad) + xfs_scrub_da_set_corrupt(ds, level); + + /* Check the owner. */ + if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) { + owner = be64_to_cpu(hdr3->owner); + if (owner != ip->i_ino) + xfs_scrub_da_set_corrupt(ds, level); + } + + /* Check the siblings. */ + error = xfs_scrub_da_btree_block_check_siblings(ds, level, &hdr3->hdr); + if (error) + goto out; + + /* Interpret the buffer. */ + switch (blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_ATTR_LEAF_BUF); + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xfs_scrub_da_set_corrupt(ds, level); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DIR_LEAFN_BUF); + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xfs_scrub_da_set_corrupt(ds, level); + break; + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DIR_LEAF1_BUF); + blk->magic = XFS_DIR2_LEAF1_MAGIC; + blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xfs_scrub_da_set_corrupt(ds, level); + break; + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DA_NODE_BUF); + blk->magic = XFS_DA_NODE_MAGIC; + node = blk->bp->b_addr; + ip->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = ip->d_ops->node_tree_p(node); + *pmaxrecs = nodehdr.count; + blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval); + if (level == 0) { + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { + xfs_scrub_da_set_corrupt(ds, level); + goto out_freebp; + } + ds->tree_level = nodehdr.level; + } else { + if (ds->tree_level != nodehdr.level) { + xfs_scrub_da_set_corrupt(ds, level); + goto out_freebp; + } + } + + /* XXX: Check hdr3.pad32 once we know how to fix it. */ + break; + default: + xfs_scrub_da_set_corrupt(ds, level); + goto out_freebp; + } + +out: + return error; +out_freebp: + xfs_trans_brelse(dargs->trans, blk->bp); + blk->bp = NULL; +out_nobuf: + blk->blkno = 0; + return error; +} + +/* Visit all nodes and leaves of a da btree. */ +int +xfs_scrub_da_btree( + struct xfs_scrub_context *sc, + int whichfork, + xfs_scrub_da_btree_rec_fn scrub_fn, + void *private) +{ + struct xfs_scrub_da_btree ds = {}; + struct xfs_mount *mp = sc->mp; + struct xfs_da_state_blk *blks; + struct xfs_da_node_entry *key; + void *rec; + xfs_dablk_t blkno; + int level; + int error; + + /* Skip short format data structures; no btree to scan. */ + if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE) + return 0; + + /* Set up initial da state. */ + ds.dargs.dp = sc->ip; + ds.dargs.whichfork = whichfork; + ds.dargs.trans = sc->tp; + ds.dargs.op_flags = XFS_DA_OP_OKNOENT; + ds.state = xfs_da_state_alloc(); + ds.state->args = &ds.dargs; + ds.state->mp = mp; + ds.sc = sc; + ds.private = private; + if (whichfork == XFS_ATTR_FORK) { + ds.dargs.geo = mp->m_attr_geo; + ds.lowest = 0; + ds.highest = 0; + } else { + ds.dargs.geo = mp->m_dir_geo; + ds.lowest = ds.dargs.geo->leafblk; + ds.highest = ds.dargs.geo->freeblk; + } + blkno = ds.lowest; + level = 0; + + /* Find the root of the da tree, if present. */ + blks = ds.state->path.blk; + error = xfs_scrub_da_btree_block(&ds, level, blkno); + if (error) + goto out_state; + /* + * We didn't find a block at ds.lowest, which means that there's + * no LEAF1/LEAFN tree (at least not where it's supposed to be), + * so jump out now. + */ + if (blks[level].bp == NULL) + goto out_state; + + blks[level].index = 0; + while (level >= 0 && level < XFS_DA_NODE_MAXDEPTH) { + /* Handle leaf block. */ + if (blks[level].magic != XFS_DA_NODE_MAGIC) { + /* End of leaf, pop back towards the root. */ + if (blks[level].index >= ds.maxrecs[level]) { + if (level > 0) + blks[level - 1].index++; + ds.tree_level++; + level--; + continue; + } + + /* Dispatch record scrubbing. */ + rec = xfs_scrub_da_btree_entry(&ds, level, + blks[level].index); + error = scrub_fn(&ds, level, rec); + if (error) + break; + if (xfs_scrub_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + blks[level].index++; + continue; + } + + + /* End of node, pop back towards the root. */ + if (blks[level].index >= ds.maxrecs[level]) { + if (level > 0) + blks[level - 1].index++; + ds.tree_level++; + level--; + continue; + } + + /* Hashes in order for scrub? */ + key = xfs_scrub_da_btree_entry(&ds, level, blks[level].index); + error = xfs_scrub_da_btree_hash(&ds, level, &key->hashval); + if (error) + goto out; + + /* Drill another level deeper. */ + blkno = be32_to_cpu(key->before); + level++; + ds.tree_level--; + error = xfs_scrub_da_btree_block(&ds, level, blkno); + if (error) + goto out; + if (blks[level].bp == NULL) + goto out; + + blks[level].index = 0; + } + +out: + /* Release all the buffers we're tracking. */ + for (level = 0; level < XFS_DA_NODE_MAXDEPTH; level++) { + if (blks[level].bp == NULL) + continue; + xfs_trans_brelse(sc->tp, blks[level].bp); + blks[level].bp = NULL; + } + +out_state: + xfs_da_state_free(ds.state); + return error; +} diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h new file mode 100644 index 000000000000..d31468d68cef --- /dev/null +++ b/fs/xfs/scrub/dabtree.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_DABTREE_H__ +#define __XFS_SCRUB_DABTREE_H__ + +/* dir/attr btree */ + +struct xfs_scrub_da_btree { + struct xfs_da_args dargs; + xfs_dahash_t hashes[XFS_DA_NODE_MAXDEPTH]; + int maxrecs[XFS_DA_NODE_MAXDEPTH]; + struct xfs_da_state *state; + struct xfs_scrub_context *sc; + void *private; + + /* + * Lowest and highest directory block address in which we expect + * to find dir/attr btree node blocks. For a directory this + * (presumably) means between LEAF_OFFSET and FREE_OFFSET; for + * attributes there is no limit. + */ + xfs_dablk_t lowest; + xfs_dablk_t highest; + + int tree_level; +}; + +typedef int (*xfs_scrub_da_btree_rec_fn)(struct xfs_scrub_da_btree *ds, + int level, void *rec); + +/* Check for da btree operation errors. */ +bool xfs_scrub_da_process_error(struct xfs_scrub_da_btree *ds, int level, int *error); + +/* Check for da btree corruption. */ +void xfs_scrub_da_set_corrupt(struct xfs_scrub_da_btree *ds, int level); + +int xfs_scrub_da_btree_hash(struct xfs_scrub_da_btree *ds, int level, + __be32 *hashp); +int xfs_scrub_da_btree(struct xfs_scrub_context *sc, int whichfork, + xfs_scrub_da_btree_rec_fn scrub_fn, void *private); + +#endif /* __XFS_SCRUB_DABTREE_H__ */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c new file mode 100644 index 000000000000..69e1efdd4019 --- /dev/null +++ b/fs/xfs/scrub/dir.c @@ -0,0 +1,816 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_itable.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ialloc.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/dabtree.h" + +/* Set us up to scrub directories. */ +int +xfs_scrub_setup_directory( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_inode_contents(sc, ip, 0); +} + +/* Directories */ + +/* Scrub a directory entry. */ + +struct xfs_scrub_dir_ctx { + /* VFS fill-directory iterator */ + struct dir_context dir_iter; + + struct xfs_scrub_context *sc; +}; + +/* Check that an inode's mode matches a given DT_ type. */ +STATIC int +xfs_scrub_dir_check_ftype( + struct xfs_scrub_dir_ctx *sdc, + xfs_fileoff_t offset, + xfs_ino_t inum, + int dtype) +{ + struct xfs_mount *mp = sdc->sc->mp; + struct xfs_inode *ip; + int ino_dtype; + int error = 0; + + if (!xfs_sb_version_hasftype(&mp->m_sb)) { + if (dtype != DT_UNKNOWN && dtype != DT_DIR) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + goto out; + } + + /* + * Grab the inode pointed to by the dirent. We release the + * inode before we cancel the scrub transaction. Since we're + * don't know a priori that releasing the inode won't trigger + * eofblocks cleanup (which allocates what would be a nested + * transaction), we can't use DONTCACHE here because DONTCACHE + * inodes can trigger immediate inactive cleanup of the inode. + */ + error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip); + if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, + &error)) + goto out; + + /* Convert mode to the DT_* values that dir_emit uses. */ + ino_dtype = xfs_dir3_get_dtype(mp, + xfs_mode_to_ftype(VFS_I(ip)->i_mode)); + if (ino_dtype != dtype) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + iput(VFS_I(ip)); +out: + return error; +} + +/* + * Scrub a single directory entry. + * + * We use the VFS directory iterator (i.e. readdir) to call this + * function for every directory entry in a directory. Once we're here, + * we check the inode number to make sure it's sane, then we check that + * we can look up this filename. Finally, we check the ftype. + */ +STATIC int +xfs_scrub_dir_actor( + struct dir_context *dir_iter, + const char *name, + int namelen, + loff_t pos, + u64 ino, + unsigned type) +{ + struct xfs_mount *mp; + struct xfs_inode *ip; + struct xfs_scrub_dir_ctx *sdc; + struct xfs_name xname; + xfs_ino_t lookup_ino; + xfs_dablk_t offset; + int error = 0; + + sdc = container_of(dir_iter, struct xfs_scrub_dir_ctx, dir_iter); + ip = sdc->sc->ip; + mp = ip->i_mount; + offset = xfs_dir2_db_to_da(mp->m_dir_geo, + xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); + + /* Does this inode number make sense? */ + if (!xfs_verify_dir_ino(mp, ino)) { + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + + if (!strncmp(".", name, namelen)) { + /* If this is "." then check that the inum matches the dir. */ + if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + if (ino != ip->i_ino) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + } else if (!strncmp("..", name, namelen)) { + /* + * If this is ".." in the root inode, check that the inum + * matches this dir. + */ + if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + } + + /* Verify that we can look up this name by hash. */ + xname.name = name; + xname.len = namelen; + xname.type = XFS_DIR3_FT_UNKNOWN; + + error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, + &error)) + goto fail_xref; + if (lookup_ino != ino) { + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + + /* Verify the file type. This function absorbs error codes. */ + error = xfs_scrub_dir_check_ftype(sdc, offset, lookup_ino, type); + if (error) + goto out; +out: + return error; +fail_xref: + return error; +} + +/* Scrub a directory btree record. */ +STATIC int +xfs_scrub_dir_rec( + struct xfs_scrub_da_btree *ds, + int level, + void *rec) +{ + struct xfs_mount *mp = ds->state->mp; + struct xfs_dir2_leaf_entry *ent = rec; + struct xfs_inode *dp = ds->dargs.dp; + struct xfs_dir2_data_entry *dent; + struct xfs_buf *bp; + xfs_ino_t ino; + xfs_dablk_t rec_bno; + xfs_dir2_db_t db; + xfs_dir2_data_aoff_t off; + xfs_dir2_dataptr_t ptr; + xfs_dahash_t calc_hash; + xfs_dahash_t hash; + unsigned int tag; + int error; + + /* Check the hash of the entry. */ + error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval); + if (error) + goto out; + + /* Valid hash pointer? */ + ptr = be32_to_cpu(ent->address); + if (ptr == 0) + return 0; + + /* Find the directory entry's location. */ + db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr); + off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr); + rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db); + + if (rec_bno >= mp->m_dir_geo->leafblk) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp); + if (!xfs_scrub_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, + &error)) + goto out; + if (!bp) { + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out; + } + + /* Retrieve the entry, sanity check it, and compare hashes. */ + dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); + ino = be64_to_cpu(dent->inumber); + hash = be32_to_cpu(ent->hashval); + tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent)); + if (!xfs_verify_dir_ino(mp, ino) || tag != off) + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + if (dent->namelen == 0) { + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + calc_hash = xfs_da_hashname(dent->name, dent->namelen); + if (calc_hash != hash) + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + +out_relse: + xfs_trans_brelse(ds->dargs.trans, bp); +out: + return error; +} + +/* + * Is this unused entry either in the bestfree or smaller than all of + * them? We've already checked that the bestfrees are sorted longest to + * shortest, and that there aren't any bogus entries. + */ +STATIC void +xfs_scrub_directory_check_free_entry( + struct xfs_scrub_context *sc, + xfs_dablk_t lblk, + struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup) +{ + struct xfs_dir2_data_free *dfp; + unsigned int dup_length; + + dup_length = be16_to_cpu(dup->length); + + /* Unused entry is shorter than any of the bestfrees */ + if (dup_length < be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) + return; + + for (dfp = &bf[XFS_DIR2_DATA_FD_COUNT - 1]; dfp >= bf; dfp--) + if (dup_length == be16_to_cpu(dfp->length)) + return; + + /* Unused entry should be in the bestfrees but wasn't found. */ + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +} + +/* Check free space info in a directory data block. */ +STATIC int +xfs_scrub_directory_data_bestfree( + struct xfs_scrub_context *sc, + xfs_dablk_t lblk, + bool is_block) +{ + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_free *dfp; + struct xfs_buf *bp; + struct xfs_dir2_data_free *bf; + struct xfs_mount *mp = sc->mp; + const struct xfs_dir_ops *d_ops; + char *ptr; + char *endptr; + u16 tag; + unsigned int nr_bestfrees = 0; + unsigned int nr_frees = 0; + unsigned int smallest_bestfree; + int newlen; + int offset; + int error; + + d_ops = sc->ip->d_ops; + + if (is_block) { + /* dir block format */ + if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); + } else { + /* dir data format */ + error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp); + } + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + + /* Do the bestfrees correspond to actual free space? */ + bf = d_ops->data_bestfree_p(bp->b_addr); + smallest_bestfree = UINT_MAX; + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { + offset = be16_to_cpu(dfp->offset); + if (offset == 0) + continue; + if (offset >= mp->m_dir_geo->blksize) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset); + tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); + + /* bestfree doesn't match the entry it points at? */ + if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) || + be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) || + tag != ((char *)dup - (char *)bp->b_addr)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + + /* bestfree records should be ordered largest to smallest */ + if (smallest_bestfree < be16_to_cpu(dfp->length)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + + smallest_bestfree = be16_to_cpu(dfp->length); + nr_bestfrees++; + } + + /* Make sure the bestfrees are actually the best free spaces. */ + ptr = (char *)d_ops->data_entry_p(bp->b_addr); + if (is_block) { + struct xfs_dir2_block_tail *btp; + + btp = xfs_dir2_block_tail_p(mp->m_dir_geo, bp->b_addr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + } else + endptr = (char *)bp->b_addr + BBTOB(bp->b_length); + + /* Iterate the entries, stopping when we hit or go past the end. */ + while (ptr < endptr) { + dup = (struct xfs_dir2_data_unused *)ptr; + /* Skip real entries */ + if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) { + struct xfs_dir2_data_entry *dep; + + dep = (struct xfs_dir2_data_entry *)ptr; + newlen = d_ops->data_entsize(dep->namelen); + if (newlen <= 0) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + lblk); + goto out_buf; + } + ptr += newlen; + continue; + } + + /* Spot check this free entry */ + tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); + if (tag != ((char *)dup - (char *)bp->b_addr)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + /* + * Either this entry is a bestfree or it's smaller than + * any of the bestfrees. + */ + xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup); + + /* Move on. */ + newlen = be16_to_cpu(dup->length); + if (newlen <= 0) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + ptr += newlen; + if (ptr <= endptr) + nr_frees++; + } + + /* We're required to fill all the space. */ + if (ptr != endptr) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + /* Did we see at least as many free slots as there are bestfrees? */ + if (nr_frees < nr_bestfrees) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +out_buf: + xfs_trans_brelse(sc->tp, bp); +out: + return error; +} + +/* + * Does the free space length in the free space index block ($len) match + * the longest length in the directory data block's bestfree array? + * Assume that we've already checked that the data block's bestfree + * array is in order. + */ +STATIC void +xfs_scrub_directory_check_freesp( + struct xfs_scrub_context *sc, + xfs_dablk_t lblk, + struct xfs_buf *dbp, + unsigned int len) +{ + struct xfs_dir2_data_free *dfp; + + dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr); + + if (len != be16_to_cpu(dfp->length)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + if (len > 0 && be16_to_cpu(dfp->offset) == 0) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +} + +/* Check free space info in a directory leaf1 block. */ +STATIC int +xfs_scrub_directory_leaf1_bestfree( + struct xfs_scrub_context *sc, + struct xfs_da_args *args, + xfs_dablk_t lblk) +{ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir2_leaf_tail *ltp; + struct xfs_dir2_leaf *leaf; + struct xfs_buf *dbp; + struct xfs_buf *bp; + const struct xfs_dir_ops *d_ops = sc->ip->d_ops; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + __be16 *bestp; + __u16 best; + __u32 hash; + __u32 lasthash = 0; + __u32 bestcount; + unsigned int stale = 0; + int i; + int error; + + /* Read the free space block. */ + error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + leaf = bp->b_addr; + d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = d_ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); + bestcount = be32_to_cpu(ltp->bestcount); + bestp = xfs_dir2_leaf_bests_p(ltp); + + if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad != cpu_to_be32(0)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + } + + /* + * There should be as many bestfree slots as there are dir data + * blocks that can fit under i_size. + */ + if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_d.di_size)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Is the leaf count even remotely sane? */ + if (leafhdr.count > d_ops->leaf_max_ents(geo)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Leaves and bests don't overlap in leaf format. */ + if ((char *)&ents[leafhdr.count] > (char *)bestp) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Check hash value order, count stale entries. */ + for (i = 0; i < leafhdr.count; i++) { + hash = be32_to_cpu(ents[i].hashval); + if (i > 0 && lasthash > hash) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + lasthash = hash; + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (leafhdr.stale != stale) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + /* Check all the bestfree entries. */ + for (i = 0; i < bestcount; i++, bestp++) { + best = be16_to_cpu(*bestp); + if (best == NULLDATAOFF) + continue; + error = xfs_dir3_data_read(sc->tp, sc->ip, + i * args->geo->fsbcount, -1, &dbp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, + &error)) + continue; + xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); + xfs_trans_brelse(sc->tp, dbp); + } +out: + return error; +} + +/* Check free space info in a directory freespace block. */ +STATIC int +xfs_scrub_directory_free_bestfree( + struct xfs_scrub_context *sc, + struct xfs_da_args *args, + xfs_dablk_t lblk) +{ + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_buf *dbp; + struct xfs_buf *bp; + __be16 *bestp; + __u16 best; + unsigned int stale = 0; + int i; + int error; + + /* Read the free space block */ + error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad != cpu_to_be32(0)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + } + + /* Check all the entries. */ + sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr); + bestp = sc->ip->d_ops->free_bests_p(bp->b_addr); + for (i = 0; i < freehdr.nvalid; i++, bestp++) { + best = be16_to_cpu(*bestp); + if (best == NULLDATAOFF) { + stale++; + continue; + } + error = xfs_dir3_data_read(sc->tp, sc->ip, + (freehdr.firstdb + i) * args->geo->fsbcount, + -1, &dbp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, + &error)) + continue; + xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); + xfs_trans_brelse(sc->tp, dbp); + } + + if (freehdr.nused + stale != freehdr.nvalid) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +out: + return error; +} + +/* Check free space information in directories. */ +STATIC int +xfs_scrub_directory_blocks( + struct xfs_scrub_context *sc) +{ + struct xfs_bmbt_irec got; + struct xfs_da_args args; + struct xfs_ifork *ifp; + struct xfs_mount *mp = sc->mp; + xfs_fileoff_t leaf_lblk; + xfs_fileoff_t free_lblk; + xfs_fileoff_t lblk; + struct xfs_iext_cursor icur; + xfs_dablk_t dabno; + bool found; + int is_block = 0; + int error; + + /* Ignore local format directories. */ + if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + return 0; + + ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET); + leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET); + free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); + + /* Is this a block dir? */ + args.dp = sc->ip; + args.geo = mp->m_dir_geo; + args.trans = sc->tp; + error = xfs_dir2_isblock(&args, &is_block); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + /* Iterate all the data extents in the directory... */ + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + while (found) { + /* Block directories only have a single block at offset 0. */ + if (is_block && + (got.br_startoff > 0 || + got.br_blockcount != args.geo->fsbcount)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + got.br_startoff); + break; + } + + /* No more data blocks... */ + if (got.br_startoff >= leaf_lblk) + break; + + /* + * Check each data block's bestfree data. + * + * Iterate all the fsbcount-aligned block offsets in + * this directory. The directory block reading code is + * smart enough to do its own bmap lookups to handle + * discontiguous directory blocks. When we're done + * with the extent record, re-query the bmap at the + * next fsbcount-aligned offset to avoid redundant + * block checks. + */ + for (lblk = roundup((xfs_dablk_t)got.br_startoff, + args.geo->fsbcount); + lblk < got.br_startoff + got.br_blockcount; + lblk += args.geo->fsbcount) { + error = xfs_scrub_directory_data_bestfree(sc, lblk, + is_block); + if (error) + goto out; + } + dabno = got.br_startoff + got.br_blockcount; + lblk = roundup(dabno, args.geo->fsbcount); + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Look for a leaf1 block, which has free info. */ + if (xfs_iext_lookup_extent(sc->ip, ifp, leaf_lblk, &icur, &got) && + got.br_startoff == leaf_lblk && + got.br_blockcount == args.geo->fsbcount && + !xfs_iext_next_extent(ifp, &icur, &got)) { + if (is_block) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + error = xfs_scrub_directory_leaf1_bestfree(sc, &args, + leaf_lblk); + if (error) + goto out; + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Scan for free blocks */ + lblk = free_lblk; + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + while (found) { + /* + * Dirs can't have blocks mapped above 2^32. + * Single-block dirs shouldn't even be here. + */ + lblk = got.br_startoff; + if (lblk & ~0xFFFFFFFFULL) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + if (is_block) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* + * Check each dir free block's bestfree data. + * + * Iterate all the fsbcount-aligned block offsets in + * this directory. The directory block reading code is + * smart enough to do its own bmap lookups to handle + * discontiguous directory blocks. When we're done + * with the extent record, re-query the bmap at the + * next fsbcount-aligned offset to avoid redundant + * block checks. + */ + for (lblk = roundup((xfs_dablk_t)got.br_startoff, + args.geo->fsbcount); + lblk < got.br_startoff + got.br_blockcount; + lblk += args.geo->fsbcount) { + error = xfs_scrub_directory_free_bestfree(sc, &args, + lblk); + if (error) + goto out; + } + dabno = got.br_startoff + got.br_blockcount; + lblk = roundup(dabno, args.geo->fsbcount); + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + } +out: + return error; +} + +/* Scrub a whole directory. */ +int +xfs_scrub_directory( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_dir_ctx sdc = { + .dir_iter.actor = xfs_scrub_dir_actor, + .dir_iter.pos = 0, + .sc = sc, + }; + size_t bufsize; + loff_t oldpos; + int error = 0; + + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + /* Plausible size? */ + if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + goto out; + } + + /* Check directory tree structure */ + error = xfs_scrub_da_btree(sc, XFS_DATA_FORK, xfs_scrub_dir_rec, NULL); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + + /* Check the freespace. */ + error = xfs_scrub_directory_blocks(sc); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + + /* + * Check that every dirent we see can also be looked up by hash. + * Userspace usually asks for a 32k buffer, so we will too. + */ + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, + sc->ip->i_d.di_size); + + /* + * Look up every name in this directory by hash. + * + * Use the xfs_readdir function to call xfs_scrub_dir_actor on + * every directory entry in this directory. In _actor, we check + * the name, inode number, and ftype (if applicable) of the + * entry. xfs_readdir uses the VFS filldir functions to provide + * iteration context. + * + * The VFS grabs a read or write lock via i_rwsem before it reads + * or writes to a directory. If we've gotten this far we've + * already obtained IOLOCK_EXCL, which (since 4.10) is the same as + * getting a write lock on i_rwsem. Therefore, it is safe for us + * to drop the ILOCK here in order to reuse the _readdir and + * _dir_lookup routines, which do their own ILOCK locking. + */ + oldpos = 0; + sc->ilock_flags &= ~XFS_ILOCK_EXCL; + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + while (true) { + error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + &error)) + goto out; + if (oldpos == sdc.dir_iter.pos) + break; + oldpos = sdc.dir_iter.pos; + } + +out: + return error; +} diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c new file mode 100644 index 000000000000..496d6f2fbb9e --- /dev/null +++ b/fs/xfs/scrub/ialloc.c @@ -0,0 +1,337 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub inode btrees. + * If we detect a discrepancy between the inobt and the inode, + * try again after forcing logged inode cores out to disk. + */ +int +xfs_scrub_setup_ag_iallocbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder); +} + +/* Inode btree scrubber. */ + +/* Is this chunk worth checking? */ +STATIC bool +xfs_scrub_iallocbt_chunk( + struct xfs_scrub_btree *bs, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + xfs_extlen_t len) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agblock_t bno; + + bno = XFS_AGINO_TO_AGBNO(mp, agino); + if (bno + len <= bno || + !xfs_verify_agbno(mp, agno, bno) || + !xfs_verify_agbno(mp, agno, bno + len - 1)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + return true; +} + +/* Count the number of free inodes. */ +static unsigned int +xfs_scrub_iallocbt_freecount( + xfs_inofree_t freemask) +{ + BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64)); + return hweight64(freemask); +} + +/* Check a particular inode with ir_free. */ +STATIC int +xfs_scrub_iallocbt_check_cluster_freemask( + struct xfs_scrub_btree *bs, + xfs_ino_t fsino, + xfs_agino_t chunkino, + xfs_agino_t clusterino, + struct xfs_inobt_rec_incore *irec, + struct xfs_buf *bp) +{ + struct xfs_dinode *dip; + struct xfs_mount *mp = bs->cur->bc_mp; + bool inode_is_free = false; + bool freemask_ok; + bool inuse; + int error = 0; + + if (xfs_scrub_should_terminate(bs->sc, &error)) + return error; + + dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || + (dip->di_version >= 3 && + be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + goto out; + } + + if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) + inode_is_free = true; + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, + fsino + clusterino, &inuse); + if (error == -ENODATA) { + /* Not cached, just read the disk buffer */ + freemask_ok = inode_is_free ^ !!(dip->di_mode); + if (!bs->sc->try_harder && !freemask_ok) + return -EDEADLOCK; + } else if (error < 0) { + /* + * Inode is only half assembled, or there was an IO error, + * or the verifier failed, so don't bother trying to check. + * The inode scrubber can deal with this. + */ + goto out; + } else { + /* Inode is all there. */ + freemask_ok = inode_is_free ^ inuse; + } + if (!freemask_ok) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); +out: + return 0; +} + +/* Make sure the free mask is consistent with what the inodes think. */ +STATIC int +xfs_scrub_iallocbt_check_freemask( + struct xfs_scrub_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_owner_info oinfo; + struct xfs_imap imap; + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_dinode *dip; + struct xfs_buf *bp; + xfs_ino_t fsino; + xfs_agino_t nr_inodes; + xfs_agino_t agino; + xfs_agino_t chunkino; + xfs_agino_t clusterino; + xfs_agblock_t agbno; + int blks_per_cluster; + uint16_t holemask; + uint16_t ir_holemask; + int error = 0; + + /* Make sure the freemask matches the inode records. */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + + for (agino = irec->ir_startino; + agino < irec->ir_startino + XFS_INODES_PER_CHUNK; + agino += blks_per_cluster * mp->m_sb.sb_inopblock) { + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + chunkino = agino - irec->ir_startino; + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + + /* Compute the holemask mask for this cluster. */ + for (clusterino = 0, holemask = 0; clusterino < nr_inodes; + clusterino += XFS_INODES_PER_HOLEMASK_BIT) + holemask |= XFS_INOBT_MASK((chunkino + clusterino) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* The whole cluster must be a hole or not a hole. */ + ir_holemask = (irec->ir_holemask & holemask); + if (ir_holemask != holemask && ir_holemask != 0) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + continue; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) + continue; + + /* Grab the inode cluster buffer. */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, + agbno); + imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap.im_boffset = 0; + + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, + &dip, &bp, 0, 0); + if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error)) + continue; + + /* Which inodes are free? */ + for (clusterino = 0; clusterino < nr_inodes; clusterino++) { + error = xfs_scrub_iallocbt_check_cluster_freemask(bs, + fsino, chunkino, clusterino, irec, bp); + if (error) { + xfs_trans_brelse(bs->cur->bc_tp, bp); + return error; + } + } + + xfs_trans_brelse(bs->cur->bc_tp, bp); + } + + return error; +} + +/* Scrub an inobt/finobt record. */ +STATIC int +xfs_scrub_iallocbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_inobt_rec_incore irec; + uint64_t holes; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agino_t agino; + xfs_agblock_t agbno; + xfs_extlen_t len; + int holecount; + int i; + int error = 0; + unsigned int real_freecount; + uint16_t holemask; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + + if (irec.ir_count > XFS_INODES_PER_CHUNK || + irec.ir_freecount > XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + real_freecount = irec.ir_freecount + + (XFS_INODES_PER_CHUNK - irec.ir_count); + if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + agino = irec.ir_startino; + /* Record has to be properly aligned within the AG. */ + if (!xfs_verify_agino(mp, agno, agino) || + !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + goto out; + } + + /* Make sure this record is aligned to cluster and inoalignmnt size. */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); + if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) || + (agbno & (xfs_icluster_size_fsb(mp) - 1))) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* Handle non-sparse inodes */ + if (!xfs_inobt_issparse(irec.ir_holemask)) { + len = XFS_B_TO_FSB(mp, + XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize); + if (irec.ir_count != XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) + goto out; + goto check_freemask; + } + + /* Check each chunk of a sparse inode cluster. */ + holemask = irec.ir_holemask; + holecount = 0; + len = XFS_B_TO_FSB(mp, + XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize); + holes = ~xfs_inobt_irec_to_allocmask(&irec); + if ((holes & irec.ir_free) != holes || + irec.ir_freecount > irec.ir_count) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) { + if (holemask & 1) + holecount += XFS_INODES_PER_HOLEMASK_BIT; + else if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) + break; + holemask >>= 1; + agino += XFS_INODES_PER_HOLEMASK_BIT; + } + + if (holecount > XFS_INODES_PER_CHUNK || + holecount + irec.ir_count != XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + +check_freemask: + error = xfs_scrub_iallocbt_check_freemask(bs, &irec); + if (error) + goto out; + +out: + return error; +} + +/* Scrub the inode btrees for some AG. */ +STATIC int +xfs_scrub_iallocbt( + struct xfs_scrub_context *sc, + xfs_btnum_t which) +{ + struct xfs_btree_cur *cur; + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); + cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + return xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo, NULL); +} + +int +xfs_scrub_inobt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_iallocbt(sc, XFS_BTNUM_INO); +} + +int +xfs_scrub_finobt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_iallocbt(sc, XFS_BTNUM_FINO); +} diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c new file mode 100644 index 000000000000..f120fb20452f --- /dev/null +++ b/fs/xfs/scrub/inode.c @@ -0,0 +1,623 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Grab total control of the inode metadata. It doesn't matter here if + * the file data is still changing; exclusive access to the metadata is + * the goal. + */ +int +xfs_scrub_setup_inode( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + int error; + + /* + * Try to get the inode. If the verifiers fail, we try again + * in raw mode. + */ + error = xfs_scrub_get_inode(sc, ip); + switch (error) { + case 0: + break; + case -EFSCORRUPTED: + case -EFSBADCRC: + return 0; + default: + return error; + } + + /* Got the inode, lock it and we're ready to go. */ + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + if (error) + goto out; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + +out: + /* scrub teardown will unlock and release the inode for us */ + return error; +} + +/* Inode core */ + +/* + * Validate di_extsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_extsize(). + * These functions must be kept in sync with each other. + */ +STATIC void +xfs_scrub_inode_extsize( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags) +{ + struct xfs_mount *mp = sc->mp; + bool rt_flag; + bool hint_flag; + bool inherit_flag; + uint32_t extsize; + uint32_t extsize_bytes; + uint32_t blocksize_bytes; + + rt_flag = (flags & XFS_DIFLAG_REALTIME); + hint_flag = (flags & XFS_DIFLAG_EXTSIZE); + inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); + extsize = be32_to_cpu(dip->di_extsize); + extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize); + + if (rt_flag) + blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + else + blocksize_bytes = mp->m_sb.sb_blocksize; + + if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) + goto bad; + + if (hint_flag && !S_ISREG(mode)) + goto bad; + + if (inherit_flag && !S_ISDIR(mode)) + goto bad; + + if ((hint_flag || inherit_flag) && extsize == 0) + goto bad; + + if (!(hint_flag || inherit_flag) && extsize != 0) + goto bad; + + if (extsize_bytes % blocksize_bytes) + goto bad; + + if (extsize > MAXEXTLEN) + goto bad; + + if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) + goto bad; + + return; +bad: + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + +/* + * Validate di_cowextsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). + * These functions must be kept in sync with each other. + */ +STATIC void +xfs_scrub_inode_cowextsize( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + struct xfs_mount *mp = sc->mp; + bool rt_flag; + bool hint_flag; + uint32_t extsize; + uint32_t extsize_bytes; + + rt_flag = (flags & XFS_DIFLAG_REALTIME); + hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); + extsize = be32_to_cpu(dip->di_cowextsize); + extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize); + + if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) + goto bad; + + if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) + goto bad; + + if (hint_flag && extsize == 0) + goto bad; + + if (!hint_flag && extsize != 0) + goto bad; + + if (hint_flag && rt_flag) + goto bad; + + if (extsize_bytes % mp->m_sb.sb_blocksize) + goto bad; + + if (extsize > MAXEXTLEN) + goto bad; + + if (extsize > mp->m_sb.sb_agblocks / 2) + goto bad; + + return; +bad: + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + +/* Make sure the di_flags make sense for the inode. */ +STATIC void +xfs_scrub_inode_flags( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags) +{ + struct xfs_mount *mp = sc->mp; + + if (flags & ~XFS_DIFLAG_ANY) + goto bad; + + /* rt flags require rt device */ + if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) && + !mp->m_rtdev_targp) + goto bad; + + /* new rt bitmap flag only valid for rbmino */ + if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino) + goto bad; + + /* directory-only flags */ + if ((flags & (XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS)) && + !S_ISDIR(mode)) + goto bad; + + /* file-only flags */ + if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) && + !S_ISREG(mode)) + goto bad; + + /* filestreams and rt make no sense */ + if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) + goto bad; + + return; +bad: + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + +/* Make sure the di_flags2 make sense for the inode. */ +STATIC void +xfs_scrub_inode_flags2( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + struct xfs_mount *mp = sc->mp; + + if (flags2 & ~XFS_DIFLAG2_ANY) + goto bad; + + /* reflink flag requires reflink feature */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && + !xfs_sb_version_hasreflink(&mp->m_sb)) + goto bad; + + /* cowextsize flag is checked w.r.t. mode separately */ + + /* file/dir-only flags */ + if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode))) + goto bad; + + /* file-only flags */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) + goto bad; + + /* realtime and reflink make no sense, currently */ + if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) + goto bad; + + /* dax and reflink make no sense, currently */ + if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK)) + goto bad; + + return; +bad: + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + +/* Scrub all the ondisk inode fields. */ +STATIC void +xfs_scrub_dinode( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino) +{ + struct xfs_mount *mp = sc->mp; + size_t fork_recs; + unsigned long long isize; + uint64_t flags2; + uint32_t nextents; + uint16_t flags; + uint16_t mode; + + flags = be16_to_cpu(dip->di_flags); + if (dip->di_version >= 3) + flags2 = be64_to_cpu(dip->di_flags2); + else + flags2 = 0; + + /* di_mode */ + mode = be16_to_cpu(dip->di_mode); + switch (mode & S_IFMT) { + case S_IFLNK: + case S_IFREG: + case S_IFDIR: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + /* mode is recognized */ + break; + default: + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } + + /* v1/v2 fields */ + switch (dip->di_version) { + case 1: + /* + * We autoconvert v1 inodes into v2 inodes on writeout, + * so just mark this inode for preening. + */ + xfs_scrub_ino_set_preen(sc, ino, bp); + break; + case 2: + case 3: + if (dip->di_onlink != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (dip->di_mode == 0 && sc->ip) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (dip->di_projid_hi != 0 && + !xfs_sb_version_hasprojid32bit(&mp->m_sb)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + default: + xfs_scrub_ino_set_corrupt(sc, ino, bp); + return; + } + + /* + * di_uid/di_gid -- -1 isn't invalid, but there's no way that + * userspace could have created that. + */ + if (dip->di_uid == cpu_to_be32(-1U) || + dip->di_gid == cpu_to_be32(-1U)) + xfs_scrub_ino_set_warning(sc, ino, bp); + + /* di_format */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (!S_ISCHR(mode) && !S_ISBLK(mode) && + !S_ISFIFO(mode) && !S_ISSOCK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_LOCAL: + if (!S_ISDIR(mode) && !S_ISLNK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_EXTENTS: + if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (!S_ISREG(mode) && !S_ISDIR(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_UUID: + default: + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } + + /* + * di_size. xfs_dinode_verify checks for things that screw up + * the VFS such as the upper bit being set and zero-length + * symlinks/directories, but we can do more here. + */ + isize = be64_to_cpu(dip->di_size); + if (isize & (1ULL << 63)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* Devices, fifos, and sockets must have zero size */ + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* Directories can't be larger than the data section size (32G) */ + if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* Symlinks can't be larger than SYMLINK_MAXLEN */ + if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* + * Warn if the running kernel can't handle the kinds of offsets + * needed to deal with the file size. In other words, if the + * pagecache can't cache all the blocks in this file due to + * overly large offsets, flag the inode for admin review. + */ + if (isize >= mp->m_super->s_maxbytes) + xfs_scrub_ino_set_warning(sc, ino, bp); + + /* di_nblocks */ + if (flags2 & XFS_DIFLAG2_REFLINK) { + ; /* nblocks can exceed dblocks */ + } else if (flags & XFS_DIFLAG_REALTIME) { + /* + * nblocks is the sum of data extents (in the rtdev), + * attr extents (in the datadev), and both forks' bmbt + * blocks (in the datadev). This clumsy check is the + * best we can do without cross-referencing with the + * inode forks. + */ + if (be64_to_cpu(dip->di_nblocks) >= + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } else { + if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } + + xfs_scrub_inode_flags(sc, bp, dip, ino, mode, flags); + + xfs_scrub_inode_extsize(sc, bp, dip, ino, mode, flags); + + /* di_nextents */ + nextents = be32_to_cpu(dip->di_nextents); + fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); + switch (dip->di_format) { + case XFS_DINODE_FMT_EXTENTS: + if (nextents > fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (nextents <= fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + default: + if (nextents != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } + + /* di_forkoff */ + if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (dip->di_anextents != 0 && dip->di_forkoff == 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* di_aformat */ + if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && + dip->di_aformat != XFS_DINODE_FMT_EXTENTS && + dip->di_aformat != XFS_DINODE_FMT_BTREE) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* di_anextents */ + nextents = be16_to_cpu(dip->di_anextents); + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); + switch (dip->di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + if (nextents > fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (nextents <= fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + default: + if (nextents != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } + + if (dip->di_version >= 3) { + xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2); + xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags, + flags2); + } +} + +/* Map and read a raw inode. */ +STATIC int +xfs_scrub_inode_map_raw( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf **bpp, + struct xfs_dinode **dipp) +{ + struct xfs_imap imap; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp = NULL; + struct xfs_dinode *dip; + int error; + + error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); + if (error == -EINVAL) { + /* + * Inode could have gotten deleted out from under us; + * just forget about it. + */ + error = -ENOENT; + goto out; + } + if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, + imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + + /* + * Is this really an inode? We disabled verifiers in the above + * xfs_trans_read_buf call because the inode buffer verifier + * fails on /any/ inode record in the inode cluster with a bad + * magic or version number, not just the one that we're + * checking. Therefore, grab the buffer unconditionally, attach + * the inode verifiers by hand, and run the inode verifier only + * on the one inode we want. + */ + bp->b_ops = &xfs_inode_buf_ops; + dip = xfs_buf_offset(bp, imap.im_boffset); + if (!xfs_dinode_verify(mp, ino, dip) || + !xfs_dinode_good_version(mp, dip->di_version)) { + xfs_scrub_ino_set_corrupt(sc, ino, bp); + goto out_buf; + } + + /* ...and is it the one we asked for? */ + if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { + error = -ENOENT; + goto out_buf; + } + + *dipp = dip; + *bpp = bp; +out: + return error; +out_buf: + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* Scrub an inode. */ +int +xfs_scrub_inode( + struct xfs_scrub_context *sc) +{ + struct xfs_dinode di; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp = NULL; + struct xfs_dinode *dip; + xfs_ino_t ino; + + bool has_shared; + int error = 0; + + /* Did we get the in-core inode, or are we doing this manually? */ + if (sc->ip) { + ino = sc->ip->i_ino; + xfs_inode_to_disk(sc->ip, &di, 0); + dip = &di; + } else { + /* Map & read inode. */ + ino = sc->sm->sm_ino; + error = xfs_scrub_inode_map_raw(sc, ino, &bp, &dip); + if (error || !bp) + goto out; + } + + xfs_scrub_dinode(sc, bp, dip, ino); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Now let's do the things that require a live inode. */ + if (!sc->ip) + goto out; + + /* + * Does this inode have the reflink flag set but no shared extents? + * Set the preening flag if this is the case. + */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &has_shared); + if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + if (!has_shared) + xfs_scrub_ino_set_preen(sc, ino, bp); + } + +out: + if (bp) + xfs_trans_brelse(sc->tp, bp); + return error; +} diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c new file mode 100644 index 000000000000..63a25334fc83 --- /dev/null +++ b/fs/xfs/scrub/parent.c @@ -0,0 +1,317 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ialloc.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Set us up to scrub parents. */ +int +xfs_scrub_setup_parent( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_inode_contents(sc, ip, 0); +} + +/* Parent pointers */ + +/* Look for an entry in a parent pointing to this inode. */ + +struct xfs_scrub_parent_ctx { + struct dir_context dc; + xfs_ino_t ino; + xfs_nlink_t nlink; +}; + +/* Look for a single entry in a directory pointing to an inode. */ +STATIC int +xfs_scrub_parent_actor( + struct dir_context *dc, + const char *name, + int namelen, + loff_t pos, + u64 ino, + unsigned type) +{ + struct xfs_scrub_parent_ctx *spc; + + spc = container_of(dc, struct xfs_scrub_parent_ctx, dc); + if (spc->ino == ino) + spc->nlink++; + return 0; +} + +/* Count the number of dentries in the parent dir that point to this inode. */ +STATIC int +xfs_scrub_parent_count_parent_dentries( + struct xfs_scrub_context *sc, + struct xfs_inode *parent, + xfs_nlink_t *nlink) +{ + struct xfs_scrub_parent_ctx spc = { + .dc.actor = xfs_scrub_parent_actor, + .dc.pos = 0, + .ino = sc->ip->i_ino, + .nlink = 0, + }; + size_t bufsize; + loff_t oldpos; + uint lock_mode; + int error = 0; + + /* + * If there are any blocks, read-ahead block 0 as we're almost + * certain to have the next operation be a read there. This is + * how we guarantee that the parent's extent map has been loaded, + * if there is one. + */ + lock_mode = xfs_ilock_data_map_shared(parent); + if (parent->i_d.di_nextents > 0) + error = xfs_dir3_data_readahead(parent, 0, -1); + xfs_iunlock(parent, lock_mode); + if (error) + return error; + + /* + * Iterate the parent dir to confirm that there is + * exactly one entry pointing back to the inode being + * scanned. + */ + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, + parent->i_d.di_size); + oldpos = 0; + while (true) { + error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize); + if (error) + goto out; + if (oldpos == spc.dc.pos) + break; + oldpos = spc.dc.pos; + } + *nlink = spc.nlink; +out: + return error; +} + +/* + * Given the inode number of the alleged parent of the inode being + * scrubbed, try to validate that the parent has exactly one directory + * entry pointing back to the inode being scrubbed. + */ +STATIC int +xfs_scrub_parent_validate( + struct xfs_scrub_context *sc, + xfs_ino_t dnum, + bool *try_again) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *dp = NULL; + xfs_nlink_t expected_nlink; + xfs_nlink_t nlink; + int error = 0; + + *try_again = false; + + /* '..' must not point to ourselves. */ + if (sc->ip->i_ino == dnum) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* + * If we're an unlinked directory, the parent /won't/ have a link + * to us. Otherwise, it should have one link. + */ + expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; + + /* + * Grab this parent inode. We release the inode before we + * cancel the scrub transaction. Since we're don't know a + * priori that releasing the inode won't trigger eofblocks + * cleanup (which allocates what would be a nested transaction) + * if the parent pointer erroneously points to a file, we + * can't use DONTCACHE here because DONTCACHE inodes can trigger + * immediate inactive cleanup of the inode. + */ + error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + if (dp == sc->ip) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out_rele; + } + + /* + * We prefer to keep the inode locked while we lock and search + * its alleged parent for a forward reference. If we can grab + * the iolock, validate the pointers and we're done. We must + * use nowait here to avoid an ABBA deadlock on the parent and + * the child inodes. + */ + if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) { + error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + &error)) + goto out_unlock; + if (nlink != expected_nlink) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out_unlock; + } + + /* + * The game changes if we get here. We failed to lock the parent, + * so we're going to try to verify both pointers while only holding + * one lock so as to avoid deadlocking with something that's actually + * trying to traverse down the directory tree. + */ + xfs_iunlock(sc->ip, sc->ilock_flags); + sc->ilock_flags = 0; + xfs_ilock(dp, XFS_IOLOCK_SHARED); + + /* Go looking for our dentry. */ + error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_unlock; + + /* Drop the parent lock, relock this inode. */ + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + sc->ilock_flags = XFS_IOLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + + /* + * If we're an unlinked directory, the parent /won't/ have a link + * to us. Otherwise, it should have one link. We have to re-set + * it here because we dropped the lock on sc->ip. + */ + expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; + + /* Look up '..' to see if the inode changed. */ + error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_rele; + + /* Drat, parent changed. Try again! */ + if (dnum != dp->i_ino) { + iput(VFS_I(dp)); + *try_again = true; + return 0; + } + iput(VFS_I(dp)); + + /* + * '..' didn't change, so check that there was only one entry + * for us in the parent. + */ + if (nlink != expected_nlink) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return error; + +out_unlock: + xfs_iunlock(dp, XFS_IOLOCK_SHARED); +out_rele: + iput(VFS_I(dp)); +out: + return error; +} + +/* Scrub a parent pointer. */ +int +xfs_scrub_parent( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + xfs_ino_t dnum; + bool try_again; + int tries = 0; + int error = 0; + + /* + * If we're a directory, check that the '..' link points up to + * a directory that has one entry pointing to us. + */ + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + /* We're not a special inode, are we? */ + if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* + * The VFS grabs a read or write lock via i_rwsem before it reads + * or writes to a directory. If we've gotten this far we've + * already obtained IOLOCK_EXCL, which (since 4.10) is the same as + * getting a write lock on i_rwsem. Therefore, it is safe for us + * to drop the ILOCK here in order to do directory lookups. + */ + sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); + + /* Look up '..' */ + error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + if (!xfs_verify_dir_ino(mp, dnum)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_rootip) { + if (sc->ip->i_ino != mp->m_sb.sb_rootino || + sc->ip->i_ino != dnum) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + do { + error = xfs_scrub_parent_validate(sc, dnum, &try_again); + if (error) + goto out; + } while (try_again && ++tries < 20); + + /* + * We gave it our best shot but failed, so mark this scrub + * incomplete. Userspace can decide if it wants to try again. + */ + if (try_again && tries == 20) + xfs_scrub_set_incomplete(sc); +out: + return error; +} diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c new file mode 100644 index 000000000000..3d9037eceaf1 --- /dev/null +++ b/fs/xfs/scrub/quota.c @@ -0,0 +1,304 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Convert a scrub type code to a DQ flag, or return 0 if error. */ +static inline uint +xfs_scrub_quota_to_dqtype( + struct xfs_scrub_context *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_UQUOTA: + return XFS_DQ_USER; + case XFS_SCRUB_TYPE_GQUOTA: + return XFS_DQ_GROUP; + case XFS_SCRUB_TYPE_PQUOTA: + return XFS_DQ_PROJ; + default: + return 0; + } +} + +/* Set us up to scrub a quota. */ +int +xfs_scrub_setup_quota( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + uint dqtype; + + /* + * If userspace gave us an AG number or inode data, they don't + * know what they're doing. Get out. + */ + if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen) + return -EINVAL; + + dqtype = xfs_scrub_quota_to_dqtype(sc); + if (dqtype == 0) + return -EINVAL; + if (!xfs_this_quota_on(sc->mp, dqtype)) + return -ENOENT; + return 0; +} + +/* Quotas. */ + +/* Scrub the fields in an individual quota item. */ +STATIC void +xfs_scrub_quota_item( + struct xfs_scrub_context *sc, + uint dqtype, + struct xfs_dquot *dq, + xfs_dqid_t id) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_disk_dquot *d = &dq->q_core; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset; + unsigned long long bsoft; + unsigned long long isoft; + unsigned long long rsoft; + unsigned long long bhard; + unsigned long long ihard; + unsigned long long rhard; + unsigned long long bcount; + unsigned long long icount; + unsigned long long rcount; + xfs_ino_t fs_icount; + + offset = id / qi->qi_dqperchunk; + + /* + * We fed $id and DQNEXT into the xfs_qm_dqget call, which means + * that the actual dquot we got must either have the same id or + * the next higher id. + */ + if (id > be32_to_cpu(d->d_id)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* Did we get the dquot type we wanted? */ + if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* Check the limits. */ + bhard = be64_to_cpu(d->d_blk_hardlimit); + ihard = be64_to_cpu(d->d_ino_hardlimit); + rhard = be64_to_cpu(d->d_rtb_hardlimit); + + bsoft = be64_to_cpu(d->d_blk_softlimit); + isoft = be64_to_cpu(d->d_ino_softlimit); + rsoft = be64_to_cpu(d->d_rtb_softlimit); + + /* + * Warn if the hard limits are larger than the fs. + * Administrators can do this, though in production this seems + * suspect, which is why we flag it for review. + * + * Complain about corruption if the soft limit is greater than + * the hard limit. + */ + if (bhard > mp->m_sb.sb_dblocks) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (bsoft > bhard) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + if (ihard > mp->m_maxicount) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (isoft > ihard) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + if (rhard > mp->m_sb.sb_rblocks) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (rsoft > rhard) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* Check the resource counts. */ + bcount = be64_to_cpu(d->d_bcount); + icount = be64_to_cpu(d->d_icount); + rcount = be64_to_cpu(d->d_rtbcount); + fs_icount = percpu_counter_sum(&mp->m_icount); + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (mp->m_sb.sb_dblocks < bcount) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, + offset); + } else { + if (mp->m_sb.sb_dblocks < bcount) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + offset); + } + if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* + * We can violate the hard limits if the admin suddenly sets a + * lower limit than the actual usage. However, we flag it for + * admin review. + */ + if (id != 0 && bhard != 0 && bcount > bhard) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (id != 0 && ihard != 0 && icount > ihard) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (id != 0 && rhard != 0 && rcount > rhard) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); +} + +/* Scrub all of a quota type's items. */ +int +xfs_scrub_quota( + struct xfs_scrub_context *sc) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *dq; + xfs_fileoff_t max_dqid_off; + xfs_fileoff_t off = 0; + xfs_dqid_t id = 0; + uint dqtype; + int nimaps; + int error = 0; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return -ENOENT; + + mutex_lock(&qi->qi_quotaofflock); + dqtype = xfs_scrub_quota_to_dqtype(sc); + if (!xfs_this_quota_on(sc->mp, dqtype)) { + error = -ENOENT; + goto out_unlock_quota; + } + + /* Attach to the quota inode and set sc->ip so that reporting works. */ + ip = xfs_quota_inode(sc->mp, dqtype); + sc->ip = ip; + + /* Look for problem extents. */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + goto out_unlock_inode; + } + max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + while (1) { + if (xfs_scrub_should_terminate(sc, &error)) + break; + + off = irec.br_startoff + irec.br_blockcount; + nimaps = 1; + error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps, + XFS_BMAPI_ENTIRE); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off, + &error)) + goto out_unlock_inode; + if (!nimaps) + break; + if (irec.br_startblock == HOLESTARTBLOCK) + continue; + + /* Check the extent record doesn't point to crap. */ + if (irec.br_startblock + irec.br_blockcount <= + irec.br_startblock) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + irec.br_startoff); + if (!xfs_verify_fsbno(mp, irec.br_startblock) || + !xfs_verify_fsbno(mp, irec.br_startblock + + irec.br_blockcount - 1)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + irec.br_startoff); + + /* + * Unwritten extents or blocks mapped above the highest + * quota id shouldn't happen. + */ + if (isnullstartblock(irec.br_startblock) || + irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount > max_dqid_off + 1) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Check all the quota items. */ + while (id < ((xfs_dqid_t)-1ULL)) { + if (xfs_scrub_should_terminate(sc, &error)) + break; + + error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT, + &dq); + if (error == -ENOENT) + break; + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, + id * qi->qi_dqperchunk, &error)) + break; + + xfs_scrub_quota_item(sc, dqtype, dq, id); + + id = be32_to_cpu(dq->q_core.d_id) + 1; + xfs_qm_dqput(dq); + if (!id) + break; + } + +out: + /* We set sc->ip earlier, so make sure we clear it now. */ + sc->ip = NULL; +out_unlock_quota: + mutex_unlock(&qi->qi_quotaofflock); + return error; + +out_unlock_inode: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out; +} diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c new file mode 100644 index 000000000000..2f88a8d44bd0 --- /dev/null +++ b/fs/xfs/scrub/refcount.c @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub reference count btrees. + */ +int +xfs_scrub_setup_ag_refcountbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, false); +} + +/* Reference count btree scrubber. */ + +/* Scrub a refcountbt record. */ +STATIC int +xfs_scrub_refcountbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agblock_t bno; + xfs_extlen_t len; + xfs_nlink_t refcount; + bool has_cowflag; + int error = 0; + + bno = be32_to_cpu(rec->refc.rc_startblock); + len = be32_to_cpu(rec->refc.rc_blockcount); + refcount = be32_to_cpu(rec->refc.rc_refcount); + + /* Only CoW records can have refcount == 1. */ + has_cowflag = (bno & XFS_REFC_COW_START); + if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* Check the extent. */ + bno &= ~XFS_REFC_COW_START; + if (bno + len <= bno || + !xfs_verify_agbno(mp, agno, bno) || + !xfs_verify_agbno(mp, agno, bno + len - 1)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (refcount == 0) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + return error; +} + +/* Scrub the refcount btree for some AG. */ +int +xfs_scrub_refcountbt( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); + return xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec, + &oinfo, NULL); +} diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c new file mode 100644 index 000000000000..97846c424690 --- /dev/null +++ b/fs/xfs/scrub/rmap.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub reverse mapping btrees. + */ +int +xfs_scrub_setup_ag_rmapbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, false); +} + +/* Reverse-mapping scrubber. */ + +/* Scrub an rmapbt record. */ +STATIC int +xfs_scrub_rmapbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_rmap_irec irec; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + bool non_inode; + bool is_unwritten; + bool is_bmbt; + bool is_attr; + int error; + + error = xfs_rmap_btrec_to_irec(rec, &irec); + if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error)) + goto out; + + /* Check extent. */ + if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (irec.rm_owner == XFS_RMAP_OWN_FS) { + /* + * xfs_verify_agbno returns false for static fs metadata. + * Since that only exists at the start of the AG, validate + * that by hand. + */ + if (irec.rm_startblock != 0 || + irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + } else { + /* + * Otherwise we must point somewhere past the static metadata + * but before the end of the FS. Run the regular check. + */ + if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) || + !xfs_verify_agbno(mp, agno, irec.rm_startblock + + irec.rm_blockcount - 1)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + } + + /* Check flags. */ + non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner); + is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN; + + if (is_bmbt && irec.rm_offset != 0) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (non_inode && irec.rm_offset != 0) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (is_unwritten && (is_bmbt || non_inode || is_attr)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (non_inode && (is_bmbt || is_unwritten || is_attr)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (!non_inode) { + if (!xfs_verify_ino(mp, irec.rm_owner)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + } else { + /* Non-inode owner within the magic values? */ + if (irec.rm_owner <= XFS_RMAP_OWN_MIN || + irec.rm_owner > XFS_RMAP_OWN_FS) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + } +out: + return error; +} + +/* Scrub the rmap btree for some AG. */ +int +xfs_scrub_rmapbt( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + return xfs_scrub_btree(sc, sc->sa.rmap_cur, xfs_scrub_rmapbt_rec, + &oinfo, NULL); +} diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c new file mode 100644 index 000000000000..c6fedb698008 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.c @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Set us up with the realtime metadata locked. */ +int +xfs_scrub_setup_rt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* + * If userspace gave us an AG number or inode data, they don't + * know what they're doing. Get out. + */ + if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen) + return -EINVAL; + + error = xfs_scrub_setup_fs(sc, ip); + if (error) + return error; + + sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP; + sc->ip = mp->m_rbmip; + xfs_ilock(sc->ip, sc->ilock_flags); + + return 0; +} + +/* Realtime bitmap. */ + +/* Scrub a free extent record from the realtime bitmap. */ +STATIC int +xfs_scrub_rtbitmap_rec( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_scrub_context *sc = priv; + + if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock || + !xfs_verify_rtbno(sc->mp, rec->ar_startblock) || + !xfs_verify_rtbno(sc->mp, rec->ar_startblock + + rec->ar_blockcount - 1)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; +} + +/* Scrub the realtime bitmap. */ +int +xfs_scrub_rtbitmap( + struct xfs_scrub_context *sc) +{ + int error; + + error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + +out: + return error; +} + +/* Scrub the realtime summary. */ +int +xfs_scrub_rtsummary( + struct xfs_scrub_context *sc) +{ + /* XXX: implement this some day */ + return -ENOENT; +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c new file mode 100644 index 000000000000..ab3aef2ae823 --- /dev/null +++ b/fs/xfs/scrub/scrub.c @@ -0,0 +1,391 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/btree.h" + +/* + * Online Scrub and Repair + * + * Traditionally, XFS (the kernel driver) did not know how to check or + * repair on-disk data structures. That task was left to the xfs_check + * and xfs_repair tools, both of which require taking the filesystem + * offline for a thorough but time consuming examination. Online + * scrub & repair, on the other hand, enables us to check the metadata + * for obvious errors while carefully stepping around the filesystem's + * ongoing operations, locking rules, etc. + * + * Given that most XFS metadata consist of records stored in a btree, + * most of the checking functions iterate the btree blocks themselves + * looking for irregularities. When a record block is encountered, each + * record can be checked for obviously bad values. Record values can + * also be cross-referenced against other btrees to look for potential + * misunderstandings between pieces of metadata. + * + * It is expected that the checkers responsible for per-AG metadata + * structures will lock the AG headers (AGI, AGF, AGFL), iterate the + * metadata structure, and perform any relevant cross-referencing before + * unlocking the AG and returning the results to userspace. These + * scrubbers must not keep an AG locked for too long to avoid tying up + * the block and inode allocators. + * + * Block maps and b-trees rooted in an inode present a special challenge + * because they can involve extents from any AG. The general scrubber + * structure of lock -> check -> xref -> unlock still holds, but AG + * locking order rules /must/ be obeyed to avoid deadlocks. The + * ordering rule, of course, is that we must lock in increasing AG + * order. Helper functions are provided to track which AG headers we've + * already locked. If we detect an imminent locking order violation, we + * can signal a potential deadlock, in which case the scrubber can jump + * out to the top level, lock all the AGs in order, and retry the scrub. + * + * For file data (directories, extended attributes, symlinks) scrub, we + * can simply lock the inode and walk the data. For btree data + * (directories and attributes) we follow the same btree-scrubbing + * strategy outlined previously to check the records. + * + * We use a bit of trickery with transactions to avoid buffer deadlocks + * if there is a cycle in the metadata. The basic problem is that + * travelling down a btree involves locking the current buffer at each + * tree level. If a pointer should somehow point back to a buffer that + * we've already examined, we will deadlock due to the second buffer + * locking attempt. Note however that grabbing a buffer in transaction + * context links the locked buffer to the transaction. If we try to + * re-grab the buffer in the context of the same transaction, we avoid + * the second lock attempt and continue. Between the verifier and the + * scrubber, something will notice that something is amiss and report + * the corruption. Therefore, each scrubber will allocate an empty + * transaction, attach buffers to it, and cancel the transaction at the + * end of the scrub run. Cancelling a non-dirty transaction simply + * unlocks the buffers. + * + * There are four pieces of data that scrub can communicate to + * userspace. The first is the error code (errno), which can be used to + * communicate operational errors in performing the scrub. There are + * also three flags that can be set in the scrub context. If the data + * structure itself is corrupt, the CORRUPT flag will be set. If + * the metadata is correct but otherwise suboptimal, the PREEN flag + * will be set. + */ + +/* + * Scrub probe -- userspace uses this to probe if we're willing to scrub + * or repair a given mountpoint. This will be used by xfs_scrub to + * probe the kernel's abilities to scrub (and repair) the metadata. We + * do this by validating the ioctl inputs from userspace, preparing the + * filesystem for a scrub (or a repair) operation, and immediately + * returning to userspace. Userspace can use the returned errno and + * structure state to decide (in broad terms) if scrub/repair are + * supported by the running kernel. + */ +static int +xfs_scrub_probe( + struct xfs_scrub_context *sc) +{ + int error = 0; + + if (sc->sm->sm_ino || sc->sm->sm_agno) + return -EINVAL; + if (xfs_scrub_should_terminate(sc, &error)) + return error; + + return 0; +} + +/* Scrub setup and teardown */ + +/* Free all the resources and finish the transactions. */ +STATIC int +xfs_scrub_teardown( + struct xfs_scrub_context *sc, + struct xfs_inode *ip_in, + int error) +{ + xfs_scrub_ag_free(sc, &sc->sa); + if (sc->tp) { + xfs_trans_cancel(sc->tp); + sc->tp = NULL; + } + if (sc->ip) { + xfs_iunlock(sc->ip, sc->ilock_flags); + if (sc->ip != ip_in && + !xfs_internal_inum(sc->mp, sc->ip->i_ino)) + iput(VFS_I(sc->ip)); + sc->ip = NULL; + } + if (sc->buf) { + kmem_free(sc->buf); + sc->buf = NULL; + } + return error; +} + +/* Scrubbing dispatch. */ + +static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { + { /* ioctl presence test */ + .setup = xfs_scrub_setup_fs, + .scrub = xfs_scrub_probe, + }, + { /* superblock */ + .setup = xfs_scrub_setup_ag_header, + .scrub = xfs_scrub_superblock, + }, + { /* agf */ + .setup = xfs_scrub_setup_ag_header, + .scrub = xfs_scrub_agf, + }, + { /* agfl */ + .setup = xfs_scrub_setup_ag_header, + .scrub = xfs_scrub_agfl, + }, + { /* agi */ + .setup = xfs_scrub_setup_ag_header, + .scrub = xfs_scrub_agi, + }, + { /* bnobt */ + .setup = xfs_scrub_setup_ag_allocbt, + .scrub = xfs_scrub_bnobt, + }, + { /* cntbt */ + .setup = xfs_scrub_setup_ag_allocbt, + .scrub = xfs_scrub_cntbt, + }, + { /* inobt */ + .setup = xfs_scrub_setup_ag_iallocbt, + .scrub = xfs_scrub_inobt, + }, + { /* finobt */ + .setup = xfs_scrub_setup_ag_iallocbt, + .scrub = xfs_scrub_finobt, + .has = xfs_sb_version_hasfinobt, + }, + { /* rmapbt */ + .setup = xfs_scrub_setup_ag_rmapbt, + .scrub = xfs_scrub_rmapbt, + .has = xfs_sb_version_hasrmapbt, + }, + { /* refcountbt */ + .setup = xfs_scrub_setup_ag_refcountbt, + .scrub = xfs_scrub_refcountbt, + .has = xfs_sb_version_hasreflink, + }, + { /* inode record */ + .setup = xfs_scrub_setup_inode, + .scrub = xfs_scrub_inode, + }, + { /* inode data fork */ + .setup = xfs_scrub_setup_inode_bmap, + .scrub = xfs_scrub_bmap_data, + }, + { /* inode attr fork */ + .setup = xfs_scrub_setup_inode_bmap, + .scrub = xfs_scrub_bmap_attr, + }, + { /* inode CoW fork */ + .setup = xfs_scrub_setup_inode_bmap, + .scrub = xfs_scrub_bmap_cow, + }, + { /* directory */ + .setup = xfs_scrub_setup_directory, + .scrub = xfs_scrub_directory, + }, + { /* extended attributes */ + .setup = xfs_scrub_setup_xattr, + .scrub = xfs_scrub_xattr, + }, + { /* symbolic link */ + .setup = xfs_scrub_setup_symlink, + .scrub = xfs_scrub_symlink, + }, + { /* parent pointers */ + .setup = xfs_scrub_setup_parent, + .scrub = xfs_scrub_parent, + }, + { /* realtime bitmap */ + .setup = xfs_scrub_setup_rt, + .scrub = xfs_scrub_rtbitmap, + .has = xfs_sb_version_hasrealtime, + }, + { /* realtime summary */ + .setup = xfs_scrub_setup_rt, + .scrub = xfs_scrub_rtsummary, + .has = xfs_sb_version_hasrealtime, + }, + { /* user quota */ + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, + }, + { /* group quota */ + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, + }, + { /* project quota */ + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, + }, +}; + +/* This isn't a stable feature, warn once per day. */ +static inline void +xfs_scrub_experimental_warning( + struct xfs_mount *mp) +{ + static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( + "xfs_scrub_warning", 86400 * HZ, 1); + ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); + + if (__ratelimit(&scrub_warning)) + xfs_alert(mp, +"EXPERIMENTAL online scrub feature in use. Use at your own risk!"); +} + +/* Dispatch metadata scrubbing. */ +int +xfs_scrub_metadata( + struct xfs_inode *ip, + struct xfs_scrub_metadata *sm) +{ + struct xfs_scrub_context sc; + struct xfs_mount *mp = ip->i_mount; + const struct xfs_scrub_meta_ops *ops; + bool try_harder = false; + int error = 0; + + trace_xfs_scrub_start(ip, sm, error); + + /* Forbidden if we are shut down or mounted norecovery. */ + error = -ESHUTDOWN; + if (XFS_FORCED_SHUTDOWN(mp)) + goto out; + error = -ENOTRECOVERABLE; + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + goto out; + + /* Check our inputs. */ + error = -EINVAL; + sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) + goto out; + if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) + goto out; + + /* Do we know about this type of metadata? */ + error = -ENOENT; + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) + goto out; + ops = &meta_scrub_ops[sm->sm_type]; + if (ops->scrub == NULL) + goto out; + + /* + * We won't scrub any filesystem that doesn't have the ability + * to record unwritten extents. The option was made default in + * 2003, removed from mkfs in 2007, and cannot be disabled in + * v5, so if we find a filesystem without this flag it's either + * really old or totally unsupported. Avoid it either way. + * We also don't support v1-v3 filesystems, which aren't + * mountable. + */ + error = -EOPNOTSUPP; + if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) + goto out; + + /* Does this fs even support this type of metadata? */ + error = -ENOENT; + if (ops->has && !ops->has(&mp->m_sb)) + goto out; + + /* We don't know how to repair anything yet. */ + error = -EOPNOTSUPP; + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + goto out; + + xfs_scrub_experimental_warning(mp); + +retry_op: + /* Set up for the operation. */ + memset(&sc, 0, sizeof(sc)); + sc.mp = ip->i_mount; + sc.sm = sm; + sc.ops = ops; + sc.try_harder = try_harder; + sc.sa.agno = NULLAGNUMBER; + error = sc.ops->setup(&sc, ip); + if (error) + goto out_teardown; + + /* Scrub for errors. */ + error = sc.ops->scrub(&sc); + if (!try_harder && error == -EDEADLOCK) { + /* + * Scrubbers return -EDEADLOCK to mean 'try harder'. + * Tear down everything we hold, then set up again with + * preparation for worst-case scenarios. + */ + error = xfs_scrub_teardown(&sc, ip, 0); + if (error) + goto out; + try_harder = true; + goto retry_op; + } else if (error) + goto out_teardown; + + if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT)) + xfs_alert_ratelimited(mp, "Corruption detected during scrub."); + +out_teardown: + error = xfs_scrub_teardown(&sc, ip, error); +out: + trace_xfs_scrub_done(ip, sm, error); + if (error == -EFSCORRUPTED || error == -EFSBADCRC) { + sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + error = 0; + } + return error; +} diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h new file mode 100644 index 000000000000..e9ec041cf713 --- /dev/null +++ b/fs/xfs/scrub/scrub.h @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_SCRUB_H__ +#define __XFS_SCRUB_SCRUB_H__ + +struct xfs_scrub_context; + +struct xfs_scrub_meta_ops { + /* Acquire whatever resources are needed for the operation. */ + int (*setup)(struct xfs_scrub_context *, + struct xfs_inode *); + + /* Examine metadata for errors. */ + int (*scrub)(struct xfs_scrub_context *); + + /* Decide if we even have this piece of metadata. */ + bool (*has)(struct xfs_sb *); +}; + +/* Buffer pointers and btree cursors for an entire AG. */ +struct xfs_scrub_ag { + xfs_agnumber_t agno; + + /* AG btree roots */ + struct xfs_buf *agf_bp; + struct xfs_buf *agfl_bp; + struct xfs_buf *agi_bp; + + /* AG btrees */ + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur; + struct xfs_btree_cur *rmap_cur; + struct xfs_btree_cur *refc_cur; +}; + +struct xfs_scrub_context { + /* General scrub state. */ + struct xfs_mount *mp; + struct xfs_scrub_metadata *sm; + const struct xfs_scrub_meta_ops *ops; + struct xfs_trans *tp; + struct xfs_inode *ip; + void *buf; + uint ilock_flags; + bool try_harder; + + /* State tracking for single-AG operations. */ + struct xfs_scrub_ag sa; +}; + +/* Metadata scrubbers */ +int xfs_scrub_tester(struct xfs_scrub_context *sc); +int xfs_scrub_superblock(struct xfs_scrub_context *sc); +int xfs_scrub_agf(struct xfs_scrub_context *sc); +int xfs_scrub_agfl(struct xfs_scrub_context *sc); +int xfs_scrub_agi(struct xfs_scrub_context *sc); +int xfs_scrub_bnobt(struct xfs_scrub_context *sc); +int xfs_scrub_cntbt(struct xfs_scrub_context *sc); +int xfs_scrub_inobt(struct xfs_scrub_context *sc); +int xfs_scrub_finobt(struct xfs_scrub_context *sc); +int xfs_scrub_rmapbt(struct xfs_scrub_context *sc); +int xfs_scrub_refcountbt(struct xfs_scrub_context *sc); +int xfs_scrub_inode(struct xfs_scrub_context *sc); +int xfs_scrub_bmap_data(struct xfs_scrub_context *sc); +int xfs_scrub_bmap_attr(struct xfs_scrub_context *sc); +int xfs_scrub_bmap_cow(struct xfs_scrub_context *sc); +int xfs_scrub_directory(struct xfs_scrub_context *sc); +int xfs_scrub_xattr(struct xfs_scrub_context *sc); +int xfs_scrub_symlink(struct xfs_scrub_context *sc); +int xfs_scrub_parent(struct xfs_scrub_context *sc); +#ifdef CONFIG_XFS_RT +int xfs_scrub_rtbitmap(struct xfs_scrub_context *sc); +int xfs_scrub_rtsummary(struct xfs_scrub_context *sc); +#else +static inline int +xfs_scrub_rtbitmap(struct xfs_scrub_context *sc) +{ + return -ENOENT; +} +static inline int +xfs_scrub_rtsummary(struct xfs_scrub_context *sc) +{ + return -ENOENT; +} +#endif +#ifdef CONFIG_XFS_QUOTA +int xfs_scrub_quota(struct xfs_scrub_context *sc); +#else +static inline int +xfs_scrub_quota(struct xfs_scrub_context *sc) +{ + return -ENOENT; +} +#endif + +#endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c new file mode 100644 index 000000000000..3aa3d60f7c16 --- /dev/null +++ b/fs/xfs/scrub/symlink.c @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_symlink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Set us up to scrub a symbolic link. */ +int +xfs_scrub_setup_symlink( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + /* Allocate the buffer without the inode lock held. */ + sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); + if (!sc->buf) + return -ENOMEM; + + return xfs_scrub_setup_inode_contents(sc, ip, 0); +} + +/* Symbolic links. */ + +int +xfs_scrub_symlink( + struct xfs_scrub_context *sc) +{ + struct xfs_inode *ip = sc->ip; + struct xfs_ifork *ifp; + loff_t len; + int error = 0; + + if (!S_ISLNK(VFS_I(ip)->i_mode)) + return -ENOENT; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + len = ip->i_d.di_size; + + /* Plausible size? */ + if (len > XFS_SYMLINK_MAXLEN || len <= 0) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* Inline symlink? */ + if (ifp->if_flags & XFS_IFINLINE) { + if (len > XFS_IFORK_DSIZE(ip) || + len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip))) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* Remote symlink; must read the contents. */ + error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); +out: + return error; +} diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c new file mode 100644 index 000000000000..86daed0e3a45 --- /dev/null +++ b/fs/xfs/scrub/trace.c @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" + +/* Figure out which block the btree cursor was pointing to. */ +static inline xfs_fsblock_t +xfs_scrub_btree_cur_fsbno( + struct xfs_btree_cur *cur, + int level) +{ + if (level < cur->bc_nlevels && cur->bc_bufs[level]) + return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn); + else if (level == cur->bc_nlevels - 1 && + cur->bc_flags & XFS_BTREE_LONG_PTRS) + return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino); + else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) + return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0); + return NULLFSBLOCK; +} + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "scrub/trace.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h new file mode 100644 index 000000000000..c4ebfb5c1ee8 --- /dev/null +++ b/fs/xfs/scrub/trace.h @@ -0,0 +1,499 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xfs_scrub + +#if !defined(_TRACE_XFS_SCRUB_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XFS_SCRUB_TRACE_H + +#include <linux/tracepoint.h> +#include "xfs_bit.h" + +DECLARE_EVENT_CLASS(xfs_scrub_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, + int error), + TP_ARGS(ip, sm, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->type = sm->sm_type; + __entry->agno = sm->sm_agno; + __entry->inum = sm->sm_ino; + __entry->gen = sm->sm_gen; + __entry->flags = sm->sm_flags; + __entry->error = error; + ), + TP_printk("dev %d:%d ino %llu type %u agno %u inum %llu gen %u flags 0x%x error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->type, + __entry->agno, + __entry->inum, + __entry->gen, + __entry->flags, + __entry->error) +) +#define DEFINE_SCRUB_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, \ + int error), \ + TP_ARGS(ip, sm, error)) + +DEFINE_SCRUB_EVENT(xfs_scrub_start); +DEFINE_SCRUB_EVENT(xfs_scrub_done); +DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry); + +TRACE_EVENT(xfs_scrub_op_error, + TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno, + xfs_agblock_t bno, int error, void *ret_ip), + TP_ARGS(sc, agno, bno, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->agno = agno; + __entry->bno = bno; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_file_op_error, + TP_PROTO(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset, int error, void *ret_ip), + TP_ARGS(sc, whichfork, offset, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_fileoff_t, offset) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->ip->i_mount->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = whichfork; + __entry->type = sc->sm->sm_type; + __entry->offset = offset; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->whichfork, + __entry->type, + __entry->offset, + __entry->error, + __entry->ret_ip) +); + +DECLARE_EVENT_CLASS(xfs_scrub_block_error_class, + TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, void *ret_ip), + TP_ARGS(sc, daddr, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno; + xfs_agnumber_t agno; + xfs_agblock_t bno; + + fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr); + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->agno = agno; + __entry->bno = bno; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->bno, + __entry->ret_ip) +) + +#define DEFINE_SCRUB_BLOCK_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_block_error_class, name, \ + TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, \ + void *ret_ip), \ + TP_ARGS(sc, daddr, ret_ip)) + +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_error); +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_preen); + +DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class, + TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, xfs_daddr_t daddr, + void *ret_ip), + TP_ARGS(sc, ino, daddr, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno; + xfs_agnumber_t agno; + xfs_agblock_t bno; + + if (daddr) { + fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr); + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + } else { + agno = XFS_INO_TO_AGNO(sc->mp, ino); + bno = XFS_AGINO_TO_AGBNO(sc->mp, + XFS_INO_TO_AGINO(sc->mp, ino)); + } + + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = ino; + __entry->type = sc->sm->sm_type; + __entry->agno = agno; + __entry->bno = bno; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu type %u agno %u agbno %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->type, + __entry->agno, + __entry->bno, + __entry->ret_ip) +) + +#define DEFINE_SCRUB_INO_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_ino_error_class, name, \ + TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, \ + xfs_daddr_t daddr, void *ret_ip), \ + TP_ARGS(sc, ino, daddr, ret_ip)) + +DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_error); +DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_preen); +DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_warning); + +DECLARE_EVENT_CLASS(xfs_scrub_fblock_error_class, + TP_PROTO(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset, void *ret_ip), + TP_ARGS(sc, whichfork, offset, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_fileoff_t, offset) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->ip->i_mount->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = whichfork; + __entry->type = sc->sm->sm_type; + __entry->offset = offset; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->whichfork, + __entry->type, + __entry->offset, + __entry->ret_ip) +); + +#define DEFINE_SCRUB_FBLOCK_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_fblock_error_class, name, \ + TP_PROTO(struct xfs_scrub_context *sc, int whichfork, \ + xfs_fileoff_t offset, void *ret_ip), \ + TP_ARGS(sc, whichfork, offset, ret_ip)) + +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_error); +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_warning); + +TRACE_EVENT(xfs_scrub_incomplete, + TP_PROTO(struct xfs_scrub_context *sc, void *ret_ip), + TP_ARGS(sc, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_btree_op_error, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level, int error, void *ret_ip), + TP_ARGS(sc, cur, level, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr); + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_ptrs[level]; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->btnum, + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_ifork_btree_op_error, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level, int error, void *ret_ip), + TP_ARGS(sc, cur, level, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(int, ptr) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = cur->bc_private.b.whichfork; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->ptr = cur->bc_ptrs[level]; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->whichfork, + __entry->type, + __entry->btnum, + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_btree_error, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level, void *ret_ip), + TP_ARGS(sc, cur, level, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr); + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_ptrs[level]; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->btnum, + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_ifork_btree_error, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level, void *ret_ip), + TP_ARGS(sc, cur, level, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr); + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = cur->bc_private.b.whichfork; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_ptrs[level]; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->whichfork, + __entry->type, + __entry->btnum, + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->ret_ip) +); + +DECLARE_EVENT_CLASS(xfs_scrub_sbtree_class, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level), + TP_ARGS(sc, cur, level), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, level) + __field(int, nlevels) + __field(int, ptr) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->ptr = cur->bc_ptrs[level]; + ), + TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->btnum, + __entry->agno, + __entry->bno, + __entry->level, + __entry->nlevels, + __entry->ptr) +) +#define DEFINE_SCRUB_SBTREE_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_sbtree_class, name, \ + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, \ + int level), \ + TP_ARGS(sc, cur, level)) + +DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_rec); +DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_key); + +#endif /* _TRACE_XFS_SCRUB_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE scrub/trace +#include <trace/define_trace.h> diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h new file mode 100644 index 000000000000..e00e0eadac6a --- /dev/null +++ b/fs/xfs/scrub/xfs_scrub.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_H__ +#define __XFS_SCRUB_H__ + +#ifndef CONFIG_XFS_ONLINE_SCRUB +# define xfs_scrub_metadata(ip, sm) (-ENOTTY) +#else +int xfs_scrub_metadata(struct xfs_inode *ip, struct xfs_scrub_metadata *sm); +#endif /* CONFIG_XFS_ONLINE_SCRUB */ + +#endif /* __XFS_SCRUB_H__ */ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 80cd0fd86783..5ff7f228d616 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -19,7 +19,6 @@ #define __XFS_H__ #ifdef CONFIG_XFS_DEBUG -#define STATIC #define DEBUG 1 #define XFS_BUF_LOCK_TRACKING 1 #endif diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a3eeaba156c5..4fc526a27a94 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -399,7 +399,7 @@ xfs_map_blocks( (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + count > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - count) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -896,13 +896,13 @@ xfs_writepage_map( struct writeback_control *wbc, struct inode *inode, struct page *page, - loff_t offset, - uint64_t end_offset) + uint64_t end_offset) { LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; struct buffer_head *bh, *head; ssize_t len = i_blocksize(inode); + uint64_t offset; int error = 0; int count = 0; int uptodate = 1; @@ -1146,7 +1146,7 @@ xfs_do_writepage( end_offset = offset; } - return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset); + return xfs_writepage_map(wpc, wbc, inode, page, end_offset); redirty: redirty_page_for_writepage(wbc, page); @@ -1265,7 +1265,7 @@ xfs_map_trim_size( if (mapping_size > size) mapping_size = size; if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { + (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) { /* limit mapping to block that spans EOF */ mapping_size = roundup_64(i_size_read(inode) - offset, i_blocksize(inode)); @@ -1312,7 +1312,7 @@ xfs_get_blocks( lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + size > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - size) size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 5d5a5e277f35..d07bf27451c9 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -48,6 +48,8 @@ struct xfs_attr_list_context; #define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ +#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ + #define XFS_ATTR_FLAGS \ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ { ATTR_ROOT, "ROOT" }, \ @@ -56,7 +58,8 @@ struct xfs_attr_list_context; { ATTR_CREATE, "CREATE" }, \ { ATTR_REPLACE, "REPLACE" }, \ { ATTR_KERNOTIME, "KERNOTIME" }, \ - { ATTR_KERNOVAL, "KERNOVAL" } + { ATTR_KERNOVAL, "KERNOVAL" }, \ + { ATTR_INCOMPLETE, "INCOMPLETE" } /* * The maximum size (into the kernel or returned from the kernel) of an diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index e3a950ed35a8..52818ea2eb50 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -251,47 +251,44 @@ xfs_attr3_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK); + error = xfs_da3_node_read(*trans, dp, child_fsb, -1, &child_bp, + XFS_ATTR_FORK); if (error) return error; - if (child_bp) { - /* save for re-read later */ - child_blkno = XFS_BUF_ADDR(child_bp); - /* - * Invalidate the subtree, however we have to. - */ - info = child_bp->b_addr; - switch (info->magic) { - case cpu_to_be16(XFS_DA_NODE_MAGIC): - case cpu_to_be16(XFS_DA3_NODE_MAGIC): - error = xfs_attr3_node_inactive(trans, dp, - child_bp, level + 1); - break; - case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): - case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): - error = xfs_attr3_leaf_inactive(trans, dp, - child_bp); - break; - default: - error = -EIO; - xfs_trans_brelse(*trans, child_bp); - break; - } - if (error) - return error; + /* save for re-read later */ + child_blkno = XFS_BUF_ADDR(child_bp); - /* - * Remove the subsidiary block from the cache - * and from the log. - */ - error = xfs_da_get_buf(*trans, dp, 0, child_blkno, - &child_bp, XFS_ATTR_FORK); - if (error) - return error; - xfs_trans_binval(*trans, child_bp); + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, child_bp, + level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, child_bp); + break; + default: + error = -EIO; + xfs_trans_brelse(*trans, child_bp); + break; } + if (error) + return error; + + /* + * Remove the subsidiary block from the cache and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp, + XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, child_bp); /* * If we're not done, re-read the parent to get the next diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 7740c8a5e736..3e59a348ea71 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -204,19 +204,103 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) return 0; } +/* + * We didn't find the block & hash mentioned in the cursor state, so + * walk down the attr btree looking for the hash. + */ STATIC int -xfs_attr_node_list(xfs_attr_list_context_t *context) +xfs_attr_node_list_lookup( + struct xfs_attr_list_context *context, + struct attrlist_cursor_kern *cursor, + struct xfs_buf **pbp) { - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - struct xfs_attr3_icleaf_hdr leafhdr; - struct xfs_da3_icnode_hdr nodehdr; - struct xfs_da_node_entry *btree; - int error, i; - struct xfs_buf *bp; - struct xfs_inode *dp = context->dp; - struct xfs_mount *mp = dp->i_mount; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = context->tp; + struct xfs_buf *bp; + int i; + int error = 0; + unsigned int expected_level = 0; + uint16_t magic; + + ASSERT(*pbp == NULL); + cursor->blkno = 0; + for (;;) { + error = xfs_da3_node_read(tp, dp, cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return error; + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + node); + goto out_corruptbuf; + } + + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + + /* Tree taller than we can handle; bail out! */ + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + goto out_corruptbuf; + + /* Check the level from the root node. */ + if (cursor->blkno == 0) + expected_level = nodehdr.level - 1; + else if (expected_level != nodehdr.level) + goto out_corruptbuf; + else + expected_level--; + + btree = dp->d_ops->node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { + if (cursor->hashval <= be32_to_cpu(btree->hashval)) { + cursor->blkno = be32_to_cpu(btree->before); + trace_xfs_attr_list_node_descend(context, + btree); + break; + } + } + xfs_trans_brelse(tp, bp); + + if (i == nodehdr.count) + return 0; + + /* We can't point back to the root. */ + if (cursor->blkno == 0) + return -EFSCORRUPTED; + } + + if (expected_level != 0) + goto out_corruptbuf; + + *pbp = bp; + return 0; + +out_corruptbuf: + xfs_trans_brelse(tp, bp); + return -EFSCORRUPTED; +} + +STATIC int +xfs_attr_node_list( + struct xfs_attr_list_context *context) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; + int error; trace_xfs_attr_node_list(context); @@ -277,47 +361,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) * Note that start of node block is same as start of leaf block. */ if (bp == NULL) { - cursor->blkno = 0; - for (;;) { - uint16_t magic; - - error = xfs_da3_node_read(context->tp, dp, - cursor->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) - return error; - node = bp->b_addr; - magic = be16_to_cpu(node->hdr.info.magic); - if (magic == XFS_ATTR_LEAF_MAGIC || - magic == XFS_ATTR3_LEAF_MAGIC) - break; - if (magic != XFS_DA_NODE_MAGIC && - magic != XFS_DA3_NODE_MAGIC) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, - node); - xfs_trans_brelse(context->tp, bp); - return -EFSCORRUPTED; - } - - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); - for (i = 0; i < nodehdr.count; btree++, i++) { - if (cursor->hashval - <= be32_to_cpu(btree->hashval)) { - cursor->blkno = be32_to_cpu(btree->before); - trace_xfs_attr_list_node_descend(context, - btree); - break; - } - } - if (i == nodehdr.count) { - xfs_trans_brelse(context->tp, bp); - return 0; - } - xfs_trans_brelse(context->tp, bp); - } + error = xfs_attr_node_list_lookup(context, cursor, &bp); + if (error || !bp) + return error; } ASSERT(bp != NULL); @@ -407,7 +453,8 @@ xfs_attr3_leaf_list_int( cursor->offset = 0; } - if (entry->flags & XFS_ATTR_INCOMPLETE) + if ((entry->flags & XFS_ATTR_INCOMPLETE) && + !(context->flags & ATTR_INCOMPLETE)) continue; /* skip incomplete entries */ if (entry->flags & XFS_ATTR_LOCAL) { @@ -499,8 +546,8 @@ xfs_attr_list_int( #define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ (((struct attrlist_ent *) 0)->a_name - (char *) 0) #define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ - & ~(sizeof(u_int32_t)-1)) + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \ + & ~(sizeof(uint32_t)-1)) /* * Format an attribute and copy it out to the user's buffer. @@ -583,6 +630,10 @@ xfs_attr_list( (cursor->hashval || cursor->blkno || cursor->offset)) return -EINVAL; + /* Only internal consumers can retrieve incomplete attrs. */ + if (flags & ATTR_INCOMPLETE) + return -EINVAL; + /* * Check for a properly aligned buffer. */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index dd136f7275e4..e5fb008d75e8 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -389,7 +389,8 @@ xfs_bud_init( int xfs_bui_recover( struct xfs_mount *mp, - struct xfs_bui_log_item *buip) + struct xfs_bui_log_item *buip, + struct xfs_defer_ops *dfops) { int error = 0; unsigned int bui_type; @@ -404,9 +405,7 @@ xfs_bui_recover( xfs_exntst_t state; struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_defer_ops dfops; struct xfs_bmbt_irec irec; - xfs_fsblock_t firstfsb; ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); @@ -464,7 +463,6 @@ xfs_bui_recover( if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); - xfs_defer_init(&dfops, &firstfsb); /* Process deferred bmap item. */ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? @@ -479,16 +477,16 @@ xfs_bui_recover( break; default: error = -EFSCORRUPTED; - goto err_dfops; + goto err_inode; } xfs_trans_ijoin(tp, ip, 0); count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + error = xfs_trans_log_finish_bmap_update(tp, budp, dfops, type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, &count, state); if (error) - goto err_dfops; + goto err_inode; if (count > 0) { ASSERT(type == XFS_BMAP_UNMAP); @@ -496,16 +494,11 @@ xfs_bui_recover( irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; irec.br_state = state; - error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); + error = xfs_bmap_unmap_extent(tp->t_mountp, dfops, ip, &irec); if (error) - goto err_dfops; + goto err_inode; } - /* Finish transaction, free inodes. */ - error = xfs_defer_finish(&tp, &dfops); - if (error) - goto err_dfops; - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -513,8 +506,6 @@ xfs_bui_recover( return error; -err_dfops: - xfs_defer_cancel(&dfops); err_inode: xfs_trans_cancel(tp); if (ip) { diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index c867daae4a3c..24b354a2c836 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -93,6 +93,7 @@ struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, struct xfs_bui_log_item *); void xfs_bui_item_free(struct xfs_bui_log_item *); void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip, + struct xfs_defer_ops *dfops); #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6503cfa44262..6d37ab43195f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -229,15 +229,17 @@ xfs_bmap_count_leaves( struct xfs_ifork *ifp, xfs_filblks_t *count) { + struct xfs_iext_cursor icur; struct xfs_bmbt_irec got; - xfs_extnum_t numrecs = 0, i = 0; + xfs_extnum_t numrecs = 0; - while (xfs_iext_get_extent(ifp, i++, &got)) { + for_each_xfs_iext(ifp, &icur, &got) { if (!isnullstartblock(got.br_startblock)) { *count += got.br_blockcount; numrecs++; } } + return numrecs; } @@ -405,125 +407,103 @@ xfs_bmap_count_blocks( return 0; } -/* - * returns 1 for success, 0 if we failed to map the extent. - */ -STATIC int -xfs_getbmapx_fix_eof_hole( - xfs_inode_t *ip, /* xfs incore inode pointer */ - int whichfork, - struct getbmapx *out, /* output structure */ - int prealloced, /* this is a file with - * preallocated data space */ - int64_t end, /* last block requested */ - xfs_fsblock_t startblock, - bool moretocome) +static int +xfs_getbmap_report_one( + struct xfs_inode *ip, + struct getbmapx *bmv, + struct kgetbmap *out, + int64_t bmv_end, + struct xfs_bmbt_irec *got) { - int64_t fixlen; - xfs_mount_t *mp; /* file system mount point */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent pointer */ - xfs_fileoff_t fileblock; - - if (startblock == HOLESTARTBLOCK) { - mp = ip->i_mount; - out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); - fixlen -= out->bmv_offset; - if (prealloced && out->bmv_offset + out->bmv_length == end) { - /* Came to hole at EOF. Trim it. */ - if (fixlen <= 0) - return 0; - out->bmv_length = fixlen; - } + struct kgetbmap *p = out + bmv->bmv_entries; + bool shared = false, trimmed = false; + int error; + + error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed); + if (error) + return error; + + if (isnullstartblock(got->br_startblock) || + got->br_startblock == DELAYSTARTBLOCK) { + /* + * Delalloc extents that start beyond EOF can occur due to + * speculative EOF allocation when the delalloc extent is larger + * than the largest freespace extent at conversion time. These + * extents cannot be converted by data writeback, so can exist + * here even if we are not supposed to be finding delalloc + * extents. + */ + if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip))) + ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0); + + p->bmv_oflags |= BMV_OF_DELALLOC; + p->bmv_block = -2; } else { - if (startblock == DELAYSTARTBLOCK) - out->bmv_block = -2; - else - out->bmv_block = xfs_fsb_to_db(ip, startblock); - fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!moretocome && - xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && - (lastx == xfs_iext_count(ifp) - 1)) - out->bmv_oflags |= BMV_OF_LAST; + p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock); } - return 1; + if (got->br_state == XFS_EXT_UNWRITTEN && + (bmv->bmv_iflags & BMV_IF_PREALLOC)) + p->bmv_oflags |= BMV_OF_PREALLOC; + + if (shared) + p->bmv_oflags |= BMV_OF_SHARED; + + p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff); + p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount); + + bmv->bmv_offset = p->bmv_offset + p->bmv_length; + bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset); + bmv->bmv_entries++; + return 0; } -/* Adjust the reported bmap around shared/unshared extent transitions. */ -STATIC int -xfs_getbmap_adjust_shared( - struct xfs_inode *ip, - int whichfork, - struct xfs_bmbt_irec *map, - struct getbmapx *out, - struct xfs_bmbt_irec *next_map) +static void +xfs_getbmap_report_hole( + struct xfs_inode *ip, + struct getbmapx *bmv, + struct kgetbmap *out, + int64_t bmv_end, + xfs_fileoff_t bno, + xfs_fileoff_t end) { - struct xfs_mount *mp = ip->i_mount; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_agblock_t ebno; - xfs_extlen_t elen; - xfs_extlen_t nlen; - int error; + struct kgetbmap *p = out + bmv->bmv_entries; - next_map->br_startblock = NULLFSBLOCK; - next_map->br_startoff = NULLFILEOFF; - next_map->br_blockcount = 0; + if (bmv->bmv_iflags & BMV_IF_NO_HOLES) + return; - /* Only written data blocks can be shared. */ - if (!xfs_is_reflink_inode(ip) || - whichfork != XFS_DATA_FORK || - !xfs_bmap_is_real_extent(map)) - return 0; + p->bmv_block = -1; + p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno); + p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno); - agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); - error = xfs_reflink_find_shared(mp, NULL, agno, agbno, - map->br_blockcount, &ebno, &elen, true); - if (error) - return error; + bmv->bmv_offset = p->bmv_offset + p->bmv_length; + bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset); + bmv->bmv_entries++; +} - if (ebno == NULLAGBLOCK) { - /* No shared blocks at all. */ - return 0; - } else if (agbno == ebno) { - /* - * Shared extent at (agbno, elen). Shrink the reported - * extent length and prepare to move the start of map[i] - * to agbno+elen, with the aim of (re)formatting the new - * map[i] the next time through the inner loop. - */ - out->bmv_length = XFS_FSB_TO_BB(mp, elen); - out->bmv_oflags |= BMV_OF_SHARED; - if (elen != map->br_blockcount) { - *next_map = *map; - next_map->br_startblock += elen; - next_map->br_startoff += elen; - next_map->br_blockcount -= elen; - } - map->br_blockcount -= elen; - } else { - /* - * There's an unshared extent (agbno, ebno - agbno) - * followed by shared extent at (ebno, elen). Shrink - * the reported extent length to cover only the unshared - * extent and prepare to move up the start of map[i] to - * ebno, with the aim of (re)formatting the new map[i] - * the next time through the inner loop. - */ - *next_map = *map; - nlen = ebno - agbno; - out->bmv_length = XFS_FSB_TO_BB(mp, nlen); - next_map->br_startblock += nlen; - next_map->br_startoff += nlen; - next_map->br_blockcount -= nlen; - map->br_blockcount -= nlen; - } +static inline bool +xfs_getbmap_full( + struct getbmapx *bmv) +{ + return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1; +} - return 0; +static bool +xfs_getbmap_next_rec( + struct xfs_bmbt_irec *rec, + xfs_fileoff_t total_end) +{ + xfs_fileoff_t end = rec->br_startoff + rec->br_blockcount; + + if (end == total_end) + return false; + + rec->br_startoff += rec->br_blockcount; + if (!isnullstartblock(rec->br_startblock) && + rec->br_startblock != DELAYSTARTBLOCK) + rec->br_startblock += rec->br_blockcount; + rec->br_blockcount = total_end - end; + return true; } /* @@ -535,33 +515,22 @@ xfs_getbmap_adjust_shared( */ int /* error code */ xfs_getbmap( - xfs_inode_t *ip, + struct xfs_inode *ip, struct getbmapx *bmv, /* user bmap structure */ - xfs_bmap_format_t formatter, /* format to user */ - void *arg) /* formatter arg */ + struct kgetbmap *out) { - int64_t bmvend; /* last block requested */ - int error = 0; /* return value */ - int64_t fixlen; /* length for -1 case */ - int i; /* extent number */ - int lock; /* lock state */ - xfs_bmbt_irec_t *map; /* buffer for user's data */ - xfs_mount_t *mp; /* file system mount point */ - int nex; /* # of user extents can do */ - int subnex; /* # of bmapi's can do */ - int nmap; /* number of map entries */ - struct getbmapx *out; /* output structure */ - int whichfork; /* data or attr fork */ - int prealloced; /* this is a file with - * preallocated data space */ - int iflags; /* interface flags */ - int bmapi_flags; /* flags for xfs_bmapi */ - int cur_ext = 0; - struct xfs_bmbt_irec inject_map; - - mp = ip->i_mount; - iflags = bmv->bmv_iflags; - + struct xfs_mount *mp = ip->i_mount; + int iflags = bmv->bmv_iflags; + int whichfork, lock, error = 0; + int64_t bmv_end, max_len; + xfs_fileoff_t bno, first_bno; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got, rec; + xfs_filblks_t len; + struct xfs_iext_cursor icur; + + if (bmv->bmv_iflags & ~BMV_IF_VALID) + return -EINVAL; #ifndef DEBUG /* Only allow CoW fork queries if we're debugging. */ if (iflags & BMV_IF_COWFORK) @@ -570,89 +539,42 @@ xfs_getbmap( if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK)) return -EINVAL; + if (bmv->bmv_length < -1) + return -EINVAL; + bmv->bmv_entries = 0; + if (bmv->bmv_length == 0) + return 0; + if (iflags & BMV_IF_ATTRFORK) whichfork = XFS_ATTR_FORK; else if (iflags & BMV_IF_COWFORK) whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: - if (XFS_IFORK_Q(ip)) { - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && - ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return -EINVAL; - } else if (unlikely( - ip->i_d.di_aformat != 0 && - ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, - ip->i_mount); - return -EFSCORRUPTED; - } + if (!XFS_IFORK_Q(ip)) + goto out_unlock_iolock; - prealloced = 0; - fixlen = 1LL << 32; + max_len = 1LL << 32; + lock = xfs_ilock_attr_map_shared(ip); break; case XFS_COW_FORK: - if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS) - return -EINVAL; + /* No CoW fork? Just return */ + if (!ifp) + goto out_unlock_iolock; - if (xfs_get_cowextsz_hint(ip)) { - prealloced = 1; - fixlen = mp->m_super->s_maxbytes; - } else { - prealloced = 0; - fixlen = XFS_ISIZE(ip); - } - break; - default: - /* Local format data forks report no extents. */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { - bmv->bmv_entries = 0; - return 0; - } - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) - return -EINVAL; + if (xfs_get_cowextsz_hint(ip)) + max_len = mp->m_super->s_maxbytes; + else + max_len = XFS_ISIZE(ip); - if (xfs_get_extsz_hint(ip) || - ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ - prealloced = 1; - fixlen = mp->m_super->s_maxbytes; - } else { - prealloced = 0; - fixlen = XFS_ISIZE(ip); - } + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); break; - } - - if (bmv->bmv_length == -1) { - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); - bmv->bmv_length = - max_t(int64_t, fixlen - bmv->bmv_offset, 0); - } else if (bmv->bmv_length == 0) { - bmv->bmv_entries = 0; - return 0; - } else if (bmv->bmv_length < 0) { - return -EINVAL; - } - - nex = bmv->bmv_count - 1; - if (nex <= 0) - return -EINVAL; - bmvend = bmv->bmv_offset + bmv->bmv_length; - - - if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) - return -ENOMEM; - out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); - if (!out) - return -ENOMEM; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - switch (whichfork) { case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { @@ -670,154 +592,105 @@ xfs_getbmap( */ } + if (xfs_get_extsz_hint(ip) || + (ip->i_d.di_flags & + (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))) + max_len = mp->m_super->s_maxbytes; + else + max_len = XFS_ISIZE(ip); + lock = xfs_ilock_data_map_shared(ip); break; - case XFS_COW_FORK: - lock = XFS_ILOCK_SHARED; - xfs_ilock(ip, lock); - break; - case XFS_ATTR_FORK: - lock = xfs_ilock_attr_map_shared(ip); - break; } - /* - * Don't let nex be bigger than the number of extents - * we can have assuming alternating holes and real extents. - */ - if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) - nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - - bmapi_flags = xfs_bmapi_aflag(whichfork); - if (!(iflags & BMV_IF_PREALLOC)) - bmapi_flags |= XFS_BMAPI_IGSTATE; - - /* - * Allocate enough space to handle "subnex" maps at a time. - */ - error = -ENOMEM; - subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); - if (!map) + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + case XFS_DINODE_FMT_LOCAL: + /* Local format inode forks report no extents. */ goto out_unlock_ilock; + default: + error = -EINVAL; + goto out_unlock_ilock; + } - bmv->bmv_entries = 0; - - if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && - (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { - error = 0; - goto out_free_map; + if (bmv->bmv_length == -1) { + max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len)); + bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset); } - do { - nmap = (nex> subnex) ? subnex : nex; - error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - map, &nmap, bmapi_flags); - if (error) - goto out_free_map; - ASSERT(nmap <= subnex); - - for (i = 0; i < nmap && bmv->bmv_length && - cur_ext < bmv->bmv_count - 1; i++) { - out[cur_ext].bmv_oflags = 0; - if (map[i].br_state == XFS_EXT_UNWRITTEN) - out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; - else if (map[i].br_startblock == DELAYSTARTBLOCK) - out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; - out[cur_ext].bmv_offset = - XFS_FSB_TO_BB(mp, map[i].br_startoff); - out[cur_ext].bmv_length = - XFS_FSB_TO_BB(mp, map[i].br_blockcount); - out[cur_ext].bmv_unused1 = 0; - out[cur_ext].bmv_unused2 = 0; + bmv_end = bmv->bmv_offset + bmv->bmv_length; - /* - * delayed allocation extents that start beyond EOF can - * occur due to speculative EOF allocation when the - * delalloc extent is larger than the largest freespace - * extent at conversion time. These extents cannot be - * converted by data writeback, so can exist here even - * if we are not supposed to be finding delalloc - * extents. - */ - if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) - ASSERT((iflags & BMV_IF_DELALLOC) != 0); - - if (map[i].br_startblock == HOLESTARTBLOCK && - whichfork == XFS_ATTR_FORK) { - /* came to the end of attribute fork */ - out[cur_ext].bmv_oflags |= BMV_OF_LAST; - goto out_free_map; - } + first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset); + len = XFS_BB_TO_FSB(mp, bmv->bmv_length); - /* Is this a shared block? */ - error = xfs_getbmap_adjust_shared(ip, whichfork, - &map[i], &out[cur_ext], &inject_map); - if (error) - goto out_free_map; + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + goto out_unlock_ilock; + } - if (!xfs_getbmapx_fix_eof_hole(ip, whichfork, - &out[cur_ext], prealloced, bmvend, - map[i].br_startblock, - inject_map.br_startblock != NULLFSBLOCK)) - goto out_free_map; + if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { + /* + * Report a whole-file hole if the delalloc flag is set to + * stay compatible with the old implementation. + */ + if (iflags & BMV_IF_DELALLOC) + xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno, + XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); + goto out_unlock_ilock; + } - bmv->bmv_offset = - out[cur_ext].bmv_offset + - out[cur_ext].bmv_length; - bmv->bmv_length = - max_t(int64_t, 0, bmvend - bmv->bmv_offset); + while (!xfs_getbmap_full(bmv)) { + xfs_trim_extent(&got, first_bno, len); - /* - * In case we don't want to return the hole, - * don't increase cur_ext so that we can reuse - * it in the next loop. - */ - if ((iflags & BMV_IF_NO_HOLES) && - map[i].br_startblock == HOLESTARTBLOCK) { - memset(&out[cur_ext], 0, sizeof(out[cur_ext])); - continue; - } + /* + * Report an entry for a hole if this extent doesn't directly + * follow the previous one. + */ + if (got.br_startoff > bno) { + xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno, + got.br_startoff); + if (xfs_getbmap_full(bmv)) + break; + } - /* - * In order to report shared extents accurately, - * we report each distinct shared/unshared part - * of a single bmbt record using multiple bmap - * extents. To make that happen, we iterate the - * same map array item multiple times, each - * time trimming out the subextent that we just - * reported. - * - * Because of this, we must check the out array - * index (cur_ext) directly against bmv_count-1 - * to avoid overflows. - */ - if (inject_map.br_startblock != NULLFSBLOCK) { - map[i] = inject_map; - i--; + /* + * In order to report shared extents accurately, we report each + * distinct shared / unshared part of a single bmbt record with + * an individual getbmapx record. + */ + bno = got.br_startoff + got.br_blockcount; + rec = got; + do { + error = xfs_getbmap_report_one(ip, bmv, out, bmv_end, + &rec); + if (error || xfs_getbmap_full(bmv)) + goto out_unlock_ilock; + } while (xfs_getbmap_next_rec(&rec, bno)); + + if (!xfs_iext_next_extent(ifp, &icur, &got)) { + xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + + out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST; + + if (whichfork != XFS_ATTR_FORK && bno < end && + !xfs_getbmap_full(bmv)) { + xfs_getbmap_report_hole(ip, bmv, out, bmv_end, + bno, end); } - bmv->bmv_entries++; - cur_ext++; + break; } - } while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1); - out_free_map: - kmem_free(map); - out_unlock_ilock: - xfs_iunlock(ip, lock); - out_unlock_iolock: - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - for (i = 0; i < cur_ext; i++) { - /* format results & advance arg */ - error = formatter(&arg, &out[i]); - if (error) + if (bno >= first_bno + len) break; } - kmem_free(out); +out_unlock_ilock: + xfs_iunlock(ip, lock); +out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return error; } @@ -1389,53 +1262,12 @@ out: } -/* - * @next_fsb will keep track of the extent currently undergoing shift. - * @stop_fsb will keep track of the extent at which we have to stop. - * If we are shifting left, we will start with block (offset + len) and - * shift each extent till last extent. - * If we are shifting right, we will start with last extent inside file space - * and continue until we reach the block corresponding to offset. - */ static int -xfs_shift_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len, - enum shift_direction direction) +xfs_prepare_shift( + struct xfs_inode *ip, + loff_t offset) { - int done = 0; - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; int error; - struct xfs_defer_ops dfops; - xfs_fsblock_t first_block; - xfs_fileoff_t stop_fsb; - xfs_fileoff_t next_fsb; - xfs_fileoff_t shift_fsb; - uint resblks; - - ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - - if (direction == SHIFT_LEFT) { - /* - * Reserve blocks to cover potential extent merges after left - * shift operations. - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - next_fsb = XFS_B_TO_FSB(mp, offset + len); - stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); - } else { - /* - * If right shift, delegate the work of initialization of - * next_fsb to xfs_bmap_shift_extent as it has ilock held. - */ - resblks = 0; - next_fsb = NULLFSBLOCK; - stop_fsb = XFS_B_TO_FSB(mp, offset); - } - - shift_fsb = XFS_B_TO_FSB(mp, len); /* * Trim eofblocks to avoid shifting uninitialized post-eof preallocation @@ -1451,8 +1283,7 @@ xfs_shift_file_space( * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - offset, -1); + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); if (error) return error; error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, @@ -1472,16 +1303,50 @@ xfs_shift_file_space( return error; } - /* - * The extent shifting code works on extent granularity. So, if - * stop_fsb is not the starting block of extent, we need to split - * the extent at stop_fsb. - */ - if (direction == SHIFT_RIGHT) { - error = xfs_bmap_split_extent(ip, stop_fsb); - if (error) - return error; - } + return 0; +} + +/* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + struct xfs_defer_ops dfops; + xfs_fsblock_t first_block; + xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); + xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); + uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + bool done = false; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + + trace_xfs_collapse_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + error = xfs_prepare_shift(ip, offset); + if (error) + return error; while (!error && !done) { error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, @@ -1495,25 +1360,17 @@ xfs_shift_file_space( XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_defer_init(&dfops, &first_block); - - /* - * We are using the write transaction in which max 2 bmbt - * updates are allowed - */ - error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, - &done, stop_fsb, &first_block, &dfops, - direction, XFS_BMAP_MAX_SHIFT_EXTENTS); + error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &dfops); if (error) goto out_bmap_cancel; error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp); } @@ -1527,36 +1384,6 @@ out_trans_cancel: } /* - * xfs_collapse_file_space() - * This routine frees disk space and shift extent for the given file. - * The first thing we do is to free data blocks in the specified range - * by calling xfs_free_file_space(). It would also sync dirty data - * and invalidate page cache over the region on which collapse range - * is working. And Shift extent records to the left to cover a hole. - * RETURNS: - * 0 on success - * errno on error - * - */ -int -xfs_collapse_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len) -{ - int error; - - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - trace_xfs_collapse_file_space(ip); - - error = xfs_free_file_space(ip, offset, len); - if (error) - return error; - - return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT); -} - -/* * xfs_insert_file_space() * This routine create hole space by shifting extents for the given file. * The first thing we do is to sync dirty data and invalidate page cache @@ -1574,10 +1401,60 @@ xfs_insert_file_space( loff_t offset, loff_t len) { + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + struct xfs_defer_ops dfops; + xfs_fsblock_t first_block; + xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, offset); + xfs_fileoff_t next_fsb = NULLFSBLOCK; + xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); + bool done = false; + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + trace_xfs_insert_file_space(ip); - return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT); + error = xfs_prepare_shift(ip, offset); + if (error) + return error; + + /* + * The extent shifting code works on extent granularity. So, if stop_fsb + * is not the starting block of extent, we need to split the extent at + * stop_fsb. + */ + error = xfs_bmap_split_extent(ip, stop_fsb); + if (error) + return error; + + while (!error && !done) { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, + &tp); + if (error) + break; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_defer_init(&dfops, &first_block); + error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &dfops); + if (error) + goto out_bmap_cancel; + + error = xfs_defer_finish(&tp, &dfops); + if (error) + goto out_bmap_cancel; + error = xfs_trans_commit(tp); + } + + return error; + +out_bmap_cancel: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + return error; } /* @@ -1832,7 +1709,6 @@ xfs_swap_extent_forks( xfs_filblks_t aforkblks = 0; xfs_filblks_t taforkblks = 0; xfs_extnum_t junk; - xfs_extnum_t nextents; uint64_t tmp; int error; @@ -1907,13 +1783,6 @@ xfs_swap_extent_forks( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* - * If the extents fit in the inode, fix the pointer. Otherwise - * it's already NULL or pointing to the extent. - */ - nextents = xfs_iext_count(&ip->i_df); - if (nextents <= XFS_INLINE_EXTS) - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1925,13 +1794,6 @@ xfs_swap_extent_forks( switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* - * If the extents fit in the inode, fix the pointer. Otherwise - * it's already NULL or pointing to the extent. - */ - nextents = xfs_iext_count(&tip->i_df); - if (nextents <= XFS_INLINE_EXTS) - tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 7d330b3c77c3..4d4ae48bd4f6 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -47,10 +47,14 @@ int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); -/* bmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *); +struct kgetbmap { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_oflags; /* output flags */ +}; int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, - xfs_bmap_format_t formatter, void *arg); + struct kgetbmap *out); /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 2f97c12ca75e..4c6e86d861fd 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -42,6 +42,8 @@ #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_errortag.h" +#include "xfs_error.h" static kmem_zone_t *xfs_buf_zone; @@ -1813,22 +1815,27 @@ xfs_alloc_buftarg( btp->bt_daxdev = dax_dev; if (xfs_setsize_buftarg_early(btp, bdev)) - goto error; + goto error_free; if (list_lru_init(&btp->bt_lru)) - goto error; + goto error_free; if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error; + goto error_lru; btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - register_shrinker(&btp->bt_shrinker); + if (register_shrinker(&btp->bt_shrinker)) + goto error_pcpu; return btp; -error: +error_pcpu: + percpu_counter_destroy(&btp->bt_io_count); +error_lru: + list_lru_destroy(&btp->bt_lru); +error_free: kmem_free(btp); return NULL; } @@ -2129,3 +2136,17 @@ xfs_buf_terminate(void) { kmem_zone_destroy(xfs_buf_zone); } + +void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) +{ + /* + * Set the lru reference count to 0 based on the error injection tag. + * This allows userspace to disrupt buffer caching for debug/testing + * purposes. + */ + if (XFS_TEST_ERROR(false, bp->b_target->bt_mount, + XFS_ERRTAG_BUF_LRU_REF)) + lru_ref = 0; + + atomic_set(&bp->b_lru_ref, lru_ref); +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index bf71507ddb16..f873bb786824 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -352,10 +352,7 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) -static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) -{ - atomic_set(&bp->b_lru_ref, lru_ref); -} +void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); static inline int xfs_buf_ispinned(struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index ba2638d37031..0c58918bc0ad 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -41,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = { DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, }; -static unsigned char +unsigned char xfs_dir3_get_dtype( struct xfs_mount *mp, uint8_t filetype) @@ -266,7 +266,7 @@ xfs_dir2_leaf_readbuf( xfs_dablk_t next_ra; xfs_dablk_t map_off; xfs_dablk_t last_da; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; int ra_want; int error = 0; @@ -283,7 +283,7 @@ xfs_dir2_leaf_readbuf( */ last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off)); - if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map)) + if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map)) goto out; if (map.br_startoff >= last_da) goto out; @@ -311,7 +311,7 @@ xfs_dir2_leaf_readbuf( if (next_ra >= last_da) goto out_no_ra; if (map.br_blockcount < geo->fsbcount && - !xfs_iext_get_extent(ifp, ++idx, &map)) + !xfs_iext_next_extent(ifp, &icur, &map)) goto out_no_ra; if (map.br_startoff >= last_da) goto out_no_ra; @@ -334,7 +334,7 @@ xfs_dir2_leaf_readbuf( ra_want -= geo->fsbcount; next_ra += geo->fsbcount; } - if (!xfs_iext_get_extent(ifp, ++idx, &map)) { + if (!xfs_iext_next_extent(ifp, &icur, &map)) { *ra_blk = last_da; break; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index cd82429d8df7..f248708c10ff 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -53,13 +53,6 @@ * otherwise by the lowest id first, see xfs_dqlock2. */ -#ifdef DEBUG -xfs_buftarg_t *xfs_dqerror_target; -int xfs_do_dqerror; -int xfs_dqreq_num; -int xfs_dqerror_mod = 33; -#endif - struct kmem_zone *xfs_qm_dqtrxzone; static struct kmem_zone *xfs_qm_dqzone; @@ -703,7 +696,7 @@ xfs_dq_get_next_id( xfs_dqid_t next_id = *id + 1; /* simple advance */ uint lock_flags; struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor cur; xfs_fsblock_t start; int error = 0; @@ -727,7 +720,7 @@ xfs_dq_get_next_id( return error; } - if (xfs_iext_lookup_extent(quotip, "ip->i_df, start, &idx, &got)) { + if (xfs_iext_lookup_extent(quotip, "ip->i_df, start, &cur, &got)) { /* contiguous chunk, bump startoff for the id calculation */ if (got.br_startoff < start) got.br_startoff = start; @@ -770,15 +763,6 @@ xfs_qm_dqget( return -ESRCH; } -#ifdef DEBUG - if (xfs_do_dqerror) { - if ((xfs_dqerror_target == mp->m_ddev_targp) && - (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { - xfs_debug(mp, "Returning error in dqget"); - return -EIO; - } - } - ASSERT(type == XFS_DQ_USER || type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); @@ -786,7 +770,6 @@ xfs_qm_dqget( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(xfs_inode_dquot(ip, type) == NULL); } -#endif restart: mutex_lock(&qi->qi_tree_lock); @@ -987,14 +970,22 @@ xfs_qm_dqflush_done( * holding the lock before removing the dquot from the AIL. */ if ((lip->li_flags & XFS_LI_IN_AIL) && - lip->li_lsn == qip->qli_flush_lsn) { + ((lip->li_lsn == qip->qli_flush_lsn) || + (lip->li_flags & XFS_LI_FAILED))) { /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - if (lip->li_lsn == qip->qli_flush_lsn) + if (lip->li_lsn == qip->qli_flush_lsn) { xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); - else + } else { + /* + * Clear the failed state since we are about to drop the + * flush lock + */ + if (lip->li_flags & XFS_LI_FAILED) + xfs_clear_li_failed(lip); spin_unlock(&ailp->xa_lock); + } } /* diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 2c7a1629e064..664dea105e76 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -137,6 +137,26 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * this informs the AIL that the dquot is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_dquot_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + struct xfs_dquot *dqp; + + dqp = DQUOT_ITEM(lip)->qli_dquot; + ASSERT(!completion_done(&dqp->q_flush)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, @@ -144,13 +164,28 @@ xfs_qm_dquot_logitem_push( __acquires(&lip->li_ailp->xa_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; @@ -242,7 +277,8 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_committing = xfs_qm_dquot_logitem_committing + .iop_committing = xfs_qm_dquot_logitem_committing, + .iop_error = xfs_dquot_item_error }; /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index eaf86f55b7f2..4c9f35d983b2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -21,6 +21,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" @@ -58,6 +59,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_DROP_WRITES, XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, + XFS_RANDOM_BUF_LRU_REF, }; struct xfs_errortag_attr { @@ -163,6 +165,7 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); +XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -196,10 +199,11 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), + XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), NULL, }; -struct kobj_type xfs_errortag_ktype = { +static struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, .default_attrs = xfs_errortag_attrs, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 7c4bef3bddb7..ea816c1bf8db 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -63,87 +63,6 @@ extern void xfs_verifier_error(struct xfs_buf *bp); } \ } -/* - * error injection tags - the labels can be anything you want - * but each tag should have its own unique number - */ - -#define XFS_ERRTAG_NOERROR 0 -#define XFS_ERRTAG_IFLUSH_1 1 -#define XFS_ERRTAG_IFLUSH_2 2 -#define XFS_ERRTAG_IFLUSH_3 3 -#define XFS_ERRTAG_IFLUSH_4 4 -#define XFS_ERRTAG_IFLUSH_5 5 -#define XFS_ERRTAG_IFLUSH_6 6 -#define XFS_ERRTAG_DA_READ_BUF 7 -#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8 -#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9 -#define XFS_ERRTAG_ALLOC_READ_AGF 10 -#define XFS_ERRTAG_IALLOC_READ_AGI 11 -#define XFS_ERRTAG_ITOBP_INOTOBP 12 -#define XFS_ERRTAG_IUNLINK 13 -#define XFS_ERRTAG_IUNLINK_REMOVE 14 -#define XFS_ERRTAG_DIR_INO_VALIDATE 15 -#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16 -#define XFS_ERRTAG_IODONE_IOERR 17 -#define XFS_ERRTAG_STRATREAD_IOERR 18 -#define XFS_ERRTAG_STRATCMPL_IOERR 19 -#define XFS_ERRTAG_DIOWRITE_IOERR 20 -#define XFS_ERRTAG_BMAPIFORMAT 21 -#define XFS_ERRTAG_FREE_EXTENT 22 -#define XFS_ERRTAG_RMAP_FINISH_ONE 23 -#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24 -#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 -#define XFS_ERRTAG_BMAP_FINISH_ONE 26 -#define XFS_ERRTAG_AG_RESV_CRITICAL 27 -/* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. - */ -#define XFS_ERRTAG_DROP_WRITES 28 -#define XFS_ERRTAG_LOG_BAD_CRC 29 -#define XFS_ERRTAG_LOG_ITEM_PIN 30 -#define XFS_ERRTAG_MAX 31 - -/* - * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. - */ -#define XFS_RANDOM_DEFAULT 100 -#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) -#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#define XFS_RANDOM_FREE_EXTENT 1 -#define XFS_RANDOM_RMAP_FINISH_ONE 1 -#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 -#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 -#define XFS_RANDOM_BMAP_FINISH_ONE 1 -#define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_DROP_WRITES 1 -#define XFS_RANDOM_LOG_BAD_CRC 1 -#define XFS_RANDOM_LOG_ITEM_PIN 1 - #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); extern void xfs_errortag_del(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 44f8c5451210..64da90655e95 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -538,7 +538,7 @@ xfs_efi_recover( return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - xfs_rmap_skip_owner_update(&oinfo); + xfs_rmap_any_owner_update(&oinfo); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6526ef0e2a23..8601275cc5e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -44,6 +44,7 @@ #include <linux/falloc.h> #include <linux/pagevec.h> #include <linux/backing-dev.h> +#include <linux/mman.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -984,7 +985,7 @@ xfs_file_readdir( * point we can change the ->readdir prototype to include the * buffer size. For now we use the current glibc buffer size. */ - bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size); return xfs_readdir(NULL, ip, ctx, bufsize); } @@ -1045,7 +1046,11 @@ __xfs_filemap_fault( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); + pfn_t pfn; + + ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); @@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite( } /* - * pfn_mkwrite was originally inteneded to ensure we capture time stamp - * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED - * to ensure we serialise the fault barrier in place. + * pfn_mkwrite was originally intended to ensure we capture time stamp updates + * on write faults. In reality, it needs to serialise against truncate and + * prepare memory for writing so handle is as standard write fault. */ static int xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret = VM_FAULT_NOPAGE; - loff_t size; - - trace_xfs_filemap_pfn_mkwrite(ip); - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* check if the faulting page hasn't raced with truncate */ - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); - return ret; - + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } static const struct vm_operations_struct xfs_file_vm_ops = { @@ -1136,6 +1120,13 @@ xfs_file_mmap( struct file *filp, struct vm_area_struct *vma) { + /* + * We don't support synchronous mappings for non-DAX files. At least + * until someone comes with a sensible use case. + */ + if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) @@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, + .mmap_supported_flags = MAP_SYNC, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8f22fc579dbb..60a2e128cb6a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -571,6 +571,11 @@ xfs_growfs_data_private( * this doesn't actually exist in the rmap btree. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_rmap_free(tp, bp, agno, + be32_to_cpu(agf->agf_length) - new, + new, &oinfo); + if (error) + goto error0; error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, be32_to_cpu(agf->agf_length) - new), diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 34227115a5d6..3861d61fb265 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -610,7 +610,7 @@ again: } else { rcu_read_unlock(); if (flags & XFS_IGET_INCORE) { - error = -ENOENT; + error = -ENODATA; goto out_error_or_again; } XFS_STATS_INC(mp, xs_ig_missed); @@ -870,7 +870,7 @@ xfs_eofblocks_worker( * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). * (We'll just piggyback on the post-EOF prealloc space workqueue.) */ -STATIC void +void xfs_queue_cowblocks( struct xfs_mount *mp) { @@ -1536,8 +1536,23 @@ xfs_inode_free_quota_eofblocks( return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); } +static inline unsigned long +xfs_iflag_for_tag( + int tag) +{ + switch (tag) { + case XFS_ICI_EOFBLOCKS_TAG: + return XFS_IEOFBLOCKS; + case XFS_ICI_COWBLOCKS_TAG: + return XFS_ICOWBLOCKS; + default: + ASSERT(0); + return 0; + } +} + static void -__xfs_inode_set_eofblocks_tag( +__xfs_inode_set_blocks_tag( xfs_inode_t *ip, void (*execute)(struct xfs_mount *mp), void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -1552,10 +1567,10 @@ __xfs_inode_set_eofblocks_tag( * Don't bother locking the AG and looking up in the radix trees * if we already know that we have the tag set. */ - if (ip->i_flags & XFS_IEOFBLOCKS) + if (ip->i_flags & xfs_iflag_for_tag(tag)) return; spin_lock(&ip->i_flags_lock); - ip->i_flags |= XFS_IEOFBLOCKS; + ip->i_flags |= xfs_iflag_for_tag(tag); spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -1587,13 +1602,13 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, + return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, trace_xfs_perag_set_eofblocks, XFS_ICI_EOFBLOCKS_TAG); } static void -__xfs_inode_clear_eofblocks_tag( +__xfs_inode_clear_blocks_tag( xfs_inode_t *ip, void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, int error, unsigned long caller_ip), @@ -1603,7 +1618,7 @@ __xfs_inode_clear_eofblocks_tag( struct xfs_perag *pag; spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~XFS_IEOFBLOCKS; + ip->i_flags &= ~xfs_iflag_for_tag(tag); spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -1630,7 +1645,7 @@ xfs_inode_clear_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_eofblocks_tag(ip); - return __xfs_inode_clear_eofblocks_tag(ip, + return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); } @@ -1724,7 +1739,7 @@ xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, + return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, trace_xfs_perag_set_cowblocks, XFS_ICI_COWBLOCKS_TAG); } @@ -1734,6 +1749,6 @@ xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_cowblocks_tag(ip); - return __xfs_inode_clear_eofblocks_tag(ip, + return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index bff4d85e5498..d4a77588eca1 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -81,6 +81,7 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); +void xfs_queue_cowblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4ec5b7f45401..6f95bdb408ce 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -39,6 +39,7 @@ #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_filestream.h" @@ -384,14 +385,6 @@ xfs_isilocked( } #endif -#ifdef DEBUG -int xfs_locked_n; -int xfs_small_retries; -int xfs_middle_retries; -int xfs_lots_retries; -int xfs_lock_delays; -#endif - /* * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined @@ -544,24 +537,11 @@ again: if ((attempts % 5) == 0) { delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_lock_delays++; -#endif } i = 0; try_lock = 0; goto again; } - -#ifdef DEBUG - if (attempts) { - if (attempts < 5) xfs_small_retries++; - else if (attempts < 100) xfs_middle_retries++; - else xfs_lots_retries++; - } else { - xfs_locked_n++; - } -#endif } /* @@ -767,9 +747,8 @@ xfs_ialloc( xfs_inode_t *pip, umode_t mode, xfs_nlink_t nlink, - xfs_dev_t rdev, + dev_t rdev, prid_t prid, - int okalloc, xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { @@ -785,7 +764,7 @@ xfs_ialloc( * Call the space management code to pick * the on-disk inode to be allocated. */ - error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, ialloc_context, &ino); if (error) return error; @@ -819,6 +798,7 @@ xfs_ialloc( set_nlink(inode, nlink); ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + inode->i_rdev = rdev; xfs_set_projid(ip, prid); if (pip && XFS_INHERIT_GID(pip)) { @@ -867,7 +847,6 @@ xfs_ialloc( case S_IFBLK: case S_IFSOCK: ip->i_d.di_format = XFS_DINODE_FMT_DEV; - ip->i_df.if_u2.if_rdev = rdev; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; @@ -933,7 +912,7 @@ xfs_ialloc( ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; - ip->i_df.if_u1.if_extents = NULL; + ip->i_df.if_u1.if_root = NULL; break; default: ASSERT(0); @@ -975,9 +954,8 @@ xfs_dir_ialloc( the inode. */ umode_t mode, xfs_nlink_t nlink, - xfs_dev_t rdev, + dev_t rdev, prid_t prid, /* project id */ - int okalloc, /* ok to allocate new space */ xfs_inode_t **ipp, /* pointer to inode; it will be locked. */ int *committed) @@ -1008,8 +986,8 @@ xfs_dir_ialloc( * transaction commit so that no other process can steal * the inode(s) that we've just allocated. */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &ip); + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context, + &ip); /* * Return an error if we were unable to allocate a new inode. @@ -1081,7 +1059,7 @@ xfs_dir_ialloc( * this call should always succeed. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &ip); + &ialloc_context, &ip); /* * If we get an error at this point, return to the caller @@ -1147,7 +1125,7 @@ xfs_create( xfs_inode_t *dp, struct xfs_name *name, umode_t mode, - xfs_dev_t rdev, + dev_t rdev, xfs_inode_t **ipp) { int is_dir = S_ISDIR(mode); @@ -1183,7 +1161,6 @@ xfs_create( return error; if (is_dir) { - rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_mkdir; } else { @@ -1203,11 +1180,6 @@ xfs_create( xfs_flush_inodes(mp); error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1224,19 +1196,13 @@ xfs_create( if (error) goto out_trans_cancel; - if (!resblks) { - error = xfs_dir_canenter(tp, dp, name); - if (error) - goto out_trans_cancel; - } - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip, + NULL); if (error) goto out_trans_cancel; @@ -1361,11 +1327,6 @@ xfs_create_tmpfile( tres = &M_RES(mp)->tr_create_tmpfile; error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1374,8 +1335,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL); if (error) goto out_trans_cancel; @@ -1527,6 +1487,24 @@ xfs_link( return error; } +/* Clear the reflink flag and the cowblocks tag if possible. */ +static void +xfs_itruncate_clear_reflink_flags( + struct xfs_inode *ip) +{ + struct xfs_ifork *dfork; + struct xfs_ifork *cfork; + + if (!xfs_is_reflink_inode(ip)) + return; + dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); + if (dfork->if_bytes == 0 && cfork->if_bytes == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (cfork->if_bytes == 0) + xfs_inode_clear_cowblocks_tag(ip); +} + /* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and @@ -1623,15 +1601,7 @@ xfs_itruncate_extents( if (error) goto out; - /* - * Clear the reflink flag if there are no data fork blocks and - * there are no extents staged in the cow fork. - */ - if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { - if (ip->i_d.di_nblocks == 0) - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; - xfs_inode_clear_cowblocks_tag(ip); - } + xfs_itruncate_clear_reflink_flags(ip); /* * Always re-log the inode so that our permanent transaction can keep @@ -2378,6 +2348,7 @@ retry: */ if (ip->i_ino != inum + i) { xfs_iunlock(ip, XFS_ILOCK_EXCL); + rcu_read_unlock(); continue; } } @@ -2421,6 +2392,24 @@ retry: } /* + * Free any local-format buffers sitting around before we reset to + * extents format. + */ +static inline void +xfs_ifree_local_data( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return; + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); +} + +/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2457,6 +2446,9 @@ xfs_ifree( if (error) return error; + xfs_ifree_local_data(ip, XFS_DATA_FORK); + xfs_ifree_local_data(ip, XFS_ATTR_FORK); + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0ee453de239a..d383e392ec9d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -232,6 +232,7 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) * log recovery to replay a bmap operation on the inode. */ #define XFS_IRECOVERY (1 << 11) +#define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -391,7 +392,7 @@ void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, - umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); + umode_t mode, dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, @@ -428,7 +429,7 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, - xfs_nlink_t, xfs_dev_t, prid_t, int, + xfs_nlink_t, dev_t, prid_t, struct xfs_inode **, int *); /* from xfs_file.c */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9bbc2d7cc8cb..6ee5c3bf19ad 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -72,7 +72,6 @@ xfs_inode_item_data_fork_size( break; case XFS_DINODE_FMT_DEV: - case XFS_DINODE_FMT_UUID: break; default: ASSERT(0); @@ -156,15 +155,13 @@ xfs_inode_item_format_data_fork( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DEXT) && ip->i_d.di_nextents > 0 && ip->i_df.if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(ip->i_df.if_u1.if_extents != NULL); ASSERT(xfs_iext_count(&ip->i_df) > 0); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); @@ -181,8 +178,7 @@ xfs_inode_item_format_data_fork( break; case XFS_DINODE_FMT_BTREE: iip->ili_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID); + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { @@ -200,8 +196,7 @@ xfs_inode_item_format_data_fork( break; case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= - ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { /* @@ -224,17 +219,9 @@ xfs_inode_item_format_data_fork( break; case XFS_DINODE_FMT_DEV: iip->ili_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_UUID); + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT); if (iip->ili_fields & XFS_ILOG_DEV) - ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; - break; - case XFS_DINODE_FMT_UUID: - iip->ili_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_DEV); - if (iip->ili_fields & XFS_ILOG_UUID) - ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; + ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev); break; default: ASSERT(0); @@ -264,7 +251,6 @@ xfs_inode_item_format_attr_fork( ASSERT(xfs_iext_count(ip->i_afp) == ip->i_d.di_anextents); - ASSERT(ip->i_afp->if_u1.if_extents != NULL); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); @@ -441,7 +427,7 @@ xfs_inode_item_format( ilf->ilf_dsize = 0; ilf->ilf_asize = 0; ilf->ilf_pad = 0; - uuid_copy(&ilf->ilf_u.ilfu_uuid, &uuid_null); + memset(&ilf->ilf_u, 0, sizeof(ilf->ilf_u)); xlog_finish_iovec(lv, vecp, sizeof(*ilf)); @@ -892,8 +878,7 @@ xfs_inode_item_format_convert( in_f->ilf_asize = in_f32->ilf_asize; in_f->ilf_dsize = in_f32->ilf_dsize; in_f->ilf_ino = in_f32->ilf_ino; - /* copy biggest field of ilf_u */ - uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); + memcpy(&in_f->ilf_u, &in_f32->ilf_u, sizeof(in_f->ilf_u)); in_f->ilf_blkno = in_f32->ilf_blkno; in_f->ilf_len = in_f32->ilf_len; in_f->ilf_boffset = in_f32->ilf_boffset; diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 4c7722e325b3..b72373a33cd9 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -48,7 +48,7 @@ extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, - xfs_inode_log_format_t *); + struct xfs_inode_log_format *); extern struct kmem_zone *xfs_ili_zone; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index aa75389be8cf..20dc65fef6a4 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -44,6 +44,7 @@ #include "xfs_btree.h" #include <linux/fsmap.h> #include "xfs_fsmap.h" +#include "scrub/xfs_scrub.h" #include <linux/capability.h> #include <linux/cred.h> @@ -310,8 +311,8 @@ xfs_readlink_by_handle( int xfs_set_dmattrs( xfs_inode_t *ip, - u_int evmask, - u_int16_t state) + uint evmask, + uint16_t state) { xfs_mount_t *mp = ip->i_mount; xfs_trans_t *tp; @@ -1201,6 +1202,8 @@ out_unlock: * 8. for non-realtime files, the extent size hint must be limited * to half the AG size to avoid alignment extending the extent beyond the * limits of the AG. + * + * Please keep this function in sync with xfs_scrub_inode_extsize. */ static int xfs_ioctl_setattr_check_extsize( @@ -1257,6 +1260,8 @@ xfs_ioctl_setattr_check_extsize( * 5. Extent size must be a multiple of the appropriate block size. * 6. The extent size hint must be limited to half the AG size to avoid * alignment extending the extent beyond the limits of the AG. + * + * Please keep this function in sync with xfs_scrub_inode_cowextsize. */ static int xfs_ioctl_setattr_check_cowextsize( @@ -1540,17 +1545,26 @@ out_drop_write: return error; } -STATIC int -xfs_getbmap_format(void **ap, struct getbmapx *bmv) +static bool +xfs_getbmap_format( + struct kgetbmap *p, + struct getbmapx __user *u, + size_t recsize) { - struct getbmap __user *base = (struct getbmap __user *)*ap; - - /* copy only getbmap portion (not getbmapx) */ - if (copy_to_user(base, bmv, sizeof(struct getbmap))) - return -EFAULT; - - *ap += sizeof(struct getbmap); - return 0; + if (put_user(p->bmv_offset, &u->bmv_offset) || + put_user(p->bmv_block, &u->bmv_block) || + put_user(p->bmv_length, &u->bmv_length) || + put_user(0, &u->bmv_count) || + put_user(0, &u->bmv_entries)) + return false; + if (recsize < sizeof(struct getbmapx)) + return true; + if (put_user(0, &u->bmv_iflags) || + put_user(p->bmv_oflags, &u->bmv_oflags) || + put_user(0, &u->bmv_unused1) || + put_user(0, &u->bmv_unused2)) + return false; + return true; } STATIC int @@ -1560,68 +1574,57 @@ xfs_ioc_getbmap( void __user *arg) { struct getbmapx bmx = { 0 }; - int error; - - /* struct getbmap is a strict subset of struct getbmapx. */ - if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) - return -EFAULT; + struct kgetbmap *buf; + size_t recsize; + int error, i; - if (bmx.bmv_count < 2) + switch (cmd) { + case XFS_IOC_GETBMAPA: + bmx.bmv_iflags = BMV_IF_ATTRFORK; + /*FALLTHRU*/ + case XFS_IOC_GETBMAP: + if (file->f_mode & FMODE_NOCMTIME) + bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; + /* struct getbmap is a strict subset of struct getbmapx. */ + recsize = sizeof(struct getbmap); + break; + case XFS_IOC_GETBMAPX: + recsize = sizeof(struct getbmapx); + break; + default: return -EINVAL; + } - bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (file->f_mode & FMODE_NOCMTIME) - bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - - error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, - (__force struct getbmap *)arg+1); - if (error) - return error; - - /* copy back header - only size of getbmap */ - if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) - return -EFAULT; - return 0; -} - -STATIC int -xfs_getbmapx_format(void **ap, struct getbmapx *bmv) -{ - struct getbmapx __user *base = (struct getbmapx __user *)*ap; - - if (copy_to_user(base, bmv, sizeof(struct getbmapx))) - return -EFAULT; - - *ap += sizeof(struct getbmapx); - return 0; -} - -STATIC int -xfs_ioc_getbmapx( - struct xfs_inode *ip, - void __user *arg) -{ - struct getbmapx bmx; - int error; - - if (copy_from_user(&bmx, arg, sizeof(bmx))) + if (copy_from_user(&bmx, arg, recsize)) return -EFAULT; if (bmx.bmv_count < 2) return -EINVAL; + if (bmx.bmv_count > ULONG_MAX / recsize) + return -ENOMEM; - if (bmx.bmv_iflags & (~BMV_IF_VALID)) - return -EINVAL; + buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0); + if (!buf) + return -ENOMEM; - error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, - (__force struct getbmapx *)arg+1); + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, buf); if (error) - return error; + goto out_free_buf; - /* copy back header */ - if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) - return -EFAULT; + error = -EFAULT; + if (copy_to_user(arg, &bmx, recsize)) + goto out_free_buf; + arg += recsize; + + for (i = 0; i < bmx.bmv_entries; i++) { + if (!xfs_getbmap_format(buf + i, arg, recsize)) + goto out_free_buf; + arg += recsize; + } + error = 0; +out_free_buf: + kmem_free(buf); return 0; } @@ -1703,6 +1706,30 @@ xfs_ioc_getfsmap( return 0; } +STATIC int +xfs_ioc_scrub_metadata( + struct xfs_inode *ip, + void __user *arg) +{ + struct xfs_scrub_metadata scrub; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&scrub, arg, sizeof(scrub))) + return -EFAULT; + + error = xfs_scrub_metadata(ip, &scrub); + if (error) + return error; + + if (copy_to_user(arg, &scrub, sizeof(scrub))) + return -EFAULT; + + return 0; +} + int xfs_ioc_swapext( xfs_swapext_t *sxp) @@ -1878,14 +1905,15 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(filp, cmd, arg); - case XFS_IOC_GETBMAPX: - return xfs_ioc_getbmapx(ip, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case FS_IOC_GETFSMAP: return xfs_ioc_getfsmap(ip, arg); + case XFS_IOC_SCRUB_METADATA: + return xfs_ioc_scrub_metadata(ip, arg); + case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE: case XFS_IOC_PATH_TO_FSHANDLE: { diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index e86c3ea137d2..8de879f0c7d5 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -86,7 +86,7 @@ xfs_file_compat_ioctl( extern int xfs_set_dmattrs( struct xfs_inode *ip, - u_int evmask, - u_int16_t state); + uint evmask, + uint16_t state); #endif diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index fa0bc4d46065..35c79e246fde 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -556,6 +556,7 @@ xfs_file_compat_ioctl( case XFS_IOC_ERROR_INJECTION: case XFS_IOC_ERROR_CLEARALL: case FS_IOC_GETFSMAP: + case XFS_IOC_SCRUB_METADATA: return xfs_file_ioctl(filp, cmd, p); #ifndef BROKEN_X86_ALIGNMENT /* These are handled fine if no alignment issues */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f179bdf1644d..66e1edbfb2b2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -30,9 +30,11 @@ #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_space.h" +#include "xfs_inode_item.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -54,13 +56,13 @@ xfs_bmbt_to_iomap( struct xfs_mount *mp = ip->i_mount; if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; + iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; + iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { - iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); if (imap->br_state == XFS_EXT_UNWRITTEN) iomap->type = IOMAP_UNWRITTEN; else @@ -389,7 +391,7 @@ xfs_iomap_prealloc_size( struct xfs_inode *ip, loff_t offset, loff_t count, - xfs_extnum_t idx) + struct xfs_iext_cursor *icur) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); @@ -414,7 +416,7 @@ xfs_iomap_prealloc_size( */ if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - !xfs_iext_get_extent(ifp, idx - 1, &prev) || + !xfs_iext_peek_prev_extent(ifp, icur, &prev) || prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_writeio_blocks; @@ -532,7 +534,7 @@ xfs_file_iomap_begin_delay( xfs_fileoff_t end_fsb; int error = 0, eof = 0; struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; xfs_fsblock_t prealloc_blocks = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); @@ -557,7 +559,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); if (!eof && got.br_startoff <= offset_fsb) { if (xfs_is_reflink_inode(ip)) { bool shared; @@ -591,7 +593,8 @@ xfs_file_iomap_begin_delay( end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx); + prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, + &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -613,7 +616,8 @@ xfs_file_iomap_begin_delay( retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof); + end_fsb - offset_fsb, prealloc_blocks, &got, &icur, + eof); switch (error) { case 0: break; @@ -1002,7 +1006,7 @@ xfs_file_iomap_begin( } ASSERT(offset <= mp->m_super->s_maxbytes); - if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + if (offset > mp->m_super->s_maxbytes - length) length = mp->m_super->s_maxbytes - offset; offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); @@ -1086,6 +1090,10 @@ xfs_file_iomap_begin( trace_xfs_iomap_found(ip, offset, length, 0, &imap); } + if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields + & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + xfs_bmbt_to_iomap(ip, iomap, &imap); if (shared) @@ -1205,7 +1213,7 @@ xfs_xattr_iomap_begin( ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK); + &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: xfs_iunlock(ip, lockmode); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 17081c77ef86..56475fcd76f2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -160,7 +160,6 @@ xfs_generic_create( if (S_ISCHR(mode) || S_ISBLK(mode)) { if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) return -EINVAL; - rdev = sysv_encode_dev(rdev); } else { rdev = 0; } @@ -535,8 +534,7 @@ xfs_vn_getattr( case S_IFBLK: case S_IFCHR: stat->blksize = BLKDEV_IOSIZE; - stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); + stat->rdev = inode->i_rdev; break; default: if (XFS_IS_REALTIME_INODE(ip)) { @@ -886,22 +884,6 @@ xfs_setattr_size( return error; /* - * We are going to log the inode size change in this transaction so - * any previous writes that are beyond the on disk EOF and the new - * EOF that have not been written out need to be written here. If we - * do not write the data out, we expose ourselves to the null files - * problem. Note that this includes any block zeroing we did above; - * otherwise those blocks may not be zeroed after a crash. - */ - if (did_zeroing || - (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ip->i_d.di_size, newsize); - if (error) - return error; - } - - /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we * drop the XFS_MMAP_EXCL lock after the extent manipulations are @@ -917,9 +899,29 @@ xfs_setattr_size( * user visible changes). There's not much we can do about this, except * to hope that the caller sees ENOMEM and retries the truncate * operation. + * + * And we update in-core i_size and truncate page cache beyond newsize + * before writeback the [di_size, newsize] range, so we're guaranteed + * not to write stale data past the new EOF on truncate down. */ truncate_setsize(inode, newsize); + /* + * We are going to log the inode size change in this transaction so + * any previous writes that are beyond the on disk EOF and the new + * EOF that have not been written out need to be written here. If we + * do not write the data out, we expose ourselves to the null files + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. + */ + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize - 1); + if (error) + return error; + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) return error; @@ -1231,18 +1233,6 @@ xfs_setup_inode( inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); - switch (inode->i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - inode->i_rdev = - MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); - break; - default: - inode->i_rdev = 0; - break; - } - i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c393a2f6d8c3..d58310514423 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -31,16 +31,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" -int -xfs_internal_inum( - xfs_mount_t *mp, - xfs_ino_t ino) -{ - return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (xfs_sb_version_hasquota(&mp->m_sb) && - xfs_is_quota_inode(&mp->m_sb, ino))); -} - /* * Return stat information for one inode. * Return 0 if ok, else errno. @@ -119,12 +109,11 @@ xfs_bulkstat_one_int( switch (dic->di_format) { case XFS_DINODE_FMT_DEV: - buf->bs_rdev = ip->i_df.if_u2.if_rdev; + buf->bs_rdev = sysv_encode_dev(inode->i_rdev); buf->bs_blksize = BLKDEV_IOSIZE; buf->bs_blocks = 0; break; case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_UUID: buf->bs_rdev = 0; buf->bs_blksize = mp->m_sb.sb_blocksize; buf->bs_blocks = 0; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 17e86e0541af..6ea8b3912fa4 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -96,6 +96,4 @@ xfs_inumbers( void __user *buffer, /* buffer with inode info */ inumbers_fmt_pf formatter); -int xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); - #endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index dcd1292664b3..99562ec0de56 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -142,6 +142,13 @@ typedef __u32 xfs_nlink_t; #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) +/* + * Return the address of a label. Use barrier() so that the optimizer + * won't reorder code to refactor the error jumpouts into a single + * return, which throws off the reported address. + */ +#define __this_address ({ __label__ __here; __here: barrier(); &&__here; }) + #define XFS_PROJID_DEFAULT 0 #define MIN(a,b) (min(a,b)) @@ -197,6 +204,16 @@ static inline kgid_t xfs_gid_to_kgid(uint32_t gid) return make_kgid(&init_user_ns, gid); } +static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) +{ + return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); +} + +static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev) +{ + return sysv_encode_dev(dev); +} + /* * Various platform dependent calls that don't fit anywhere else */ @@ -243,10 +260,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #define ASSERT(expr) \ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -#ifndef STATIC -# define STATIC noinline -#endif - #else /* !DEBUG */ #ifdef XFS_WARN @@ -254,21 +267,15 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #define ASSERT(expr) \ (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) -#ifndef STATIC -# define STATIC static noinline -#endif - #else /* !DEBUG && !XFS_WARN */ #define ASSERT(expr) ((void)0) -#ifndef STATIC -# define STATIC static noinline -#endif - #endif /* XFS_WARN */ #endif /* DEBUG */ +#define STATIC static noinline + #ifdef CONFIG_XFS_RT /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index dc95a49d62e7..a503af96d780 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -22,6 +22,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -608,6 +609,7 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { + bool fatal = xfs_sb_version_hascrc(&mp->m_sb); int error = 0; int min_logfsbs; @@ -659,9 +661,20 @@ xfs_log_mount( XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), XFS_MAX_LOG_BYTES); error = -EINVAL; + } else if (mp->m_sb.sb_logsunit > 1 && + mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { + xfs_warn(mp, + "log stripe unit %u bytes must be a multiple of block size", + mp->m_sb.sb_logsunit); + error = -EINVAL; + fatal = true; } if (error) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + /* + * Log check errors are always fatal on v5; or whenever bad + * metadata leads to a crash. + */ + if (fatal) { xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); ASSERT(0); goto out_free_log; @@ -744,6 +757,7 @@ xfs_log_mount_finish( { int error = 0; bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); + bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); @@ -767,19 +781,34 @@ xfs_log_mount_finish( * something to an unlinked inode, the irele won't cause * premature truncation and freeing of the inode, which results * in log recovery failure. We have to evict the unreferenced - * lru inodes after clearing MS_ACTIVE because we don't + * lru inodes after clearing SB_ACTIVE because we don't * otherwise clean up the lru if there's a subsequent failure in * xfs_mountfs, which leads to us leaking the inodes if nothing * else (e.g. quotacheck) references the inodes before the * mount failure occurs. */ - mp->m_super->s_flags |= MS_ACTIVE; + mp->m_super->s_flags |= SB_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); - mp->m_super->s_flags &= ~MS_ACTIVE; + mp->m_super->s_flags &= ~SB_ACTIVE; evict_inodes(mp->m_super); + /* + * Drain the buffer LRU after log recovery. This is required for v4 + * filesystems to avoid leaving around buffers with NULL verifier ops, + * but we do it unconditionally to make sure we're always in a clean + * cache state after mount. + * + * Don't push in the error case because the AIL may have pending intents + * that aren't removed until recovery is cancelled. + */ + if (!error && recovered) { + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_ail_push_all_sync(mp->m_ail); + } + xfs_wait_buftarg(mp->m_ddev_targp); + if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY; @@ -3734,7 +3763,7 @@ xlog_ticket_alloc( * one of the iclogs. This uses backup pointers stored in a different * part of the log in case we trash the log structure. */ -void +STATIC void xlog_verify_dest_ptr( struct xlog *log, void *ptr) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 51bf7b827387..129975970d99 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -592,9 +592,9 @@ xlog_valid_lsn( * a transiently forward state. Instead, we can see the LSN in a * transiently behind state if we happen to race with a cycle wrap. */ - cur_cycle = ACCESS_ONCE(log->l_curr_cycle); + cur_cycle = READ_ONCE(log->l_curr_cycle); smp_rmb(); - cur_block = ACCESS_ONCE(log->l_curr_block); + cur_block = READ_ONCE(log->l_curr_block); if ((CYCLE_LSN(lsn) > cur_cycle) || (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ee34899396b2..28d1abfe835e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -85,17 +86,21 @@ struct xfs_buf_cancel { */ /* - * Verify the given count of basic blocks is valid number of blocks - * to specify for an operation involving the given XFS log buffer. - * Returns nonzero if the count is valid, 0 otherwise. + * Verify the log-relative block number and length in basic blocks are valid for + * an operation involving the given XFS log buffer. Returns true if the fields + * are valid, false otherwise. */ - -static inline int -xlog_buf_bbcount_valid( +static inline bool +xlog_verify_bp( struct xlog *log, + xfs_daddr_t blk_no, int bbcount) { - return bbcount > 0 && bbcount <= log->l_logBBsize; + if (blk_no < 0 || blk_no >= log->l_logBBsize) + return false; + if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize) + return false; + return true; } /* @@ -110,7 +115,11 @@ xlog_get_bp( { struct xfs_buf *bp; - if (!xlog_buf_bbcount_valid(log, nbblks)) { + /* + * Pass log block 0 since we don't have an addr yet, buffer will be + * verified on read. + */ + if (!xlog_verify_bp(log, 0, nbblks)) { xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); @@ -180,9 +189,10 @@ xlog_bread_noalign( { int error; - if (!xlog_buf_bbcount_valid(log, nbblks)) { - xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", - nbblks); + if (!xlog_verify_bp(log, blk_no, nbblks)) { + xfs_warn(log->l_mp, + "Invalid log block/length (0x%llx, 0x%x) for buffer", + blk_no, nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return -EFSCORRUPTED; } @@ -265,9 +275,10 @@ xlog_bwrite( { int error; - if (!xlog_buf_bbcount_valid(log, nbblks)) { - xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", - nbblks); + if (!xlog_verify_bp(log, blk_no, nbblks)) { + xfs_warn(log->l_mp, + "Invalid log block/length (0x%llx, 0x%x) for buffer", + blk_no, nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return -EFSCORRUPTED; } @@ -753,7 +764,7 @@ xlog_find_head( * in the in-core log. The following number can be made tighter if * we actually look at the block size of the filesystem. */ - num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log)); if (head_blk >= num_scan_bblks) { /* * We are guaranteed that the entire check can be performed @@ -2975,7 +2986,7 @@ xlog_recover_inode_pass2( struct xlog_recover_item *item, xfs_lsn_t current_lsn) { - xfs_inode_log_format_t *in_f; + struct xfs_inode_log_format *in_f; xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; xfs_dinode_t *dip; @@ -2989,10 +3000,10 @@ xlog_recover_inode_pass2( uint isize; int need_free = 0; - if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; } else { - in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) @@ -3163,16 +3174,8 @@ xlog_recover_inode_pass2( } fields = in_f->ilf_fields; - switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { - case XFS_ILOG_DEV: + if (fields & XFS_ILOG_DEV) xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); - break; - case XFS_ILOG_UUID: - memcpy(XFS_DFORK_DPTR(dip), - &in_f->ilf_u.ilfu_uuid, - sizeof(uuid_t)); - break; - } if (in_f->ilf_size == 2) goto out_owner_change; @@ -4297,7 +4300,7 @@ xlog_recover_add_to_trans( char *dp, int len) { - xfs_inode_log_format_t *in_f; /* any will do */ + struct xfs_inode_log_format *in_f; /* any will do */ xlog_recover_item_t *item; char *ptr; @@ -4331,7 +4334,7 @@ xlog_recover_add_to_trans( ptr = kmem_alloc(len, KM_SLEEP); memcpy(ptr, dp, len); - in_f = (xfs_inode_log_format_t *)ptr; + in_f = (struct xfs_inode_log_format *)ptr; /* take the tail entry */ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); @@ -4714,7 +4717,8 @@ STATIC int xlog_recover_process_cui( struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct xfs_defer_ops *dfops) { struct xfs_cui_log_item *cuip; int error; @@ -4727,7 +4731,7 @@ xlog_recover_process_cui( return 0; spin_unlock(&ailp->xa_lock); - error = xfs_cui_recover(mp, cuip); + error = xfs_cui_recover(mp, cuip, dfops); spin_lock(&ailp->xa_lock); return error; @@ -4754,7 +4758,8 @@ STATIC int xlog_recover_process_bui( struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct xfs_defer_ops *dfops) { struct xfs_bui_log_item *buip; int error; @@ -4767,7 +4772,7 @@ xlog_recover_process_bui( return 0; spin_unlock(&ailp->xa_lock); - error = xfs_bui_recover(mp, buip); + error = xfs_bui_recover(mp, buip, dfops); spin_lock(&ailp->xa_lock); return error; @@ -4803,6 +4808,46 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) } } +/* Take all the collected deferred ops and finish them in order. */ +static int +xlog_finish_defer_ops( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops) +{ + struct xfs_trans *tp; + int64_t freeblks; + uint resblks; + int error; + + /* + * We're finishing the defer_ops that accumulated as a result of + * recovering unfinished intent items during log recovery. We + * reserve an itruncate transaction because it is the largest + * permanent transaction type. Since we're the only user of the fs + * right now, take 93% (15/16) of the available free blocks. Use + * weird math to avoid a 64-bit division. + */ + freeblks = percpu_counter_sum(&mp->m_fdblocks); + if (freeblks <= 0) + return -ENOSPC; + resblks = min_t(int64_t, UINT_MAX, freeblks); + resblks = (resblks * 15) >> 4; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + error = xfs_defer_finish(&tp, dfops); + if (error) + goto out_cancel; + + return xfs_trans_commit(tp); + +out_cancel: + xfs_trans_cancel(tp); + return error; +} + /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4823,10 +4868,12 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_log_item *lip; - int error = 0; + struct xfs_defer_ops dfops; struct xfs_ail_cursor cur; + struct xfs_log_item *lip; struct xfs_ail *ailp; + xfs_fsblock_t firstfsb; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif @@ -4837,6 +4884,7 @@ xlog_recover_process_intents( #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif + xfs_defer_init(&dfops, &firstfsb); while (lip != NULL) { /* * We're done when we see something other than an intent. @@ -4857,6 +4905,12 @@ xlog_recover_process_intents( */ ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + /* + * NOTE: If your intent processing routine can create more + * deferred ops, you /must/ attach them to the dfops in this + * routine or else those subsequent intents will get + * replayed in the wrong order! + */ switch (lip->li_type) { case XFS_LI_EFI: error = xlog_recover_process_efi(log->l_mp, ailp, lip); @@ -4865,10 +4919,12 @@ xlog_recover_process_intents( error = xlog_recover_process_rui(log->l_mp, ailp, lip); break; case XFS_LI_CUI: - error = xlog_recover_process_cui(log->l_mp, ailp, lip); + error = xlog_recover_process_cui(log->l_mp, ailp, lip, + &dfops); break; case XFS_LI_BUI: - error = xlog_recover_process_bui(log->l_mp, ailp, lip); + error = xlog_recover_process_bui(log->l_mp, ailp, lip, + &dfops); break; } if (error) @@ -4878,6 +4934,11 @@ xlog_recover_process_intents( out: xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); + if (error) + xfs_defer_cancel(&dfops); + else + error = xlog_finish_defer_ops(log->l_mp, &dfops); + return error; } @@ -5823,7 +5884,7 @@ xlog_recover_cancel( * Read all of the agf and agi counters and check that they * are consistent with the superblock counters. */ -void +STATIC void xlog_recover_check_summary( struct xlog *log) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e9727d0a541a..c879b517cc94 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1022,10 +1022,21 @@ xfs_mountfs( xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); + /* + * Cancel all delayed reclaim work and reclaim the inodes directly. + * We have to do this /after/ rtunmount and qm_unmount because those + * two will have scheduled delayed reclaim for the rt/quota inodes. + * + * This is slightly different from the unmountfs call sequence + * because we could be tearing down a partially set up mount. In + * particular, if log_mount_finish fails we bail out without calling + * qm_unmount_quotas and therefore rely on qm_unmount to release the + * quota inodes. + */ + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 010a13a201aa..b897b11afb2c 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -48,7 +48,7 @@ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); - +STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it @@ -695,9 +695,17 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - register_shrinker(&qinf->qi_shrinker); + + error = register_shrinker(&qinf->qi_shrinker); + if (error) + goto out_free_inos; + return 0; +out_free_inos: + mutex_destroy(&qinf->qi_quotaofflock); + mutex_destroy(&qinf->qi_tree_lock); + xfs_qm_destroy_quotainos(qinf); out_free_lru: list_lru_destroy(&qinf->qi_lru); out_free_qinf: @@ -706,7 +714,6 @@ out_free_qinf: return error; } - /* * Gets called when unmounting a filesystem or when all quotas get * turned off. @@ -723,19 +730,8 @@ xfs_qm_destroy_quotainfo( unregister_shrinker(&qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); - - if (qi->qi_uquotaip) { - IRELE(qi->qi_uquotaip); - qi->qi_uquotaip = NULL; /* paranoia */ - } - if (qi->qi_gquotaip) { - IRELE(qi->qi_gquotaip); - qi->qi_gquotaip = NULL; - } - if (qi->qi_pquotaip) { - IRELE(qi->qi_pquotaip); - qi->qi_pquotaip = NULL; - } + xfs_qm_destroy_quotainos(qi); + mutex_destroy(&qi->qi_tree_lock); mutex_destroy(&qi->qi_quotaofflock); kmem_free(qi); mp->m_quotainfo = NULL; @@ -793,8 +789,8 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, - &committed); + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip, + &committed); if (error) { xfs_trans_cancel(tp); return error; @@ -1600,6 +1596,24 @@ error_rele: } STATIC void +xfs_qm_destroy_quotainos( + xfs_quotainfo_t *qi) +{ + if (qi->qi_uquotaip) { + IRELE(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + IRELE(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + if (qi->qi_pquotaip) { + IRELE(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } +} + +STATIC void xfs_qm_dqfree_one( struct xfs_dquot *dqp) { diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 8f2e2fac4255..3a55d6fc271b 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -393,7 +393,8 @@ xfs_cud_init( int xfs_cui_recover( struct xfs_mount *mp, - struct xfs_cui_log_item *cuip) + struct xfs_cui_log_item *cuip, + struct xfs_defer_ops *dfops) { int i; int error = 0; @@ -405,11 +406,9 @@ xfs_cui_recover( struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; enum xfs_refcount_intent_type type; - xfs_fsblock_t firstfsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; struct xfs_bmbt_irec irec; - struct xfs_defer_ops dfops; bool requeue_only = false; ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); @@ -465,7 +464,6 @@ xfs_cui_recover( return error; cudp = xfs_trans_get_cud(tp, cuip); - xfs_defer_init(&dfops, &firstfsb); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { refc = &cuip->cui_format.cui_extents[i]; refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; @@ -485,7 +483,7 @@ xfs_cui_recover( new_len = refc->pe_len; } else error = xfs_trans_log_finish_refcount_update(tp, cudp, - &dfops, type, refc->pe_startblock, refc->pe_len, + dfops, type, refc->pe_startblock, refc->pe_len, &new_fsb, &new_len, &rcur); if (error) goto abort_error; @@ -497,21 +495,21 @@ xfs_cui_recover( switch (type) { case XFS_REFCOUNT_INCREASE: error = xfs_refcount_increase_extent( - tp->t_mountp, &dfops, &irec); + tp->t_mountp, dfops, &irec); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_decrease_extent( - tp->t_mountp, &dfops, &irec); + tp->t_mountp, dfops, &irec); break; case XFS_REFCOUNT_ALLOC_COW: error = xfs_refcount_alloc_cow_extent( - tp->t_mountp, &dfops, + tp->t_mountp, dfops, irec.br_startblock, irec.br_blockcount); break; case XFS_REFCOUNT_FREE_COW: error = xfs_refcount_free_cow_extent( - tp->t_mountp, &dfops, + tp->t_mountp, dfops, irec.br_startblock, irec.br_blockcount); break; @@ -525,17 +523,12 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - error = xfs_defer_finish(&tp, &dfops); - if (error) - goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); error = xfs_trans_commit(tp); return error; abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); -abort_defer: - xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index 5b74dddfa64b..0e5327349a13 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -96,6 +96,7 @@ struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, struct xfs_cui_log_item *); void xfs_cui_item_free(struct xfs_cui_log_item *); void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip, + struct xfs_defer_ops *dfops); #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 37e603bf1591..47aea2e82c26 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -49,8 +49,6 @@ #include "xfs_alloc.h" #include "xfs_quota_defs.h" #include "xfs_quota.h" -#include "xfs_btree.h" -#include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_iomap.h" #include "xfs_rmap_btree.h" @@ -273,7 +271,7 @@ xfs_reflink_reserve_cow( struct xfs_bmbt_irec got; int error = 0; bool eof = false, trimmed; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; /* * Search the COW fork extent list first. This serves two purposes: @@ -284,7 +282,7 @@ xfs_reflink_reserve_cow( * tree. */ - if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got)) + if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) eof = true; if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); @@ -312,7 +310,7 @@ xfs_reflink_reserve_cow( return error; error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &idx, eof); + imap->br_blockcount, 0, &got, &icur, eof); if (error == -ENOSPC || error == -EDQUOT) trace_xfs_reflink_cow_enospc(ip, imap); if (error) @@ -353,29 +351,22 @@ xfs_reflink_convert_cow( xfs_off_t offset, xfs_off_t count) { - struct xfs_bmbt_irec got; - struct xfs_defer_ops dfops; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); - xfs_extnum_t idx; - bool found; - int error = 0; - - xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_filblks_t count_fsb = end_fsb - offset_fsb; + struct xfs_bmbt_irec imap; + struct xfs_defer_ops dfops; + xfs_fsblock_t first_block = NULLFSBLOCK; + int nimaps = 1, error = 0; - /* Convert all the extents to real from unwritten. */ - for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); - found && got.br_startoff < end_fsb; - found = xfs_iext_get_extent(ifp, ++idx, &got)) { - error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb, - end_fsb - offset_fsb, &dfops); - if (error) - break; - } + ASSERT(count != 0); - /* Finish up. */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, + XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | + XFS_BMAPI_CONVERT_ONLY, &first_block, 0, &imap, &nimaps, + &dfops); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -399,7 +390,7 @@ xfs_reflink_allocate_cow( bool trimmed; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; retry: ASSERT(xfs_is_reflink_inode(ip)); @@ -409,7 +400,7 @@ retry: * Even if the extent is not shared we might have a preallocation for * it in the COW fork. If so use it. */ - if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) && + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) && got.br_startoff <= offset_fsb) { *shared = true; @@ -463,6 +454,8 @@ retry: if (error) goto out_bmap_cancel; + xfs_inode_set_cowblocks_tag(ip); + /* Finish up. */ error = xfs_defer_finish(&tp, &dfops); if (error) @@ -496,13 +489,14 @@ xfs_reflink_find_cow_mapping( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); xfs_fileoff_t offset_fsb; struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!xfs_is_reflink_inode(ip)) + return false; offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got)) return false; if (got.br_startoff > offset_fsb) return false; @@ -524,18 +518,18 @@ xfs_reflink_trim_irec_to_next_cow( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; if (!xfs_is_reflink_inode(ip)) return; /* Find the extent in the CoW fork. */ - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got)) return; /* This is the extent before; try sliding up one. */ if (got.br_startoff < offset_fsb) { - if (!xfs_iext_get_extent(ifp, idx + 1, &got)) + if (!xfs_iext_next_extent(ifp, &icur, &got)) return; } @@ -562,24 +556,32 @@ xfs_reflink_cancel_cow_blocks( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; int error = 0; if (!xfs_is_reflink_inode(ip)) return 0; - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) return 0; - while (got.br_startoff < end_fsb) { + /* Walk backwards until we're out of the I/O range... */ + while (got.br_startoff + got.br_blockcount > offset_fsb) { del = got; xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); + + /* Extent delete may have bumped ext forward */ + if (!del.br_blockcount) { + xfs_iext_prev(ifp, &icur); + goto next_extent; + } + trace_xfs_reflink_cancel_cow(ip, &del); if (isnullstartblock(del.br_startblock)) { error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, - &idx, &got, &del); + &icur, &got, &del); if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { @@ -610,10 +612,13 @@ xfs_reflink_cancel_cow_blocks( } /* Remove the mapping from the CoW fork. */ - xfs_bmap_del_extent_cow(ip, &idx, &got, &del); + xfs_bmap_del_extent_cow(ip, &icur, &got, &del); + } else { + /* Didn't do anything, push cursor back. */ + xfs_iext_prev(ifp, &icur); } - - if (!xfs_iext_get_extent(ifp, ++idx, &got)) +next_extent: + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -698,7 +703,7 @@ xfs_reflink_end_cow( int error; unsigned int resblks; xfs_filblks_t rlen; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; trace_xfs_reflink_end_cow(ip, offset, count); @@ -726,34 +731,29 @@ xfs_reflink_end_cow( (unsigned int)(end_fsb - offset_fsb), XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - resblks, 0, 0, &tp); + resblks, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* If there is a hole at end_fsb - 1 go to the previous extent */ - if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || - got.br_startoff > end_fsb) { - /* - * In case of racing, overlapping AIO writes no COW extents - * might be left by the time I/O completes for the loser of - * the race. In that case we are done. - */ - if (idx <= 0) - goto out_cancel; - xfs_iext_get_extent(ifp, --idx, &got); - } + /* + * In case of racing, overlapping AIO writes no COW extents might be + * left by the time I/O completes for the loser of the race. In that + * case we are done. + */ + if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) + goto out_cancel; /* Walk backwards until we're out of the I/O range... */ while (got.br_startoff + got.br_blockcount > offset_fsb) { del = got; xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); - /* Extent delete may have bumped idx forward */ + /* Extent delete may have bumped ext forward */ if (!del.br_blockcount) { - idx--; + xfs_iext_prev(ifp, &icur); goto next_extent; } @@ -765,7 +765,7 @@ xfs_reflink_end_cow( * allocated but have not yet been involved in a write. */ if (got.br_state == XFS_EXT_UNWRITTEN) { - idx--; + xfs_iext_prev(ifp, &icur); goto next_extent; } @@ -796,14 +796,14 @@ xfs_reflink_end_cow( goto out_defer; /* Remove the mapping from the CoW fork. */ - xfs_bmap_del_extent_cow(ip, &idx, &got, &del); + xfs_bmap_del_extent_cow(ip, &icur, &got, &del); xfs_defer_ijoin(&dfops, ip); error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; next_extent: - if (!xfs_iext_get_extent(ifp, idx, &got)) + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -1297,6 +1297,17 @@ xfs_reflink_remap_range( trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + /* + * Clear out post-eof preallocations because we don't have page cache + * backing the delayed allocations and they'll never get freed on + * their own. + */ + if (xfs_can_free_eofblocks(dest, true)) { + ret = xfs_free_eofblocks(dest); + if (ret) + goto out_unlock; + } + /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) @@ -1433,7 +1444,7 @@ xfs_reflink_inode_has_shared_extents( xfs_extlen_t aglen; xfs_agblock_t rbno; xfs_extlen_t rlen; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; bool found; int error; @@ -1445,7 +1456,7 @@ xfs_reflink_inode_has_shared_extents( } *has_shared = false; - found = xfs_iext_lookup_extent(ip, ifp, 0, &idx, &got); + found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); while (found) { if (isnullstartblock(got.br_startblock) || got.br_state != XFS_EXT_NORM) @@ -1464,7 +1475,7 @@ xfs_reflink_inode_has_shared_extents( return 0; } next: - found = xfs_iext_get_extent(ifp, ++idx, &got); + found = xfs_iext_next_extent(ifp, &icur, &got); } return 0; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 79defa722bf1..3f30f846d7f2 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -138,6 +138,7 @@ int xfs_rtalloc_query_range(struct xfs_trans *tp, int xfs_rtalloc_query_all(struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); +bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) @@ -146,6 +147,7 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp, # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) # define xfs_rtalloc_query_all(t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) +# define xfs_verify_rtbno(m, r) (false) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f663022353c0..1dacccc367f8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -212,9 +212,9 @@ xfs_parseargs( */ if (sb_rdonly(sb)) mp->m_flags |= XFS_MOUNT_RDONLY; - if (sb->s_flags & MS_DIRSYNC) + if (sb->s_flags & SB_DIRSYNC) mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) mp->m_flags |= XFS_MOUNT_WSYNC; /* @@ -1312,7 +1312,7 @@ xfs_fs_remount( } /* ro -> rw */ - if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) { if (mp->m_flags & XFS_MOUNT_NORECOVERY) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); @@ -1360,6 +1360,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } + xfs_queue_cowblocks(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1368,7 +1369,15 @@ xfs_fs_remount( } /* rw -> ro */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { + /* Get rid of any leftover CoW reservations... */ + cancel_delayed_work_sync(&mp->m_cowblocks_work); + error = xfs_icache_free_cowblocks(mp, NULL); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 5f2f32408011..fcc5dfc70aa0 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -30,7 +30,7 @@ extern void xfs_qm_exit(void); #ifdef CONFIG_XFS_POSIX_ACL # define XFS_ACL_STRING "ACLs, " -# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL) +# define set_posix_acl_flag(sb) ((sb)->s_flags |= SB_POSIXACL) #else # define XFS_ACL_STRING # define set_posix_acl_flag(sb) do { } while (0) diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 68d3ca2c4968..2e9e793a8f9d 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -232,11 +232,6 @@ xfs_symlink( resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); - if (error == -ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, - &tp); - } if (error) goto out_release_inode; @@ -260,14 +255,6 @@ xfs_symlink( goto out_trans_cancel; /* - * Check for ability to enter directory entry, if no space reserved. - */ - if (!resblks) { - error = xfs_dir_canenter(tp, dp, link_name); - if (error) - goto out_trans_cancel; - } - /* * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. */ @@ -277,7 +264,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); + prid, &ip, NULL); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 5d95fe348294..35f3546b6af5 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -24,7 +24,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bb5514688d47..d718a10c2271 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -218,53 +218,15 @@ TRACE_EVENT(xfs_attr_list_node_descend, __entry->bt_before) ); -TRACE_EVENT(xfs_iext_insert, - TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, - struct xfs_bmbt_irec *r, int state, unsigned long caller_ip), - TP_ARGS(ip, idx, r, state, caller_ip), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_extnum_t, idx) - __field(xfs_fileoff_t, startoff) - __field(xfs_fsblock_t, startblock) - __field(xfs_filblks_t, blockcount) - __field(xfs_exntst_t, state) - __field(int, bmap_state) - __field(unsigned long, caller_ip) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->idx = idx; - __entry->startoff = r->br_startoff; - __entry->startblock = r->br_startblock; - __entry->blockcount = r->br_blockcount; - __entry->state = r->br_state; - __entry->bmap_state = state; - __entry->caller_ip = caller_ip; - ), - TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %ps", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), - (long)__entry->idx, - __entry->startoff, - (int64_t)__entry->startblock, - __entry->blockcount, - __entry->state, - (char *)__entry->caller_ip) -); - DECLARE_EVENT_CLASS(xfs_bmap_class, - TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, unsigned long caller_ip), - TP_ARGS(ip, idx, state, caller_ip), + TP_ARGS(ip, cur, state, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(xfs_extnum_t, idx) + __field(void *, leaf); + __field(int, pos); __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -277,10 +239,11 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, struct xfs_bmbt_irec r; ifp = xfs_iext_state_to_fork(ip, state); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + xfs_iext_get_extent(ifp, cur, &r); __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->idx = idx; + __entry->leaf = cur->leaf; + __entry->pos = cur->pos; __entry->startoff = r.br_startoff; __entry->startblock = r.br_startblock; __entry->blockcount = r.br_blockcount; @@ -288,12 +251,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->bmap_state = state; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + TP_printk("dev %d:%d ino 0x%llx state %s cur 0x%p/%d " "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), - (long)__entry->idx, + __entry->leaf, + __entry->pos, __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount, @@ -303,13 +267,15 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, #define DEFINE_BMAP_EVENT(name) \ DEFINE_EVENT(xfs_bmap_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, \ unsigned long caller_ip), \ - TP_ARGS(ip, idx, state, caller_ip)) + TP_ARGS(ip, cur, state, caller_ip)) +DEFINE_BMAP_EVENT(xfs_iext_insert); DEFINE_BMAP_EVENT(xfs_iext_remove); DEFINE_BMAP_EVENT(xfs_bmap_pre_update); DEFINE_BMAP_EVENT(xfs_bmap_post_update); -DEFINE_BMAP_EVENT(xfs_extlist); +DEFINE_BMAP_EVENT(xfs_read_extent); +DEFINE_BMAP_EVENT(xfs_write_extent); DECLARE_EVENT_CLASS(xfs_buf_class, TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), @@ -688,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); -DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); - TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 354368a906e5..cef89f7127d3 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -25,6 +25,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log.h" @@ -514,11 +515,26 @@ xfsaild( current->flags |= PF_MEMALLOC; set_freezable(); - while (!kthread_should_stop()) { + while (1) { if (tout && tout <= 20) - __set_current_state(TASK_KILLABLE); + set_current_state(TASK_KILLABLE); else - __set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); + + /* + * Check kthread_should_stop() after we set the task state + * to guarantee that we either see the stop bit and exit or + * the task state is reset to runnable such that it's not + * scheduled out indefinitely and detects the stop bit at + * next iteration. + * + * A memory barrier is included in above task state set to + * serialize again kthread_stop(). + */ + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } spin_lock(&ailp->xa_lock); |