diff options
Diffstat (limited to 'fs')
124 files changed, 4425 insertions, 2377 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index a9ea73d6dcf3..2b7a032c37bc 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -90,7 +90,7 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, &v9fs_cache_session_index_def, - v9ses); + v9ses, true); p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", v9ses, v9ses->fscache); } @@ -204,7 +204,7 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode); + v9inode, true); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9inode->fscache); @@ -271,7 +271,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode); + v9inode, true); p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", inode, old, v9inode->fscache); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 3c090b7555ea..ca0a3cf93791 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -179,7 +179,7 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz, /* put it up for caching (this never returns an error) */ cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, &afs_cell_cache_index_def, - cell); + cell, true); #endif /* add to the cell lists */ diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 789bc253b5f6..ce25d755b7aa 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -259,7 +259,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, #ifdef CONFIG_AFS_FSCACHE vnode->cache = fscache_acquire_cookie(vnode->volume->cache, &afs_vnode_cache_index_def, - vnode); + vnode, true); #endif ret = afs_inode_map_status(vnode, key); diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 57bcb1596530..b6df2e83809f 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -308,7 +308,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, /* see if we have an in-cache copy (will set vl->valid if there is) */ #ifdef CONFIG_AFS_FSCACHE vl->cache = fscache_acquire_cookie(vl->cell->cache, - &afs_vlocation_cache_index_def, vl); + &afs_vlocation_cache_index_def, vl, + true); #endif if (vl->valid) { diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 401eeb21869f..2b607257820c 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -131,7 +131,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) #ifdef CONFIG_AFS_FSCACHE volume->cache = fscache_acquire_cookie(vlocation->cache, &afs_volume_cache_index_def, - volume); + volume, true); #endif afs_get_vlocation(vlocation); volume->vlocation = vlocation; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b0ef7b07b1b3..51e3afa78354 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6437,6 +6437,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (btrfs_extent_readonly(root, disk_bytenr)) goto out; + btrfs_release_path(path); /* * look for other files referencing this extent, if we diff --git a/fs/buffer.c b/fs/buffer.c index 4d7433534f5c..6024877335ca 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; /* Will call free_more_memory() */ + gfp_t gfp_mask; - page = find_or_create_page(inode->i_mapping, index, - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; + gfp_mask |= __GFP_MOVABLE; + /* + * XXX: __getblk_slow() can not really deal with failure and + * will endlessly loop on improvised global reclaim. Prefer + * looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp_mask |= __GFP_NOFAIL; + + page = find_or_create_page(inode->i_mapping, index, gfp_mask); if (!page) return ret; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 43eb5592cdea..00baf1419989 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -270,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) #endif /* delete retired objects */ - if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) && + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 6bfe65e0b038..7db2e6ca4b8f 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -68,7 +68,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) { fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, - fsc); + fsc, true); if (fsc->fscache == NULL) { pr_err("Unable to resgister fsid: %p fscache cookie", fsc); @@ -204,7 +204,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, - ci); + ci, true); done: mutex_unlock(&inode->i_mutex); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a16b4e58bcc6..77fc5e181077 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -120,14 +120,16 @@ cifs_read_super(struct super_block *sb) { struct inode *inode; struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; int rc = 0; cifs_sb = CIFS_SB(sb); + tcon = cifs_sb_master_tcon(cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) sb->s_flags |= MS_POSIXACL; - if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES) + if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files) sb->s_maxbytes = MAX_LFS_FILESIZE; else sb->s_maxbytes = MAX_NON_LFS; @@ -147,7 +149,7 @@ cifs_read_super(struct super_block *sb) goto out_no_root; } - if (cifs_sb_master_tcon(cifs_sb)->nocase) + if (tcon->nocase) sb->s_d_op = &cifs_ci_dentry_ops; else sb->s_d_op = &cifs_dentry_ops; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 52b6f6c26bfc..26b1c1dc93f6 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -278,6 +278,8 @@ struct smb_version_operations { /* set attributes */ int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, const unsigned int); + int (*set_compression)(const unsigned int, struct cifs_tcon *, + struct cifsFileInfo *); /* check if we can send an echo or nor */ bool (*can_echo)(struct TCP_Server_Info *); /* send echo request */ @@ -620,11 +622,34 @@ set_credits(struct TCP_Server_Info *server, const int val) } static inline __u64 -get_next_mid(struct TCP_Server_Info *server) +get_next_mid64(struct TCP_Server_Info *server) { return server->ops->get_next_mid(server); } +static inline __le16 +get_next_mid(struct TCP_Server_Info *server) +{ + __u16 mid = get_next_mid64(server); + /* + * The value in the SMB header should be little endian for easy + * on-the-wire decoding. + */ + return cpu_to_le16(mid); +} + +static inline __u16 +get_mid(const struct smb_hdr *smb) +{ + return le16_to_cpu(smb->Mid); +} + +static inline bool +compare_mid(__u16 mid, const struct smb_hdr *smb) +{ + return mid == le16_to_cpu(smb->Mid); +} + /* * When the server supports very large reads and writes via POSIX extensions, * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not @@ -828,6 +853,8 @@ struct cifs_tcon { __u32 maximal_access; __u32 vol_serial_number; __le64 vol_create_time; + __u32 ss_flags; /* sector size flags */ + __u32 perf_sector_size; /* best sector size for perf */ #endif /* CONFIG_CIFS_SMB2 */ #ifdef CONFIG_CIFS_FSCACHE u64 resource_id; /* server resource id */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index a630475e421c..9e5ee34de986 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -428,7 +428,7 @@ struct smb_hdr { __u16 Tid; __le16 Pid; __u16 Uid; - __u16 Mid; + __le16 Mid; __u8 WordCount; } __attribute__((packed)); @@ -1352,6 +1352,35 @@ typedef struct smb_com_transaction_ioctl_req { __u8 Data[1]; } __attribute__((packed)) TRANSACT_IOCTL_REQ; +typedef struct smb_com_transaction_compr_ioctl_req { + struct smb_hdr hdr; /* wct = 23 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 2 = IOCTL/FSCTL */ + __le32 FunctionCode; + __u16 Fid; + __u8 IsFsctl; /* 1 = File System Control 0 = device control (IOCTL) */ + __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */ + __le16 ByteCount; + __u8 Pad[3]; + __le16 compression_state; /* See below for valid flags */ +} __attribute__((packed)) TRANSACT_COMPR_IOCTL_REQ; + +/* compression state flags */ +#define COMPRESSION_FORMAT_NONE 0x0000 +#define COMPRESSION_FORMAT_DEFAULT 0x0001 +#define COMPRESSION_FORMAT_LZNT1 0x0002 + typedef struct smb_com_transaction_ioctl_rsp { struct smb_hdr hdr; /* wct = 19 */ __u8 Reserved[3]; @@ -1491,15 +1520,30 @@ struct file_notify_information { __u8 FileName[0]; } __attribute__((packed)); -struct reparse_data { - __u32 ReparseTag; - __u16 ReparseDataLength; +/* For IO_REPARSE_TAG_SYMLINK */ +struct reparse_symlink_data { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + char PathBuffer[0]; +} __attribute__((packed)); + +/* For IO_REPARSE_TAG_NFS */ +#define NFS_SPECFILE_LNK 0x00000000014B4E4C +#define NFS_SPECFILE_CHR 0x0000000000524843 +#define NFS_SPECFILE_BLK 0x00000000004B4C42 +#define NFS_SPECFILE_FIFO 0x000000004F464946 +#define NFS_SPECFILE_SOCK 0x000000004B434F53 +struct reparse_posix_data { + __le32 ReparseTag; + __le16 ReparseDataLength; __u16 Reserved; - __u16 SubstituteNameOffset; - __u16 SubstituteNameLength; - __u16 PrintNameOffset; - __u16 PrintNameLength; - __u32 Flags; + __le64 InodeType; /* LNK, FIFO, CHR etc. */ char PathBuffer[0]; } __attribute__((packed)); @@ -2200,6 +2244,9 @@ typedef struct { __le32 DeviceCharacteristics; } __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */ +/* minimum includes first three fields, and empty FS Name */ +#define MIN_FS_ATTR_INFO_SIZE 12 + typedef struct { __le32 Attributes; __le32 MaxPathNameComponentLength; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index b5ec2a268f56..aa3397620342 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -360,6 +360,8 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, char **symlinkinfo, const struct nls_table *nls_codepage); +extern int CIFSSMB_set_compression(const unsigned int xid, + struct cifs_tcon *tcon, __u16 fid); extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const int disposition, const int access_flags, const int omode, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4baf35949b51..93b29474714a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3088,7 +3088,8 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, bool is_unicode; unsigned int sub_len; char *sub_start; - struct reparse_data *reparse_buf; + struct reparse_symlink_data *reparse_buf; + struct reparse_posix_data *posix_buf; __u32 data_offset, data_count; char *end_of_smb; @@ -3137,20 +3138,47 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, goto qreparse_out; } end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; - reparse_buf = (struct reparse_data *) + reparse_buf = (struct reparse_symlink_data *) ((char *)&pSMBr->hdr.Protocol + data_offset); if ((char *)reparse_buf >= end_of_smb) { rc = -EIO; goto qreparse_out; } - if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset + - reparse_buf->PrintNameLength) > end_of_smb) { + if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) { + cifs_dbg(FYI, "NFS style reparse tag\n"); + posix_buf = (struct reparse_posix_data *)reparse_buf; + + if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) { + cifs_dbg(FYI, "unsupported file type 0x%llx\n", + le64_to_cpu(posix_buf->InodeType)); + rc = -EOPNOTSUPP; + goto qreparse_out; + } + is_unicode = true; + sub_len = le16_to_cpu(reparse_buf->ReparseDataLength); + if (posix_buf->PathBuffer + sub_len > end_of_smb) { + cifs_dbg(FYI, "reparse buf beyond SMB\n"); + rc = -EIO; + goto qreparse_out; + } + *symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer, + sub_len, is_unicode, nls_codepage); + goto qreparse_out; + } else if (reparse_buf->ReparseTag != + cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) { + rc = -EOPNOTSUPP; + goto qreparse_out; + } + + /* Reparse tag is NTFS symlink */ + sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) + + reparse_buf->PathBuffer; + sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength); + if (sub_start + sub_len > end_of_smb) { cifs_dbg(FYI, "reparse buf beyond SMB\n"); rc = -EIO; goto qreparse_out; } - sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer; - sub_len = reparse_buf->SubstituteNameLength; if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) is_unicode = true; else @@ -3171,6 +3199,60 @@ qreparse_out: return rc; } +int +CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid) +{ + int rc = 0; + int bytes_returned; + struct smb_com_transaction_compr_ioctl_req *pSMB; + struct smb_com_transaction_ioctl_rsp *pSMBr; + + cifs_dbg(FYI, "Set compression for %u\n", fid); + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + + pSMB->TotalParameterCount = 0; + pSMB->TotalDataCount = __constant_cpu_to_le32(2); + pSMB->MaxParameterCount = 0; + pSMB->MaxDataCount = 0; + pSMB->MaxSetupCount = 4; + pSMB->Reserved = 0; + pSMB->ParameterOffset = 0; + pSMB->DataCount = __constant_cpu_to_le32(2); + pSMB->DataOffset = + cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, + compression_state) - 4); /* 84 */ + pSMB->SetupCount = 4; + pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->ParameterCount = 0; + pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->IsFsctl = 1; /* FSCTL */ + pSMB->IsRootFlag = 0; + pSMB->Fid = fid; /* file handle always le */ + /* 3 byte pad, followed by 2 byte compress state */ + pSMB->ByteCount = __constant_cpu_to_le16(5); + inc_rfc1001_len(pSMB, 5); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Send error in SetCompression = %d\n", rc); + + cifs_buf_release(pSMB); + + /* + * Note: On -EAGAIN error only caller can retry on handle based calls + * since file handle passed in no longer valid. + */ + return rc; +} + + #ifdef CONFIG_CIFS_POSIX /*Convert an Access Control Entry from wire format to local POSIX xattr format*/ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a279ffc0bc29..62a55147400a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2242,6 +2242,8 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->status == CifsExiting) + continue; if (!match_session(ses, vol)) continue; ++ses->ses_count; @@ -2255,24 +2257,37 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) static void cifs_put_smb_ses(struct cifs_ses *ses) { - unsigned int xid; + unsigned int rc, xid; struct TCP_Server_Info *server = ses->server; cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); + spin_lock(&cifs_tcp_ses_lock); + if (ses->status == CifsExiting) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } if (--ses->ses_count > 0) { spin_unlock(&cifs_tcp_ses_lock); return; } - - list_del_init(&ses->smb_ses_list); + if (ses->status == CifsGood) + ses->status = CifsExiting; spin_unlock(&cifs_tcp_ses_lock); - if (ses->status == CifsGood && server->ops->logoff) { + if (ses->status == CifsExiting && server->ops->logoff) { xid = get_xid(); - server->ops->logoff(xid, ses); + rc = server->ops->logoff(xid, ses); + if (rc) + cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n", + __func__, rc); _free_xid(xid); } + + spin_lock(&cifs_tcp_ses_lock); + list_del_init(&ses->smb_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + sesInfoFree(ses); cifs_put_tcp_session(server); } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index b3258f35e88a..8d4b7bc8ae91 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -27,7 +27,7 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) { server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, - &cifs_fscache_server_index_def, server); + &cifs_fscache_server_index_def, server, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server, server->fscache); } @@ -46,7 +46,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) tcon->fscache = fscache_acquire_cookie(server->fscache, - &cifs_fscache_super_index_def, tcon); + &cifs_fscache_super_index_def, tcon, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server->fscache, tcon->fscache); } @@ -69,7 +69,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { cifsi->fscache = fscache_acquire_cookie(tcon->fscache, - &cifs_fscache_inode_object_def, cifsi); + &cifs_fscache_inode_object_def, cifsi, true); cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", __func__, tcon->fscache, cifsi->fscache); } @@ -119,7 +119,7 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) cifsi->fscache = fscache_acquire_cookie( cifs_sb_master_tcon(cifs_sb)->fscache, &cifs_fscache_inode_object_def, - cifsi); + cifsi, true); cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", __func__, cifsi->fscache, old); } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 3e0845585853..ba54bf6ab116 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -3,7 +3,7 @@ * * vfs operations that deal with io control * - * Copyright (C) International Business Machines Corp., 2005,2007 + * Copyright (C) International Business Machines Corp., 2005,2013 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -34,13 +34,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifs_sb_info *cifs_sb; -#ifdef CONFIG_CIFS_POSIX struct cifsFileInfo *pSMBFile = filep->private_data; struct cifs_tcon *tcon; __u64 ExtAttrBits = 0; - __u64 ExtAttrMask = 0; __u64 caps; -#endif /* CONFIG_CIFS_POSIX */ xid = get_xid(); @@ -49,13 +46,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) cifs_sb = CIFS_SB(inode->i_sb); switch (command) { -#ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: if (pSMBFile == NULL) break; tcon = tlink_tcon(pSMBFile->tlink); caps = le64_to_cpu(tcon->fsUnixInfo.Capability); +#ifdef CONFIG_CIFS_POSIX if (CIFS_UNIX_EXTATTR_CAP & caps) { + __u64 ExtAttrMask = 0; rc = CIFSGetExtAttr(xid, tcon, pSMBFile->fid.netfid, &ExtAttrBits, &ExtAttrMask); @@ -63,29 +61,50 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE, (int __user *)arg); + if (rc != EOPNOTSUPP) + break; + } +#endif /* CONFIG_CIFS_POSIX */ + rc = 0; + if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) { + /* add in the compressed bit */ + ExtAttrBits = FS_COMPR_FL; + rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE, + (int __user *)arg); } break; - case FS_IOC_SETFLAGS: if (pSMBFile == NULL) break; tcon = tlink_tcon(pSMBFile->tlink); caps = le64_to_cpu(tcon->fsUnixInfo.Capability); - if (CIFS_UNIX_EXTATTR_CAP & caps) { - if (get_user(ExtAttrBits, (int __user *)arg)) { - rc = -EFAULT; - break; - } - /* - * rc = CIFSGetExtAttr(xid, tcon, - * pSMBFile->fid.netfid, - * extAttrBits, - * &ExtAttrMask); - */ + + if (get_user(ExtAttrBits, (int __user *)arg)) { + rc = -EFAULT; + break; + } + + /* + * if (CIFS_UNIX_EXTATTR_CAP & caps) + * rc = CIFSSetExtAttr(xid, tcon, + * pSMBFile->fid.netfid, + * extAttrBits, + * &ExtAttrMask); + * if (rc != EOPNOTSUPP) + * break; + */ + + /* Currently only flag we can set is compressed flag */ + if ((ExtAttrBits & FS_COMPR_FL) == 0) + break; + + /* Try to set compress flag */ + if (tcon->ses->server->ops->set_compression) { + rc = tcon->ses->server->ops->set_compression( + xid, tcon, pSMBFile); + cifs_dbg(FYI, "set compress flag rc %d\n", rc); } - cifs_dbg(FYI, "set flags not implemented yet\n"); break; -#endif /* CONFIG_CIFS_POSIX */ default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 138a011633fe..2f9f3790679d 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -278,7 +278,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , } static int -check_smb_hdr(struct smb_hdr *smb, __u16 mid) +check_smb_hdr(struct smb_hdr *smb) { /* does it have the right SMB "signature" ? */ if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) { @@ -287,13 +287,6 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) return 1; } - /* Make sure that message ids match */ - if (mid != smb->Mid) { - cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n", - smb->Mid, mid); - return 1; - } - /* if it's a response then accept */ if (smb->Flags & SMBFLG_RESPONSE) return 0; @@ -302,7 +295,8 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) if (smb->Command == SMB_COM_LOCKING_ANDX) return 0; - cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid); + cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", + get_mid(smb)); return 1; } @@ -310,7 +304,6 @@ int checkSMB(char *buf, unsigned int total_read) { struct smb_hdr *smb = (struct smb_hdr *)buf; - __u16 mid = smb->Mid; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n", @@ -348,7 +341,7 @@ checkSMB(char *buf, unsigned int total_read) } /* otherwise, there is enough to get to the BCC */ - if (check_smb_hdr(smb, mid)) + if (check_smb_hdr(smb)) return -EIO; clc_len = smbCalcSize(smb); @@ -359,6 +352,7 @@ checkSMB(char *buf, unsigned int total_read) } if (4 + rfclen != clc_len) { + __u16 mid = get_mid(smb); /* check if bcc wrapped around for large read responses */ if ((rfclen > 64 * 1024) && (rfclen > clc_len)) { /* check if lengths match mod 64K */ @@ -366,11 +360,11 @@ checkSMB(char *buf, unsigned int total_read) return 0; /* bcc wrapped */ } cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n", - clc_len, 4 + rfclen, smb->Mid); + clc_len, 4 + rfclen, mid); if (4 + rfclen < clc_len) { cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n", - rfclen, smb->Mid); + rfclen, mid); return -EIO; } else if (rfclen > clc_len + 512) { /* @@ -383,7 +377,7 @@ checkSMB(char *buf, unsigned int total_read) * data to 512 bytes. */ cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n", - rfclen, smb->Mid); + rfclen, mid); return -EIO; } } diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index af847e1cf1c1..651a5279607b 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -780,7 +780,9 @@ static const struct { ERRDOS, ERRnoaccess, 0xc0000290}, { ERRDOS, ERRbadfunc, 0xc000029c}, { ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, { - ERRDOS, ERRinvlevel, 0x007c0001}, }; + ERRDOS, ERRinvlevel, 0x007c0001}, { + 0, 0, 0 } +}; /***************************************************************************** Print an error message from the status code diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 352358de1d7e..e87387dbf39f 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -500,9 +500,9 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) return NTLMv2; if (global_secflags & CIFSSEC_MAY_NTLM) return NTLM; - /* Fallthrough */ default: - return Unspecified; + /* Fallthrough to attempt LANMAN authentication next */ + break; } case CIFS_NEGFLAVOR_LANMAN: switch (requested) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 8233b174de3d..384cffe42850 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -67,7 +67,7 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf, mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n", - in_buf->Mid, rc); + get_mid(in_buf), rc); return rc; } @@ -101,7 +101,7 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer) spin_lock(&GlobalMid_Lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { - if (mid->mid == buf->Mid && + if (compare_mid(mid->mid, buf) && mid->mid_state == MID_REQUEST_SUBMITTED && le16_to_cpu(mid->command) == buf->Command) { spin_unlock(&GlobalMid_Lock); @@ -807,6 +807,13 @@ out: } static int +cifs_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + return CIFSSMB_set_compression(xid, tcon, cfile->fid.netfid); +} + +static int cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cifs_fid *fid, __u16 search_flags, @@ -956,6 +963,7 @@ struct smb_version_operations smb1_operations = { .set_path_size = CIFSSMBSetEOF, .set_file_size = CIFSSMBSetFileSize, .set_file_info = smb_set_file_info, + .set_compression = cifs_set_compression, .echo = CIFSSMBEcho, .mkdir = CIFSSMBMkDir, .mkdir_setinfo = cifs_mkdir_setinfo, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 861b33214144..c571be8cb76e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -209,6 +209,94 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) return rsize; } +#ifdef CONFIG_CIFS_STATS2 +static int +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + unsigned int ret_data_len = 0; + struct network_interface_info_ioctl_rsp *out_buf; + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, + NULL /* no data input */, 0 /* no data input */, + (char **)&out_buf, &ret_data_len); + + if ((rc == 0) && (ret_data_len > 0)) { + /* Dump info on first interface */ + cifs_dbg(FYI, "Adapter Capability 0x%x\t", + le32_to_cpu(out_buf->Capability)); + cifs_dbg(FYI, "Link Speed %lld\n", + le64_to_cpu(out_buf->LinkSpeed)); + } else + cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); + + return rc; +} +#endif /* STATS2 */ + +static void +smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + __le16 srch_path = 0; /* Null - open root of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (rc) + return; + +#ifdef CONFIG_CIFS_STATS2 + SMB3_request_interfaces(xid, tcon); +#endif /* STATS2 */ + + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_ATTRIBUTE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_DEVICE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */ + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return; +} + +static void +smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc; + __le16 srch_path = 0; /* Null - open root of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (rc) + return; + + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_ATTRIBUTE_INFORMATION); + SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, + FS_DEVICE_INFORMATION); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return; +} + static int smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) @@ -304,7 +392,19 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " ASYMMETRIC,"); if (tcon->capabilities == 0) seq_puts(m, " None"); + if (tcon->ss_flags & SSINFO_FLAGS_ALIGNED_DEVICE) + seq_puts(m, " Aligned,"); + if (tcon->ss_flags & SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE) + seq_puts(m, " Partition Aligned,"); + if (tcon->ss_flags & SSINFO_FLAGS_NO_SEEK_PENALTY) + seq_puts(m, " SSD,"); + if (tcon->ss_flags & SSINFO_FLAGS_TRIM_ENABLED) + seq_puts(m, " TRIM-support,"); + seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags); + if (tcon->perf_sector_size) + seq_printf(m, "\tOptimal sector size: 0x%x", + tcon->perf_sector_size); } static void @@ -446,6 +546,14 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, } static int +smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + return SMB2_set_compression(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid); +} + +static int smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cifs_fid *fid, __u16 search_flags, @@ -865,6 +973,7 @@ struct smb_version_operations smb20_operations = { .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb2_qfs_tcon, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, @@ -874,6 +983,7 @@ struct smb_version_operations smb20_operations = { .set_path_size = smb2_set_path_size, .set_file_size = smb2_set_file_size, .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, @@ -936,6 +1046,7 @@ struct smb_version_operations smb21_operations = { .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb2_qfs_tcon, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, @@ -945,6 +1056,7 @@ struct smb_version_operations smb21_operations = { .set_path_size = smb2_set_path_size, .set_file_size = smb2_set_file_size, .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, @@ -1008,6 +1120,7 @@ struct smb_version_operations smb30_operations = { .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb3_qfs_tcon, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, @@ -1017,6 +1130,7 @@ struct smb_version_operations smb30_operations = { .set_path_size = smb2_set_path_size, .set_file_size = smb2_set_file_size, .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index eba0efde66d7..8ab05b0d6778 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -687,6 +687,10 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) else return -EIO; + /* no need to send SMB logoff if uid already closed due to reconnect */ + if (ses->need_reconnect) + goto smb2_session_already_dead; + rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req); if (rc) return rc; @@ -701,6 +705,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) * No tcon so can't do * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); */ + +smb2_session_already_dead: return rc; } @@ -1131,6 +1137,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, cifs_dbg(FYI, "SMB2 IOCTL\n"); + *out_data = NULL; /* zero out returned data len, in case of error */ if (plen) *plen = 0; @@ -1176,11 +1183,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, req->Flags = 0; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; - if (indatalen) - inc_rfc1001_len(req, indatalen); + /* + * If no input data, the size of ioctl struct in + * protocol spec still includes a 1 byte data buffer, + * but if input data passed to ioctl, we do not + * want to double count this, so we do not send + * the dummy one byte of data in iovec[0] if sending + * input data (in iovec[1]). We also must add 4 bytes + * in first iovec to allow for rfc1002 length field. + */ + + if (indatalen) { + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + inc_rfc1001_len(req, indatalen - 1); + } else + iov[0].iov_len = get_rfc1002_length(req) + 4; + rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; @@ -1228,6 +1247,33 @@ ioctl_exit: return rc; } +/* + * Individual callers to ioctl worker function follow + */ + +int +SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + int rc; + char *res_key = NULL; + struct compress_ioctl fsctl_input; + char *ret_data = NULL; + + fsctl_input.CompressionState = + __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + + rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, + FSCTL_SET_COMPRESSION, true /* is_fsctl */, + (char *)&fsctl_input /* data input */, + 2 /* in data len */, &ret_data /* out data */, NULL); + + cifs_dbg(FYI, "set compression rc %d\n", rc); + kfree(res_key); + + return rc; +} + int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) @@ -2293,7 +2339,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); - goto qinf_exit; + goto qfsinf_exit; } rsp = (struct smb2_query_info_rsp *)iov.iov_base; @@ -2305,7 +2351,70 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (!rc) copy_fs_info_to_kstatfs(info, fsdata); -qinf_exit: +qfsinf_exit: + free_rsp_buf(resp_buftype, iov.iov_base); + return rc; +} + +int +SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int level) +{ + struct smb2_query_info_rsp *rsp = NULL; + struct kvec iov; + int rc = 0; + int resp_buftype, max_len, min_len; + struct cifs_ses *ses = tcon->ses; + unsigned int rsp_len, offset; + + if (level == FS_DEVICE_INFORMATION) { + max_len = sizeof(FILE_SYSTEM_DEVICE_INFO); + min_len = sizeof(FILE_SYSTEM_DEVICE_INFO); + } else if (level == FS_ATTRIBUTE_INFORMATION) { + max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO); + min_len = MIN_FS_ATTR_INFO_SIZE; + } else if (level == FS_SECTOR_SIZE_INFORMATION) { + max_len = sizeof(struct smb3_fs_ss_info); + min_len = sizeof(struct smb3_fs_ss_info); + } else { + cifs_dbg(FYI, "Invalid qfsinfo level %d\n", level); + return -EINVAL; + } + + rc = build_qfs_info_req(&iov, tcon, level, max_len, + persistent_fid, volatile_fid); + if (rc) + return rc; + + rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); + goto qfsattr_exit; + } + rsp = (struct smb2_query_info_rsp *)iov.iov_base; + + rsp_len = le32_to_cpu(rsp->OutputBufferLength); + offset = le16_to_cpu(rsp->OutputBufferOffset); + rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len); + if (rc) + goto qfsattr_exit; + + if (level == FS_ATTRIBUTE_INFORMATION) + memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset + + (char *)&rsp->hdr, min_t(unsigned int, + rsp_len, max_len)); + else if (level == FS_DEVICE_INFORMATION) + memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset + + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO)); + else if (level == FS_SECTOR_SIZE_INFORMATION) { + struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *) + (4 /* RFC1001 len */ + offset + (char *)&rsp->hdr); + tcon->ss_flags = le32_to_cpu(ss_info->Flags); + tcon->perf_sector_size = + le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf); + } + +qfsattr_exit: free_rsp_buf(resp_buftype, iov.iov_base); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index b83d0118a757..6183b1b7550f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -569,6 +569,10 @@ struct network_interface_info_ioctl_rsp { #define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */ +struct compress_ioctl { + __le16 CompressionState; /* See cifspdu.h for possible flag values */ +} __packed; + struct smb2_ioctl_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ @@ -584,7 +588,7 @@ struct smb2_ioctl_req { __le32 MaxOutputResponse; __le32 Flags; __u32 Reserved2; - char Buffer[0]; + __u8 Buffer[0]; } __packed; struct smb2_ioctl_rsp { @@ -870,14 +874,16 @@ struct smb2_lease_ack { /* File System Information Classes */ #define FS_VOLUME_INFORMATION 1 /* Query */ -#define FS_LABEL_INFORMATION 2 /* Set */ +#define FS_LABEL_INFORMATION 2 /* Local only */ #define FS_SIZE_INFORMATION 3 /* Query */ #define FS_DEVICE_INFORMATION 4 /* Query */ #define FS_ATTRIBUTE_INFORMATION 5 /* Query */ #define FS_CONTROL_INFORMATION 6 /* Query, Set */ #define FS_FULL_SIZE_INFORMATION 7 /* Query */ #define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ -#define FS_DRIVER_PATH_INFORMATION 9 /* Query */ +#define FS_DRIVER_PATH_INFORMATION 9 /* Local only */ +#define FS_VOLUME_FLAGS_INFORMATION 10 /* Local only */ +#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ struct smb2_fs_full_size_info { __le64 TotalAllocationUnits; @@ -887,6 +893,22 @@ struct smb2_fs_full_size_info { __le32 BytesPerSector; } __packed; +#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 +#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 +#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 +#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 + +/* sector size info struct */ +struct smb3_fs_ss_info { + __le32 LogicalBytesPerSector; + __le32 PhysicalBytesPerSectorForAtomicity; + __le32 PhysicalBytesPerSectorForPerf; + __le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity; + __le32 Flags; + __le32 ByteOffsetForSectorAlignment; + __le32 ByteOffsetForPartitionAlignment; +} __packed; + /* partial list of QUERY INFO levels */ #define FILE_DIRECTORY_INFORMATION 1 #define FILE_FULL_DIRECTORY_INFORMATION 2 diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index e3fb4801ee96..313813e4c19b 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -142,12 +142,16 @@ extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf); +extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid); extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, const __u8 oplock_level); extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct kstatfs *FSData); +extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, int lvl); extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon, const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, const __u64 length, const __u64 offset, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 340abca3aa52..59c748ce872f 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -466,7 +466,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) static inline void smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr) { - hdr->MessageId = get_next_mid(server); + hdr->MessageId = get_next_mid64(server); } static struct mid_q_entry * @@ -516,13 +516,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf, return -EAGAIN; } - if (ses->status != CifsGood) { - /* check if SMB2 session is bad because we are setting it up */ + if (ses->status == CifsNew) { if ((buf->Command != SMB2_SESSION_SETUP) && (buf->Command != SMB2_NEGOTIATE)) return -EAGAIN; /* else ok - we are setting up session */ } + + if (ses->status == CifsExiting) { + if (buf->Command != SMB2_LOGOFF) + return -EAGAIN; + /* else ok - we are shutting down the session */ + } + *mid = smb2_mid_entry_alloc(buf, ses->server); if (*mid == NULL) return -ENOMEM; diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index d952ee48f4dc..a4b2391fe66e 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -97,9 +97,23 @@ #define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */ #define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */ +/* See FSCC 2.1.2.5 */ #define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 #define IO_REPARSE_TAG_HSM 0xC0000004 #define IO_REPARSE_TAG_SIS 0x80000007 +#define IO_REPARSE_TAG_HSM2 0x80000006 +#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005 +/* Used by the DFS filter. See MS-DFSC */ +#define IO_REPARSE_TAG_DFS 0x8000000A +/* Used by the DFS filter See MS-DFSC */ +#define IO_REPARSE_TAG_DFSR 0x80000012 +#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B +/* See section MS-FSCC 2.1.2.4 */ +#define IO_REPARSE_TAG_SYMLINK 0xA000000C +#define IO_REPARSE_TAG_DEDUP 0x80000013 +#define IO_REPARSE_APPXSTREAM 0xC0000014 +/* NFS symlinks, Win 8/SMB3 and later */ +#define IO_REPARSE_TAG_NFS 0x80000014 /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 6fdcb1b4a106..b37570952846 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -58,7 +58,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) return temp; else { memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = smb_buffer->Mid; /* always LE */ + temp->mid = get_mid(smb_buffer); temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); @@ -410,8 +410,13 @@ static int wait_for_free_request(struct TCP_Server_Info *server, const int timeout, const int optype) { - return wait_for_free_credits(server, timeout, - server->ops->get_credits_field(server, optype)); + int *val; + + val = server->ops->get_credits_field(server, optype); + /* Since an echo is already inflight, no need to wait to send another */ + if (*val <= 0 && optype == CIFS_ECHO_OP) + return -EAGAIN; + return wait_for_free_credits(server, timeout, val); } static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, @@ -426,13 +431,20 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, return -EAGAIN; } - if (ses->status != CifsGood) { - /* check if SMB session is bad because we are setting it up */ + if (ses->status == CifsNew) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && (in_buf->Command != SMB_COM_NEGOTIATE)) return -EAGAIN; /* else ok - we are setting up session */ } + + if (ses->status == CifsExiting) { + /* check if SMB session is bad because we are setting it up */ + if (in_buf->Command != SMB_COM_LOGOFF_ANDX) + return -EAGAIN; + /* else ok - we are shutting down session */ + } + *ppmidQ = AllocMidQEntry(in_buf, ses->server); if (*ppmidQ == NULL) return -ENOMEM; diff --git a/fs/dcache.c b/fs/dcache.c index 41000305d716..ae6ebb88ceff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -542,7 +542,7 @@ EXPORT_SYMBOL(d_drop); * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ -static inline struct dentry * +static struct dentry * dentry_kill(struct dentry *dentry, int unlock_on_failure) __releases(dentry->d_lock) { @@ -630,7 +630,8 @@ repeat: goto kill_it; } - dentry->d_flags |= DCACHE_REFERENCED; + if (!(dentry->d_flags & DCACHE_REFERENCED)) + dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); dentry->d_lockref.count--; @@ -1331,14 +1332,6 @@ rename_retry: * list is non-empty and continue searching. */ -/** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. - * - * Return true if the parent or its subdirectories contain - * a mount point - */ - static enum d_walk_ret check_mount(void *data, struct dentry *dentry) { int *ret = data; @@ -1349,6 +1342,13 @@ static enum d_walk_ret check_mount(void *data, struct dentry *dentry) return D_WALK_CONTINUE; } +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ int have_submounts(struct dentry *parent) { int ret = 0; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c88e355f7635..000eae2782b6 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -408,7 +408,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, struct page *page) { return ecryptfs_lower_header_size(crypt_stat) + - (page->index << PAGE_CACHE_SHIFT); + ((loff_t)page->index << PAGE_CACHE_SHIFT); } /** diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 7d52806c2119..4725a07f003c 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1149,7 +1149,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_msg_ctx *msg_ctx; struct ecryptfs_message *msg = NULL; char *auth_tok_sig; - char *payload; + char *payload = NULL; size_t payload_len = 0; int rc; @@ -1203,6 +1203,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, } out: kfree(msg); + kfree(payload); return rc; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 473e09da7d02..810c28fb8c3c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -34,7 +34,6 @@ #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> -#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/io.h> #include <asm/mman.h> @@ -1605,8 +1604,7 @@ fetch_events: } spin_unlock_irqrestore(&ep->lock, flags); - if (!freezable_schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS)) + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irqsave(&ep->lock, flags); diff --git a/fs/exec.c b/fs/exec.c index 8875dd10ae7a..2ea437e5acf4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename, current->fs->in_exec = 0; current->in_execve = 0; acct_update_integrals(current); + task_numa_free(current); free_bprm(bprm); if (displaced) put_files_struct(displaced); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c260de6d7b6d..8a337640a46a 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -632,6 +632,8 @@ static int ext2_get_blocks(struct inode *inode, int count = 0; ext2_fsblk_t first_block = 0; + BUG_ON(maxblocks == 0); + depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); if (depth == 0) diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index 1c3312858fcf..e98171a11cfe 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -35,6 +35,7 @@ __ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, int rc; memset(&tmp, 0, sizeof(struct buffer_head)); + tmp.b_size = 1 << inode->i_blkbits; rc = ext2_get_block(inode, pgoff, &tmp, create); *result = tmp.b_blocknr; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 1194b1f0f839..f8cde46de9cd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1783,7 +1783,7 @@ retry: d_tmpfile(dentry, inode); err = ext3_orphan_add(handle, inode); if (err) - goto err_drop_inode; + goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } @@ -1791,10 +1791,9 @@ retry: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: +err_unlock_inode: ext3_journal_stop(handle); unlock_new_inode(inode); - iput(inode); return err; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index c50c76190373..37fd31ed16e7 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2825,6 +2825,10 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * bitmap, and an inode table. */ overhead += ngroups * (2 + sbi->s_itb_per_group); + + /* Add the journal blocks as well */ + overhead += sbi->s_journal->j_maxlen; + sbi->s_overhead_last = overhead; smp_wmb(); sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1bec5a5c1e45..5a0408d7b114 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2319,7 +2319,7 @@ retry: d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); if (err) - goto err_drop_inode; + goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } @@ -2328,10 +2328,9 @@ retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: +err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); - iput(inode); return err; } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index e06e0995e00f..214fe1054fce 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -63,3 +63,11 @@ config F2FS_FS_SECURITY the extended attribute support in advance. If you are not using a security module, say N. + +config F2FS_CHECK_FS + bool "F2FS consistency checking feature" + depends on F2FS_FS + help + Enables BUG_ONs which check the file system consistency in runtime. + + If you want to improve the performance, say N. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index b7826ec1b470..d0fc287efeff 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -205,7 +205,8 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return acl; } -static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +static int f2fs_set_acl(struct inode *inode, int type, + struct posix_acl *acl, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -250,7 +251,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } } - error = f2fs_setxattr(inode, name_index, "", value, size, NULL); + error = f2fs_setxattr(inode, name_index, "", value, size, ipage); kfree(value); if (!error) @@ -260,10 +261,10 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int f2fs_init_acl(struct inode *inode, struct inode *dir) +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) { - struct posix_acl *acl = NULL; struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct posix_acl *acl = NULL; int error = 0; if (!S_ISLNK(inode->i_mode)) { @@ -276,19 +277,19 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } - if (test_opt(sbi, POSIX_ACL) && acl) { + if (!test_opt(sbi, POSIX_ACL) || !acl) + goto cleanup; - if (S_ISDIR(inode->i_mode)) { - error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); + if (S_ISDIR(inode->i_mode)) { + error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl, ipage); + if (error) + goto cleanup; } + error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + if (error < 0) + return error; + if (error > 0) + error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage); cleanup: posix_acl_release(acl); return error; @@ -313,7 +314,8 @@ int f2fs_acl_chmod(struct inode *inode) error = posix_acl_chmod(&acl, GFP_KERNEL, mode); if (error) return error; - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); + + error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, NULL); posix_acl_release(acl); return error; } @@ -388,7 +390,7 @@ static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, acl = NULL; } - error = f2fs_set_acl(inode, type, acl); + error = f2fs_set_acl(inode, type, acl, NULL); release_and_out: posix_acl_release(acl); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 80f430674417..49633131e038 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -36,9 +36,9 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL -extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type); -extern int f2fs_acl_chmod(struct inode *inode); -extern int f2fs_init_acl(struct inode *inode, struct inode *dir); +extern struct posix_acl *f2fs_get_acl(struct inode *, int); +extern int f2fs_acl_chmod(struct inode *); +extern int f2fs_init_acl(struct inode *, struct inode *, struct page *); #else #define f2fs_check_acl NULL #define f2fs_get_acl NULL @@ -49,7 +49,8 @@ static inline int f2fs_acl_chmod(struct inode *inode) return 0; } -static inline int f2fs_init_acl(struct inode *inode, struct inode *dir) +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, + struct page *page) { return 0; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bb312201ca95..5716e5eb4e8e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -81,7 +81,7 @@ static int f2fs_write_meta_page(struct page *page, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); /* Should not write any meta pages, if any IO error was occurred */ - if (wbc->for_reclaim || + if (wbc->for_reclaim || sbi->por_doing || is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { dec_page_count(sbi, F2FS_DIRTY_META); wbc->pages_skipped++; @@ -142,8 +142,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; lock_page(page); - BUG_ON(page->mapping != mapping); - BUG_ON(!PageDirty(page)); + f2fs_bug_on(page->mapping != mapping); + f2fs_bug_on(!PageDirty(page)); clear_page_dirty_for_io(page); if (f2fs_write_meta_page(page, &wbc)) { unlock_page(page); @@ -167,6 +167,8 @@ static int f2fs_set_meta_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + trace_f2fs_set_page_dirty(page, META); + SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); @@ -206,6 +208,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) void release_orphan_inode(struct f2fs_sb_info *sbi) { mutex_lock(&sbi->orphan_inode_mutex); + f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; mutex_unlock(&sbi->orphan_inode_mutex); } @@ -225,12 +228,8 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) break; orphan = NULL; } -retry: - new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - if (!new) { - cond_resched(); - goto retry; - } + + new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); new->ino = ino; /* add new_oentry into list which is sorted by inode number */ @@ -253,6 +252,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (orphan->ino == ino) { list_del(&orphan->list); kmem_cache_free(orphan_entry_slab, orphan); + f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; break; } @@ -263,7 +263,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode = f2fs_iget(sbi->sb, ino); - BUG_ON(IS_ERR(inode)); + f2fs_bug_on(IS_ERR(inode)); clear_nlink(inode); /* truncate all the data during iput */ @@ -277,7 +277,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) return 0; - sbi->por_doing = 1; + sbi->por_doing = true; start_blk = __start_cp_addr(sbi) + 1; orphan_blkaddr = __start_sum_addr(sbi) - 1; @@ -294,7 +294,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) } /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); - sbi->por_doing = 0; + sbi->por_doing = false; return 0; } @@ -469,9 +469,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) return -EEXIST; } list_add_tail(&new->list, head); -#ifdef CONFIG_F2FS_STAT_FS - sbi->n_dirty_dirs++; -#endif + stat_inc_dirty_dir(sbi); return 0; } @@ -482,12 +480,8 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) if (!S_ISDIR(inode->i_mode)) return; -retry: - new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - if (!new) { - cond_resched(); - goto retry; - } + + new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); new->inode = inode; INIT_LIST_HEAD(&new->list); @@ -504,13 +498,9 @@ retry: void add_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct dir_inode_entry *new; -retry: - new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - if (!new) { - cond_resched(); - goto retry; - } + struct dir_inode_entry *new = + f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + new->inode = inode; INIT_LIST_HEAD(&new->list); @@ -541,9 +531,7 @@ void remove_dirty_dir_inode(struct inode *inode) if (entry->inode == inode) { list_del(&entry->list); kmem_cache_free(inode_entry_slab, entry); -#ifdef CONFIG_F2FS_STAT_FS - sbi->n_dirty_dirs--; -#endif + stat_dec_dirty_dir(sbi); break; } } @@ -617,11 +605,10 @@ static void block_operations(struct f2fs_sb_info *sbi) blk_start_plug(&plug); retry_flush_dents: - mutex_lock_all(sbi); - + f2fs_lock_all(sbi); /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { - mutex_unlock_all(sbi); + f2fs_unlock_all(sbi); sync_dirty_dir_inodes(sbi); goto retry_flush_dents; } @@ -644,7 +631,22 @@ retry_flush_nodes: static void unblock_operations(struct f2fs_sb_info *sbi) { mutex_unlock(&sbi->node_write); - mutex_unlock_all(sbi); + f2fs_unlock_all(sbi); +} + +static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); + + if (!get_pages(sbi, F2FS_WRITEBACK)) + break; + + io_schedule(); + } + finish_wait(&sbi->cp_wait, &wait); } static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) @@ -756,8 +758,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) f2fs_put_page(cp_page, 1); /* wait for previous submitted node/meta pages writeback */ - while (get_pages(sbi, F2FS_WRITEBACK)) - congestion_wait(BLK_RW_ASYNC, HZ / 50); + wait_on_all_pages_writeback(sbi); filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 941f9b9ca3a5..aa3438c571fa 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -68,9 +68,6 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { struct f2fs_inode_info *fi = F2FS_I(inode); -#ifdef CONFIG_F2FS_STAT_FS - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); -#endif pgoff_t start_fofs, end_fofs; block_t start_blkaddr; @@ -80,9 +77,8 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, return 0; } -#ifdef CONFIG_F2FS_STAT_FS - sbi->total_hit_ext++; -#endif + stat_inc_total_hit(inode->i_sb); + start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; start_blkaddr = fi->ext.blk_addr; @@ -100,9 +96,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, else bh_result->b_size = UINT_MAX; -#ifdef CONFIG_F2FS_STAT_FS - sbi->read_hit_ext++; -#endif + stat_inc_read_hit(inode->i_sb); read_unlock(&fi->ext.ext_lock); return 1; } @@ -116,7 +110,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) pgoff_t fofs, start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; - BUG_ON(blk_addr == NEW_ADDR); + f2fs_bug_on(blk_addr == NEW_ADDR); fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; @@ -442,7 +436,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, } /* It does not support data allocation */ - BUG_ON(create); + f2fs_bug_on(create); if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { int i; @@ -560,9 +554,9 @@ write: inode_dec_dirty_dents(inode); err = do_write_data_page(page); } else { - int ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = do_write_data_page(page); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); need_balance_fs = true; } if (err == -ENOENT) @@ -641,7 +635,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; - int ilock; f2fs_balance_fs(sbi); repeat: @@ -650,7 +643,7 @@ repeat: return -ENOMEM; *pagep = page; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, ALLOC_NODE); @@ -664,7 +657,7 @@ repeat: if (err) goto err; - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -700,7 +693,7 @@ out: return 0; err: - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); f2fs_put_page(page, 1); return err; } @@ -763,6 +756,8 @@ static int f2fs_set_data_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; + trace_f2fs_set_page_dirty(page, DATA); + SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 384c6daf9a89..594fc1bb64ef 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -139,7 +139,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, bool room = false; int max_slots = 0; - BUG_ON(level > MAX_DIR_HASH_DEPTH); + f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); nbucket = dir_buckets(level); nblock = bucket_blocks(level); @@ -346,7 +346,7 @@ static struct page *init_inode_metadata(struct inode *inode, goto error; } - err = f2fs_init_acl(inode, dir); + err = f2fs_init_acl(inode, dir, page); if (err) goto error; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 608f0df5b919..89dc7508faf2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -18,6 +18,13 @@ #include <linux/crc32.h> #include <linux/magic.h> #include <linux/kobject.h> +#include <linux/sched.h> + +#ifdef CONFIG_F2FS_CHECK_FS +#define f2fs_bug_on(condition) BUG_ON(condition) +#else +#define f2fs_bug_on(condition) +#endif /* * For mount options @@ -298,6 +305,9 @@ struct f2fs_sm_info { unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ unsigned int ovp_segments; /* # of overprovision segments */ + + /* a threshold to reclaim prefree segments */ + unsigned int rec_prefree_segments; }; /* @@ -318,14 +328,6 @@ enum count_type { }; /* - * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS]. - * The checkpoint procedure blocks all the locks in this fs_lock array. - * Some FS operations grab free locks, and if there is no free lock, - * then wait to grab a lock in a round-robin manner. - */ -#define NR_GLOBAL_LOCKS 8 - -/* * The below are the page types of bios used in submti_bio(). * The available types are: * DATA User data pages. It operates as async mode. @@ -365,12 +367,12 @@ struct f2fs_sb_info { struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ - struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */ + struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ - unsigned char next_lock_num; /* round-robin global locks */ - int por_doing; /* recovery is doing or not */ - int on_build_free_nids; /* build_free_nids is doing */ + bool por_doing; /* recovery is doing or not */ + bool on_build_free_nids; /* build_free_nids is doing */ + wait_queue_head_t cp_wait; /* for orphan inode management */ struct list_head orphan_inode_list; /* orphan inode list */ @@ -520,48 +522,24 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void mutex_lock_all(struct f2fs_sb_info *sbi) +static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_GLOBAL_LOCKS; i++) { - /* - * This is the only time we take multiple fs_lock[] - * instances; the order is immaterial since we - * always hold cp_mutex, which serializes multiple - * such operations. - */ - mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex); - } + down_read(&sbi->cp_rwsem); } -static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) +static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - int i = 0; - for (; i < NR_GLOBAL_LOCKS; i++) - mutex_unlock(&sbi->fs_lock[i]); + up_read(&sbi->cp_rwsem); } -static inline int mutex_lock_op(struct f2fs_sb_info *sbi) +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS; - int i = 0; - - for (; i < NR_GLOBAL_LOCKS; i++) - if (mutex_trylock(&sbi->fs_lock[i])) - return i; - - mutex_lock(&sbi->fs_lock[next_lock]); - sbi->next_lock_num++; - return next_lock; + down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex); } -static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock) +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - if (ilock < 0) - return; - BUG_ON(ilock >= NR_GLOBAL_LOCKS); - mutex_unlock(&sbi->fs_lock[ilock]); + up_write(&sbi->cp_rwsem); } /* @@ -612,8 +590,8 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t count) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_block_count < (block_t) count); - BUG_ON(inode->i_blocks < count); + f2fs_bug_on(sbi->total_valid_block_count < (block_t) count); + f2fs_bug_on(inode->i_blocks < count); inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); @@ -745,9 +723,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_block_count < count); - BUG_ON(sbi->total_valid_node_count < count); - BUG_ON(inode->i_blocks < count); + f2fs_bug_on(sbi->total_valid_block_count < count); + f2fs_bug_on(sbi->total_valid_node_count < count); + f2fs_bug_on(inode->i_blocks < count); inode->i_blocks -= count; sbi->total_valid_node_count -= count; @@ -768,7 +746,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count); + f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count); sbi->total_valid_inode_count++; spin_unlock(&sbi->stat_lock); } @@ -776,7 +754,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - BUG_ON(!sbi->total_valid_inode_count); + f2fs_bug_on(!sbi->total_valid_inode_count); sbi->total_valid_inode_count--; spin_unlock(&sbi->stat_lock); return 0; @@ -797,7 +775,7 @@ static inline void f2fs_put_page(struct page *page, int unlock) return; if (unlock) { - BUG_ON(!PageLocked(page)); + f2fs_bug_on(!PageLocked(page)); unlock_page(page); } page_cache_release(page); @@ -819,6 +797,20 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); } +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + void *entry; +retry: + entry = kmem_cache_alloc(cachep, flags); + if (!entry) { + cond_resched(); + goto retry; + } + + return entry; +} + #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) static inline bool IS_INODE(struct page *page) @@ -979,6 +971,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +int try_to_free_nats(struct f2fs_sb_info *, int); void update_inode(struct inode *, struct page *); int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); @@ -1033,6 +1026,7 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); +int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); int remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *, const struct qstr *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); @@ -1059,6 +1053,7 @@ void destroy_node_manager_caches(void); * segment.c */ void f2fs_balance_fs(struct f2fs_sb_info *); +void f2fs_balance_fs_bg(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); @@ -1172,7 +1167,16 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) return (struct f2fs_stat_info*)sbi->stat_info; } -#define stat_inc_call_count(si) ((si)->call_count++) +#define stat_inc_call_count(si) ((si)->call_count++) +#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) +#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) +#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_seg_type(sbi, curseg) \ + ((sbi)->segment_count[(curseg)->alloc_type]++) +#define stat_inc_block_count(sbi, curseg) \ + ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_seg_count(sbi, type) \ do { \ @@ -1207,6 +1211,13 @@ void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_call_count(si) +#define stat_inc_bggc_count(si) +#define stat_inc_dirty_dir(sbi) +#define stat_dec_dirty_dir(sbi) +#define stat_inc_total_hit(sb) +#define stat_inc_read_hit(sb) +#define stat_inc_seg_type(sbi, curseg) +#define stat_inc_block_count(sbi, curseg) #define stat_inc_seg_count(si, type) #define stat_inc_tot_blk_count(si, blks) #define stat_inc_data_blk_count(si, blks) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 02c906971cc6..7d714f4972d5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,18 +35,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t old_blk_addr; struct dnode_of_data dn; - int err, ilock; + int err; f2fs_balance_fs(sbi); sb_start_pagefault(inode->i_sb); /* block allocation */ - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); if (err) { - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); goto out; } @@ -56,12 +56,12 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, err = reserve_new_block(&dn); if (err) { f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); goto out; } } f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); file_update_time(vma->vm_file); lock_page(page); @@ -88,6 +88,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, set_page_dirty(page); SetPageUptodate(page); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ wait_on_page_writeback(page); @@ -188,8 +189,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; } - filemap_fdatawait_range(sbi->node_inode->i_mapping, - 0, LONG_MAX); + ret = wait_on_node_pages_writeback(sbi, inode->i_ino); + if (ret) + goto out; ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); } out: @@ -270,7 +272,7 @@ static int truncate_blocks(struct inode *inode, u64 from) unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; - int count = 0, ilock = -1; + int count = 0; int err; trace_f2fs_truncate_blocks_enter(inode, from); @@ -278,13 +280,13 @@ static int truncate_blocks(struct inode *inode, u64 from) free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -295,7 +297,7 @@ static int truncate_blocks(struct inode *inode, u64 from) count = ADDRS_PER_BLOCK; count -= dn.ofs_in_node; - BUG_ON(count < 0); + f2fs_bug_on(count < 0); if (dn.ofs_in_node || IS_INODE(dn.node_page)) { truncate_data_blocks_range(&dn, count); @@ -305,7 +307,7 @@ static int truncate_blocks(struct inode *inode, u64 from) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); @@ -416,16 +418,15 @@ static void fill_zero(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; - int ilock; if (!len) return; f2fs_balance_fs(sbi); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (!IS_ERR(page)) { wait_on_page_writeback(page); @@ -484,7 +485,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ilock; f2fs_balance_fs(sbi); @@ -493,9 +493,9 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) truncate_inode_pages_range(mapping, blk_start, blk_end - 1); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); } } @@ -529,13 +529,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - int ilock; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); break; } @@ -543,12 +542,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, ret = reserve_new_block(&dn); if (ret) { f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); break; } } f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (pg_start == pg_end) new_size = offset + len; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2f157e883687..b7ad1ec7e4cc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -77,13 +77,15 @@ static int gc_thread_func(void *data) else wait_ms = increase_sleep_time(gc_th, wait_ms); -#ifdef CONFIG_F2FS_STAT_FS - sbi->bg_gc++; -#endif + stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) wait_ms = gc_th->no_gc_sleep_time; + + /* balancing f2fs's metadata periodically */ + f2fs_balance_fs_bg(sbi); + } while (!kthread_should_stop()); return 0; } @@ -236,8 +238,8 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } -static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, - struct victim_sel_policy *p) +static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, + unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) return get_seg_entry(sbi, segno)->ckpt_valid_blocks; @@ -293,7 +295,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } break; } - p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; + + p.offset = segno + p.ofs_unit; + if (p.ofs_unit > 1) + p.offset -= segno % p.ofs_unit; + secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) @@ -306,10 +312,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; - } - - if (cost == max_cost) + } else if (unlikely(cost == max_cost)) { continue; + } if (nsearched++ >= p.max_search) { sbi->last_victim[p.gc_mode] = segno; @@ -358,12 +363,8 @@ static void add_gc_inode(struct inode *inode, struct list_head *ilist) iput(inode); return; } -repeat: - new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); - if (!new_ie) { - cond_resched(); - goto repeat; - } + + new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS); new_ie->inode = inode; list_add_tail(&new_ie->list, ilist); } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9339cd292047..d0eaa9faeca0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -37,6 +37,31 @@ void f2fs_set_inode_flags(struct inode *inode) inode->i_flags |= S_DIRSYNC; } +static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + if (ri->i_addr[0]) + inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); + else + inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); + } +} + +static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[1] = 0; + } else { + ri->i_addr[0] = 0; + ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[2] = 0; + } + } +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -73,10 +98,6 @@ static int do_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); - if (ri->i_addr[0]) - inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); - else - inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); fi->i_current_depth = le32_to_cpu(ri->i_current_depth); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); @@ -84,8 +105,13 @@ static int do_read_inode(struct inode *inode) fi->flags = 0; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); + get_extent_info(&fi->ext, ri->i_ext); get_inline_info(fi, ri); + + /* get rdev by using inline_info */ + __get_inode_rdev(inode, ri); + f2fs_put_page(node_page, 1); return 0; } @@ -179,21 +205,10 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; - } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; - } - } - + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); set_page_dirty(node_page); + clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } @@ -214,7 +229,7 @@ int update_inode_page(struct inode *inode) int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ret, ilock; + int ret; if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) @@ -227,9 +242,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to lock here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); ret = update_inode_page(inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (wbc) f2fs_balance_fs(sbi); @@ -243,7 +258,6 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ilock; trace_f2fs_evict_inode(inode); truncate_inode_pages(&inode->i_data, 0); @@ -252,7 +266,7 @@ void f2fs_evict_inode(struct inode *inode) inode->i_ino == F2FS_META_INO(sbi)) goto no_delete; - BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents)); + f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) @@ -265,9 +279,9 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); remove_inode_page(inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); sb_end_intwrite(inode->i_sb); no_delete: diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2a5359c990fc..575adac17f8b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -27,19 +27,19 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; - int err, ilock; + int err; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); if (!alloc_nid(sbi, &ino)) { - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); inode->i_uid = current_fsuid(); @@ -115,7 +115,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; nid_t ino = 0; - int err, ilock; + int err; f2fs_balance_fs(sbi); @@ -131,9 +131,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out; @@ -157,7 +157,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = old_dentry->d_inode; struct super_block *sb = dir->i_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); - int err, ilock; + int err; f2fs_balance_fs(sbi); @@ -165,9 +165,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_inode_flag(F2FS_I(inode), FI_INC_LINK); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out; @@ -220,7 +220,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct f2fs_dir_entry *de; struct page *page; int err = -ENOENT; - int ilock; trace_f2fs_unlink_enter(dir, dentry); f2fs_balance_fs(sbi); @@ -229,16 +228,16 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (!de) goto fail; + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) { + f2fs_unlock_op(sbi); kunmap(page); f2fs_put_page(page, 0); goto fail; } - - ilock = mutex_lock_op(sbi); f2fs_delete_entry(de, page, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); @@ -254,7 +253,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; size_t symlen = strlen(symname) + 1; - int err, ilock; + int err; f2fs_balance_fs(sbi); @@ -265,9 +264,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out; @@ -290,7 +289,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; - int err, ilock; + int err; f2fs_balance_fs(sbi); @@ -304,9 +303,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); set_inode_flag(F2FS_I(inode), FI_INC_LINK); - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out_fail; @@ -342,7 +341,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; int err = 0; - int ilock; if (!new_valid_dev(rdev)) return -EINVAL; @@ -356,9 +354,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); if (err) goto out; @@ -387,7 +385,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - int err = -ENOENT, ilock = -1; + int err = -ENOENT; f2fs_balance_fs(sbi); @@ -402,7 +400,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); if (new_inode) { @@ -467,7 +465,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_dir); } - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); return 0; put_out_dir: @@ -477,7 +475,7 @@ out_dir: kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 51ef27894433..4ac4150d421d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -204,7 +204,7 @@ retry: } e->ni = *ni; e->checkpointed = true; - BUG_ON(ni->blk_addr == NEW_ADDR); + f2fs_bug_on(ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* * when nid is reallocated, @@ -212,19 +212,19 @@ retry: * So, reinitialize it with new information. */ e->ni = *ni; - BUG_ON(ni->blk_addr != NULL_ADDR); + f2fs_bug_on(ni->blk_addr != NULL_ADDR); } if (new_blkaddr == NEW_ADDR) e->checkpointed = false; /* sanity check */ - BUG_ON(nat_get_blkaddr(e) != ni->blk_addr); - BUG_ON(nat_get_blkaddr(e) == NULL_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && new_blkaddr == NULL_ADDR); - BUG_ON(nat_get_blkaddr(e) == NEW_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - BUG_ON(nat_get_blkaddr(e) != NEW_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && nat_get_blkaddr(e) != NULL_ADDR && new_blkaddr == NEW_ADDR); @@ -240,7 +240,7 @@ retry: write_unlock(&nm_i->nat_tree_lock); } -static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -495,10 +495,10 @@ static void truncate_node(struct dnode_of_data *dn) get_node_info(sbi, dn->nid, &ni); if (dn->inode->i_blocks == 0) { - BUG_ON(ni.blk_addr != NULL_ADDR); + f2fs_bug_on(ni.blk_addr != NULL_ADDR); goto invalidate; } - BUG_ON(ni.blk_addr == NULL_ADDR); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); @@ -822,7 +822,7 @@ int remove_inode_page(struct inode *inode) } /* 0 is possible, after f2fs_new_inode() is failed */ - BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); + f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); set_new_dnode(&dn, inode, page, page, ino); truncate_node(&dn); return 0; @@ -863,7 +863,7 @@ struct page *new_node_page(struct dnode_of_data *dn, get_node_info(sbi, dn->nid, &old_ni); /* Reinitialize old_ni with new node page */ - BUG_ON(old_ni.blk_addr != NULL_ADDR); + f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; set_node_addr(sbi, &new_ni, NEW_ADDR); @@ -969,7 +969,7 @@ repeat: goto repeat; } got_it: - BUG_ON(nid != nid_of_node(page)); + f2fs_bug_on(nid != nid_of_node(page)); mark_page_accessed(page); return page; } @@ -1148,6 +1148,47 @@ continue_unlock: return nwritten; } +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct address_space *mapping = sbi->node_inode->i_mapping; + pgoff_t index = 0, end = LONG_MAX; + struct pagevec pvec; + int nr_pages; + int ret2 = 0, ret = 0; + + pagevec_init(&pvec, 0); + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + if (ino && ino_of_node(page) == ino) { + wait_on_page_writeback(page); + if (TestClearPageError(page)) + ret = -EIO; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret2 = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret2 = -EIO; + if (!ret) + ret = ret2; + return ret; +} + static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { @@ -1156,11 +1197,14 @@ static int f2fs_write_node_page(struct page *page, block_t new_addr; struct node_info ni; + if (sbi->por_doing) + goto redirty_out; + wait_on_page_writeback(page); /* get old block addr of this node page */ nid = nid_of_node(page); - BUG_ON(page->index != nid); + f2fs_bug_on(page->index != nid); get_node_info(sbi, nid, &ni); @@ -1171,12 +1215,8 @@ static int f2fs_write_node_page(struct page *page, return 0; } - if (wbc->for_reclaim) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } + if (wbc->for_reclaim) + goto redirty_out; mutex_lock(&sbi->node_write); set_page_writeback(page); @@ -1186,6 +1226,12 @@ static int f2fs_write_node_page(struct page *page, mutex_unlock(&sbi->node_write); unlock_page(page); return 0; + +redirty_out: + dec_page_count(sbi, F2FS_DIRTY_NODES); + wbc->pages_skipped++; + set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; } /* @@ -1200,11 +1246,8 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); long nr_to_write = wbc->nr_to_write; - /* First check balancing cached NAT entries */ - if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - f2fs_sync_fs(sbi->sb, true); - return 0; - } + /* balancing f2fs's metadata in background */ + f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) @@ -1223,6 +1266,8 @@ static int f2fs_set_node_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + trace_f2fs_set_page_dirty(page, NODE); + SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); @@ -1291,23 +1336,18 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) if (nid == 0) return 0; - if (!build) - goto retry; - - /* do not add allocated nids */ - read_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) - allocated = true; - read_unlock(&nm_i->nat_tree_lock); - if (allocated) - return 0; -retry: - i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); - if (!i) { - cond_resched(); - goto retry; + if (build) { + /* do not add allocated nids */ + read_lock(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + allocated = true; + read_unlock(&nm_i->nat_tree_lock); + if (allocated) + return 0; } + + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; @@ -1350,7 +1390,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i, break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - BUG_ON(blk_addr == NEW_ADDR); + f2fs_bug_on(blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) { if (add_free_nid(nm_i, start_nid, true) < 0) break; @@ -1421,14 +1461,14 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !sbi->on_build_free_nids) { - BUG_ON(list_empty(&nm_i->free_nid_list)); + f2fs_bug_on(list_empty(&nm_i->free_nid_list)); list_for_each(this, &nm_i->free_nid_list) { i = list_entry(this, struct free_nid, list); if (i->state == NID_NEW) break; } - BUG_ON(i->state != NID_NEW); + f2fs_bug_on(i->state != NID_NEW); *nid = i->nid; i->state = NID_ALLOC; nm_i->fcnt--; @@ -1439,9 +1479,9 @@ retry: /* Let's scan nat pages and its caches to get free nids */ mutex_lock(&nm_i->build_lock); - sbi->on_build_free_nids = 1; + sbi->on_build_free_nids = true; build_free_nids(sbi); - sbi->on_build_free_nids = 0; + sbi->on_build_free_nids = false; mutex_unlock(&nm_i->build_lock); goto retry; } @@ -1456,7 +1496,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - BUG_ON(!i || i->state != NID_ALLOC); + f2fs_bug_on(!i || i->state != NID_ALLOC); __del_from_free_nid_list(i); spin_unlock(&nm_i->free_nid_list_lock); } @@ -1474,7 +1514,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - BUG_ON(!i || i->state != NID_ALLOC); + f2fs_bug_on(!i || i->state != NID_ALLOC); if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { __del_from_free_nid_list(i); } else { @@ -1677,7 +1717,7 @@ to_nat_page: nat_blk = page_address(page); } - BUG_ON(!nat_blk); + f2fs_bug_on(!nat_blk); raw_ne = nat_blk->entries[nid - start_nid]; flush_now: new_blkaddr = nat_get_blkaddr(ne); @@ -1781,11 +1821,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - BUG_ON(i->state == NID_ALLOC); + f2fs_bug_on(i->state == NID_ALLOC); __del_from_free_nid_list(i); nm_i->fcnt--; } - BUG_ON(nm_i->fcnt); + f2fs_bug_on(nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ @@ -1799,7 +1839,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) __del_from_nat_cache(nm_i, e); } } - BUG_ON(nm_i->nat_cnt); + f2fs_bug_on(nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 51ef5eec33d7..fdc81161f254 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -64,24 +64,31 @@ static int recover_dentry(struct page *ipage, struct inode *inode) name.name = raw_inode->i_name; retry: de = f2fs_find_entry(dir, &name, &page); - if (de && inode->i_ino == le32_to_cpu(de->ino)) { - kunmap(page); - f2fs_put_page(page, 0); - goto out; - } + if (de && inode->i_ino == le32_to_cpu(de->ino)) + goto out_unmap_put; if (de) { einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); if (PTR_ERR(einode) == -ENOENT) err = -EEXIST; - goto out; + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_SB(inode->i_sb)); + if (err) { + iput(einode); + goto out_unmap_put; } f2fs_delete_entry(de, page, einode); iput(einode); goto retry; } err = __f2fs_add_link(dir, &name, inode); + goto out; + +out_unmap_put: + kunmap(page); + f2fs_put_page(page, 0); out: f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " "ino = %x, name = %s, dir = %lx, err = %d", @@ -285,7 +292,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct f2fs_summary sum; struct node_info ni; int err = 0, recovered = 0; - int ilock; start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) @@ -293,20 +299,20 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, else end = start + ADDRS_PER_BLOCK; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); return err; } wait_on_page_writeback(dn.node_page); get_node_info(sbi, dn.nid, &ni); - BUG_ON(ni.ino != ino_of_node(page)); - BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page)); + f2fs_bug_on(ni.ino != ino_of_node(page)); + f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page)); for (; start < end; start++) { block_t src, dest; @@ -316,9 +322,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { if (src == NULL_ADDR) { - int err = reserve_new_block(&dn); + err = reserve_new_block(&dn); /* We should not get -ENOSPC */ - BUG_ON(err); + f2fs_bug_on(err); } /* Check the previous node page having this index */ @@ -349,7 +355,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); err: f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " "recovered_data = %d blocks, err = %d", @@ -419,6 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) { struct list_head inode_list; int err; + bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry), NULL); @@ -428,7 +435,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&inode_list); /* step #1: find fsynced inode numbers */ - sbi->por_doing = 1; + sbi->por_doing = true; err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -436,14 +443,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) if (list_empty(&inode_list)) goto out; + need_writecp = true; + /* step #2: recover data */ err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); - BUG_ON(!list_empty(&inode_list)); + f2fs_bug_on(!list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); - sbi->por_doing = 0; - if (!err) + sbi->por_doing = false; + if (!err && need_writecp) write_checkpoint(sbi, false); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 09af9c7b0f52..fa284d397199 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -36,6 +36,14 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) } } +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) +{ + /* check the # of cached NAT entries and prefree segments */ + if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || + excess_prefree_segs(sbi)) + f2fs_sync_fs(sbi->sb, true); +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -50,20 +58,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); - enum dirty_type t = DIRTY_HOT_DATA; - - dirty_type = sentry->type; - - if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) - dirty_i->nr_dirty[dirty_type]++; + enum dirty_type t = sentry->type; - /* Only one bitmap should be set */ - for (; t <= DIRTY_COLD_NODE; t++) { - if (t == dirty_type) - continue; - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) - dirty_i->nr_dirty[t]--; - } + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]++; } } @@ -76,12 +74,11 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { - enum dirty_type t = DIRTY_HOT_DATA; + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; - /* clear all the bitmaps */ - for (; t <= DIRTY_COLD_NODE; t++) - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) - dirty_i->nr_dirty[t]--; + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) clear_bit(GET_SECNO(sbi, segno), @@ -142,27 +139,33 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno = -1; + unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); + unsigned int start = 0, end = -1; mutex_lock(&dirty_i->seglist_lock); + while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - segno + 1); - if (segno >= total_segs) + int i; + start = find_next_bit(prefree_map, total_segs, end + 1); + if (start >= total_segs) break; + end = find_next_zero_bit(prefree_map, total_segs, start + 1); + + for (i = start; i < end; i++) + clear_bit(i, prefree_map); - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) - dirty_i->nr_dirty[PRE]--; - - /* Let's use trim */ - if (test_opt(sbi, DISCARD)) - blkdev_issue_discard(sbi->sb->s_bdev, - START_BLOCK(sbi, segno) << - sbi->log_sectors_per_block, - 1 << (sbi->log_sectors_per_block + - sbi->log_blocks_per_seg), - GFP_NOFS, 0); + dirty_i->nr_dirty[PRE] -= end - start; + + if (!test_opt(sbi, DISCARD)) + continue; + + blkdev_issue_discard(sbi->sb->s_bdev, + START_BLOCK(sbi, start) << + sbi->log_sectors_per_block, + (1 << (sbi->log_sectors_per_block + + sbi->log_blocks_per_seg)) * (end - start), + GFP_NOFS, 0); } mutex_unlock(&dirty_i->seglist_lock); } @@ -195,7 +198,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) new_vblocks = se->valid_blocks + del; offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); - BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) || + f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; @@ -235,7 +238,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); - BUG_ON(addr == NULL_ADDR); + f2fs_bug_on(addr == NULL_ADDR); if (addr == NEW_ADDR) return; @@ -267,9 +270,8 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, */ int npages_for_summary_flush(struct f2fs_sb_info *sbi) { - int total_size_bytes = 0; int valid_sum_count = 0; - int i, sum_space; + int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { if (sbi->ckpt->alloc_type[i] == SSR) @@ -278,13 +280,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi) valid_sum_count += curseg_blkoff(sbi, i); } - total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1) - + sizeof(struct nat_journal) + 2 - + sizeof(struct sit_journal) + 2; - sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE; - if (total_size_bytes < sum_space) + sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + SUM_FOOTER_SIZE) / SUMMARY_SIZE; + if (valid_sum_count <= sum_in_page) return 1; - else if (total_size_bytes < 2 * sum_space) + else if ((valid_sum_count - sum_in_page) <= + (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -350,7 +351,7 @@ find_other_zone: if (dir == ALLOC_RIGHT) { secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), 0); - BUG_ON(secno >= TOTAL_SECS(sbi)); + f2fs_bug_on(secno >= TOTAL_SECS(sbi)); } else { go_left = 1; left_start = hint - 1; @@ -366,7 +367,7 @@ find_other_zone: } left_start = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), 0); - BUG_ON(left_start >= TOTAL_SECS(sbi)); + f2fs_bug_on(left_start >= TOTAL_SECS(sbi)); break; } secno = left_start; @@ -405,7 +406,7 @@ skip_left: } got_it: /* set it as dirty segment in free segmap */ - BUG_ON(test_bit(segno, free_i->free_segmap)); + f2fs_bug_on(test_bit(segno, free_i->free_segmap)); __set_inuse(sbi, segno); *newseg = segno; write_unlock(&free_i->segmap_lock); @@ -550,9 +551,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); else new_curseg(sbi, type, false); -#ifdef CONFIG_F2FS_STAT_FS - sbi->segment_count[curseg->alloc_type]++; -#endif + + stat_inc_seg_type(sbi, curseg); } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -597,6 +597,11 @@ static void f2fs_end_io_write(struct bio *bio, int err) if (p->is_sync) complete(p->wait); + + if (!get_pages(p->sbi, F2FS_WRITEBACK) && + !list_empty(&p->sbi->cp_wait.task_list)) + wake_up(&p->sbi->cp_wait); + kfree(p); bio_put(bio); } @@ -657,6 +662,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, block_t blk_addr, enum page_type type) { struct block_device *bdev = sbi->sb->s_bdev; + int bio_blocks; verify_block_addr(sbi, blk_addr); @@ -676,7 +682,8 @@ retry: goto retry; } - sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); + bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks); sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); sbi->bio[type]->bi_private = priv; /* @@ -771,7 +778,7 @@ static int __get_segment_type(struct page *page, enum page_type p_type) return __get_segment_type_4(page, p_type); } /* NR_CURSEG_TYPE(6) logs by default */ - BUG_ON(sbi->active_logs != NR_CURSEG_TYPE); + f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE); return __get_segment_type_6(page, p_type); } @@ -801,9 +808,8 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, mutex_lock(&sit_i->sentry_lock); __refresh_next_blkoff(sbi, curseg); -#ifdef CONFIG_F2FS_STAT_FS - sbi->block_count[curseg->alloc_type]++; -#endif + + stat_inc_block_count(sbi, curseg); /* * SIT information should be updated before segment allocation, @@ -849,7 +855,7 @@ void write_data_page(struct inode *inode, struct page *page, struct f2fs_summary sum; struct node_info ni; - BUG_ON(old_blkaddr == NULL_ADDR); + f2fs_bug_on(old_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); @@ -1122,8 +1128,6 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; - set_page_dirty(page); - /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blkoff; @@ -1142,18 +1146,20 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) summary = (struct f2fs_summary *)(kaddr + written_size); *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - set_page_dirty(page); if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) continue; + set_page_dirty(page); f2fs_put_page(page, 1); page = NULL; } } - if (page) + if (page) { + set_page_dirty(page); f2fs_put_page(page, 1); + } } static void write_normal_summaries(struct f2fs_sb_info *sbi, @@ -1239,7 +1245,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, /* get current sit block page without lock */ src_page = get_meta_page(sbi, src_off); dst_page = grab_meta_page(sbi, dst_off); - BUG_ON(PageDirty(src_page)); + f2fs_bug_on(PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -1271,9 +1277,9 @@ static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) __mark_sit_entry_dirty(sbi, segno); } update_sits_in_cursum(sum, -sits_in_cursum(sum)); - return 1; + return true; } - return 0; + return false; } /* @@ -1637,6 +1643,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; err = build_sit_info(sbi); if (err) @@ -1744,6 +1751,8 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) void destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); + if (!sm_info) + return; destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index bdd10eab8c40..269f690b4e24 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -14,6 +14,8 @@ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) +#define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) @@ -90,6 +92,8 @@ (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) #define SECTOR_TO_BLOCK(sbi, sectors) \ (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) +#define MAX_BIO_BLOCKS(max_hw_blocks) \ + (min((int)max_hw_blocks, BIO_MAX_PAGES)) /* during checkpoint, bio_private is used to synchronize the last bio */ struct bio_private { @@ -470,6 +474,11 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) reserved_sections(sbi))); } +static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) +{ + return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments); +} + static inline int utilization(struct f2fs_sb_info *sbi) { return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); @@ -513,16 +522,13 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) return curseg->next_blkoff; } +#ifdef CONFIG_F2FS_CHECK_FS static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int end_segno = SM_I(sbi)->segment_count - 1; BUG_ON(segno > end_segno); } -/* - * This function is used for only debugging. - * NOTE: In future, we have to remove this function. - */ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { struct f2fs_sm_info *sm_info = SM_I(sbi); @@ -541,8 +547,9 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, { struct f2fs_sm_info *sm_info = SM_I(sbi); unsigned int end_segno = sm_info->segment_count - 1; + bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; - int i; + int cur_pos = 0, next_pos; /* check segment usage */ BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); @@ -551,11 +558,26 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, BUG_ON(segno > end_segno); /* check bitmap with valid block count */ - for (i = 0; i < sbi->blocks_per_seg; i++) - if (f2fs_test_bit(i, raw_sit->valid_map)) - valid_blocks++; + do { + if (is_valid) { + next_pos = find_next_zero_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + valid_blocks += next_pos - cur_pos; + } else + next_pos = find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + cur_pos = next_pos; + is_valid = !is_valid; + } while (cur_pos < sbi->blocks_per_seg); BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); } +#else +#define check_seg_range(sbi, segno) +#define verify_block_addr(sbi, blk_addr) +#define check_block_count(sbi, segno, raw_sit) +#endif static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 13d0a0fe49dd..bafff72de8e8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -43,7 +43,9 @@ enum { Opt_disable_roll_forward, Opt_discard, Opt_noheap, + Opt_user_xattr, Opt_nouser_xattr, + Opt_acl, Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, @@ -56,7 +58,9 @@ static match_table_t f2fs_tokens = { {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_discard, "discard"}, {Opt_noheap, "no_heap"}, + {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, @@ -65,24 +69,40 @@ static match_table_t f2fs_tokens = { }; /* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ +}; + struct f2fs_attr { struct attribute attr; ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, const char *, size_t); + int struct_type; int offset; }; +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + return NULL; +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct f2fs_gc_kthread *gc_kth = sbi->gc_thread; + unsigned char *ptr = NULL; unsigned int *ui; - if (!gc_kth) + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) return -EINVAL; - ui = (unsigned int *)(((char *)gc_kth) + a->offset); + ui = (unsigned int *)(ptr + a->offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } @@ -91,15 +111,16 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { - struct f2fs_gc_kthread *gc_kth = sbi->gc_thread; + unsigned char *ptr; unsigned long t; unsigned int *ui; ssize_t ret; - if (!gc_kth) + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) return -EINVAL; - ui = (unsigned int *)(((char *)gc_kth) + a->offset); + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) @@ -135,21 +156,25 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } -#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \ +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ - .offset = offsetof(struct f2fs_gc_kthread, _elname), \ + .struct_type = _struct_type, \ + .offset = _offset \ } -#define F2FS_RW_ATTR(name, elname) \ - F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname) +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) -F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -157,6 +182,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_max_sleep_time), ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), + ATTR_LIST(reclaim_segments), NULL, }; @@ -237,6 +263,9 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, NOHEAP); break; #ifdef CONFIG_F2FS_FS_XATTR + case Opt_user_xattr: + set_opt(sbi, XATTR_USER); + break; case Opt_nouser_xattr: clear_opt(sbi, XATTR_USER); break; @@ -244,6 +273,10 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, INLINE_XATTR); break; #else + case Opt_user_xattr: + f2fs_msg(sb, KERN_INFO, + "user_xattr options not supported"); + break; case Opt_nouser_xattr: f2fs_msg(sb, KERN_INFO, "nouser_xattr options not supported"); @@ -254,10 +287,16 @@ static int parse_options(struct super_block *sb, char *options) break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi, POSIX_ACL); + break; case Opt_noacl: clear_opt(sbi, POSIX_ACL); break; #else + case Opt_acl: + f2fs_msg(sb, KERN_INFO, "acl options not supported"); + break; case Opt_noacl: f2fs_msg(sb, KERN_INFO, "noacl options not supported"); break; @@ -355,7 +394,9 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_stats(sbi); stop_gc_thread(sbi); - write_checkpoint(sbi, true); + /* We don't need to do checkpoint when it's clean */ + if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES)) + write_checkpoint(sbi, true); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -727,30 +768,47 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); } -static int validate_superblock(struct super_block *sb, - struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, sector_t block) +/* + * Read f2fs raw super block. + * Because we have two copies of super block, so read the first one at first, + * if the first one is invalid, move to read the second one. + */ +static int read_raw_super_block(struct super_block *sb, + struct f2fs_super_block **raw_super, + struct buffer_head **raw_super_buf) { - const char *super = (block == 0 ? "first" : "second"); + int block = 0; - /* read f2fs raw super block */ +retry: *raw_super_buf = sb_bread(sb, block); if (!*raw_super_buf) { - f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", - super); - return -EIO; + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + block + 1); + if (block == 0) { + block++; + goto retry; + } else { + return -EIO; + } } *raw_super = (struct f2fs_super_block *) ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (!sanity_check_raw_super(sb, *raw_super)) - return 0; + if (sanity_check_raw_super(sb, *raw_super)) { + brelse(*raw_super_buf); + f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " + "in %dth superblock", block + 1); + if(block == 0) { + block++; + goto retry; + } else { + return -EINVAL; + } + } - f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " - "in %s superblock", super); - return -EINVAL; + return 0; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -760,7 +818,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct buffer_head *raw_super_buf; struct inode *root; long err = -EINVAL; - int i; /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); @@ -773,14 +830,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - err = validate_superblock(sb, &raw_super, &raw_super_buf, 0); - if (err) { - brelse(raw_super_buf); - /* check secondary superblock when primary failed */ - err = validate_superblock(sb, &raw_super, &raw_super_buf, 1); - if (err) - goto free_sb_buf; - } + err = read_raw_super_block(sb, &raw_super, &raw_super_buf); + if (err) + goto free_sbi; + sb->s_fs_info = sbi; /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; @@ -818,12 +871,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - for (i = 0; i < NR_GLOBAL_LOCKS; i++) - mutex_init(&sbi->fs_lock[i]); mutex_init(&sbi->node_write); - sbi->por_doing = 0; + sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); init_rwsem(&sbi->bio_sem); + init_rwsem(&sbi->cp_rwsem); + init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); /* get an inode for meta space */ @@ -922,12 +975,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto fail; + goto free_gc; } err = f2fs_build_stats(sbi); if (err) - goto fail; + goto free_gc; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -953,6 +1006,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) return 0; fail: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + f2fs_destroy_stats(sbi); +free_gc: stop_gc_thread(sbi); free_root_inode: dput(sb->s_root); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1ac8a5f6e380..aa7a3f139fe5 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -154,6 +154,9 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, } #ifdef CONFIG_F2FS_FS_SECURITY +static int __f2fs_setxattr(struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + struct page *ipage); static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *page) { @@ -161,7 +164,7 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, int err = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, + err = __f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, xattr->name, xattr->value, xattr->value_len, (struct page *)page); if (err < 0) @@ -369,7 +372,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); } - BUG_ON(new_nid); + f2fs_bug_on(new_nid); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -469,16 +472,15 @@ cleanup: return error; } -int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len, struct page *ipage) +static int __f2fs_setxattr(struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; size_t name_len; - int ilock; __u32 new_hsize; int error = -ENOMEM; @@ -493,10 +495,6 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode)) return -ERANGE; - f2fs_balance_fs(sbi); - - ilock = mutex_lock_op(sbi); - base_addr = read_all_xattrs(inode, ipage); if (!base_addr) goto exit; @@ -522,7 +520,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, */ free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); if (found) - free = free - ENTRY_SIZE(here); + free = free + ENTRY_SIZE(here); if (free < newsize) { error = -ENOSPC; @@ -578,7 +576,21 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, else update_inode_page(inode); exit: - mutex_unlock_op(sbi, ilock); kzfree(base_addr); return error; } + +int f2fs_setxattr(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int err; + + f2fs_balance_fs(sbi); + + f2fs_lock_op(sbi); + err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); + f2fs_unlock_op(sbi); + + return err; +} diff --git a/fs/file_table.c b/fs/file_table.c index abdd15ad13c9..e900ca518635 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -297,7 +297,7 @@ void flush_delayed_fput(void) delayed_fput(NULL); } -static DECLARE_WORK(delayed_fput_work, delayed_fput); +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput(struct file *file) { @@ -317,7 +317,7 @@ void fput(struct file *file) } if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) - schedule_work(&delayed_fput_work); + schedule_delayed_work(&delayed_fput_work, 1); } } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index b2a86e324aac..29d7feb62cf7 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -58,15 +58,16 @@ void fscache_cookie_init_once(void *_cookie) struct fscache_cookie *__fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, - void *netfs_data) + void *netfs_data, + bool enable) { struct fscache_cookie *cookie; BUG_ON(!def); - _enter("{%s},{%s},%p", + _enter("{%s},{%s},%p,%u", parent ? (char *) parent->def->name : "<no-parent>", - def->name, netfs_data); + def->name, netfs_data, enable); fscache_stat(&fscache_n_acquires); @@ -106,7 +107,7 @@ struct fscache_cookie *__fscache_acquire_cookie( cookie->def = def; cookie->parent = parent; cookie->netfs_data = netfs_data; - cookie->flags = 0; + cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ @@ -124,16 +125,22 @@ struct fscache_cookie *__fscache_acquire_cookie( break; } - /* if the object is an index then we need do nothing more here - we - * create indices on disk when we need them as an index may exist in - * multiple caches */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (fscache_acquire_non_index_cookie(cookie) < 0) { - atomic_dec(&parent->n_children); - __fscache_cookie_put(cookie); - fscache_stat(&fscache_n_acquires_nobufs); - _leave(" = NULL"); - return NULL; + if (enable) { + /* if the object is an index then we need do nothing more here + * - we create indices on disk when we need them as an index + * may exist in multiple caches */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie) == 0) { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } else { + atomic_dec(&parent->n_children); + __fscache_cookie_put(cookie); + fscache_stat(&fscache_n_acquires_nobufs); + _leave(" = NULL"); + return NULL; + } + } else { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } } @@ -144,6 +151,39 @@ struct fscache_cookie *__fscache_acquire_cookie( EXPORT_SYMBOL(__fscache_acquire_cookie); /* + * Enable a cookie to permit it to accept new operations. + */ +void __fscache_enable_cookie(struct fscache_cookie *cookie, + bool (*can_enable)(void *data), + void *data) +{ + _enter("%p", cookie); + + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + + if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) + goto out_unlock; + + if (can_enable && !can_enable(data)) { + /* The netfs decided it didn't want to enable after all */ + } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + /* Wait for outstanding disablement to complete */ + __fscache_wait_on_invalidate(cookie); + + if (fscache_acquire_non_index_cookie(cookie) == 0) + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } else { + set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + } + +out_unlock: + clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); +} +EXPORT_SYMBOL(__fscache_enable_cookie); + +/* * acquire a non-index cookie * - this must make sure the index chain is instantiated and instantiate the * object representation too @@ -157,7 +197,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) _enter(""); - cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE; + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); /* now we need to see whether the backing objects for this cookie yet * exist, if not there'll be nothing to search */ @@ -180,9 +220,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) _debug("cache %s", cache->tag->name); - cookie->flags = - (1 << FSCACHE_COOKIE_LOOKING_UP) | - (1 << FSCACHE_COOKIE_NO_DATA_YET); + set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); /* ask the cache to allocate objects for this cookie and its parent * chain */ @@ -398,7 +436,8 @@ void __fscache_invalidate(struct fscache_cookie *cookie) if (!hlist_empty(&cookie->backing_objects)) { spin_lock(&cookie->lock); - if (!hlist_empty(&cookie->backing_objects) && + if (fscache_cookie_enabled(cookie) && + !hlist_empty(&cookie->backing_objects) && !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { object = hlist_entry(cookie->backing_objects.first, @@ -452,10 +491,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) spin_lock(&cookie->lock); - /* update the index entry on disk in each cache backing this cookie */ - hlist_for_each_entry(object, - &cookie->backing_objects, cookie_link) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + if (fscache_cookie_enabled(cookie)) { + /* update the index entry on disk in each cache backing this + * cookie. + */ + hlist_for_each_entry(object, + &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + } } spin_unlock(&cookie->lock); @@ -464,28 +507,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) EXPORT_SYMBOL(__fscache_update_cookie); /* - * release a cookie back to the cache - * - the object will be marked as recyclable on disk if retire is true - * - all dependents of this cookie must have already been unregistered - * (indices/files/pages) + * Disable a cookie to stop it from accepting new requests from the netfs. */ -void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) +void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) { struct fscache_object *object; + bool awaken = false; - fscache_stat(&fscache_n_relinquishes); - if (retire) - fscache_stat(&fscache_n_relinquishes_retire); - - if (!cookie) { - fscache_stat(&fscache_n_relinquishes_null); - _leave(" [no cookie]"); - return; - } - - _enter("%p{%s,%p,%d},%d", - cookie, cookie->def->name, cookie->netfs_data, - atomic_read(&cookie->n_active), retire); + _enter("%p,%u", cookie, invalidate); ASSERTCMP(atomic_read(&cookie->n_active), >, 0); @@ -495,24 +524,82 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) BUG(); } - /* No further netfs-accessing operations on this cookie permitted */ - set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); - if (retire) - set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) + goto out_unlock_enable; + + /* If the cookie is being invalidated, wait for that to complete first + * so that we can reuse the flag. + */ + __fscache_wait_on_invalidate(cookie); + + /* Dispose of the backing objects */ + set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags); spin_lock(&cookie->lock); - hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + if (!hlist_empty(&cookie->backing_objects)) { + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { + if (invalidate) + set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + } + } else { + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + awaken = true; } spin_unlock(&cookie->lock); + if (awaken) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); /* Wait for cessation of activity requiring access to the netfs (when - * n_active reaches 0). + * n_active reaches 0). This makes sure outstanding reads and writes + * have completed. */ if (!atomic_dec_and_test(&cookie->n_active)) wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, TASK_UNINTERRUPTIBLE); + /* Reset the cookie state if it wasn't relinquished */ + if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { + atomic_inc(&cookie->n_active); + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + } + +out_unlock_enable: + clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); + _leave(""); +} +EXPORT_SYMBOL(__fscache_disable_cookie); + +/* + * release a cookie back to the cache + * - the object will be marked as recyclable on disk if retire is true + * - all dependents of this cookie must have already been unregistered + * (indices/files/pages) + */ +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) +{ + fscache_stat(&fscache_n_relinquishes); + if (retire) + fscache_stat(&fscache_n_relinquishes_retire); + + if (!cookie) { + fscache_stat(&fscache_n_relinquishes_null); + _leave(" [no cookie]"); + return; + } + + _enter("%p{%s,%p,%d},%d", + cookie, cookie->def->name, cookie->netfs_data, + atomic_read(&cookie->n_active), retire); + + /* No further netfs-accessing operations on this cookie permitted */ + set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); + + __fscache_disable_cookie(cookie, retire); + /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; @@ -568,6 +655,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,", cookie); @@ -591,7 +679,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto inconsistent; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); @@ -600,7 +689,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) op->debug_id = atomic_inc_return(&fscache_op_debug_id); - atomic_inc(&cookie->n_active); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, op) < 0) goto submit_failed; @@ -622,9 +711,11 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) return ret; submit_failed: - atomic_dec(&cookie->n_active); + wake_cookie = __fscache_unuse_cookie(cookie); inconsistent: spin_unlock(&cookie->lock); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); kfree(op); _leave(" = -ESTALE"); return -ESTALE; diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index 10a2ade0bdf8..5a117df2a9ef 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -59,6 +59,7 @@ struct fscache_cookie fscache_fsdef_index = { .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), .backing_objects = HLIST_HEAD_INIT, .def = &fscache_fsdef_index_def, + .flags = 1 << FSCACHE_COOKIE_ENABLED, }; EXPORT_SYMBOL(fscache_fsdef_index); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index b1bb6117473a..989f39401547 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -45,6 +45,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) netfs->primary_index->def = &fscache_fsdef_netfs_def; netfs->primary_index->parent = &fscache_fsdef_index; netfs->primary_index->netfs_data = netfs; + netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED; atomic_inc(&netfs->primary_index->parent->usage); atomic_inc(&netfs->primary_index->parent->n_children); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 86d75a60b20c..dcb821617774 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -495,6 +495,7 @@ void fscache_object_lookup_negative(struct fscache_object *object) * returning ENODATA. */ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); _debug("wake up lookup %p", &cookie->flags); clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); @@ -527,6 +528,7 @@ void fscache_obtained_object(struct fscache_object *object) /* We do (presumably) have data */ clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); /* Allow write requests to begin stacking up and read requests * to begin shovelling data. @@ -679,7 +681,8 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob */ spin_lock(&cookie->lock); hlist_del_init(&object->cookie_link); - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + if (hlist_empty(&cookie->backing_objects) && + test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) awaken = true; spin_unlock(&cookie->lock); @@ -927,7 +930,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj */ if (!fscache_use_cookie(object)) { ASSERT(object->cookie->stores.rnode == NULL); - set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); _leave(" [no cookie]"); return transit_to(KILL_OBJECT); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 73899c1c3449..7f5c658af755 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -163,12 +163,10 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat(&fscache_n_attr_changed_calls); - if (fscache_object_is_active(object) && - fscache_use_cookie(object)) { + if (fscache_object_is_active(object)) { fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); fscache_stat_d(&fscache_n_cop_attr_changed); - fscache_unuse_cookie(object); if (ret < 0) fscache_abort_object(object); } @@ -184,6 +182,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; + bool wake_cookie; _enter("%p", cookie); @@ -199,15 +198,19 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) } fscache_operation_init(op, fscache_attr_changed_op, NULL); - op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); + op->flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_EXCLUSIVE) | + (1 << FSCACHE_OP_UNUSE_COOKIE); spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); if (fscache_submit_exclusive_op(object, op) < 0) goto nobufs; spin_unlock(&cookie->lock); @@ -217,8 +220,11 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return 0; nobufs: + wake_cookie = __fscache_unuse_cookie(cookie); spin_unlock(&cookie->lock); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_attr_changed_nobufs); _leave(" = %d", -ENOBUFS); return -ENOBUFS; @@ -263,7 +269,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval( } fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); - atomic_inc(&cookie->n_active); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); @@ -384,6 +389,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%p,,,", cookie, page); @@ -405,7 +411,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, return -ERESTARTSYS; op = fscache_alloc_retrieval(cookie, page->mapping, - end_io_func,context); + end_io_func, context); if (!op) { _leave(" = -ENOMEM"); return -ENOMEM; @@ -414,13 +420,15 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)); + __fscache_use_cookie(cookie); atomic_inc(&object->n_reads); __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); @@ -475,9 +483,11 @@ error: nobufs_unlock_dec: atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); kfree(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); @@ -514,6 +524,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,,%d,,,", cookie, *nr_pages); @@ -542,11 +553,13 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); atomic_inc(&object->n_reads); __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); @@ -601,10 +614,12 @@ error: nobufs_unlock_dec: atomic_dec(&object->n_reads); + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); _leave(" = -ENOBUFS"); @@ -626,6 +641,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, { struct fscache_retrieval *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%p,,,", cookie, page); @@ -653,13 +669,15 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs_unlock; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, &op->op) < 0) - goto nobufs_unlock; + goto nobufs_unlock_dec; spin_unlock(&cookie->lock); fscache_stat(&fscache_n_alloc_ops); @@ -689,10 +707,13 @@ error: _leave(" = %d", ret); return ret; +nobufs_unlock_dec: + wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - atomic_dec(&cookie->n_active); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); nobufs: fscache_stat(&fscache_n_allocs_nobufs); _leave(" = -ENOBUFS"); @@ -889,6 +910,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, { struct fscache_storage *op; struct fscache_object *object; + bool wake_cookie = false; int ret; _enter("%p,%x,", cookie, (u32) page->flags); @@ -920,7 +942,8 @@ int __fscache_write_page(struct fscache_cookie *cookie, ret = -ENOBUFS; spin_lock(&cookie->lock); - if (hlist_empty(&cookie->backing_objects)) + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) goto nobufs; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); @@ -957,7 +980,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); op->store_limit = object->store_limit; - atomic_inc(&cookie->n_active); + __fscache_use_cookie(cookie); if (fscache_submit_op(object, &op->op) < 0) goto submit_failed; @@ -984,10 +1007,10 @@ already_pending: return 0; submit_failed: - atomic_dec(&cookie->n_active); spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); spin_unlock(&cookie->stores_lock); + wake_cookie = __fscache_unuse_cookie(cookie); page_cache_release(page); ret = -ENOBUFS; goto nobufs; @@ -999,6 +1022,8 @@ nobufs: spin_unlock(&cookie->lock); radix_tree_preload_end(); kfree(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_stores_nobufs); _leave(" = -ENOBUFS"); return -ENOBUFS; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index adbfd66b380f..24da581cb52b 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -589,11 +589,14 @@ static struct attribute *cuse_class_dev_attrs[] = { ATTRIBUTE_GROUPS(cuse_class_dev); static struct miscdevice cuse_miscdev = { - .minor = MISC_DYNAMIC_MINOR, + .minor = CUSE_MINOR, .name = "cuse", .fops = &cuse_channel_fops, }; +MODULE_ALIAS_MISCDEV(CUSE_MINOR); +MODULE_ALIAS("devname:cuse"); + static int __init cuse_init(void) { int i, rc; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4598345ab87d..7e70506297bc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -334,7 +334,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) BUG_ON(req->inode != inode); curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; - if (curr_index == index) { + if (curr_index <= index && + index < curr_index + req->num_pages) { found = true; break; } @@ -1409,8 +1410,13 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { - __free_page(req->pages[0]); - fuse_file_put(req->ff, false); + int i; + + for (i = 0; i < req->num_pages; i++) + __free_page(req->pages[i]); + + if (req->ff) + fuse_file_put(req->ff, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1418,30 +1424,34 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) struct inode *inode = req->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + int i; list_del(&req->writepages_entry); - dec_bdi_stat(bdi, BDI_WRITEBACK); - dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + for (i = 0; i < req->num_pages; i++) { + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + } wake_up(&fi->page_waitq); } /* Called under fc->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, + loff_t size) __releases(fc->lock) __acquires(fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); - loff_t size = i_size_read(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; + __u64 data_size = req->num_pages * PAGE_CACHE_SIZE; if (!fc->connected) goto out_free; - if (inarg->offset + PAGE_CACHE_SIZE <= size) { - inarg->size = PAGE_CACHE_SIZE; + if (inarg->offset + data_size <= size) { + inarg->size = data_size; } else if (inarg->offset < size) { - inarg->size = size & (PAGE_CACHE_SIZE - 1); + inarg->size = size - inarg->offset; } else { /* Got truncated off completely */ goto out_free; @@ -1472,12 +1482,13 @@ __acquires(fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + size_t crop = i_size_read(inode); struct fuse_req *req; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { req = list_entry(fi->queued_writes.next, struct fuse_req, list); list_del_init(&req->list); - fuse_send_writepage(fc, req); + fuse_send_writepage(fc, req, crop); } } @@ -1488,12 +1499,62 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) mapping_set_error(inode->i_mapping, req->out.h.error); spin_lock(&fc->lock); + while (req->misc.write.next) { + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_req *next = req->misc.write.next; + req->misc.write.next = next->misc.write.next; + next->misc.write.next = NULL; + next->ff = fuse_file_get(req->ff); + list_add(&next->writepages_entry, &fi->writepages); + + /* + * Skip fuse_flush_writepages() to make it easy to crop requests + * based on primary request size. + * + * 1st case (trivial): there are no concurrent activities using + * fuse_set/release_nowrite. Then we're on safe side because + * fuse_flush_writepages() would call fuse_send_writepage() + * anyway. + * + * 2nd case: someone called fuse_set_nowrite and it is waiting + * now for completion of all in-flight requests. This happens + * rarely and no more than once per page, so this should be + * okay. + * + * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle + * of fuse_set_nowrite..fuse_release_nowrite section. The fact + * that fuse_set_nowrite returned implies that all in-flight + * requests were completed along with all of their secondary + * requests. Further primary requests are blocked by negative + * writectr. Hence there cannot be any in-flight requests and + * no invocations of fuse_writepage_end() while we're in + * fuse_set_nowrite..fuse_release_nowrite section. + */ + fuse_send_writepage(fc, next, inarg->offset + inarg->size); + } fi->writectr--; fuse_writepage_finish(fc, req); spin_unlock(&fc->lock); fuse_writepage_free(fc, req); } +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = NULL; + + spin_lock(&fc->lock); + if (!WARN_ON(list_empty(&fi->write_files))) { + ff = list_entry(fi->write_files.next, struct fuse_file, + write_entry); + fuse_file_get(ff); + } + spin_unlock(&fc->lock); + + return ff; +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1501,8 +1562,8 @@ static int fuse_writepage_locked(struct page *page) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_req *req; - struct fuse_file *ff; struct page *tmp_page; + int error = -ENOMEM; set_page_writeback(page); @@ -1515,16 +1576,16 @@ static int fuse_writepage_locked(struct page *page) if (!tmp_page) goto err_free; - spin_lock(&fc->lock); - BUG_ON(list_empty(&fi->write_files)); - ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); - req->ff = fuse_file_get(ff); - spin_unlock(&fc->lock); + error = -EIO; + req->ff = fuse_write_file_get(fc, fi); + if (!req->ff) + goto err_free; - fuse_write_fill(req, ff, page_offset(page), 0); + fuse_write_fill(req, req->ff, page_offset(page), 0); copy_highpage(tmp_page, page); req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; + req->misc.write.next = NULL; req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; @@ -1550,19 +1611,263 @@ err_free: fuse_request_free(req); err: end_page_writeback(page); - return -ENOMEM; + return error; } static int fuse_writepage(struct page *page, struct writeback_control *wbc) { int err; + if (fuse_page_is_writeback(page->mapping->host, page->index)) { + /* + * ->writepages() should be called for sync() and friends. We + * should only get here on direct reclaim and then we are + * allowed to skip a page which is already in flight + */ + WARN_ON(wbc->sync_mode == WB_SYNC_ALL); + + redirty_page_for_writepage(wbc, page); + return 0; + } + err = fuse_writepage_locked(page); unlock_page(page); return err; } +struct fuse_fill_wb_data { + struct fuse_req *req; + struct fuse_file *ff; + struct inode *inode; + struct page **orig_pages; +}; + +static void fuse_writepages_send(struct fuse_fill_wb_data *data) +{ + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + int num_pages = req->num_pages; + int i; + + req->ff = fuse_file_get(data->ff); + spin_lock(&fc->lock); + list_add_tail(&req->list, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fc->lock); + + for (i = 0; i < num_pages; i++) + end_page_writeback(data->orig_pages[i]); +} + +static bool fuse_writepage_in_flight(struct fuse_req *new_req, + struct page *page) +{ + struct fuse_conn *fc = get_fuse_conn(new_req->inode); + struct fuse_inode *fi = get_fuse_inode(new_req->inode); + struct fuse_req *tmp; + struct fuse_req *old_req; + bool found = false; + pgoff_t curr_index; + + BUG_ON(new_req->num_pages != 0); + + spin_lock(&fc->lock); + list_del(&new_req->writepages_entry); + list_for_each_entry(old_req, &fi->writepages, writepages_entry) { + BUG_ON(old_req->inode != new_req->inode); + curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (curr_index <= page->index && + page->index < curr_index + old_req->num_pages) { + found = true; + break; + } + } + if (!found) { + list_add(&new_req->writepages_entry, &fi->writepages); + goto out_unlock; + } + + new_req->num_pages = 1; + for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { + BUG_ON(tmp->inode != new_req->inode); + curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (tmp->num_pages == 1 && + curr_index == page->index) { + old_req = tmp; + } + } + + if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || + old_req->state == FUSE_REQ_PENDING)) { + struct backing_dev_info *bdi = page->mapping->backing_dev_info; + + copy_highpage(old_req->pages[0], page); + spin_unlock(&fc->lock); + + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(page, NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + fuse_writepage_free(fc, new_req); + fuse_request_free(new_req); + goto out; + } else { + new_req->misc.write.next = old_req->misc.write.next; + old_req->misc.write.next = new_req; + } +out_unlock: + spin_unlock(&fc->lock); +out: + return found; +} + +static int fuse_writepages_fill(struct page *page, + struct writeback_control *wbc, void *_data) +{ + struct fuse_fill_wb_data *data = _data; + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct page *tmp_page; + bool is_writeback; + int err; + + if (!data->ff) { + err = -EIO; + data->ff = fuse_write_file_get(fc, get_fuse_inode(inode)); + if (!data->ff) + goto out_unlock; + } + + /* + * Being under writeback is unlikely but possible. For example direct + * read to an mmaped fuse file will set the page dirty twice; once when + * the pages are faulted with get_user_pages(), and then after the read + * completed. + */ + is_writeback = fuse_page_is_writeback(inode, page->index); + + if (req && req->num_pages && + (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write || + data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { + fuse_writepages_send(data); + data->req = NULL; + } + err = -ENOMEM; + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto out_unlock; + + /* + * The page must not be redirtied until the writeout is completed + * (i.e. userspace has sent a reply to the write request). Otherwise + * there could be more than one temporary page instance for each real + * page. + * + * This is ensured by holding the page lock in page_mkwrite() while + * checking fuse_page_is_writeback(). We already hold the page lock + * since clear_page_dirty_for_io() and keep it held until we add the + * request to the fi->writepages list and increment req->num_pages. + * After this fuse_page_is_writeback() will indicate that the page is + * under writeback, so we can release the page lock. + */ + if (data->req == NULL) { + struct fuse_inode *fi = get_fuse_inode(inode); + + err = -ENOMEM; + req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); + if (!req) { + __free_page(tmp_page); + goto out_unlock; + } + + fuse_write_fill(req, data->ff, page_offset(page), 0); + req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; + req->misc.write.next = NULL; + req->in.argpages = 1; + req->background = 1; + req->num_pages = 0; + req->end = fuse_writepage_end; + req->inode = inode; + + spin_lock(&fc->lock); + list_add(&req->writepages_entry, &fi->writepages); + spin_unlock(&fc->lock); + + data->req = req; + } + set_page_writeback(page); + + copy_highpage(tmp_page, page); + req->pages[req->num_pages] = tmp_page; + req->page_descs[req->num_pages].offset = 0; + req->page_descs[req->num_pages].length = PAGE_SIZE; + + inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK); + inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + + err = 0; + if (is_writeback && fuse_writepage_in_flight(req, page)) { + end_page_writeback(page); + data->req = NULL; + goto out_unlock; + } + data->orig_pages[req->num_pages] = page; + + /* + * Protected by fc->lock against concurrent access by + * fuse_page_is_writeback(). + */ + spin_lock(&fc->lock); + req->num_pages++; + spin_unlock(&fc->lock); + +out_unlock: + unlock_page(page); + + return err; +} + +static int fuse_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct fuse_fill_wb_data data; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + data.inode = inode; + data.req = NULL; + data.ff = NULL; + + err = -ENOMEM; + data.orig_pages = kzalloc(sizeof(struct page *) * + FUSE_MAX_PAGES_PER_REQ, + GFP_NOFS); + if (!data.orig_pages) + goto out; + + err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); + if (data.req) { + /* Ignore errors if we can write at least one page */ + BUG_ON(!data.req->num_pages); + fuse_writepages_send(&data); + err = 0; + } + if (data.ff) + fuse_file_put(data.ff, false); + + kfree(data.orig_pages); +out: + return err; +} + static int fuse_launder_page(struct page *page) { int err = 0; @@ -1602,14 +1907,17 @@ static void fuse_vma_close(struct vm_area_struct *vma) static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - /* - * Don't use page->mapping as it may become NULL from a - * concurrent truncate. - */ - struct inode *inode = vma->vm_file->f_mapping->host; + struct inode *inode = file_inode(vma->vm_file); + + file_update_time(vma->vm_file); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + return VM_FAULT_NOPAGE; + } fuse_wait_on_page_writeback(inode, page->index); - return 0; + return VM_FAULT_LOCKED; } static const struct vm_operations_struct fuse_file_vm_ops = { @@ -2581,6 +2889,7 @@ static const struct file_operations fuse_direct_io_file_operations = { static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, .writepage = fuse_writepage, + .writepages = fuse_writepages, .launder_page = fuse_launder_page, .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5b9e6f3b6aef..643274852c8b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -321,6 +321,7 @@ struct fuse_req { struct { struct fuse_write_in in; struct fuse_write_out out; + struct fuse_req *next; } write; struct fuse_notify_retrieve_in retrieve_in; struct fuse_lk_in lk_in; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1f7d8057ea68..b7fc035a6943 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -611,12 +611,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; error = gfs2_quota_lock_check(ip); if (error) goto out_unlock; requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, requested, 0); + ap.target = requested; + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 62a65fc448dc..fe0500c0af7a 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1216,6 +1216,7 @@ static int do_grow(struct inode *inode, u64 size) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .target = 1, }; struct buffer_head *dibh; int error; int unstuff = 0; @@ -1226,7 +1227,7 @@ static int do_grow(struct inode *inode, u64 size) if (error) return error; - error = gfs2_inplace_reserve(ip, 1, 0); + error = gfs2_inplace_reserve(ip, &ap); if (error) goto do_grow_qunlock; unstuff = 1; @@ -1279,6 +1280,7 @@ do_grow_qunlock: int gfs2_setattr_size(struct inode *inode, u64 newsize) { + struct gfs2_inode *ip = GFS2_I(inode); int ret; u64 oldsize; @@ -1294,7 +1296,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) inode_dio_wait(inode); - ret = gfs2_rs_alloc(GFS2_I(inode)); + ret = gfs2_rs_alloc(ip); if (ret) goto out; @@ -1304,6 +1306,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) goto out; } + gfs2_rs_deltree(ip->i_res); ret = do_shrink(inode, oldsize, newsize); out: put_write_access(inode); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 0621b46d474d..efc078f0ee4e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -383,6 +383,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned long last_index; u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; @@ -430,7 +431,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); + ap.target = data_blocks + ind_blocks; + ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_quota_unlock; @@ -620,7 +622,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (!(file->f_mode & FMODE_WRITE)) return 0; - gfs2_rs_delete(ip); + gfs2_rs_delete(ip, &inode->i_writecount); return 0; } @@ -800,6 +802,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, struct inode *inode = file_inode(file); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; loff_t bytes, max_bytes; int error; @@ -850,7 +853,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); + ap.target = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip, &ap); if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { bytes >>= 1; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c2f41b4d00b9..e66a8009aff1 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -31,6 +31,7 @@ #include <linux/bit_spinlock.h> #include <linux/percpu.h> #include <linux/list_sort.h> +#include <linux/lockref.h> #include "gfs2.h" #include "incore.h" @@ -129,10 +130,10 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -void gfs2_glock_hold(struct gfs2_glock *gl) +static void gfs2_glock_hold(struct gfs2_glock *gl) { - GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); - atomic_inc(&gl->gl_ref); + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + lockref_get(&gl->gl_lockref); } /** @@ -187,20 +188,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) } /** - * gfs2_glock_put_nolock() - Decrement reference count on glock - * @gl: The glock to put - * - * This function should only be used if the caller has its own reference - * to the glock, in addition to the one it is dropping. - */ - -void gfs2_glock_put_nolock(struct gfs2_glock *gl) -{ - if (atomic_dec_and_test(&gl->gl_ref)) - GLOCK_BUG_ON(gl, 1); -} - -/** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * @@ -211,17 +198,22 @@ void gfs2_glock_put(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); - if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { - __gfs2_glock_remove_from_lru(gl); - spin_unlock(&lru_lock); - spin_lock_bucket(gl->gl_hash); - hlist_bl_del_rcu(&gl->gl_list); - spin_unlock_bucket(gl->gl_hash); - GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - GLOCK_BUG_ON(gl, mapping && mapping->nrpages); - trace_gfs2_glock_put(gl); - sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); - } + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + lockref_mark_dead(&gl->gl_lockref); + + spin_lock(&lru_lock); + __gfs2_glock_remove_from_lru(gl); + spin_unlock(&lru_lock); + spin_unlock(&gl->gl_lockref.lock); + spin_lock_bucket(gl->gl_hash); + hlist_bl_del_rcu(&gl->gl_list); + spin_unlock_bucket(gl->gl_hash); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } /** @@ -244,7 +236,7 @@ static struct gfs2_glock *search_bucket(unsigned int hash, continue; if (gl->gl_sbd != sdp) continue; - if (atomic_inc_not_zero(&gl->gl_ref)) + if (lockref_get_not_dead(&gl->gl_lockref)) return gl; } @@ -396,10 +388,11 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) held2 = (new_state != LM_ST_UNLOCKED); if (held1 != held2) { + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); if (held2) - gfs2_glock_hold(gl); + gl->gl_lockref.count++; else - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; } if (held1 && held2 && list_empty(&gl->gl_holders)) clear_bit(GLF_QUEUED, &gl->gl_flags); @@ -626,9 +619,9 @@ out: out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_clear_bit(); - gfs2_glock_hold(gl); + gl->gl_lockref.count++; if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; return; out_unlock: @@ -754,7 +747,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_sbd = sdp; gl->gl_flags = 0; gl->gl_name = name; - atomic_set(&gl->gl_ref, 1); + gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; @@ -1356,10 +1349,10 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } } - spin_unlock(&gl->gl_spin); + gl->gl_lockref.count++; set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - smp_wmb(); - gfs2_glock_hold(gl); + spin_unlock(&gl->gl_spin); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); } @@ -1404,15 +1397,19 @@ __acquires(&lru_lock) while(!list_empty(list)) { gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); + if (!spin_trylock(&gl->gl_spin)) { + list_add(&gl->gl_lru, &lru_list); + atomic_inc(&lru_count); + continue; + } clear_bit(GLF_LRU, &gl->gl_flags); - gfs2_glock_hold(gl); spin_unlock(&lru_lock); - spin_lock(&gl->gl_spin); + gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; spin_unlock(&gl->gl_spin); spin_lock(&lru_lock); } @@ -1493,7 +1490,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, rcu_read_lock(); hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { - if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref)) + if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); } rcu_read_unlock(); @@ -1746,7 +1743,7 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_revokes), - atomic_read(&gl->gl_ref), gl->gl_hold_time); + (int)gl->gl_lockref.count, gl->gl_hold_time); list_for_each_entry(gh, &gl->gl_holders, gh_list) { error = dump_holder(seq, gh); @@ -1902,7 +1899,7 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) gi->nhash = 0; } /* Skip entries for other sb and dead entries */ - } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0); + } while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref)); return 0; } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 69f66e3d22bf..6647d77366ba 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -181,8 +181,6 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); -extern void gfs2_glock_hold(struct gfs2_glock *gl); -extern void gfs2_glock_put_nolock(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index e2e0a90396e7..db908f697139 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -525,9 +525,9 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) if (gl->gl_demote_state == LM_ST_UNLOCKED && gl->gl_state == LM_ST_SHARED && ip) { - gfs2_glock_hold(gl); + gl->gl_lockref.count++; if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) - gfs2_glock_put_nolock(gl); + gl->gl_lockref.count--; } } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 26aabd7caba7..ba1ea67f4eeb 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -21,6 +21,7 @@ #include <linux/rbtree.h> #include <linux/ktime.h> #include <linux/percpu.h> +#include <linux/lockref.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -71,6 +72,7 @@ struct gfs2_bitmap { u32 bi_offset; u32 bi_start; u32 bi_len; + u32 bi_blocks; }; struct gfs2_rgrpd { @@ -101,19 +103,25 @@ struct gfs2_rgrpd { struct gfs2_rbm { struct gfs2_rgrpd *rgd; - struct gfs2_bitmap *bi; /* Bitmap must belong to the rgd */ u32 offset; /* The offset is bitmap relative */ + int bii; /* Bitmap index */ }; +static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_bits + rbm->bii; +} + static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) { - return rbm->rgd->rd_data0 + (rbm->bi->bi_start * GFS2_NBBY) + rbm->offset; + return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + + rbm->offset; } static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1, const struct gfs2_rbm *rbm2) { - return (rbm1->rgd == rbm2->rgd) && (rbm1->bi == rbm2->bi) && + return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) && (rbm1->offset == rbm2->offset); } @@ -278,6 +286,20 @@ struct gfs2_blkreserv { unsigned int rs_qa_qd_num; }; +/* + * Allocation parameters + * @target: The number of blocks we'd ideally like to allocate + * @aflags: The flags (e.g. Orlov flag) + * + * The intent is to gradually expand this structure over time in + * order to give more information, e.g. alignment, min extent size + * to the allocation code. + */ +struct gfs2_alloc_parms { + u32 target; + u32 aflags; +}; + enum { GLF_LOCK = 1, GLF_DEMOTE = 3, @@ -300,9 +322,9 @@ struct gfs2_glock { struct gfs2_sbd *gl_sbd; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; - atomic_t gl_ref; - spinlock_t gl_spin; + struct lockref gl_lockref; +#define gl_spin gl_lockref.lock /* State fields protected by gl_spin */ unsigned int gl_state:2, /* Current state */ @@ -398,11 +420,10 @@ enum { struct gfs2_quota_data { struct list_head qd_list; - struct list_head qd_reclaim; - - atomic_t qd_count; - struct kqid qd_id; + struct lockref qd_lockref; + struct list_head qd_lru; + unsigned long qd_flags; /* QDF_... */ s64 qd_change; @@ -516,7 +537,6 @@ struct gfs2_tune { unsigned int gt_logd_secs; - unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ unsigned int gt_quota_scale_num; /* Numerator */ unsigned int gt_quota_scale_den; /* Denominator */ @@ -694,6 +714,7 @@ struct gfs2_sbd { struct list_head sd_quota_list; atomic_t sd_quota_count; struct mutex sd_quota_mutex; + struct mutex sd_quota_sync_mutex; wait_queue_head_t sd_quota_wait; struct list_head sd_trunc_list; spinlock_t sd_trunc_lock; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index ced3257f06e8..109ce9325b76 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -379,6 +379,7 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip, static int alloc_dinode(struct gfs2_inode *ip, u32 flags) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc_parms ap = { .target = RES_DINODE, .aflags = flags, }; int error; int dblocks = 1; @@ -386,7 +387,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags) if (error) goto out; - error = gfs2_inplace_reserve(ip, RES_DINODE, flags); + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_quota; @@ -472,6 +473,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, struct gfs2_inode *ip, int arq) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; int error; if (arq) { @@ -479,7 +481,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_quota_locks; - error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0); + error = gfs2_inplace_reserve(dip, &ap); if (error) goto fail_quota_locks; @@ -584,17 +586,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!IS_ERR(inode)) { d = d_splice_alias(inode, dentry); error = 0; - if (file && !IS_ERR(d)) { - if (d == NULL) - d = dentry; - if (S_ISREG(inode->i_mode)) - error = finish_open(file, d, gfs2_open_common, opened); - else + if (file) { + if (S_ISREG(inode->i_mode)) { + WARN_ON(d != NULL); + error = finish_open(file, dentry, gfs2_open_common, opened); + } else { error = finish_no_open(file, d); + } + } else { + dput(d); } gfs2_glock_dq_uninit(ghs); - if (IS_ERR(d)) - return PTR_ERR(d); return error; } else if (error != -ENOENT) { goto fail_gunlock; @@ -713,7 +715,7 @@ fail_gunlock2: fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); - gfs2_rs_delete(ip); + gfs2_rs_delete(ip, NULL); free_inode_nonrcu(inode); inode = NULL; fail_gunlock: @@ -781,8 +783,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, error = finish_open(file, dentry, gfs2_open_common, opened); gfs2_glock_dq_uninit(&gh); - if (error) + if (error) { + dput(d); return ERR_PTR(error); + } return d; } @@ -874,11 +878,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, error = 0; if (alloc_required) { + struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; error = gfs2_quota_lock_check(dip); if (error) goto out_gunlock; - error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0); + error = gfs2_inplace_reserve(dip, &ap); if (error) goto out_gunlock_q; @@ -1163,14 +1168,16 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, d = __gfs2_lookup(dir, dentry, file, opened); if (IS_ERR(d)) return PTR_ERR(d); - if (d == NULL) - d = dentry; - if (d->d_inode) { + if (d != NULL) + dentry = d; + if (dentry->d_inode) { if (!(*opened & FILE_OPENED)) - return finish_no_open(file, d); + return finish_no_open(file, dentry); + dput(d); return 0; } + BUG_ON(d != NULL); if (!(flags & O_CREAT)) return -ENOENT; @@ -1385,11 +1392,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (alloc_required) { + struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; error = gfs2_quota_lock_check(ndip); if (error) goto out_gunlock; - error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0); + error = gfs2_inplace_reserve(ndip, &ap); if (error) goto out_gunlock_q; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 351586e24e30..0650db2541ef 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -31,12 +31,6 @@ struct workqueue_struct *gfs2_control_wq; -static struct shrinker qd_shrinker = { - .count_objects = gfs2_qd_shrink_count, - .scan_objects = gfs2_qd_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - static void gfs2_init_inode_once(void *foo) { struct gfs2_inode *ip = foo; @@ -87,6 +81,10 @@ static int __init init_gfs2_fs(void) if (error) return error; + error = list_lru_init(&gfs2_qd_lru); + if (error) + goto fail_lru; + error = gfs2_glock_init(); if (error) goto fail; @@ -139,7 +137,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_rsrv_cachep) goto fail; - register_shrinker(&qd_shrinker); + register_shrinker(&gfs2_qd_shrinker); error = register_filesystem(&gfs2_fs_type); if (error) @@ -179,7 +177,9 @@ fail_wq: fail_unregister: unregister_filesystem(&gfs2_fs_type); fail: - unregister_shrinker(&qd_shrinker); + list_lru_destroy(&gfs2_qd_lru); +fail_lru: + unregister_shrinker(&gfs2_qd_shrinker); gfs2_glock_exit(); if (gfs2_rsrv_cachep) @@ -214,13 +214,14 @@ fail: static void __exit exit_gfs2_fs(void) { - unregister_shrinker(&qd_shrinker); + unregister_shrinker(&gfs2_qd_shrinker); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); destroy_workqueue(gfs_recovery_wq); destroy_workqueue(gfs2_control_wq); + list_lru_destroy(&gfs2_qd_lru); rcu_barrier(); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 19ff5e8c285c..82303b474958 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -51,7 +51,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) { spin_lock_init(>->gt_spin); - gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; gt->gt_quota_scale_den = 1; @@ -94,6 +93,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_quota_list); mutex_init(&sdp->sd_quota_mutex); + mutex_init(&sdp->sd_quota_sync_mutex); init_waitqueue_head(&sdp->sd_quota_wait); INIT_LIST_HEAD(&sdp->sd_trunc_list); spin_lock_init(&sdp->sd_trunc_lock); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index db441359ee8c..453b50eaddec 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -50,6 +50,8 @@ #include <linux/freezer.h> #include <linux/quota.h> #include <linux/dqblk_xfs.h> +#include <linux/lockref.h> +#include <linux/list_lru.h> #include "gfs2.h" #include "incore.h" @@ -71,29 +73,25 @@ struct gfs2_quota_change_host { struct kqid qc_id; }; -static LIST_HEAD(qd_lru_list); -static atomic_t qd_lru_count = ATOMIC_INIT(0); -static DEFINE_SPINLOCK(qd_lru_lock); +/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */ +static DEFINE_SPINLOCK(qd_lock); +struct list_lru gfs2_qd_lru; -unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) +static void gfs2_qd_dispose(struct list_head *list) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; - int nr_to_scan = sc->nr_to_scan; - long freed = 0; - if (!(sc->gfp_mask & __GFP_FS)) - return SHRINK_STOP; - - spin_lock(&qd_lru_lock); - while (nr_to_scan && !list_empty(&qd_lru_list)) { - qd = list_entry(qd_lru_list.next, - struct gfs2_quota_data, qd_reclaim); + while (!list_empty(list)) { + qd = list_entry(list->next, struct gfs2_quota_data, qd_lru); sdp = qd->qd_gl->gl_sbd; + list_del(&qd->qd_lru); + /* Free from the filesystem-specific list */ + spin_lock(&qd_lock); list_del(&qd->qd_list); + spin_unlock(&qd_lock); gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); @@ -103,24 +101,59 @@ unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, atomic_dec(&sdp->sd_quota_count); /* Delete it from the common reclaim list */ - list_del_init(&qd->qd_reclaim); - atomic_dec(&qd_lru_count); - spin_unlock(&qd_lru_lock); kmem_cache_free(gfs2_quotad_cachep, qd); - spin_lock(&qd_lru_lock); - nr_to_scan--; - freed++; } - spin_unlock(&qd_lru_lock); +} + + +static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) +{ + struct list_head *dispose = arg; + struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); + + if (!spin_trylock(&qd->qd_lockref.lock)) + return LRU_SKIP; + + if (qd->qd_lockref.count == 0) { + lockref_mark_dead(&qd->qd_lockref); + list_move(&qd->qd_lru, dispose); + } + + spin_unlock(&qd->qd_lockref.lock); + return LRU_REMOVED; +} + +static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, + &dispose, &sc->nr_to_scan); + + gfs2_qd_dispose(&dispose); + return freed; } -unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) { - return vfs_pressure_ratio(atomic_read(&qd_lru_count)); + return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); } +struct shrinker gfs2_qd_shrinker = { + .count_objects = gfs2_qd_shrink_count, + .scan_objects = gfs2_qd_shrink_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + + static u64 qd2index(struct gfs2_quota_data *qd) { struct kqid qid = qd->qd_id; @@ -148,10 +181,11 @@ static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, if (!qd) return -ENOMEM; - atomic_set(&qd->qd_count, 1); + qd->qd_lockref.count = 1; + spin_lock_init(&qd->qd_lockref.lock); qd->qd_id = qid; qd->qd_slot = -1; - INIT_LIST_HEAD(&qd->qd_reclaim); + INIT_LIST_HEAD(&qd->qd_lru); error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); @@ -177,16 +211,11 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, for (;;) { found = 0; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (qid_eq(qd->qd_id, qid)) { - if (!atomic_read(&qd->qd_count) && - !list_empty(&qd->qd_reclaim)) { - /* Remove it from reclaim list */ - list_del_init(&qd->qd_reclaim); - atomic_dec(&qd_lru_count); - } - atomic_inc(&qd->qd_count); + if (qid_eq(qd->qd_id, qid) && + lockref_get_not_dead(&qd->qd_lockref)) { + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); found = 1; break; } @@ -202,7 +231,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, new_qd = NULL; } - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); if (qd) { if (new_qd) { @@ -222,18 +251,19 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, static void qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - gfs2_assert(sdp, atomic_read(&qd->qd_count)); - atomic_inc(&qd->qd_count); + gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref)); + lockref_get(&qd->qd_lockref); } static void qd_put(struct gfs2_quota_data *qd) { - if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) { - /* Add to the reclaim list */ - list_add_tail(&qd->qd_reclaim, &qd_lru_list); - atomic_inc(&qd_lru_count); - spin_unlock(&qd_lru_lock); - } + if (lockref_put_or_lock(&qd->qd_lockref)) + return; + + qd->qd_lockref.count = 0; + list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + spin_unlock(&qd->qd_lockref.lock); + } static int slot_get(struct gfs2_quota_data *qd) @@ -242,10 +272,10 @@ static int slot_get(struct gfs2_quota_data *qd) unsigned int c, o = 0, b; unsigned char byte = 0; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); if (qd->qd_slot_count++) { - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); return 0; } @@ -269,13 +299,13 @@ found: sdp->sd_quota_bitmap[c][o] |= 1 << b; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); return 0; fail: qd->qd_slot_count--; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); return -ENOSPC; } @@ -283,23 +313,43 @@ static void slot_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); gfs2_assert(sdp, qd->qd_slot_count); qd->qd_slot_count++; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); +} + +static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, + unsigned int bit, int new_value) +{ + unsigned int c, o, b = bit; + int old_value; + + c = b / (8 * PAGE_SIZE); + b %= 8 * PAGE_SIZE; + o = b / 8; + b %= 8; + + old_value = (bitmap[c][o] & (1 << b)); + gfs2_assert_withdraw(sdp, !old_value != !new_value); + + if (new_value) + bitmap[c][o] |= 1 << b; + else + bitmap[c][o] &= ~(1 << b); } static void slot_put(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); gfs2_assert(sdp, qd->qd_slot_count); if (!--qd->qd_slot_count) { gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); qd->qd_slot = -1; } - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); } static int bh_get(struct gfs2_quota_data *qd) @@ -363,6 +413,25 @@ static void bh_put(struct gfs2_quota_data *qd) mutex_unlock(&sdp->sd_quota_mutex); } +static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, + u64 *sync_gen) +{ + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags) || + (sync_gen && (qd->qd_sync_gen >= *sync_gen))) + return 0; + + if (!lockref_get_not_dead(&qd->qd_lockref)) + return 0; + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + set_bit(QDF_LOCKED, &qd->qd_flags); + qd->qd_change_sync = qd->qd_change; + gfs2_assert_warn(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + return 1; +} + static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd = NULL; @@ -374,31 +443,18 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) if (sdp->sd_vfs->s_flags & MS_RDONLY) return 0; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (test_bit(QDF_LOCKED, &qd->qd_flags) || - !test_bit(QDF_CHANGE, &qd->qd_flags) || - qd->qd_sync_gen >= sdp->sd_quota_sync_gen) - continue; - - list_move_tail(&qd->qd_list, &sdp->sd_quota_list); - - set_bit(QDF_LOCKED, &qd->qd_flags); - gfs2_assert_warn(sdp, atomic_read(&qd->qd_count)); - atomic_inc(&qd->qd_count); - qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; - found = 1; - - break; + found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen); + if (found) + break; } if (!found) qd = NULL; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); if (qd) { gfs2_assert_warn(sdp, qd->qd_change_sync); @@ -416,43 +472,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) return 0; } -static int qd_trylock(struct gfs2_quota_data *qd) -{ - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - - if (sdp->sd_vfs->s_flags & MS_RDONLY) - return 0; - - spin_lock(&qd_lru_lock); - - if (test_bit(QDF_LOCKED, &qd->qd_flags) || - !test_bit(QDF_CHANGE, &qd->qd_flags)) { - spin_unlock(&qd_lru_lock); - return 0; - } - - list_move_tail(&qd->qd_list, &sdp->sd_quota_list); - - set_bit(QDF_LOCKED, &qd->qd_flags); - gfs2_assert_warn(sdp, atomic_read(&qd->qd_count)); - atomic_inc(&qd->qd_count); - qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; - - spin_unlock(&qd_lru_lock); - - gfs2_assert_warn(sdp, qd->qd_change_sync); - if (bh_get(qd)) { - clear_bit(QDF_LOCKED, &qd->qd_flags); - slot_put(qd); - qd_put(qd); - return 0; - } - - return 1; -} - static void qd_unlock(struct gfs2_quota_data *qd) { gfs2_assert_warn(qd->qd_gl->gl_sbd, @@ -602,9 +621,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) x = be64_to_cpu(qc->qc_change) + change; qc->qc_change = cpu_to_be64(x); - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); qd->qd_change = x; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); if (!x) { gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); @@ -763,6 +782,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) { struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks, ind_blocks; struct gfs2_holder *ghs, i_gh; unsigned int qx, x; @@ -815,7 +835,8 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; reserved = 1 + (nalloc * (data_blocks + ind_blocks)); - error = gfs2_inplace_reserve(ip, reserved, 0); + ap.target = reserved; + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_alloc; @@ -974,9 +995,9 @@ static int need_sync(struct gfs2_quota_data *qd) if (!qd->qd_qb.qb_limit) return 0; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); value = qd->qd_change; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); spin_lock(>->gt_spin); num = gt->gt_quota_scale_num; @@ -1001,9 +1022,11 @@ static int need_sync(struct gfs2_quota_data *qd) void gfs2_quota_unlock(struct gfs2_inode *ip) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qda[4]; unsigned int count = 0; unsigned int x; + int found; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) goto out; @@ -1016,9 +1039,25 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) sync = need_sync(qd); gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + if (!sync) + continue; + + spin_lock(&qd_lock); + found = qd_check_sync(sdp, qd, NULL); + spin_unlock(&qd_lock); + + if (!found) + continue; - if (sync && qd_trylock(qd)) - qda[count++] = qd; + gfs2_assert_warn(sdp, qd->qd_change_sync); + if (bh_get(qd)) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + continue; + } + + qda[count++] = qd; } if (count) { @@ -1067,9 +1106,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) continue; value = (s64)be64_to_cpu(qd->qd_qb.qb_value); - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); value += qd->qd_change; - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); @@ -1118,17 +1157,18 @@ int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; - unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync); + unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder); unsigned int num_qd; unsigned int x; int error = 0; - sdp->sd_quota_sync_gen++; - qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); if (!qda) return -ENOMEM; + mutex_lock(&sdp->sd_quota_sync_mutex); + sdp->sd_quota_sync_gen++; + do { num_qd = 0; @@ -1153,6 +1193,7 @@ int gfs2_quota_sync(struct super_block *sb, int type) } } while (!error && num_qd == max_qd); + mutex_unlock(&sdp->sd_quota_sync_mutex); kfree(qda); return error; @@ -1258,11 +1299,11 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) qd->qd_slot = slot; qd->qd_slot_count = 1; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); found++; } @@ -1288,30 +1329,34 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) struct gfs2_quota_data *qd; unsigned int x; - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); while (!list_empty(head)) { qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); - if (atomic_read(&qd->qd_count) > 1 || - (atomic_read(&qd->qd_count) && - !test_bit(QDF_CHANGE, &qd->qd_flags))) { + /* + * To be removed in due course... we should be able to + * ensure that all refs to the qd have done by this point + * so that this rather odd test is not required + */ + spin_lock(&qd->qd_lockref.lock); + if (qd->qd_lockref.count > 1 || + (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { + spin_unlock(&qd->qd_lockref.lock); list_move(&qd->qd_list, head); - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); schedule(); - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); continue; } + spin_unlock(&qd->qd_lockref.lock); list_del(&qd->qd_list); /* Also remove if this qd exists in the reclaim list */ - if (!list_empty(&qd->qd_reclaim)) { - list_del_init(&qd->qd_reclaim); - atomic_dec(&qd_lru_count); - } + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); atomic_dec(&sdp->sd_quota_count); - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); - if (!atomic_read(&qd->qd_count)) { + if (!qd->qd_lockref.count) { gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); } else @@ -1321,9 +1366,9 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_glock_put(qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, qd); - spin_lock(&qd_lru_lock); + spin_lock(&qd_lock); } - spin_unlock(&qd_lru_lock); + spin_unlock(&qd_lock); gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); @@ -1462,7 +1507,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb, } fqs->qs_uquota.qfs_nextents = 1; /* unsupported */ fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */ - fqs->qs_incoredqs = atomic_read(&qd_lru_count); + fqs->qs_incoredqs = list_lru_count(&gfs2_qd_lru); return 0; } @@ -1573,10 +1618,12 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (gfs2_is_stuffed(ip)) alloc_required = 1; if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); blocks = 1 + data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, blocks, 0); + ap.target = blocks; + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_i; blocks += gfs2_rg_blocks(ip, blocks); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 0f64d9deb1b0..96e4f34a03b0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -10,9 +10,10 @@ #ifndef __QUOTA_DOT_H__ #define __QUOTA_DOT_H__ +#include <linux/list_lru.h> + struct gfs2_inode; struct gfs2_sbd; -struct shrink_control; #define NO_UID_QUOTA_CHANGE INVALID_UID #define NO_GID_QUOTA_CHANGE INVALID_GID @@ -53,10 +54,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, - struct shrink_control *sc); -extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc); extern const struct quotactl_ops gfs2_quotactl_ops; +extern struct shrinker gfs2_qd_shrinker; +extern struct list_lru gfs2_qd_lru; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 69317435faa7..4d83abdd5635 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -81,11 +81,12 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, unsigned char new_state) { unsigned char *byte1, *byte2, *end, cur_state; - unsigned int buflen = rbm->bi->bi_len; + struct gfs2_bitmap *bi = rbm_bi(rbm); + unsigned int buflen = bi->bi_len; const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; - byte1 = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY); - end = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + buflen; + byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY); + end = bi->bi_bh->b_data + bi->bi_offset + buflen; BUG_ON(byte1 >= end); @@ -95,18 +96,17 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, " "new_state=%d\n", rbm->offset, cur_state, new_state); printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n", - (unsigned long long)rbm->rgd->rd_addr, - rbm->bi->bi_start); + (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n", - rbm->bi->bi_offset, rbm->bi->bi_len); + bi->bi_offset, bi->bi_len); dump_stack(); gfs2_consist_rgrpd(rbm->rgd); return; } *byte1 ^= (cur_state ^ new_state) << bit; - if (do_clone && rbm->bi->bi_clone) { - byte2 = rbm->bi->bi_clone + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY); + if (do_clone && bi->bi_clone) { + byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY); cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; *byte2 ^= (cur_state ^ new_state) << bit; } @@ -121,7 +121,8 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm) { - const u8 *buffer = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset; + struct gfs2_bitmap *bi = rbm_bi(rbm); + const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset; const u8 *byte; unsigned int bit; @@ -252,29 +253,53 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) { u64 rblock = block - rbm->rgd->rd_data0; - u32 x; if (WARN_ON_ONCE(rblock > UINT_MAX)) return -EINVAL; if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) return -E2BIG; - rbm->bi = rbm->rgd->rd_bits; + rbm->bii = 0; rbm->offset = (u32)(rblock); /* Check if the block is within the first block */ - if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) + if (rbm->offset < rbm_bi(rbm)->bi_blocks) return 0; /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */ rbm->offset += (sizeof(struct gfs2_rgrp) - sizeof(struct gfs2_meta_header)) * GFS2_NBBY; - x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; - rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; - rbm->bi += x; + rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; return 0; } /** + * gfs2_rbm_incr - increment an rbm structure + * @rbm: The rbm with rgd already set correctly + * + * This function takes an existing rbm structure and increments it to the next + * viable block offset. + * + * Returns: If incrementing the offset would cause the rbm to go past the + * end of the rgrp, true is returned, otherwise false. + * + */ + +static bool gfs2_rbm_incr(struct gfs2_rbm *rbm) +{ + if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */ + rbm->offset++; + return false; + } + if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */ + return true; + + rbm->offset = 0; + rbm->bii++; + return false; +} + +/** * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned * @rbm: Position to search (value/result) * @n_unaligned: Number of unaligned blocks to check @@ -285,7 +310,6 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len) { - u64 block; u32 n; u8 res; @@ -296,8 +320,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le (*len)--; if (*len == 0) return true; - block = gfs2_rbm_to_block(rbm); - if (gfs2_rbm_from_block(rbm, block + 1)) + if (gfs2_rbm_incr(rbm)) return true; } @@ -328,6 +351,7 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) u32 chunk_size; u8 *ptr, *start, *end; u64 block; + struct gfs2_bitmap *bi; if (n_unaligned && gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len)) @@ -336,11 +360,12 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) n_unaligned = len & 3; /* Start is now byte aligned */ while (len > 3) { - start = rbm.bi->bi_bh->b_data; - if (rbm.bi->bi_clone) - start = rbm.bi->bi_clone; - end = start + rbm.bi->bi_bh->b_size; - start += rbm.bi->bi_offset; + bi = rbm_bi(&rbm); + start = bi->bi_bh->b_data; + if (bi->bi_clone) + start = bi->bi_clone; + end = start + bi->bi_bh->b_size; + start += bi->bi_offset; BUG_ON(rbm.offset & 3); start += (rbm.offset / GFS2_NBBY); bytes = min_t(u32, len / GFS2_NBBY, (end - start)); @@ -605,11 +630,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) RB_CLEAR_NODE(&rs->rs_node); if (rs->rs_free) { + struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm); + /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; rs->rs_free = 0; - clear_bit(GBF_FULL, &rs->rs_rbm.bi->bi_flags); + clear_bit(GBF_FULL, &bi->bi_flags); smp_mb__after_clear_bit(); } } @@ -634,14 +661,13 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) /** * gfs2_rs_delete - delete a multi-block reservation * @ip: The inode for this reservation + * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip) +void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) { - struct inode *inode = &ip->i_inode; - down_write(&ip->i_rw_mutex); - if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) { + if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); @@ -743,18 +769,21 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) bi->bi_offset = sizeof(struct gfs2_rgrp); bi->bi_start = 0; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* header block */ } else if (x == 0) { bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); bi->bi_offset = sizeof(struct gfs2_rgrp); bi->bi_start = 0; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* last block */ } else if (x + 1 == length) { bytes = bytes_left; bi->bi_offset = sizeof(struct gfs2_meta_header); bi->bi_start = rgd->rd_bitbytes - bytes_left; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* other blocks */ } else { bytes = sdp->sd_sb.sb_bsize - @@ -762,6 +791,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) bi->bi_offset = sizeof(struct gfs2_meta_header); bi->bi_start = rgd->rd_bitbytes - bytes_left; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; } bytes_left -= bytes; @@ -1392,12 +1422,12 @@ static void rs_insert(struct gfs2_inode *ip) * rg_mblk_search - find a group of multiple free blocks to form a reservation * @rgd: the resource group descriptor * @ip: pointer to the inode for which we're reserving blocks - * @requested: number of blocks required for this allocation + * @ap: the allocation parameters * */ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, - unsigned requested) + const struct gfs2_alloc_parms *ap) { struct gfs2_rbm rbm = { .rgd = rgd, }; u64 goal; @@ -1410,7 +1440,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (S_ISDIR(inode->i_mode)) extlen = 1; else { - extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); + extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target); extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); } if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) @@ -1554,14 +1584,14 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, const struct gfs2_inode *ip, bool nowrap) { struct buffer_head *bh; - struct gfs2_bitmap *initial_bi; + int initial_bii; u32 initial_offset; u32 offset; u8 *buffer; - int index; int n = 0; int iters = rbm->rgd->rd_length; int ret; + struct gfs2_bitmap *bi; /* If we are not starting at the beginning of a bitmap, then we * need to add one to the bitmap count to ensure that we search @@ -1571,52 +1601,53 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, iters++; while(1) { - if (test_bit(GBF_FULL, &rbm->bi->bi_flags) && + bi = rbm_bi(rbm); + if (test_bit(GBF_FULL, &bi->bi_flags) && (state == GFS2_BLKST_FREE)) goto next_bitmap; - bh = rbm->bi->bi_bh; - buffer = bh->b_data + rbm->bi->bi_offset; + bh = bi->bi_bh; + buffer = bh->b_data + bi->bi_offset; WARN_ON(!buffer_uptodate(bh)); - if (state != GFS2_BLKST_UNLINKED && rbm->bi->bi_clone) - buffer = rbm->bi->bi_clone + rbm->bi->bi_offset; + if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) + buffer = bi->bi_clone + bi->bi_offset; initial_offset = rbm->offset; - offset = gfs2_bitfit(buffer, rbm->bi->bi_len, rbm->offset, state); + offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state); if (offset == BFITNOENT) goto bitmap_full; rbm->offset = offset; if (ip == NULL) return 0; - initial_bi = rbm->bi; + initial_bii = rbm->bii; ret = gfs2_reservation_check_and_update(rbm, ip, minext); if (ret == 0) return 0; if (ret > 0) { - n += (rbm->bi - initial_bi); + n += (rbm->bii - initial_bii); goto next_iter; } if (ret == -E2BIG) { - index = 0; + rbm->bii = 0; rbm->offset = 0; - n += (rbm->bi - initial_bi); + n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; bitmap_full: /* Mark bitmap as full and fall through */ - if ((state == GFS2_BLKST_FREE) && initial_offset == 0) - set_bit(GBF_FULL, &rbm->bi->bi_flags); + if ((state == GFS2_BLKST_FREE) && initial_offset == 0) { + struct gfs2_bitmap *bi = rbm_bi(rbm); + set_bit(GBF_FULL, &bi->bi_flags); + } next_bitmap: /* Find next bitmap in the rgrp */ rbm->offset = 0; - index = rbm->bi - rbm->rgd->rd_bits; - index++; - if (index == rbm->rgd->rd_length) - index = 0; + rbm->bii++; + if (rbm->bii == rbm->rgd->rd_length) + rbm->bii = 0; res_covered_end_of_rgrp: - rbm->bi = &rbm->rgd->rd_bits[index]; - if ((index == 0) && nowrap) + if ((rbm->bii == 0) && nowrap) break; n++; next_iter: @@ -1645,7 +1676,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip struct gfs2_inode *ip; int error; int found = 0; - struct gfs2_rbm rbm = { .rgd = rgd, .bi = rgd->rd_bits, .offset = 0 }; + struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; while (1) { down_write(&sdp->sd_log_flush_lock); @@ -1800,12 +1831,12 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b /** * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for - * @requested: the number of blocks to be reserved + * @ap: the allocation parameters * * Returns: errno */ -int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) +int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; @@ -1817,17 +1848,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; - if (gfs2_assert_warn(sdp, requested)) + if (gfs2_assert_warn(sdp, ap->target)) return -EINVAL; if (gfs2_rs_active(rs)) { begin = rs->rs_rbm.rgd; - flags = 0; /* Yoda: Do or do not. There is no try */ } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { rs->rs_rbm.rgd = begin = ip->i_rgd; } else { rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } - if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV)) + if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) skip = gfs2_orlov_skip(ip); if (rs->rs_rbm.rgd == NULL) return -EBADSLT; @@ -1869,14 +1899,14 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) /* Get a reservation if we don't already have one */ if (!gfs2_rs_active(rs)) - rg_mblk_search(rs->rs_rbm.rgd, ip, requested); + rg_mblk_search(rs->rs_rbm.rgd, ip, ap); /* Skip rgrps when we can't get a reservation on first pass */ if (!gfs2_rs_active(rs) && (loops < 1)) goto check_rgrp; /* If rgrp has enough free space, use it */ - if (rs->rs_rbm.rgd->rd_free_clone >= requested) { + if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) { ip->i_rgd = rs->rs_rbm.rgd; return 0; } @@ -1973,14 +2003,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, *n = 1; block = gfs2_rbm_to_block(rbm); - gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh); + gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh); gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); block++; while (*n < elen) { ret = gfs2_rbm_from_block(&pos, block); if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) break; - gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh); + gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh); gfs2_setbit(&pos, true, GFS2_BLKST_USED); (*n)++; block++; @@ -2001,6 +2031,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u32 blen, unsigned char new_state) { struct gfs2_rbm rbm; + struct gfs2_bitmap *bi; rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); if (!rbm.rgd) { @@ -2011,15 +2042,15 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, while (blen--) { gfs2_rbm_from_block(&rbm, bstart); + bi = rbm_bi(&rbm); bstart++; - if (!rbm.bi->bi_clone) { - rbm.bi->bi_clone = kmalloc(rbm.bi->bi_bh->b_size, - GFP_NOFS | __GFP_NOFAIL); - memcpy(rbm.bi->bi_clone + rbm.bi->bi_offset, - rbm.bi->bi_bh->b_data + rbm.bi->bi_offset, - rbm.bi->bi_len); + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); } - gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh); + gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); gfs2_setbit(&rbm, false, new_state); } @@ -2103,6 +2134,35 @@ out: } /** + * gfs2_set_alloc_start - Set starting point for block allocation + * @rbm: The rbm which will be set to the required location + * @ip: The gfs2 inode + * @dinode: Flag to say if allocation includes a new inode + * + * This sets the starting point from the reservation if one is active + * otherwise it falls back to guessing a start point based on the + * inode's goal block or the last allocation point in the rgrp. + */ + +static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, + const struct gfs2_inode *ip, bool dinode) +{ + u64 goal; + + if (gfs2_rs_active(ip->i_res)) { + *rbm = ip->i_res->rs_rbm; + return; + } + + if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; + + gfs2_rbm_from_block(rbm, goal); +} + +/** * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode * @ip: the inode to allocate the block for * @bn: Used to return the starting block number @@ -2120,22 +2180,14 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, struct buffer_head *dibh; struct gfs2_rbm rbm = { .rgd = ip->i_rgd, }; unsigned int ndata; - u64 goal; u64 block; /* block, within the file system scope */ int error; - if (gfs2_rs_active(ip->i_res)) - goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm); - else if (!dinode && rgrp_contains_block(rbm.rgd, ip->i_goal)) - goal = ip->i_goal; - else - goal = rbm.rgd->rd_last_alloc + rbm.rgd->rd_data0; - - gfs2_rbm_from_block(&rbm, goal); + gfs2_set_alloc_start(&rbm, ip, dinode); error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); if (error == -ENOSPC) { - gfs2_rbm_from_block(&rbm, goal); + gfs2_set_alloc_start(&rbm, ip, dinode); error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 5b3f4a896e6c..3a10d2ffbbe7 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -40,7 +40,7 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); #define GFS2_AF_ORLOV 1 -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags); +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap); extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, @@ -48,7 +48,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, extern int gfs2_rs_alloc(struct gfs2_inode *ip); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip); +extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e5639dec66c4..35da5b19c0de 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1526,7 +1526,7 @@ out_unlock: out: /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); - gfs2_rs_delete(ip); + gfs2_rs_delete(ip, NULL); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index aa5c48044966..d09f6edda0ff 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -587,7 +587,6 @@ TUNE_ATTR(max_readahead, 0); TUNE_ATTR(complain_secs, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); -TUNE_ATTR(quota_simul_sync, 1); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); @@ -597,7 +596,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_max_readahead.attr, &tune_attr_complain_secs.attr, &tune_attr_statfs_slow.attr, - &tune_attr_quota_simul_sync.attr, &tune_attr_statfs_quantum.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 6402fb69d71b..f7109f689e61 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -268,23 +268,3 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, return rv; } -void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value) -{ - unsigned int c, o, b = bit; - int old_value; - - c = b / (8 * PAGE_SIZE); - b %= 8 * PAGE_SIZE; - o = b / 8; - b %= 8; - - old_value = (bitmap[c][o] & (1 << b)); - gfs2_assert_withdraw(sdp, !old_value != !new_value); - - if (new_value) - bitmap[c][o] |= 1 << b; - else - bitmap[c][o] &= ~(1 << b); -} - diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 80535739ac7b..b7ffb09b99ea 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -164,8 +164,6 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) -void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value); int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...); #endif /* __UTIL_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index ecd37f30ab91..8c6a6f6bdba9 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -723,6 +723,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, unsigned int blks, ea_skeleton_call_t skeleton_call, void *private) { + struct gfs2_alloc_parms ap = { .target = blks }; struct buffer_head *dibh; int error; @@ -734,7 +735,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - error = gfs2_inplace_reserve(ip, blks, 0); + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_gunlock_q; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index be0c39b66fe0..aa603e017d22 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -26,7 +26,6 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> -#include <linux/backing-dev.h> static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -100,10 +99,11 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS); + new_transaction = kzalloc(sizeof(*new_transaction), + GFP_NOFS|__GFP_NOFAIL); if (!new_transaction) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; + ret = -ENOMEM; + goto out; } } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index c1a3e603279c..7f464c513ba0 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -95,7 +95,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (insert_inode_locked(inode) < 0) { rc = -EINVAL; - goto fail_unlock; + goto fail_put; } inode_init_owner(inode, parent, mode); @@ -156,7 +156,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; -fail_unlock: clear_nlink(inode); unlock_new_inode(inode); fail_put: diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index 6624684dd5de..f2a0cfcef11d 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -18,7 +18,7 @@ config MINIX_FS config MINIX_FS_NATIVE_ENDIAN def_bool MINIX_FS - depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) + depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED def_bool MINIX_FS diff --git a/fs/namei.c b/fs/namei.c index 645268f23eb6..caa28051e197 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2294,10 +2294,11 @@ out: * path_mountpoint - look up a path to be umounted * @dfd: directory file descriptor to start walk from * @name: full pathname to walk + * @path: pointer to container for result * @flags: lookup flags * * Look up the given name, but don't attempt to revalidate the last component. - * Returns 0 and "path" will be valid on success; Retuns error otherwise. + * Returns 0 and "path" will be valid on success; Returns error otherwise. */ static int path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b5e80b0af315..38c1768b4142 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -140,6 +140,17 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN If the NFS client is unchanged from the upstream kernel, this option should be set to the default "kernel.org". +config NFS_V4_1_MIGRATION + bool "NFSv4.1 client support for migration" + depends on NFS_V4_1 + default n + help + This option makes the NFS client advertise to NFSv4.1 servers that + it can support NFSv4 migration. + + The NFSv4.1 pieces of the Linux NFSv4 migration implementation are + still experimental. If you are not an NFSv4 developer, say N here. + config NFS_V4_SECURITY_LABEL bool depends on NFS_V4_2 && SECURITY diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 67cd73213168..073b4cf67ed9 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -164,8 +164,7 @@ nfs41_callback_up(struct svc_serv *serv) svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; } - dprintk("--> %s return %ld\n", __func__, - IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); + dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); return rqstp; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2dceee4db076..1d09289c8f0e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -590,6 +590,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; + if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) @@ -784,8 +786,10 @@ static int nfs_init_server(struct nfs_server *server, goto error; server->port = data->nfs_server.port; + server->auth_info = data->auth_info; - error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); if (error < 0) goto error; @@ -926,6 +930,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour target->acdirmax = source->acdirmax; target->caps = source->caps; target->options = source->options; + target->auth_info = source->auth_info; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); @@ -943,7 +948,7 @@ void nfs_server_insert_lists(struct nfs_server *server) } EXPORT_SYMBOL_GPL(nfs_server_insert_lists); -static void nfs_server_remove_lists(struct nfs_server *server) +void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn; @@ -960,6 +965,7 @@ static void nfs_server_remove_lists(struct nfs_server *server) synchronize_rcu(); } +EXPORT_SYMBOL_GPL(nfs_server_remove_lists); /* * Allocate and initialise a server record diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 02b0df769e2d..9a8676f33350 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1139,7 +1139,13 @@ out_zap_parent: if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ nfs_zap_caches(inode); - if (dentry->d_flags & DCACHE_DISCONNECTED) + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (IS_ROOT(dentry)) goto out_valid; } /* If we have submounts, don't unhash ! */ @@ -1381,7 +1387,7 @@ static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, i static int do_open(struct inode *inode, struct file *filp) { - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); return 0; } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 24d1d1c5fcaf..3ef01f0ba0bc 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp) /* create a cache index for looking up filehandles */ clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, &nfs_fscache_server_index_def, - clp); + clp, true); dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", clp, clp->fscache); } @@ -139,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int /* create a cache index for looking up filehandles */ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, &nfs_fscache_super_index_def, - nfss); + nfss, true); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); return; @@ -178,163 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) /* * Initialise the per-inode cache cookie pointer for an NFS inode. */ -void nfs_fscache_init_inode_cookie(struct inode *inode) +void nfs_fscache_init_inode(struct inode *inode) { - NFS_I(inode)->fscache = NULL; - if (S_ISREG(inode->i_mode)) - set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); -} - -/* - * Get the per-inode cache cookie for an NFS inode. - */ -static void nfs_fscache_enable_inode_cookie(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; struct nfs_inode *nfsi = NFS_I(inode); - if (nfsi->fscache || !NFS_FSCACHE(inode)) + nfsi->fscache = NULL; + if (!S_ISREG(inode->i_mode)) return; - - if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { - nfsi->fscache = fscache_acquire_cookie( - NFS_SB(sb)->fscache, - &nfs_fscache_inode_object_def, - nfsi); - - dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", - sb, nfsi, nfsi->fscache); - } + nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi, false); } /* * Release a per-inode cookie. */ -void nfs_fscache_release_inode_cookie(struct inode *inode) +void nfs_fscache_clear_inode(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); - dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", - nfsi, nfsi->fscache); + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - fscache_relinquish_cookie(nfsi->fscache, 0); + fscache_relinquish_cookie(cookie, false); nfsi->fscache = NULL; } -/* - * Retire a per-inode cookie, destroying the data attached to it. - */ -void nfs_fscache_zap_inode_cookie(struct inode *inode) +static bool nfs_fscache_can_enable(void *data) { - struct nfs_inode *nfsi = NFS_I(inode); + struct inode *inode = data; - dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", - nfsi, nfsi->fscache); - - fscache_relinquish_cookie(nfsi->fscache, 1); - nfsi->fscache = NULL; + return !inode_is_open_for_write(inode); } /* - * Turn off the cache with regard to a per-inode cookie if opened for writing, - * invalidating all the pages in the page cache relating to the associated - * inode to clear the per-page caching. - */ -static void nfs_fscache_disable_inode_cookie(struct inode *inode) -{ - clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); - - if (NFS_I(inode)->fscache) { - dfprintk(FSCACHE, - "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); - - /* Need to uncache any pages attached to this inode that - * fscache knows about before turning off the cache. - */ - fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); - nfs_fscache_zap_inode_cookie(inode); - } -} - -/* - * wait_on_bit() sleep function for uninterruptible waiting - */ -static int nfs_fscache_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * Lock against someone else trying to also acquire or relinquish a cookie - */ -static inline void nfs_fscache_inode_lock(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) - wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, - nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); -} - -/* - * Unlock cookie management lock - */ -static inline void nfs_fscache_inode_unlock(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - smp_mb__before_clear_bit(); - clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); -} - -/* - * Decide if we should enable or disable local caching for this inode. - * - For now, with NFS, only regular files that are open read-only will be able - * to use the cache. - * - May be invoked multiple times in parallel by parallel nfs_open() functions. - */ -void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) -{ - if (NFS_FSCACHE(inode)) { - nfs_fscache_inode_lock(inode); - if ((filp->f_flags & O_ACCMODE) != O_RDONLY) - nfs_fscache_disable_inode_cookie(inode); - else - nfs_fscache_enable_inode_cookie(inode); - nfs_fscache_inode_unlock(inode); - } -} -EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie); - -/* - * Replace a per-inode cookie due to revalidation detecting a file having - * changed on the server. + * Enable or disable caching for a file that is being opened as appropriate. + * The cookie is allocated when the inode is initialised, but is not enabled at + * that time. Enablement is deferred to file-open time to avoid stat() and + * access() thrashing the cache. + * + * For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * + * We enable the cache for an inode if we open it read-only and it isn't + * currently open for writing. We disable the cache if the inode is open + * write-only. + * + * The caller uses the file struct to pin i_writecount on the inode before + * calling us when a file is opened for writing, so we can make use of that. + * + * Note that this may be invoked multiple times in parallel by parallel + * nfs_open() functions. */ -void nfs_fscache_reset_inode_cookie(struct inode *inode) +void nfs_fscache_open_file(struct inode *inode, struct file *filp) { struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_server *nfss = NFS_SERVER(inode); - NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache); + struct fscache_cookie *cookie = nfs_i_fscache(inode); - nfs_fscache_inode_lock(inode); - if (nfsi->fscache) { - /* retire the current fscache cache and get a new one */ - fscache_relinquish_cookie(nfsi->fscache, 1); - - nfsi->fscache = fscache_acquire_cookie( - nfss->nfs_client->fscache, - &nfs_fscache_inode_object_def, - nfsi); + if (!fscache_cookie_valid(cookie)) + return; - dfprintk(FSCACHE, - "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", - nfss, nfsi, old, nfsi->fscache); + if (inode_is_open_for_write(inode)) { + dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); + clear_bit(NFS_INO_FSCACHE, &nfsi->flags); + fscache_disable_cookie(cookie, true); + fscache_uncache_all_inode_pages(cookie, inode); + } else { + dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); + fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); + if (fscache_cookie_enabled(cookie)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); } - nfs_fscache_inode_unlock(inode); } +EXPORT_SYMBOL_GPL(nfs_fscache_open_file); /* * Release the caching state associated with a page, if the page isn't busy @@ -344,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode) int nfs_fscache_release_page(struct page *page, gfp_t gfp) { if (PageFsCache(page)) { - struct nfs_inode *nfsi = NFS_I(page->mapping->host); - struct fscache_cookie *cookie = nfsi->fscache; + struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host); BUG_ON(!cookie); dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", - cookie, page, nfsi); + cookie, page, NFS_I(page->mapping->host)); if (!fscache_maybe_release_page(cookie, page, gfp)) return 0; @@ -367,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp) */ void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - struct fscache_cookie *cookie = nfsi->fscache; + struct fscache_cookie *cookie = nfs_i_fscache(inode); BUG_ON(!cookie); dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", - cookie, page, nfsi); + cookie, page, NFS_I(inode)); fscache_wait_on_page_write(cookie, page); @@ -417,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, dfprintk(FSCACHE, "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", - NFS_I(inode)->fscache, page, page->index, page->flags, inode); + nfs_i_fscache(inode), page, page->index, page->flags, inode); - ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, + ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), page, nfs_readpage_from_fscache_complete, ctx, @@ -459,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, int ret; dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", - NFS_I(inode)->fscache, npages, inode); + nfs_i_fscache(inode), npages, inode); - ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, + ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode), mapping, pages, nr_pages, nfs_readpage_from_fscache_complete, ctx, @@ -506,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) dfprintk(FSCACHE, "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", - NFS_I(inode)->fscache, page, page->index, page->flags, sync); + nfs_i_fscache(inode), page, page->index, page->flags, sync); - ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); + ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL); dfprintk(FSCACHE, "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", page, page->index, page->flags, ret); if (ret != 0) { - fscache_uncache_page(NFS_I(inode)->fscache, page); + fscache_uncache_page(nfs_i_fscache(inode), page); nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 4ecb76652eba..d7fe3e799f2f 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -76,11 +76,9 @@ extern void nfs_fscache_release_client_cookie(struct nfs_client *); extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); extern void nfs_fscache_release_super_cookie(struct super_block *); -extern void nfs_fscache_init_inode_cookie(struct inode *); -extern void nfs_fscache_release_inode_cookie(struct inode *); -extern void nfs_fscache_zap_inode_cookie(struct inode *); -extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); -extern void nfs_fscache_reset_inode_cookie(struct inode *); +extern void nfs_fscache_init_inode(struct inode *); +extern void nfs_fscache_clear_inode(struct inode *); +extern void nfs_fscache_open_file(struct inode *, struct file *); extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); extern int nfs_fscache_release_page(struct page *, gfp_t); @@ -187,12 +185,10 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} -static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_set_inode_cookie(struct inode *inode, - struct file *filp) {} -static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_init_inode(struct inode *inode) {} +static inline void nfs_fscache_clear_inode(struct inode *inode) {} +static inline void nfs_fscache_open_file(struct inode *inode, + struct file *filp) {} static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index eda8879171c4..18ab2da4eeb6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -122,7 +122,7 @@ void nfs_clear_inode(struct inode *inode) WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); - nfs_fscache_release_inode_cookie(inode); + nfs_fscache_clear_inode(inode); } EXPORT_SYMBOL_GPL(nfs_clear_inode); @@ -274,12 +274,6 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, if (label == NULL) return; - if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0) - return; - - if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2) - return; - if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { error = security_inode_notifysecctx(inode, label->label, label->len); @@ -459,7 +453,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; - nfs_fscache_init_inode_cookie(inode); + nfs_fscache_init_inode(inode); unlock_new_inode(inode); } else @@ -854,7 +848,7 @@ int nfs_open(struct inode *inode, struct file *filp) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); return 0; } @@ -923,6 +917,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); + nfs_setsecurity(inode, fattr, label); + dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); @@ -1209,6 +1205,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) * not on the result */ return nfs_fhandle_hash(fh); } +EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); /* * _nfs_display_fhandle - display an NFS file handle on the console @@ -1253,6 +1250,7 @@ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) } } } +EXPORT_SYMBOL_GPL(_nfs_display_fhandle); #endif /** diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 38da8c2b81ac..bca6a3e3c49c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -88,8 +88,8 @@ struct nfs_parsed_mount_data { unsigned int namlen; unsigned int options; unsigned int bsize; - unsigned int auth_flavor_len; - rpc_authflavor_t auth_flavors[1]; + struct nfs_auth_info auth_info; + rpc_authflavor_t selected_flavor; char *client_address; unsigned int version; unsigned int minorversion; @@ -154,6 +154,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, rpc_authflavor_t); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); +void nfs_server_remove_lists(struct nfs_server *); void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); @@ -174,6 +175,8 @@ extern struct nfs_server *nfs4_create_server( struct nfs_subversion *); extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, struct nfs_fh *); +extern int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, @@ -323,6 +326,7 @@ extern struct file_system_type nfs_xdev_fs_type; extern struct file_system_type nfs4_xdev_fs_type; extern struct file_system_type nfs4_referral_fs_type; #endif +bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); void nfs_initialise_sb(struct super_block *); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 28842abafab4..3ce79b04522e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -29,6 +29,8 @@ enum nfs4_client_state { NFS4CLNT_SERVER_SCOPE_MISMATCH, NFS4CLNT_PURGE_STATE, NFS4CLNT_BIND_CONN_TO_SESSION, + NFS4CLNT_MOVED, + NFS4CLNT_LEASE_MOVED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -50,6 +52,7 @@ struct nfs4_minor_version_ops { const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; + const struct nfs4_mig_recovery_ops *mig_recovery_ops; }; #define NFS_SEQID_CONFIRMED 1 @@ -203,6 +206,12 @@ struct nfs4_state_maintenance_ops { int (*renew_lease)(struct nfs_client *, struct rpc_cred *); }; +struct nfs4_mig_recovery_ops { + int (*get_locations)(struct inode *, struct nfs4_fs_locations *, + struct page *, struct rpc_cred *); + int (*fsid_present)(struct inode *, struct rpc_cred *); +}; + extern const struct dentry_operations nfs4_dentry_operations; /* dir.c */ @@ -213,10 +222,11 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations); /* nfs4proc.c */ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); @@ -231,6 +241,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); +extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, + struct page *page, struct rpc_cred *); +extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); @@ -411,6 +424,8 @@ extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_migration_recovery(const struct nfs_server *); +extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index a860ab566d6e..b4a160a405ce 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -197,6 +197,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + clp->cl_mig_gen = 1; return clp; error: @@ -368,6 +369,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (clp->cl_minorversion != 0) __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); @@ -924,7 +926,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - dprintk("Mount FH: %d\n", mntfh->size); + nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); nfs4_session_set_rwsize(server); @@ -947,9 +949,8 @@ out: * Create a version 4 volume record */ static int nfs4_init_server(struct nfs_server *server, - const struct nfs_parsed_mount_data *data) + struct nfs_parsed_mount_data *data) { - rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; struct rpc_timeout timeparms; int error; @@ -961,9 +962,15 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->options = data->options; + server->auth_info = data->auth_info; - if (data->auth_flavor_len >= 1) - pseudoflavor = data->auth_flavors[0]; + /* Use the first specified auth flavor. If this flavor isn't + * allowed by the server, use the SECINFO path to try the + * other specified flavors */ + if (data->auth_info.flavor_len >= 1) + data->selected_flavor = data->auth_info.flavors[0]; + else + data->selected_flavor = RPC_AUTH_UNIX; /* Get a client record */ error = nfs4_set_client(server, @@ -971,7 +978,7 @@ static int nfs4_init_server(struct nfs_server *server, (const struct sockaddr *)&data->nfs_server.address, data->nfs_server.addrlen, data->client_address, - pseudoflavor, + data->selected_flavor, data->nfs_server.protocol, &timeparms, data->minorversion, @@ -991,7 +998,8 @@ static int nfs4_init_server(struct nfs_server *server, server->port = data->nfs_server.port; - error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor); + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); error: /* Done */ @@ -1018,7 +1026,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (!server) return ERR_PTR(-ENOMEM); - auth_probe = mount_info->parsed->auth_flavor_len < 1; + auth_probe = mount_info->parsed->auth_info.flavor_len < 1; /* set up the general RPC client */ error = nfs4_init_server(server, mount_info->parsed); @@ -1046,6 +1054,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, { struct nfs_client *parent_client; struct nfs_server *server, *parent_server; + bool auth_probe; int error; dprintk("--> nfs4_create_referral_server()\n"); @@ -1078,8 +1087,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - error = nfs4_server_common_setup(server, mntfh, - !(parent_server->flags & NFS_MOUNT_SECFLAVOUR)); + auth_probe = parent_server->auth_info.flavor_len < 1; + + error = nfs4_server_common_setup(server, mntfh, auth_probe); if (error < 0) goto error; @@ -1091,3 +1101,111 @@ error: dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); } + +/* + * Grab the destination's particulars, including lease expiry time. + * + * Returns zero if probe succeeded and retrieved FSID matches the FSID + * we have cached. + */ +static int nfs_probe_destination(struct nfs_server *server) +{ + struct inode *inode = server->super->s_root->d_inode; + struct nfs_fattr *fattr; + int error; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* Sanity: the probe won't work if the destination server + * does not recognize the migrated FH. */ + error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr); + + nfs_free_fattr(fattr); + return error; +} + +/** + * nfs4_update_server - Move an nfs_server to a different nfs_client + * + * @server: represents FSID to be moved + * @hostname: new end-point's hostname + * @sap: new end-point's socket address + * @salen: size of "sap" + * + * The nfs_server must be quiescent before this function is invoked. + * Either its session is drained (NFSv4.1+), or its transport is + * plugged and drained (NFSv4.0). + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen) +{ + struct nfs_client *clp = server->nfs_client; + struct rpc_clnt *clnt = server->client; + struct xprt_create xargs = { + .ident = clp->cl_proto, + .net = &init_net, + .dstaddr = sap, + .addrlen = salen, + .servername = hostname, + }; + char buf[INET6_ADDRSTRLEN + 1]; + struct sockaddr_storage address; + struct sockaddr *localaddr = (struct sockaddr *)&address; + int error; + + dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + hostname); + + error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); + if (error != 0) { + dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", + __func__, error); + goto out; + } + + error = rpc_localaddr(clnt, localaddr, sizeof(address)); + if (error != 0) { + dprintk("<-- %s(): rpc_localaddr returned %d\n", + __func__, error); + goto out; + } + + error = -EAFNOSUPPORT; + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { + dprintk("<-- %s(): rpc_ntop returned %d\n", + __func__, error); + goto out; + } + + nfs_server_remove_lists(server); + error = nfs4_set_client(server, hostname, sap, salen, buf, + clp->cl_rpcclient->cl_auth->au_flavor, + clp->cl_proto, clnt->cl_timeout, + clp->cl_minorversion, clp->cl_net); + nfs_put_client(clp); + if (error != 0) { + nfs_server_insert_lists(server); + dprintk("<-- %s(): nfs4_set_client returned %d\n", + __func__, error); + goto out; + } + + if (server->nfs_client->cl_hostname == NULL) + server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + nfs_server_insert_lists(server); + + error = nfs_probe_destination(server); + if (error < 0) + goto out; + + dprintk("<-- %s() succeeded\n", __func__); + +out: + return error; +} diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 77efaf15ec90..1f01b55692ee 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -75,7 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); nfs_file_set_open_context(filp, ctx); - nfs_fscache_set_inode_cookie(inode, filp); + nfs_fscache_open_file(inode, filp); err = 0; out_put_ctx: diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2288cd3c9278..c08cbf40c59e 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len, /** * nfs_find_best_sec - Find a security mechanism supported locally + * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * * Return the pseudoflavor of the first security mechanism in @@ -145,7 +146,8 @@ static size_t nfs_parse_server_name(char *string, size_t len, * is searched in the order returned from the server, per RFC 3530 * recommendation. */ -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, + struct nfs4_secinfo_flavors *flavors) { rpc_authflavor_t pseudoflavor; struct nfs4_secinfo4 *secinfo; @@ -160,12 +162,19 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) case RPC_AUTH_GSS: pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, &secinfo->flavor_info); - if (pseudoflavor != RPC_AUTH_MAXFLAVOR) + /* make sure pseudoflavor matches sec= mount opt */ + if (pseudoflavor != RPC_AUTH_MAXFLAVOR && + nfs_auth_info_match(&server->auth_info, + pseudoflavor)) return pseudoflavor; break; } } + /* if there were any sec= options then nothing matched */ + if (server->auth_info.flavor_len > 0) + return -EPERM; + return RPC_AUTH_UNIX; } @@ -187,7 +196,7 @@ static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr goto out; } - flavor = nfs_find_best_sec(flavors); + flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors); out: put_page(page); @@ -390,7 +399,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, if (client->cl_auth->au_flavor != flavor) flavor = client->cl_auth->au_flavor; - else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) { + else { rpc_authflavor_t new = nfs4_negotiate_security(dir, name); if ((int)new >= 0) flavor = new; @@ -400,3 +409,104 @@ out: rpc_shutdown_client(client); return mnt; } + +/* + * Try one location from the fs_locations array. + * + * Returns zero on success, or a negative errno value. + */ +static int nfs4_try_replacing_one_location(struct nfs_server *server, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct sockaddr *sap; + unsigned int s; + size_t salen; + int error; + + sap = kmalloc(addr_bufsize, GFP_KERNEL); + if (sap == NULL) + return -ENOMEM; + + error = -ENOENT; + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + char *hostname; + + if (buf->len <= 0 || buf->len > PAGE_SIZE) + continue; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL) + continue; + + salen = nfs_parse_server_name(buf->data, buf->len, + sap, addr_bufsize, server); + if (salen == 0) + continue; + rpc_set_port(sap, NFS_PORT); + + error = -ENOMEM; + hostname = kstrndup(buf->data, buf->len, GFP_KERNEL); + if (hostname == NULL) + break; + + error = nfs4_update_server(server, hostname, sap, salen); + kfree(hostname); + if (error == 0) + break; + } + + kfree(sap); + return error; +} + +/** + * nfs4_replace_transport - set up transport to destination server + * + * @server: export being migrated + * @locations: fs_locations array + * + * Returns zero on success, or a negative errno value. + * + * The client tries all the entries in the "locations" array, in the + * order returned by the server, until one works or the end of the + * array is reached. + */ +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations) +{ + char *page = NULL, *page2 = NULL; + int loc, error; + + error = -ENOENT; + if (locations == NULL || locations->nlocations <= 0) + goto out; + + error = -ENOMEM; + page = (char *) __get_free_page(GFP_USER); + if (!page) + goto out; + page2 = (char *) __get_free_page(GFP_USER); + if (!page2) + goto out; + + for (loc = 0; loc < locations->nlocations; loc++) { + const struct nfs4_fs_location *location = + &locations->locations[loc]; + + if (location == NULL || location->nservers <= 0 || + location->rootpath.ncomponents == 0) + continue; + + error = nfs4_try_replacing_one_location(server, page, + page2, location); + if (error == 0) + break; + } + +out: + free_page((unsigned long)page); + free_page((unsigned long)page2); + return error; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d53d6785cba2..5ab33c0792df 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,9 +105,6 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) return NULL; - if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2) - return NULL; - err = security_dentry_init_security(dentry, sattr->ia_mode, &dentry->d_name, (void **)&label->label, &label->len); if (err == 0) @@ -384,6 +381,14 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); goto wait_on_recovery; + case -NFS4ERR_MOVED: + ret = nfs4_schedule_migration_recovery(server); + if (ret < 0) + break; + goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -431,6 +436,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc return nfs4_map_errors(ret); wait_on_recovery: ret = nfs4_wait_clnt_recover(clp); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -EIO; if (ret == 0) exception->retry = 1; return ret; @@ -1318,31 +1325,24 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) int ret; if (!data->rpc_done) { - ret = data->rpc_status; - goto err; + if (data->rpc_status) { + ret = data->rpc_status; + goto err; + } + /* cached opens have already been processed */ + goto update; } - ret = -ESTALE; - if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) || - !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) || - !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE)) - goto err; - - ret = -ENOMEM; - state = nfs4_get_open_state(inode, data->owner); - if (state == NULL) - goto err; - ret = nfs_refresh_inode(inode, &data->f_attr); if (ret) goto err; - nfs_setsecurity(inode, &data->f_attr, data->f_label); - if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); +update: update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); + atomic_inc(&state->count); return state; err: @@ -1575,6 +1575,12 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; + case -NFS4ERR_MOVED: + nfs4_schedule_migration_recovery(server); + return -EAGAIN; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(server->nfs_client); + return -EAGAIN; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -2697,6 +2703,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) nfs4_close_state(ctx->state, ctx->mode); } +#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) +#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL) + static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { struct nfs4_server_caps_arg args = { @@ -2712,12 +2722,25 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { + /* Sanity check the server answers */ + switch (server->nfs_client->cl_minorversion) { + case 0: + res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; + res.attr_bitmask[2] = 0; + break; + case 1: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; + break; + case 2: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; + } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| - NFS_CAP_CTIME|NFS_CAP_MTIME); + NFS_CAP_CTIME|NFS_CAP_MTIME| + NFS_CAP_SECURITY_LABEL); if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) @@ -2746,14 +2769,12 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f #endif memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - if (server->caps & NFS_CAP_SECURITY_LABEL) { - server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - } memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + server->cache_consistency_bitmask[2] = 0; server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -2864,11 +2885,24 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, int status = -EPERM; size_t i; - for (i = 0; i < ARRAY_SIZE(flav_array); i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); - if (status == -NFS4ERR_WRONGSEC || status == -EACCES) - continue; - break; + if (server->auth_info.flavor_len > 0) { + /* try each flavor specified by user */ + for (i = 0; i < server->auth_info.flavor_len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + server->auth_info.flavors[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } + } else { + /* no flavors specified by user, try default list */ + for (i = 0; i < ARRAY_SIZE(flav_array); i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + flav_array[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } } /* @@ -2910,9 +2944,6 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, status = nfs4_lookup_root(server, fhandle, info); if (status != -NFS4ERR_WRONGSEC) break; - /* Did user force a 'sec=' mount option? */ - if (server->flags & NFS_MOUNT_SECFLAVOUR) - break; default: status = nfs4_do_find_root_sec(server, fhandle, info); } @@ -2981,11 +3012,16 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; - /* Make sure server returned a different fsid for the referral */ + + /* + * If the fsid didn't change, this is a migration event, not a + * referral. Cause us to drop into the exception handler, which + * will kick off migration recovery. + */ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { dprintk("%s: server did not return a different fsid for" " a referral at %s\n", __func__, name->name); - status = -EIO; + status = -NFS4ERR_MOVED; goto out; } /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ @@ -3165,9 +3201,6 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, err = -EPERM; if (client != *clnt) goto out; - /* No security negotiation if the user specified 'sec=' */ - if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR) - goto out; client = nfs4_create_sec_client(client, dir, name); if (IS_ERR(client)) return PTR_ERR(client); @@ -4221,7 +4254,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) unsigned long timestamp = data->timestamp; trace_nfs4_renew_async(clp, task->tk_status); - if (task->tk_status < 0) { + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + break; + default: /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) return; @@ -4575,7 +4614,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, struct nfs4_label label = {0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; - struct nfs4_getattr_arg args = { + struct nfs4_getattr_arg arg = { .fh = NFS_FH(inode), .bitmask = bitmask, }; @@ -4586,14 +4625,14 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], - .rpc_argp = &args, + .rpc_argp = &arg, .rpc_resp = &res, }; int ret; nfs_fattr_init(&fattr); - ret = rpc_call_sync(server->client, &msg, 0); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); if (ret) return ret; if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) @@ -4630,7 +4669,7 @@ static int _nfs4_do_set_security_label(struct inode *inode, struct iattr sattr = {0}; struct nfs_server *server = NFS_SERVER(inode); const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; - struct nfs_setattrargs args = { + struct nfs_setattrargs arg = { .fh = NFS_FH(inode), .iap = &sattr, .server = server, @@ -4644,14 +4683,14 @@ static int _nfs4_do_set_security_label(struct inode *inode, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &args, + .rpc_argp = &arg, .rpc_resp = &res, }; int status; - nfs4_stateid_copy(&args.stateid, &zero_stateid); + nfs4_stateid_copy(&arg.stateid, &zero_stateid); - status = rpc_call_sync(server->client, &msg, 0); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status) dprintk("%s failed: %d\n", __func__, status); @@ -4735,17 +4774,24 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (state == NULL) break; if (nfs4_schedule_stateid_recovery(server, state) < 0) - goto stateid_invalid; + goto recovery_failed; goto wait_on_recovery; case -NFS4ERR_EXPIRED: if (state != NULL) { if (nfs4_schedule_stateid_recovery(server, state) < 0) - goto stateid_invalid; + goto recovery_failed; } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); goto wait_on_recovery; + case -NFS4ERR_MOVED: + if (nfs4_schedule_migration_recovery(server) < 0) + goto recovery_failed; + goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -4757,29 +4803,28 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - task->tk_status = 0; - return -EAGAIN; + goto restart_call; #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); - task->tk_status = 0; - return -EAGAIN; case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_OLD_STATEID: - task->tk_status = 0; - return -EAGAIN; + goto restart_call; } task->tk_status = nfs4_map_errors(task->tk_status); return 0; -stateid_invalid: +recovery_failed: task->tk_status = -EIO; return 0; wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + goto recovery_failed; +restart_call: task->tk_status = 0; return -EAGAIN; } @@ -5106,6 +5151,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = 0; } request->fl_ops->fl_release_private(request); + request->fl_ops = NULL; out: return status; } @@ -5779,6 +5825,7 @@ struct nfs_release_lockowner_data { struct nfs_release_lockowner_args args; struct nfs4_sequence_args seq_args; struct nfs4_sequence_res seq_res; + unsigned long timestamp; }; static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) @@ -5786,12 +5833,27 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata struct nfs_release_lockowner_data *data = calldata; nfs40_setup_sequence(data->server, &data->seq_args, &data->seq_res, task); + data->timestamp = jiffies; } static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) { struct nfs_release_lockowner_data *data = calldata; + struct nfs_server *server = data->server; + nfs40_sequence_done(task, &data->seq_res); + + switch (task->tk_status) { + case 0: + renew_lease(server, data->timestamp); + break; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_LEASE_MOVED: + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } } static void nfs4_release_lockowner_release(void *calldata) @@ -5990,6 +6052,283 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, return err; } +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop returning + * NFS4ERR_LEASE_MOVED to this client. A RENEW operation is + * appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .clientid = server->nfs_client->cl_clientid, + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + .renew = 1, /* append RENEW */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status) + return status; + + renew_lease(server, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID + * performing this operation is identified in the SEQUENCE + * operation in this compound. + * + * When the client supports GETATTR(fs_locations_info), it can + * be plumbed in here. + */ +static int _nfs41_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_get_locations - discover locations for a migrated FSID + * @inode: inode on FSID that is migrating + * @locations: result of query + * @page: buffer + * @cred: credential to use for this operation + * + * Returns NFS4_OK on success, a negative NFS4ERR status code if the + * operation failed, or a negative errno if a local error occurred. + * + * On success, "locations" is filled in, but if the server has + * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not + * asserted. + * + * -NFS4ERR_LEASE_MOVED is returned if the server still has leases + * from this client that require migration recovery. + */ +int nfs4_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->get_locations(inode, locations, page, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop + * returning NFS4ERR_LEASE_MOVED to this client. A RENEW operation + * is appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + .clientid = clp->cl_clientid, + .renew = 1, /* append RENEW */ + }; + struct nfs4_fsid_present_res res = { + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status) + return status; + + do_renew_lease(clp, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID performing + * this operation is identified in the SEQUENCE operation in this + * compound. + */ +static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + }; + struct nfs4_fsid_present_res res = { + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_fsid_present - Is this FSID present or absent on server? + * @inode: inode on FSID to check + * @cred: credential to use for this operation + * + * Server indicates whether the FSID is present, moved, or not + * recognized. This operation is necessary to clear a LEASE_MOVED + * condition for this client ID. + * + * Returns NFS4_OK if the FSID is present on this server, + * -NFS4ERR_MOVED if the FSID is no longer present, a negative + * NFS4ERR code if some error occurred on the server, or a + * negative errno if a local failure occurred. + */ +int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->fsid_present(inode, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + /** * If 'use_integrity' is true and the state managment nfs_client * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient @@ -6276,8 +6615,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, struct nfs41_exchange_id_args args = { .verifier = &verifier, .client = clp, +#ifdef CONFIG_NFS_V4_1_MIGRATION .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif }; struct nfs41_exchange_id_res res = { 0 @@ -7616,6 +7961,9 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, break; } + if (!nfs_auth_info_match(&server->auth_info, flavor)) + flavor = RPC_AUTH_MAXFLAVOR; + if (flavor != RPC_AUTH_MAXFLAVOR) { err = nfs4_lookup_root_sec(server, fhandle, info, flavor); @@ -7887,6 +8235,18 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { }; #endif +static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { + .get_locations = _nfs40_proc_get_locations, + .fsid_present = _nfs40_proc_fsid_present, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { + .get_locations = _nfs41_proc_get_locations, + .fsid_present = _nfs41_proc_fsid_present, +}; +#endif /* CONFIG_NFS_V4_1 */ + static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS @@ -7902,6 +8262,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, + .mig_recovery_ops = &nfs40_mig_recovery_ops, }; #if defined(CONFIG_NFS_V4_1) @@ -7922,6 +8283,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, + .mig_recovery_ops = &nfs41_mig_recovery_ops, }; #endif diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cc14cbb78b73..c8e729deb4f7 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -239,8 +239,6 @@ static void nfs4_end_drain_session(struct nfs_client *clp) } } -#if defined(CONFIG_NFS_V4_1) - static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) { set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); @@ -270,6 +268,8 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) return nfs4_drain_slot_tbl(&ses->fc_slot_table); } +#if defined(CONFIG_NFS_V4_1) + static int nfs41_setup_state_renewal(struct nfs_client *clp) { int status; @@ -1197,20 +1197,74 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); +/** + * nfs4_schedule_migration_recovery - trigger migration recovery + * + * @server: FSID that is migrating + * + * Returns zero if recovery has started, otherwise a negative NFS4ERR + * value is returned. + */ +int nfs4_schedule_migration_recovery(const struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + if (server->fh_expire_type != NFS4_FH_PERSISTENT) { + pr_err("NFS: volatile file handles not supported (server %s)\n", + clp->cl_hostname); + return -NFS4ERR_IO; + } + + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -NFS4ERR_IO; + + dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n", + __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + set_bit(NFS_MIG_IN_TRANSITION, + &((struct nfs_server *)server)->mig_status); + set_bit(NFS4CLNT_MOVED, &clp->cl_state); + + nfs4_schedule_state_manager(clp); + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery); + +/** + * nfs4_schedule_lease_moved_recovery - start lease-moved recovery + * + * @clp: server to check for moved leases + * + */ +void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp) +{ + dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n", + __func__, clp->cl_clientid, clp->cl_hostname); + + set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery); + int nfs4_wait_clnt_recover(struct nfs_client *clp) { int res; might_sleep(); + atomic_inc(&clp->cl_count); res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, nfs_wait_bit_killable, TASK_KILLABLE); if (res) - return res; - + goto out; if (clp->cl_cons_state < 0) - return clp->cl_cons_state; - return 0; + res = clp->cl_cons_state; +out: + nfs_put_client(clp); + return res; } int nfs4_client_recover_expired_lease(struct nfs_client *clp) @@ -1375,8 +1429,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out; default: - printk(KERN_ERR "NFS: %s: unhandled error %d. " - "Zeroing state\n", __func__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); case -ENOMEM: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: @@ -1422,7 +1476,7 @@ restart: if (status >= 0) { status = nfs4_reclaim_locks(state, ops); if (status >= 0) { - if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) { + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) @@ -1439,15 +1493,12 @@ restart: } switch (status) { default: - printk(KERN_ERR "NFS: %s: unhandled error %d. " - "Zeroing state\n", __func__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); case -ENOENT: case -ENOMEM: case -ESTALE: - /* - * Open state on this file cannot be recovered - * All we can do is revert to using the zero stateid. - */ + /* Open state on this file cannot be recovered */ nfs4_state_mark_recovery_failed(state, status); break; case -EAGAIN: @@ -1628,7 +1679,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) nfs4_state_end_reclaim_reboot(clp); break; case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); @@ -1829,6 +1879,168 @@ static int nfs4_purge_lease(struct nfs_client *clp) return 0; } +/* + * Try remote migration of one FSID from a source server to a + * destination server. The source server provides a list of + * potential destinations. + * + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_fs_locations *locations = NULL; + struct inode *inode; + struct page *page; + int status, result; + + dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + result = 0; + page = alloc_page(GFP_KERNEL); + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (page == NULL || locations == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } + + inode = server->super->s_root->d_inode; + result = nfs4_proc_get_locations(inode, locations, page, cred); + if (result) { + dprintk("<-- %s: failed to retrieve fs_locations: %d\n", + __func__, result); + goto out; + } + + result = -NFS4ERR_NXIO; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + dprintk("<-- %s: No fs_locations data, migration skipped\n", + __func__); + goto out; + } + + nfs4_begin_drain_session(clp); + + status = nfs4_replace_transport(server, locations); + if (status != 0) { + dprintk("<-- %s: failed to replace transport: %d\n", + __func__, status); + goto out; + } + + result = 0; + dprintk("<-- %s: migration succeeded\n", __func__); + +out: + if (page != NULL) + __free_page(page); + kfree(locations); + if (result) { + pr_err("NFS: migration recovery failed (server %s)\n", + clp->cl_hostname); + set_bit(NFS_MIG_FAILED, &server->mig_status); + } + return result; +} + +/* + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_handle_migration(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: migration reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION, + &server->mig_status)) + continue; + + rcu_read_unlock(); + status = nfs4_try_migration(server, cred); + if (status < 0) { + put_rpccred(cred); + return status; + } + goto restart; + } + rcu_read_unlock(); + put_rpccred(cred); + return 0; +} + +/* + * Test each nfs_server on the clp's cl_superblocks list to see + * if it's moved to another server. Stop when the server no longer + * returns NFS4ERR_LEASE_MOVED. + */ +static int nfs4_handle_lease_moved(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: lease moved reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + struct inode *inode; + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + rcu_read_unlock(); + + inode = server->super->s_root->d_inode; + status = nfs4_proc_fsid_present(inode, cred); + if (status != -NFS4ERR_MOVED) + goto restart; /* wasn't this one */ + if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED) + goto restart; /* there are more */ + goto out; + } + rcu_read_unlock(); + +out: + put_rpccred(cred); + return 0; +} + /** * nfs4_discover_server_trunking - Detect server IP address trunking * @@ -2017,9 +2229,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) nfs41_handle_server_reboot(clp); if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | - SEQ4_STATUS_ADMIN_STATE_REVOKED | - SEQ4_STATUS_LEASE_MOVED)) + SEQ4_STATUS_ADMIN_STATE_REVOKED)) nfs41_handle_state_revoked(clp); + if (flags & SEQ4_STATUS_LEASE_MOVED) + nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) @@ -2157,7 +2370,20 @@ static void nfs4_state_manager(struct nfs_client *clp) status = nfs4_check_lease(clp); if (status < 0) goto out_error; - continue; + } + + if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { + section = "migration"; + status = nfs4_handle_migration(clp); + if (status < 0) + goto out_error; + } + + if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) { + section = "lease moved"; + status = nfs4_handle_lease_moved(clp); + if (status < 0) + goto out_error; } /* First recover reboot state... */ diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index e26acdd1a645..65ab0a0ca1c4 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -261,9 +261,9 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name, res = nfs_follow_remote_path(root_mnt, export_path); - dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", - IS_ERR(res) ? PTR_ERR(res) : 0, - IS_ERR(res) ? " [error]" : ""); + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); return res; } @@ -319,9 +319,9 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, data->mnt_path = export_path; res = nfs_follow_remote_path(root_mnt, export_path); - dprintk("<-- nfs4_referral_mount() = %ld%s\n", - IS_ERR(res) ? PTR_ERR(res) : 0, - IS_ERR(res) ? " [error]" : ""); + dprintk("<-- nfs4_referral_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); return res; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 79210d23f607..5be2868c02f1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -105,12 +105,8 @@ static int nfs4_stat_to_errno(int); #ifdef CONFIG_NFS_V4_SECURITY_LABEL /* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */ #define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) -#define encode_readdir_space 24 -#define encode_readdir_bitmask_sz 3 #else #define nfs4_label_maxsz 0 -#define encode_readdir_space 20 -#define encode_readdir_bitmask_sz 2 #endif /* We support only one layout type per file system */ #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) @@ -595,11 +591,13 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_getattr_maxsz) + encode_getattr_maxsz + \ + encode_renew_maxsz) #define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_getattr_maxsz) + decode_getattr_maxsz + \ + decode_renew_maxsz) #define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -736,13 +734,15 @@ static int nfs4_stat_to_errno(int); encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_lookup_maxsz + \ - encode_fs_locations_maxsz) + encode_fs_locations_maxsz + \ + encode_renew_maxsz) #define NFS4_dec_fs_locations_sz \ (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_lookup_maxsz + \ - decode_fs_locations_maxsz) + decode_fs_locations_maxsz + \ + decode_renew_maxsz) #define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -751,6 +751,18 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_secinfo_maxsz) +#define NFS4_enc_fsid_present_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getfh_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_fsid_present_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getfh_maxsz + \ + decode_renew_maxsz) #if defined(CONFIG_NFS_V4_1) #define NFS4_enc_bind_conn_to_session_sz \ (compound_encode_hdr_maxsz + \ @@ -1565,6 +1577,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg }; uint32_t dircount = readdir->count >> 1; __be32 *p, verf[2]; + uint32_t attrlen = 0; + unsigned int i; if (readdir->plus) { attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| @@ -1573,26 +1587,27 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; dircount >>= 1; } /* Use mounted_on_fileid only if the server supports it */ if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) attrs[0] |= FATTR4_WORD0_FILEID; + for (i = 0; i < ARRAY_SIZE(attrs); i++) { + attrs[i] &= readdir->bitmask[i]; + if (attrs[i] != 0) + attrlen = i+1; + } encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); encode_uint64(xdr, readdir->cookie); encode_nfs4_verifier(xdr, &readdir->verifier); - p = reserve_space(xdr, encode_readdir_space); + p = reserve_space(xdr, 12 + (attrlen << 2)); *p++ = cpu_to_be32(dircount); *p++ = cpu_to_be32(readdir->count); - *p++ = cpu_to_be32(encode_readdir_bitmask_sz); - *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); - *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); - if (encode_readdir_bitmask_sz > 2) { - if (hdr->minorversion > 1) - attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; - p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]); - } + *p++ = cpu_to_be32(attrlen); + for (i = 0; i < attrlen; i++) + *p++ = cpu_to_be32(attrs[i]); memcpy(verf, readdir->verifier.data, sizeof(verf)); dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n", @@ -2687,11 +2702,20 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->dir_fh, &hdr); - encode_lookup(xdr, args->name, &hdr); - replen = hdr.replen; /* get the attribute into args->page */ - encode_fs_locations(xdr, args->bitmask, &hdr); + if (args->migration) { + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + } else { + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + } + /* Set up reply kvec to capture returned fs_locations array. */ xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, 0, PAGE_SIZE); encode_nops(&hdr); @@ -2715,6 +2739,26 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode FSID_PRESENT request + */ +static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fsid_present_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfh(xdr, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + encode_nops(&hdr); +} + #if defined(CONFIG_NFS_V4_1) /* * BIND_CONN_TO_SESSION request @@ -6824,13 +6868,26 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, status = decode_putfh(xdr); if (status) goto out; - status = decode_lookup(xdr); - if (status) - goto out; - xdr_enter_page(xdr, PAGE_SIZE); - status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, + if (res->migration) { + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, + NULL, res->fs_locations, + NULL, res->fs_locations->server); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); + } else { + status = decode_lookup(xdr); + if (status) + goto out; + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, NULL, res->fs_locations, NULL, res->fs_locations->server); + } out: return status; } @@ -6859,6 +6916,34 @@ out: return status; } +/* + * Decode FSID_PRESENT response + */ +static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_fsid_present_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); +out: + return status; +} + #if defined(CONFIG_NFS_V4_1) /* * Decode BIND_CONN_TO_SESSION response @@ -7373,6 +7458,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), PROC(SECINFO, enc_secinfo, dec_secinfo), + PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a03b9c6f9489..317d6fc2160e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -497,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) static const struct { rpc_authflavor_t flavour; const char *str; - } sec_flavours[] = { + } sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = { + /* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */ { RPC_AUTH_NULL, "null" }, { RPC_AUTH_UNIX, "sys" }, { RPC_AUTH_GSS_KRB5, "krb5" }, @@ -923,8 +924,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; data->nfs_server.protocol = XPRT_TRANSPORT_TCP; - data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR; - data->auth_flavor_len = 0; + data->selected_flavor = RPC_AUTH_MAXFLAVOR; data->minorversion = 0; data->need_mount = true; data->net = current->nsproxy->net_ns; @@ -1019,12 +1019,51 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) } } -static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data, - rpc_authflavor_t pseudoflavor) +/* + * Add 'flavor' to 'auth_info' if not already present. + * Returns true if 'flavor' ends up in the list, false otherwise + */ +static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, + rpc_authflavor_t flavor) +{ + unsigned int i; + unsigned int max_flavor_len = (sizeof(auth_info->flavors) / + sizeof(auth_info->flavors[0])); + + /* make sure this flavor isn't already in the list */ + for (i = 0; i < auth_info->flavor_len; i++) { + if (flavor == auth_info->flavors[i]) + return true; + } + + if (auth_info->flavor_len + 1 >= max_flavor_len) { + dfprintk(MOUNT, "NFS: too many sec= flavors\n"); + return false; + } + + auth_info->flavors[auth_info->flavor_len++] = flavor; + return true; +} + +/* + * Return true if 'match' is in auth_info or auth_info is empty. + * Return false otherwise. + */ +bool nfs_auth_info_match(const struct nfs_auth_info *auth_info, + rpc_authflavor_t match) { - data->auth_flavors[0] = pseudoflavor; - data->auth_flavor_len = 1; + int i; + + if (!auth_info->flavor_len) + return true; + + for (i = 0; i < auth_info->flavor_len; i++) { + if (auth_info->flavors[i] == match) + return true; + } + return false; } +EXPORT_SYMBOL_GPL(nfs_auth_info_match); /* * Parse the value of the 'sec=' option. @@ -1034,49 +1073,55 @@ static int nfs_parse_security_flavors(char *value, { substring_t args[MAX_OPT_ARGS]; rpc_authflavor_t pseudoflavor; + char *p; dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); - switch (match_token(value, nfs_secflavor_tokens, args)) { - case Opt_sec_none: - pseudoflavor = RPC_AUTH_NULL; - break; - case Opt_sec_sys: - pseudoflavor = RPC_AUTH_UNIX; - break; - case Opt_sec_krb5: - pseudoflavor = RPC_AUTH_GSS_KRB5; - break; - case Opt_sec_krb5i: - pseudoflavor = RPC_AUTH_GSS_KRB5I; - break; - case Opt_sec_krb5p: - pseudoflavor = RPC_AUTH_GSS_KRB5P; - break; - case Opt_sec_lkey: - pseudoflavor = RPC_AUTH_GSS_LKEY; - break; - case Opt_sec_lkeyi: - pseudoflavor = RPC_AUTH_GSS_LKEYI; - break; - case Opt_sec_lkeyp: - pseudoflavor = RPC_AUTH_GSS_LKEYP; - break; - case Opt_sec_spkm: - pseudoflavor = RPC_AUTH_GSS_SPKM; - break; - case Opt_sec_spkmi: - pseudoflavor = RPC_AUTH_GSS_SPKMI; - break; - case Opt_sec_spkmp: - pseudoflavor = RPC_AUTH_GSS_SPKMP; - break; - default: - return 0; + while ((p = strsep(&value, ":")) != NULL) { + switch (match_token(p, nfs_secflavor_tokens, args)) { + case Opt_sec_none: + pseudoflavor = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + pseudoflavor = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + pseudoflavor = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + pseudoflavor = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + pseudoflavor = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + pseudoflavor = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + pseudoflavor = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + pseudoflavor = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + pseudoflavor = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + pseudoflavor = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + pseudoflavor = RPC_AUTH_GSS_SPKMP; + break; + default: + dfprintk(MOUNT, + "NFS: sec= option '%s' not recognized\n", p); + return 0; + } + + if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor)) + return 0; } - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - nfs_set_auth_parsed_mount_data(mnt, pseudoflavor); return 1; } @@ -1623,12 +1668,14 @@ out_security_failure: } /* - * Ensure that the specified authtype in args->auth_flavors[0] is supported by - * the server. Returns 0 if it's ok, and -EACCES if not. + * Ensure that a specified authtype in args->auth_info is supported by + * the server. Returns 0 and sets args->selected_flavor if it's ok, and + * -EACCES if not. */ -static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, +static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, rpc_authflavor_t *server_authlist, unsigned int count) { + rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; unsigned int i; /* @@ -1640,17 +1687,20 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, * can be used. */ for (i = 0; i < count; i++) { - if (args->auth_flavors[0] == server_authlist[i] || - server_authlist[i] == RPC_AUTH_NULL) + flavor = server_authlist[i]; + + if (nfs_auth_info_match(&args->auth_info, flavor) || + flavor == RPC_AUTH_NULL) goto out; } - dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n", - args->auth_flavors[0]); + dfprintk(MOUNT, + "NFS: specified auth flavors not supported by server\n"); return -EACCES; out: - dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); + args->selected_flavor = flavor; + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); return 0; } @@ -1738,9 +1788,10 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf * Was a sec= authflavor specified in the options? First, verify * whether the server supports it, and then just try to use it if so. */ - if (args->auth_flavor_len > 0) { - status = nfs_verify_authflavor(args, authlist, authlist_len); - dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); + if (args->auth_info.flavor_len > 0) { + status = nfs_verify_authflavors(args, authlist, authlist_len); + dfprintk(MOUNT, "NFS: using auth flavor %u\n", + args->selected_flavor); if (status) return ERR_PTR(status); return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); @@ -1769,7 +1820,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Fallthrough */ } dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); - nfs_set_auth_parsed_mount_data(args, flavor); + args->selected_flavor = flavor; server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); if (!IS_ERR(server)) return server; @@ -1785,7 +1836,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Last chance! Try AUTH_UNIX */ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); } @@ -1972,9 +2023,9 @@ static int nfs23_validate_mount_data(void *options, args->bsize = data->bsize; if (data->flags & NFS_MOUNT_SECFLAVOUR) - nfs_set_auth_parsed_mount_data(args, data->pseudoflavor); + args->selected_flavor = data->pseudoflavor; else - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; if (!args->nfs_server.hostname) goto out_nomem; @@ -2108,9 +2159,6 @@ static int nfs_validate_text_mount_data(void *options, nfs_set_port(sap, &args->nfs_server.port, port); - if (args->auth_flavor_len > 1) - goto out_bad_auth; - return nfs_parse_devname(dev_name, &args->nfs_server.hostname, max_namelen, @@ -2130,10 +2178,6 @@ out_invalid_transport_udp: out_no_address: dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); return -EINVAL; - -out_bad_auth: - dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n"); - return -EINVAL; } static int @@ -2143,8 +2187,10 @@ nfs_compare_remount_data(struct nfs_server *nfss, if (data->flags != nfss->flags || data->rsize != nfss->rsize || data->wsize != nfss->wsize || + data->version != nfss->nfs_client->rpc_ops->version || + data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || - data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || + data->selected_flavor != nfss->client->cl_auth->au_flavor || data->acregmin != nfss->acregmin / HZ || data->acregmax != nfss->acregmax / HZ || data->acdirmin != nfss->acdirmin / HZ || @@ -2189,7 +2235,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->rsize = nfss->rsize; data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; - nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor); + data->selected_flavor = nfss->client->cl_auth->au_flavor; + data->auth_info = nfss->auth_info; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; @@ -2197,12 +2244,14 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + data->version = nfsvers; + data->minorversion = nfss->nfs_client->cl_minorversion; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); /* overwrite those values with any that were specified */ - error = nfs_parse_mount_options((char *)options, data); - if (error < 0) + error = -EINVAL; + if (!nfs_parse_mount_options((char *)options, data)) goto out; /* @@ -2332,7 +2381,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (b->flags & NFS_MOUNT_SECFLAVOUR && + if (b->auth_info.flavor_len > 0 && clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; @@ -2530,6 +2579,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, mntroot = ERR_PTR(error); goto error_splat_bdi; } + server->super = s; } if (!s->s_root) { @@ -2713,9 +2763,9 @@ static int nfs4_validate_mount_data(void *options, data->auth_flavours, sizeof(pseudoflavor))) return -EFAULT; - nfs_set_auth_parsed_mount_data(args, pseudoflavor); + args->selected_flavor = pseudoflavor; } else - nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(c)) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index bb939edd4c99..0c29b1bb3936 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -493,7 +493,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) unsigned long long fileid; struct dentry *sdentry; struct rpc_task *task; - int error = -EIO; + int error = -EBUSY; dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -503,7 +503,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) /* * We don't allow a dentry to be silly-renamed twice. */ - error = -EBUSY; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; diff --git a/fs/proc/array.c b/fs/proc/array.c index cbd0f1b324b9..1bd2077187fd 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" + "Ngid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" @@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), task_tgid_nr_ns(p, ns), + task_numa_group_id(p), pid_nr_ns(pid, ns), ppid, tpid, from_kuid_munged(user_ns, cred->uid), diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9f8ef9b7674d..8eaa1ba793fc 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -288,10 +288,14 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); - int rv = -EIO; - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long rv = -EIO; + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; if (use_pde(pde)) { - get_unmapped_area = pde->proc_fops->get_unmapped_area; +#ifdef CONFIG_MMU + get_unmapped_area = current->mm->get_unmapped_area; +#endif + if (pde->proc_fops->get_unmapped_area) + get_unmapped_area = pde->proc_fops->get_unmapped_area; if (get_unmapped_area) rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); unuse_pde(pde); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 106a83570630..70779b2fc209 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -14,16 +14,13 @@ #include <linux/of.h> #include <linux/export.h> #include <linux/slab.h> -#include <asm/prom.h> #include <asm/uaccess.h> #include "internal.h" static inline void set_node_proc_entry(struct device_node *np, struct proc_dir_entry *de) { -#ifdef HAVE_ARCH_DEVTREE_FIXUPS np->pde = de; -#endif } static struct proc_dir_entry *proc_device_tree; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7366e9d63cee..390bdab01c3c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -941,6 +941,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, frame = pte_pfn(pte); flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) @@ -960,7 +962,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) + if ((vma->vm_flags & VM_SOFTDIRTY)) flags2 |= __PM_SOFT_DIRTY; *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index dea86e8967ee..2b363e23f36e 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -117,6 +117,7 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr) static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src) { + memset(dst, 0, sizeof(*dst)); dst->dqb_bhardlimit = src->d_blk_hardlimit; dst->dqb_bsoftlimit = src->d_blk_softlimit; dst->dqb_curspace = src->d_bcount; diff --git a/fs/select.c b/fs/select.c index 35d4adc749d9..dfd5cb18c012 100644 --- a/fs/select.c +++ b/fs/select.c @@ -238,8 +238,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, set_current_state(state); if (!pwq->triggered) - rc = freezable_schedule_hrtimeout_range(expires, slack, - HRTIMER_MODE_ABS); + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* diff --git a/fs/seq_file.c b/fs/seq_file.c index 3135c2525c76..a290157265ef 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -328,6 +328,8 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) m->read_pos = offset; retval = file->f_pos = offset; } + } else { + file->f_pos = offset; } } file->f_version = m->version; diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile index 7a1ceb946b80..8876ac183373 100644 --- a/fs/sysfs/Makefile +++ b/fs/sysfs/Makefile @@ -2,5 +2,4 @@ # Makefile for the sysfs virtual filesystem # -obj-y := inode.o file.o dir.o symlink.o mount.o bin.o \ - group.o +obj-y := inode.o file.o dir.o symlink.o mount.o group.o diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c deleted file mode 100644 index c590cabd57bb..000000000000 --- a/fs/sysfs/bin.c +++ /dev/null @@ -1,502 +0,0 @@ -/* - * fs/sysfs/bin.c - sysfs binary file implementation - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Matthew Wilcox - * Copyright (c) 2004 Silicon Graphics, Inc. - * Copyright (c) 2007 SUSE Linux Products GmbH - * Copyright (c) 2007 Tejun Heo <teheo@suse.de> - * - * This file is released under the GPLv2. - * - * Please see Documentation/filesystems/sysfs.txt for more information. - */ - -#undef DEBUG - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/kobject.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/mutex.h> -#include <linux/mm.h> -#include <linux/uaccess.h> - -#include "sysfs.h" - -/* - * There's one bin_buffer for each open file. - * - * filp->private_data points to bin_buffer and - * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s - * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock - */ -static DEFINE_MUTEX(sysfs_bin_lock); - -struct bin_buffer { - struct mutex mutex; - void *buffer; - int mmapped; - const struct vm_operations_struct *vm_ops; - struct file *file; - struct hlist_node list; -}; - -static int -fill_read(struct file *file, char *buffer, loff_t off, size_t count) -{ - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - int rc; - - /* need attr_sd for attr, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - rc = -EIO; - if (attr->read) - rc = attr->read(file, kobj, attr, buffer, off, count); - - sysfs_put_active(attr_sd); - - return rc; -} - -static ssize_t -read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) -{ - struct bin_buffer *bb = file->private_data; - int size = file_inode(file)->i_size; - loff_t offs = *off; - int count = min_t(size_t, bytes, PAGE_SIZE); - char *temp; - - if (!bytes) - return 0; - - if (size) { - if (offs > size) - return 0; - if (offs + count > size) - count = size - offs; - } - - temp = kmalloc(count, GFP_KERNEL); - if (!temp) - return -ENOMEM; - - mutex_lock(&bb->mutex); - - count = fill_read(file, bb->buffer, offs, count); - if (count < 0) { - mutex_unlock(&bb->mutex); - goto out_free; - } - - memcpy(temp, bb->buffer, count); - - mutex_unlock(&bb->mutex); - - if (copy_to_user(userbuf, temp, count)) { - count = -EFAULT; - goto out_free; - } - - pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); - - *off = offs + count; - - out_free: - kfree(temp); - return count; -} - -static int -flush_write(struct file *file, char *buffer, loff_t offset, size_t count) -{ - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - int rc; - - /* need attr_sd for attr, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - rc = -EIO; - if (attr->write) - rc = attr->write(file, kobj, attr, buffer, offset, count); - - sysfs_put_active(attr_sd); - - return rc; -} - -static ssize_t write(struct file *file, const char __user *userbuf, - size_t bytes, loff_t *off) -{ - struct bin_buffer *bb = file->private_data; - int size = file_inode(file)->i_size; - loff_t offs = *off; - int count = min_t(size_t, bytes, PAGE_SIZE); - char *temp; - - if (!bytes) - return 0; - - if (size) { - if (offs > size) - return 0; - if (offs + count > size) - count = size - offs; - } - - temp = memdup_user(userbuf, count); - if (IS_ERR(temp)) - return PTR_ERR(temp); - - mutex_lock(&bb->mutex); - - memcpy(bb->buffer, temp, count); - - count = flush_write(file, bb->buffer, offs, count); - mutex_unlock(&bb->mutex); - - if (count > 0) - *off = offs + count; - - kfree(temp); - return count; -} - -static void bin_vma_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - - if (!bb->vm_ops) - return; - - if (!sysfs_get_active(attr_sd)) - return; - - if (bb->vm_ops->open) - bb->vm_ops->open(vma); - - sysfs_put_active(attr_sd); -} - -static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return VM_FAULT_SIGBUS; - - if (!sysfs_get_active(attr_sd)) - return VM_FAULT_SIGBUS; - - ret = VM_FAULT_SIGBUS; - if (bb->vm_ops->fault) - ret = bb->vm_ops->fault(vma, vmf); - - sysfs_put_active(attr_sd); - return ret; -} - -static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return VM_FAULT_SIGBUS; - - if (!sysfs_get_active(attr_sd)) - return VM_FAULT_SIGBUS; - - ret = 0; - if (bb->vm_ops->page_mkwrite) - ret = bb->vm_ops->page_mkwrite(vma, vmf); - else - file_update_time(file); - - sysfs_put_active(attr_sd); - return ret; -} - -static int bin_access(struct vm_area_struct *vma, unsigned long addr, - void *buf, int len, int write) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return -EINVAL; - - if (!sysfs_get_active(attr_sd)) - return -EINVAL; - - ret = -EINVAL; - if (bb->vm_ops->access) - ret = bb->vm_ops->access(vma, addr, buf, len, write); - - sysfs_put_active(attr_sd); - return ret; -} - -#ifdef CONFIG_NUMA -static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return 0; - - if (!sysfs_get_active(attr_sd)) - return -EINVAL; - - ret = 0; - if (bb->vm_ops->set_policy) - ret = bb->vm_ops->set_policy(vma, new); - - sysfs_put_active(attr_sd); - return ret; -} - -static struct mempolicy *bin_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct mempolicy *pol; - - if (!bb->vm_ops) - return vma->vm_policy; - - if (!sysfs_get_active(attr_sd)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (bb->vm_ops->get_policy) - pol = bb->vm_ops->get_policy(vma, addr); - - sysfs_put_active(attr_sd); - return pol; -} - -static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, - const nodemask_t *to, unsigned long flags) -{ - struct file *file = vma->vm_file; - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - int ret; - - if (!bb->vm_ops) - return 0; - - if (!sysfs_get_active(attr_sd)) - return 0; - - ret = 0; - if (bb->vm_ops->migrate) - ret = bb->vm_ops->migrate(vma, from, to, flags); - - sysfs_put_active(attr_sd); - return ret; -} -#endif - -static const struct vm_operations_struct bin_vm_ops = { - .open = bin_vma_open, - .fault = bin_fault, - .page_mkwrite = bin_page_mkwrite, - .access = bin_access, -#ifdef CONFIG_NUMA - .set_policy = bin_set_policy, - .get_policy = bin_get_policy, - .migrate = bin_migrate, -#endif -}; - -static int mmap(struct file *file, struct vm_area_struct *vma) -{ - struct bin_buffer *bb = file->private_data; - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - int rc; - - mutex_lock(&bb->mutex); - - /* need attr_sd for attr, its parent for kobj */ - rc = -ENODEV; - if (!sysfs_get_active(attr_sd)) - goto out_unlock; - - rc = -EINVAL; - if (!attr->mmap) - goto out_put; - - rc = attr->mmap(file, kobj, attr, vma); - if (rc) - goto out_put; - - /* - * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() - * to satisfy versions of X which crash if the mmap fails: that - * substitutes a new vm_file, and we don't then want bin_vm_ops. - */ - if (vma->vm_file != file) - goto out_put; - - rc = -EINVAL; - if (bb->mmapped && bb->vm_ops != vma->vm_ops) - goto out_put; - - /* - * It is not possible to successfully wrap close. - * So error if someone is trying to use close. - */ - rc = -EINVAL; - if (vma->vm_ops && vma->vm_ops->close) - goto out_put; - - rc = 0; - bb->mmapped = 1; - bb->vm_ops = vma->vm_ops; - vma->vm_ops = &bin_vm_ops; -out_put: - sysfs_put_active(attr_sd); -out_unlock: - mutex_unlock(&bb->mutex); - - return rc; -} - -static int open(struct inode *inode, struct file *file) -{ - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; - struct bin_buffer *bb = NULL; - int error; - - /* binary file operations requires both @sd and its parent */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; - - error = -EACCES; - if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap)) - goto err_out; - if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap)) - goto err_out; - - error = -ENOMEM; - bb = kzalloc(sizeof(*bb), GFP_KERNEL); - if (!bb) - goto err_out; - - bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!bb->buffer) - goto err_out; - - mutex_init(&bb->mutex); - bb->file = file; - file->private_data = bb; - - mutex_lock(&sysfs_bin_lock); - hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers); - mutex_unlock(&sysfs_bin_lock); - - /* open succeeded, put active references */ - sysfs_put_active(attr_sd); - return 0; - - err_out: - sysfs_put_active(attr_sd); - kfree(bb); - return error; -} - -static int release(struct inode *inode, struct file *file) -{ - struct bin_buffer *bb = file->private_data; - - mutex_lock(&sysfs_bin_lock); - hlist_del(&bb->list); - mutex_unlock(&sysfs_bin_lock); - - kfree(bb->buffer); - kfree(bb); - return 0; -} - -const struct file_operations bin_fops = { - .read = read, - .write = write, - .mmap = mmap, - .llseek = generic_file_llseek, - .open = open, - .release = release, -}; - - -void unmap_bin_file(struct sysfs_dirent *attr_sd) -{ - struct bin_buffer *bb; - - if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR) - return; - - mutex_lock(&sysfs_bin_lock); - - hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) { - struct inode *inode = file_inode(bb->file); - - unmap_mapping_range(inode->i_mapping, 0, 0, 1); - } - - mutex_unlock(&sysfs_bin_lock); -} - -/** - * sysfs_create_bin_file - create binary file for object. - * @kobj: object. - * @attr: attribute descriptor. - */ -int sysfs_create_bin_file(struct kobject *kobj, - const struct bin_attribute *attr) -{ - BUG_ON(!kobj || !kobj->sd || !attr); - - return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); -} -EXPORT_SYMBOL_GPL(sysfs_create_bin_file); - -/** - * sysfs_remove_bin_file - remove binary file for object. - * @kobj: object. - * @attr: attribute descriptor. - */ -void sysfs_remove_bin_file(struct kobject *kobj, - const struct bin_attribute *attr) -{ - sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name); -} -EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 4d83cedb9fcb..5e73d6626e50 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -26,21 +26,21 @@ #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); -DEFINE_SPINLOCK(sysfs_assoc_lock); +DEFINE_SPINLOCK(sysfs_symlink_target_lock); -#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb); +#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb) static DEFINE_SPINLOCK(sysfs_ino_lock); static DEFINE_IDA(sysfs_ino_ida); /** * sysfs_name_hash - * @ns: Namespace tag to hash * @name: Null terminated string to hash + * @ns: Namespace tag to hash * * Returns 31 bit hash of ns + name (so it fits in an off_t ) */ -static unsigned int sysfs_name_hash(const void *ns, const char *name) +static unsigned int sysfs_name_hash(const char *name, const void *ns) { unsigned long hash = init_name_hash(); unsigned int len = strlen(name); @@ -56,8 +56,8 @@ static unsigned int sysfs_name_hash(const void *ns, const char *name) return hash; } -static int sysfs_name_compare(unsigned int hash, const void *ns, - const char *name, const struct sysfs_dirent *sd) +static int sysfs_name_compare(unsigned int hash, const char *name, + const void *ns, const struct sysfs_dirent *sd) { if (hash != sd->s_hash) return hash - sd->s_hash; @@ -69,7 +69,7 @@ static int sysfs_name_compare(unsigned int hash, const void *ns, static int sysfs_sd_compare(const struct sysfs_dirent *left, const struct sysfs_dirent *right) { - return sysfs_name_compare(left->s_hash, left->s_ns, left->s_name, + return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns, right); } @@ -132,24 +132,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); } -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -/* Test for attributes that want to ignore lockdep for read-locking */ -static bool ignore_lockdep(struct sysfs_dirent *sd) -{ - return sysfs_type(sd) == SYSFS_KOBJ_ATTR && - sd->s_attr.attr->ignore_lockdep; -} - -#else - -static inline bool ignore_lockdep(struct sysfs_dirent *sd) -{ - return true; -} - -#endif - /** * sysfs_get_active - get an active reference to sysfs_dirent * @sd: sysfs_dirent to get an active reference to @@ -168,7 +150,7 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) if (!atomic_inc_unless_negative(&sd->s_active)) return NULL; - if (likely(!ignore_lockdep(sd))) + if (likely(!sysfs_ignore_lockdep(sd))) rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); return sd; } @@ -187,7 +169,7 @@ void sysfs_put_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return; - if (likely(!ignore_lockdep(sd))) + if (likely(!sysfs_ignore_lockdep(sd))) rwsem_release(&sd->dep_map, 1, _RET_IP_); v = atomic_dec_return(&sd->s_active); if (likely(v != SD_DEACTIVATED_BIAS)) @@ -400,22 +382,19 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) /** * sysfs_addrm_start - prepare for sysfs_dirent add/remove * @acxt: pointer to sysfs_addrm_cxt to be used - * @parent_sd: parent sysfs_dirent * - * This function is called when the caller is about to add or - * remove sysfs_dirent under @parent_sd. This function acquires - * sysfs_mutex. @acxt is used to keep and pass context to - * other addrm functions. + * This function is called when the caller is about to add or remove + * sysfs_dirent. This function acquires sysfs_mutex. @acxt is used + * to keep and pass context to other addrm functions. * * LOCKING: * Kernel thread context (may sleep). sysfs_mutex is locked on * return. */ -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *parent_sd) +void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt) + __acquires(sysfs_mutex) { memset(acxt, 0, sizeof(*acxt)); - acxt->parent_sd = parent_sd; mutex_lock(&sysfs_mutex); } @@ -424,10 +403,11 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, * __sysfs_add_one - add sysfs_dirent to parent without warning * @acxt: addrm context to use * @sd: sysfs_dirent to be added + * @parent_sd: the parent sysfs_dirent to add @sd to * - * Get @acxt->parent_sd and set sd->s_parent to it and increment - * nlink of parent inode if @sd is a directory and link into the - * children list of the parent. + * Get @parent_sd and set @sd->s_parent to it and increment nlink of + * the parent inode if @sd is a directory and link into the children + * list of the parent. * * This function should be called between calls to * sysfs_addrm_start() and sysfs_addrm_finish() and should be @@ -440,27 +420,28 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, * 0 on success, -EEXIST if entry with the given name already * exists. */ -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, + struct sysfs_dirent *parent_sd) { struct sysfs_inode_attrs *ps_iattr; int ret; - if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { + if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) { WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid", - acxt->parent_sd->s_name, sd->s_name); + sysfs_ns_type(parent_sd) ? "required" : "invalid", + parent_sd->s_name, sd->s_name); return -EINVAL; } - sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); - sd->s_parent = sysfs_get(acxt->parent_sd); + sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); + sd->s_parent = sysfs_get(parent_sd); ret = sysfs_link_sibling(sd); if (ret) return ret; /* Update timestamps on the parent */ - ps_iattr = acxt->parent_sd->s_iattr; + ps_iattr = parent_sd->s_iattr; if (ps_iattr) { struct iattr *ps_iattrs = &ps_iattr->ia_iattr; ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; @@ -490,14 +471,32 @@ static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) return path; } +void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) +{ + char *path; + + path = kzalloc(PATH_MAX, GFP_KERNEL); + if (path) { + sysfs_pathname(parent, path); + strlcat(path, "/", PATH_MAX); + strlcat(path, name, PATH_MAX); + } + + WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n", + path ? path : name); + + kfree(path); +} + /** * sysfs_add_one - add sysfs_dirent to parent * @acxt: addrm context to use * @sd: sysfs_dirent to be added + * @parent_sd: the parent sysfs_dirent to add @sd to * - * Get @acxt->parent_sd and set sd->s_parent to it and increment - * nlink of parent inode if @sd is a directory and link into the - * children list of the parent. + * Get @parent_sd and set @sd->s_parent to it and increment nlink of + * the parent inode if @sd is a directory and link into the children + * list of the parent. * * This function should be called between calls to * sysfs_addrm_start() and sysfs_addrm_finish() and should be @@ -510,23 +509,15 @@ static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) * 0 on success, -EEXIST if entry with the given name already * exists. */ -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, + struct sysfs_dirent *parent_sd) { int ret; - ret = __sysfs_add_one(acxt, sd); - if (ret == -EEXIST) { - char *path = kzalloc(PATH_MAX, GFP_KERNEL); - WARN(1, KERN_WARNING - "sysfs: cannot create duplicate filename '%s'\n", - (path == NULL) ? sd->s_name - : (sysfs_pathname(acxt->parent_sd, path), - strlcat(path, "/", PATH_MAX), - strlcat(path, sd->s_name, PATH_MAX), - path)); - kfree(path); - } + ret = __sysfs_add_one(acxt, sd, parent_sd); + if (ret == -EEXIST) + sysfs_warn_dup(parent_sd, sd->s_name); return ret; } @@ -545,16 +536,22 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) * LOCKING: * Determined by sysfs_addrm_start(). */ -void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, + struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; - BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED); + /* + * Removal can be called multiple times on the same node. Only the + * first invocation is effective and puts the base ref. + */ + if (sd->s_flags & SYSFS_FLAG_REMOVED) + return; sysfs_unlink_sibling(sd); /* Update timestamps on the parent */ - ps_iattr = acxt->parent_sd->s_iattr; + ps_iattr = sd->s_parent->s_iattr; if (ps_iattr) { struct iattr *ps_iattrs = &ps_iattr->ia_iattr; ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; @@ -577,6 +574,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) * sysfs_mutex is released. */ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) + __releases(sysfs_mutex) { /* release resources acquired by sysfs_addrm_start() */ mutex_unlock(&sysfs_mutex); @@ -588,7 +586,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) acxt->removed = sd->u.removed_list; sysfs_deactivate(sd); - unmap_bin_file(sd); + sysfs_unmap_bin_file(sd); sysfs_put(sd); } } @@ -597,6 +595,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) * sysfs_find_dirent - find sysfs_dirent with the given name * @parent_sd: sysfs_dirent to search under * @name: name to look for + * @ns: the namespace tag to use * * Look for sysfs_dirent with name @name under @parent_sd. * @@ -607,8 +606,8 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) * Pointer to sysfs_dirent if found, NULL if not. */ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const void *ns, - const unsigned char *name) + const unsigned char *name, + const void *ns) { struct rb_node *node = parent_sd->s_dir.children.rb_node; unsigned int hash; @@ -620,13 +619,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, return NULL; } - hash = sysfs_name_hash(ns, name); + hash = sysfs_name_hash(name, ns); while (node) { struct sysfs_dirent *sd; int result; sd = to_sysfs_dirent(node); - result = sysfs_name_compare(hash, ns, name, sd); + result = sysfs_name_compare(hash, name, ns, sd); if (result < 0) node = node->rb_left; else if (result > 0) @@ -638,9 +637,10 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, } /** - * sysfs_get_dirent - find and get sysfs_dirent with the given name + * sysfs_get_dirent_ns - find and get sysfs_dirent with the given name * @parent_sd: sysfs_dirent to search under * @name: name to look for + * @ns: the namespace tag to use * * Look for sysfs_dirent with name @name under @parent_sd and get * it if found. @@ -651,24 +651,25 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, * RETURNS: * Pointer to sysfs_dirent if found, NULL if not. */ -struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, - const void *ns, - const unsigned char *name) +struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, + const unsigned char *name, + const void *ns) { struct sysfs_dirent *sd; mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, ns, name); + sd = sysfs_find_dirent(parent_sd, name, ns); sysfs_get(sd); mutex_unlock(&sysfs_mutex); return sd; } -EXPORT_SYMBOL_GPL(sysfs_get_dirent); +EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns); static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, - enum kobj_ns_type type, const void *ns, const char *name, - struct sysfs_dirent **p_sd) + enum kobj_ns_type type, + const char *name, const void *ns, + struct sysfs_dirent **p_sd) { umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; struct sysfs_addrm_cxt acxt; @@ -685,8 +686,8 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, sd->s_dir.kobj = kobj; /* link in */ - sysfs_addrm_start(&acxt, parent_sd); - rc = sysfs_add_one(&acxt, sd); + sysfs_addrm_start(&acxt); + rc = sysfs_add_one(&acxt, sd, parent_sd); sysfs_addrm_finish(&acxt); if (rc == 0) @@ -701,7 +702,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd) { return create_dir(kobj, kobj->sd, - KOBJ_NS_TYPE_NONE, NULL, name, p_sd); + KOBJ_NS_TYPE_NONE, name, NULL, p_sd); } /** @@ -730,14 +731,14 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) } /** - * sysfs_create_dir - create a directory for an object. - * @kobj: object we're creating directory for. + * sysfs_create_dir_ns - create a directory for an object with a namespace tag + * @kobj: object we're creating directory for + * @ns: the namespace tag to use */ -int sysfs_create_dir(struct kobject *kobj) +int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { enum kobj_ns_type type; struct sysfs_dirent *parent_sd, *sd; - const void *ns = NULL; int error = 0; BUG_ON(!kobj); @@ -750,11 +751,9 @@ int sysfs_create_dir(struct kobject *kobj) if (!parent_sd) return -ENOENT; - if (sysfs_ns_type(parent_sd)) - ns = kobj->ktype->namespace(kobj); type = sysfs_read_ns_type(kobj); - error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd); + error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd); if (!error) kobj->sd = sd; return error; @@ -776,7 +775,7 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, type = sysfs_ns_type(parent_sd); ns = sysfs_info(dir->i_sb)->ns[type]; - sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name); + sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns); /* no such entry */ if (!sd) { @@ -807,41 +806,128 @@ const struct inode_operations sysfs_dir_inode_operations = { .setxattr = sysfs_setxattr, }; -static void remove_dir(struct sysfs_dirent *sd) +static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos) { - struct sysfs_addrm_cxt acxt; + struct sysfs_dirent *last; - sysfs_addrm_start(&acxt, sd->s_parent); - sysfs_remove_one(&acxt, sd); - sysfs_addrm_finish(&acxt); + while (true) { + struct rb_node *rbn; + + last = pos; + + if (sysfs_type(pos) != SYSFS_DIR) + break; + + rbn = rb_first(&pos->s_dir.children); + if (!rbn) + break; + + pos = to_sysfs_dirent(rbn); + } + + return last; } -void sysfs_remove_subdir(struct sysfs_dirent *sd) +/** + * sysfs_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: sysfs_dirent whose descendants to walk + * + * Find the next descendant to visit for post-order traversal of @root's + * descendants. @root is included in the iteration and the last node to be + * visited. + */ +static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos, + struct sysfs_dirent *root) { - remove_dir(sd); + struct rb_node *rbn; + + lockdep_assert_held(&sysfs_mutex); + + /* if first iteration, visit leftmost descendant which may be root */ + if (!pos) + return sysfs_leftmost_descendant(root); + + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + + /* if there's an unvisited sibling, visit its leftmost descendant */ + rbn = rb_next(&pos->s_rb); + if (rbn) + return sysfs_leftmost_descendant(to_sysfs_dirent(rbn)); + + /* no sibling left, visit parent */ + return pos->s_parent; } +static void __sysfs_remove(struct sysfs_addrm_cxt *acxt, + struct sysfs_dirent *sd) +{ + struct sysfs_dirent *pos, *next; + + if (!sd) + return; -static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) + pr_debug("sysfs %s: removing\n", sd->s_name); + + next = NULL; + do { + pos = next; + next = sysfs_next_descendant_post(pos, sd); + if (pos) + sysfs_remove_one(acxt, pos); + } while (next); +} + +/** + * sysfs_remove - remove a sysfs_dirent recursively + * @sd: the sysfs_dirent to remove + * + * Remove @sd along with all its subdirectories and files. + */ +void sysfs_remove(struct sysfs_dirent *sd) { struct sysfs_addrm_cxt acxt; - struct rb_node *pos; - if (!dir_sd) - return; + sysfs_addrm_start(&acxt); + __sysfs_remove(&acxt, sd); + sysfs_addrm_finish(&acxt); +} - pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); - sysfs_addrm_start(&acxt, dir_sd); - pos = rb_first(&dir_sd->s_dir.children); - while (pos) { - struct sysfs_dirent *sd = to_sysfs_dirent(pos); - pos = rb_next(pos); - if (sysfs_type(sd) != SYSFS_DIR) - sysfs_remove_one(&acxt, sd); +/** + * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it + * @dir_sd: parent of the target + * @name: name of the sysfs_dirent to remove + * @ns: namespace tag of the sysfs_dirent to remove + * + * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove + * it. Returns 0 on success, -ENOENT if such entry doesn't exist. + */ +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, + const void *ns) +{ + struct sysfs_addrm_cxt acxt; + struct sysfs_dirent *sd; + + if (!dir_sd) { + WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", + name); + return -ENOENT; } + + sysfs_addrm_start(&acxt); + + sd = sysfs_find_dirent(dir_sd, name, ns); + if (sd) + __sysfs_remove(&acxt, sd); + sysfs_addrm_finish(&acxt); - remove_dir(dir_sd); + if (sd) + return 0; + else + return -ENOENT; } /** @@ -852,21 +938,34 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) * the directory before we remove the directory, and we've inlined * what used to be sysfs_rmdir() below, instead of calling separately. */ - void sysfs_remove_dir(struct kobject *kobj) { struct sysfs_dirent *sd = kobj->sd; - spin_lock(&sysfs_assoc_lock); + /* + * In general, kboject owner is responsible for ensuring removal + * doesn't race with other operations and sysfs doesn't provide any + * protection; however, when @kobj is used as a symlink target, the + * symlinking entity usually doesn't own @kobj and thus has no + * control over removal. @kobj->sd may be removed anytime and + * symlink code may end up dereferencing an already freed sd. + * + * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation + * against symlink operations so that symlink code can safely + * dereference @kobj->sd. + */ + spin_lock(&sysfs_symlink_target_lock); kobj->sd = NULL; - spin_unlock(&sysfs_assoc_lock); + spin_unlock(&sysfs_symlink_target_lock); - __sysfs_remove_dir(sd); + if (sd) { + WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR); + sysfs_remove(sd); + } } -int sysfs_rename(struct sysfs_dirent *sd, - struct sysfs_dirent *new_parent_sd, const void *new_ns, - const char *new_name) +int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, + const char *new_name, const void *new_ns) { int error; @@ -878,7 +977,7 @@ int sysfs_rename(struct sysfs_dirent *sd, goto out; /* nothing to rename */ error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, new_ns, new_name)) + if (sysfs_find_dirent(new_parent_sd, new_name, new_ns)) goto out; /* rename sysfs_dirent */ @@ -899,7 +998,7 @@ int sysfs_rename(struct sysfs_dirent *sd, sysfs_get(new_parent_sd); sysfs_put(sd->s_parent); sd->s_ns = new_ns; - sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); + sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); sd->s_parent = new_parent_sd; sysfs_link_sibling(sd); @@ -909,30 +1008,25 @@ int sysfs_rename(struct sysfs_dirent *sd, return error; } -int sysfs_rename_dir(struct kobject *kobj, const char *new_name) +int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, + const void *new_ns) { struct sysfs_dirent *parent_sd = kobj->sd->s_parent; - const void *new_ns = NULL; - - if (sysfs_ns_type(parent_sd)) - new_ns = kobj->ktype->namespace(kobj); - return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name); + return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns); } -int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) +int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, + const void *new_ns) { struct sysfs_dirent *sd = kobj->sd; struct sysfs_dirent *new_parent_sd; - const void *new_ns = NULL; BUG_ON(!sd->s_parent); - if (sysfs_ns_type(sd->s_parent)) - new_ns = kobj->ktype->namespace(kobj); new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; - return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name); + return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns); } /* Relationship between s_mode and the DT_xxx types */ diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 15ef5eb13663..79b5da2acbe1 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -21,70 +21,114 @@ #include <linux/mutex.h> #include <linux/limits.h> #include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/mm.h> #include "sysfs.h" /* - * There's one sysfs_buffer for each open file and one - * sysfs_open_dirent for each sysfs_dirent with one or more open - * files. + * There's one sysfs_open_file for each open file and one sysfs_open_dirent + * for each sysfs_dirent with one or more open files. * - * filp->private_data points to sysfs_buffer and - * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open - * is protected by sysfs_open_dirent_lock. + * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open is + * protected by sysfs_open_dirent_lock. + * + * filp->private_data points to seq_file whose ->private points to + * sysfs_open_file. sysfs_open_files are chained at + * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex. */ static DEFINE_SPINLOCK(sysfs_open_dirent_lock); +static DEFINE_MUTEX(sysfs_open_file_mutex); struct sysfs_open_dirent { atomic_t refcnt; atomic_t event; wait_queue_head_t poll; - struct list_head buffers; /* goes through sysfs_buffer.list */ + struct list_head files; /* goes through sysfs_open_file.list */ }; -struct sysfs_buffer { - size_t count; - loff_t pos; - char *page; - const struct sysfs_ops *ops; +struct sysfs_open_file { + struct sysfs_dirent *sd; + struct file *file; struct mutex mutex; - int needs_read_fill; int event; struct list_head list; + + bool mmapped; + const struct vm_operations_struct *vm_ops; }; -/** - * fill_read_buffer - allocate and fill buffer from object. - * @dentry: dentry pointer. - * @buffer: data buffer for file. - * - * Allocate @buffer->page, if it hasn't been already, then call the - * kobject's show() method to fill the buffer with this attribute's - * data. - * This is called only once, on the file's first read unless an error - * is returned. +static bool sysfs_is_bin(struct sysfs_dirent *sd) +{ + return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR; +} + +static struct sysfs_open_file *sysfs_of(struct file *file) +{ + return ((struct seq_file *)file->private_data)->private; +} + +/* + * Determine ktype->sysfs_ops for the given sysfs_dirent. This function + * must be called while holding an active reference. */ -static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer) +static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) { - struct sysfs_dirent *attr_sd = dentry->d_fsdata; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - const struct sysfs_ops *ops = buffer->ops; - int ret = 0; + struct kobject *kobj = sd->s_parent->s_dir.kobj; + + if (!sysfs_ignore_lockdep(sd)) + lockdep_assert_held(sd); + return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; +} + +/* + * Reads on sysfs are handled through seq_file, which takes care of hairy + * details like buffering and seeking. The following function pipes + * sysfs_ops->show() result through seq_file. + */ +static int sysfs_seq_show(struct seq_file *sf, void *v) +{ + struct sysfs_open_file *of = sf->private; + struct kobject *kobj = of->sd->s_parent->s_dir.kobj; + const struct sysfs_ops *ops; + char *buf; ssize_t count; - if (!buffer->page) - buffer->page = (char *) get_zeroed_page(GFP_KERNEL); - if (!buffer->page) - return -ENOMEM; + /* acquire buffer and ensure that it's >= PAGE_SIZE */ + count = seq_get_buf(sf, &buf); + if (count < PAGE_SIZE) { + seq_commit(sf, -1); + return 0; + } - /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) + /* + * Need @of->sd for attr and ops, its parent for kobj. @of->mutex + * nests outside active ref and is just to ensure that the ops + * aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!sysfs_get_active(of->sd)) { + mutex_unlock(&of->mutex); return -ENODEV; + } - buffer->event = atomic_read(&attr_sd->s_attr.open->event); - count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page); + of->event = atomic_read(&of->sd->s_attr.open->event); - sysfs_put_active(attr_sd); + /* + * Lookup @ops and invoke show(). Control may reach here via seq + * file lseek even if @ops->show() isn't implemented. + */ + ops = sysfs_file_ops(of->sd); + if (ops->show) + count = ops->show(kobj, of->sd->s_attr.attr, buf); + else + count = 0; + + sysfs_put_active(of->sd); + mutex_unlock(&of->mutex); + + if (count < 0) + return count; /* * The code works fine with PAGE_SIZE return but it's likely to @@ -96,155 +140,389 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer) /* Try to struggle along */ count = PAGE_SIZE - 1; } - if (count >= 0) { - buffer->needs_read_fill = 0; - buffer->count = count; - } else { - ret = count; - } - return ret; + seq_commit(sf, count); + return 0; } -/** - * sysfs_read_file - read an attribute. - * @file: file pointer. - * @buf: buffer to fill. - * @count: number of bytes to read. - * @ppos: starting offset in file. - * - * Userspace wants to read an attribute file. The attribute descriptor - * is in the file's ->d_fsdata. The target object is in the directory's - * ->d_fsdata. - * - * We call fill_read_buffer() to allocate and fill the buffer from the - * object's show() method exactly once (if the read is happening from - * the beginning of the file). That should fill the entire buffer with - * all the data the object has to offer for that attribute. - * We then call flush_read_buffer() to copy the buffer to userspace - * in the increments specified. +/* + * Read method for bin files. As reading a bin file can have side-effects, + * the exact offset and bytes specified in read(2) call should be passed to + * the read callback making it difficult to use seq_file. Implement + * simplistic custom buffering for bin files. */ - -static ssize_t -sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) +static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf, + size_t bytes, loff_t *off) { - struct sysfs_buffer *buffer = file->private_data; - ssize_t retval = 0; + struct sysfs_open_file *of = sysfs_of(file); + struct bin_attribute *battr = of->sd->s_attr.bin_attr; + struct kobject *kobj = of->sd->s_parent->s_dir.kobj; + loff_t size = file_inode(file)->i_size; + int count = min_t(size_t, bytes, PAGE_SIZE); + loff_t offs = *off; + char *buf; + + if (!bytes) + return 0; - mutex_lock(&buffer->mutex); - if (buffer->needs_read_fill || *ppos == 0) { - retval = fill_read_buffer(file->f_path.dentry, buffer); - if (retval) - goto out; + if (size) { + if (offs > size) + return 0; + if (offs + count > size) + count = size - offs; } - pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", - __func__, count, *ppos, buffer->page); - retval = simple_read_from_buffer(buf, count, ppos, buffer->page, - buffer->count); -out: - mutex_unlock(&buffer->mutex); - return retval; -} -/** - * fill_write_buffer - copy buffer from userspace. - * @buffer: data buffer for file. - * @buf: data from user. - * @count: number of bytes in @userbuf. - * - * Allocate @buffer->page if it hasn't been already, then - * copy the user-supplied buffer into it. - */ -static int fill_write_buffer(struct sysfs_buffer *buffer, - const char __user *buf, size_t count) -{ - int error; - - if (!buffer->page) - buffer->page = (char *)get_zeroed_page(GFP_KERNEL); - if (!buffer->page) + buf = kmalloc(count, GFP_KERNEL); + if (!buf) return -ENOMEM; - if (count >= PAGE_SIZE) - count = PAGE_SIZE - 1; - error = copy_from_user(buffer->page, buf, count); - buffer->needs_read_fill = 1; - /* if buf is assumed to contain a string, terminate it by \0, - so e.g. sscanf() can scan the string easily */ - buffer->page[count] = 0; - return error ? -EFAULT : count; -} + /* need of->sd for battr, its parent for kobj */ + mutex_lock(&of->mutex); + if (!sysfs_get_active(of->sd)) { + count = -ENODEV; + mutex_unlock(&of->mutex); + goto out_free; + } + + if (battr->read) + count = battr->read(file, kobj, battr, buf, offs, count); + else + count = -EIO; + sysfs_put_active(of->sd); + mutex_unlock(&of->mutex); + + if (count < 0) + goto out_free; + + if (copy_to_user(userbuf, buf, count)) { + count = -EFAULT; + goto out_free; + } + + pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); + + *off = offs + count; + + out_free: + kfree(buf); + return count; +} /** - * flush_write_buffer - push buffer to kobject. - * @dentry: dentry to the attribute - * @buffer: data buffer for file. - * @count: number of bytes + * flush_write_buffer - push buffer to kobject + * @of: open file + * @buf: data buffer for file + * @off: file offset to write to + * @count: number of bytes * - * Get the correct pointers for the kobject and the attribute we're - * dealing with, then call the store() method for the attribute, - * passing the buffer that we acquired in fill_write_buffer(). + * Get the correct pointers for the kobject and the attribute we're dealing + * with, then call the store() method for it with @buf. */ -static int flush_write_buffer(struct dentry *dentry, - struct sysfs_buffer *buffer, size_t count) +static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off, + size_t count) { - struct sysfs_dirent *attr_sd = dentry->d_fsdata; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - const struct sysfs_ops *ops = buffer->ops; - int rc; + struct kobject *kobj = of->sd->s_parent->s_dir.kobj; + int rc = 0; - /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) + /* + * Need @of->sd for attr and ops, its parent for kobj. @of->mutex + * nests outside active ref and is just to ensure that the ops + * aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!sysfs_get_active(of->sd)) { + mutex_unlock(&of->mutex); return -ENODEV; + } - rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count); + if (sysfs_is_bin(of->sd)) { + struct bin_attribute *battr = of->sd->s_attr.bin_attr; - sysfs_put_active(attr_sd); + rc = -EIO; + if (battr->write) + rc = battr->write(of->file, kobj, battr, buf, off, + count); + } else { + const struct sysfs_ops *ops = sysfs_file_ops(of->sd); + + rc = ops->store(kobj, of->sd->s_attr.attr, buf, count); + } + + sysfs_put_active(of->sd); + mutex_unlock(&of->mutex); return rc; } - /** - * sysfs_write_file - write an attribute. - * @file: file pointer - * @buf: data to write - * @count: number of bytes - * @ppos: starting offset + * sysfs_write_file - write an attribute + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Copy data in from userland and pass it to the matching + * sysfs_ops->store() by invoking flush_write_buffer(). * - * Similar to sysfs_read_file(), though working in the opposite direction. - * We allocate and fill the data from the user in fill_write_buffer(), - * then push it to the kobject in flush_write_buffer(). - * There is no easy way for us to know if userspace is only doing a partial - * write, so we don't support them. We expect the entire buffer to come - * on the first write. - * Hint: if you're writing a value, first read the file, modify only the - * the value you're changing, then write entire buffer back. + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come on + * the first write. Hint: if you're writing a value, first read the file, + * modify only the the value you're changing, then write entire buffer + * back. */ -static ssize_t sysfs_write_file(struct file *file, const char __user *buf, +static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - struct sysfs_buffer *buffer = file->private_data; - ssize_t len; + struct sysfs_open_file *of = sysfs_of(file); + ssize_t len = min_t(size_t, count, PAGE_SIZE); + loff_t size = file_inode(file)->i_size; + char *buf; + + if (sysfs_is_bin(of->sd) && size) { + if (size <= *ppos) + return 0; + len = min_t(ssize_t, len, size - *ppos); + } - mutex_lock(&buffer->mutex); - len = fill_write_buffer(buffer, buf, count); - if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, len); + if (!len) + return 0; + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_free; + } + buf[len] = '\0'; /* guarantee string termination */ + + len = flush_write_buffer(of, buf, *ppos, len); if (len > 0) *ppos += len; - mutex_unlock(&buffer->mutex); +out_free: + kfree(buf); return len; } +static void sysfs_bin_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct sysfs_open_file *of = sysfs_of(file); + + if (!of->vm_ops) + return; + + if (!sysfs_get_active(of->sd)) + return; + + if (of->vm_ops->open) + of->vm_ops->open(vma); + + sysfs_put_active(of->sd); +} + +static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct sysfs_open_file *of = sysfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!sysfs_get_active(of->sd)) + return VM_FAULT_SIGBUS; + + ret = VM_FAULT_SIGBUS; + if (of->vm_ops->fault) + ret = of->vm_ops->fault(vma, vmf); + + sysfs_put_active(of->sd); + return ret; +} + +static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct sysfs_open_file *of = sysfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!sysfs_get_active(of->sd)) + return VM_FAULT_SIGBUS; + + ret = 0; + if (of->vm_ops->page_mkwrite) + ret = of->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); + + sysfs_put_active(of->sd); + return ret; +} + +static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct sysfs_open_file *of = sysfs_of(file); + int ret; + + if (!of->vm_ops) + return -EINVAL; + + if (!sysfs_get_active(of->sd)) + return -EINVAL; + + ret = -EINVAL; + if (of->vm_ops->access) + ret = of->vm_ops->access(vma, addr, buf, len, write); + + sysfs_put_active(of->sd); + return ret; +} + +#ifdef CONFIG_NUMA +static int sysfs_bin_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct sysfs_open_file *of = sysfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!sysfs_get_active(of->sd)) + return -EINVAL; + + ret = 0; + if (of->vm_ops->set_policy) + ret = of->vm_ops->set_policy(vma, new); + + sysfs_put_active(of->sd); + return ret; +} + +static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct sysfs_open_file *of = sysfs_of(file); + struct mempolicy *pol; + + if (!of->vm_ops) + return vma->vm_policy; + + if (!sysfs_get_active(of->sd)) + return vma->vm_policy; + + pol = vma->vm_policy; + if (of->vm_ops->get_policy) + pol = of->vm_ops->get_policy(vma, addr); + + sysfs_put_active(of->sd); + return pol; +} + +static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, + const nodemask_t *to, unsigned long flags) +{ + struct file *file = vma->vm_file; + struct sysfs_open_file *of = sysfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!sysfs_get_active(of->sd)) + return 0; + + ret = 0; + if (of->vm_ops->migrate) + ret = of->vm_ops->migrate(vma, from, to, flags); + + sysfs_put_active(of->sd); + return ret; +} +#endif + +static const struct vm_operations_struct sysfs_bin_vm_ops = { + .open = sysfs_bin_vma_open, + .fault = sysfs_bin_fault, + .page_mkwrite = sysfs_bin_page_mkwrite, + .access = sysfs_bin_access, +#ifdef CONFIG_NUMA + .set_policy = sysfs_bin_set_policy, + .get_policy = sysfs_bin_get_policy, + .migrate = sysfs_bin_migrate, +#endif +}; + +static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sysfs_open_file *of = sysfs_of(file); + struct bin_attribute *battr = of->sd->s_attr.bin_attr; + struct kobject *kobj = of->sd->s_parent->s_dir.kobj; + int rc; + + mutex_lock(&of->mutex); + + /* need of->sd for battr, its parent for kobj */ + rc = -ENODEV; + if (!sysfs_get_active(of->sd)) + goto out_unlock; + + if (!battr->mmap) + goto out_put; + + rc = battr->mmap(file, kobj, battr, vma); + if (rc) + goto out_put; + + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) + goto out_put; + + rc = -EINVAL; + if (of->mmapped && of->vm_ops != vma->vm_ops) + goto out_put; + + /* + * It is not possible to successfully wrap close. + * So error if someone is trying to use close. + */ + rc = -EINVAL; + if (vma->vm_ops && vma->vm_ops->close) + goto out_put; + + rc = 0; + of->mmapped = 1; + of->vm_ops = vma->vm_ops; + vma->vm_ops = &sysfs_bin_vm_ops; +out_put: + sysfs_put_active(of->sd); +out_unlock: + mutex_unlock(&of->mutex); + + return rc; +} + /** * sysfs_get_open_dirent - get or create sysfs_open_dirent * @sd: target sysfs_dirent - * @buffer: sysfs_buffer for this instance of open + * @of: sysfs_open_file for this instance of open * * If @sd->s_attr.open exists, increment its reference count; - * otherwise, create one. @buffer is chained to the buffers - * list. + * otherwise, create one. @of is chained to the files list. * * LOCKING: * Kernel thread context (may sleep). @@ -253,11 +531,12 @@ static ssize_t sysfs_write_file(struct file *file, const char __user *buf, * 0 on success, -errno on failure. */ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, - struct sysfs_buffer *buffer) + struct sysfs_open_file *of) { struct sysfs_open_dirent *od, *new_od = NULL; retry: + mutex_lock(&sysfs_open_file_mutex); spin_lock_irq(&sysfs_open_dirent_lock); if (!sd->s_attr.open && new_od) { @@ -268,10 +547,11 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, od = sd->s_attr.open; if (od) { atomic_inc(&od->refcnt); - list_add_tail(&buffer->list, &od->buffers); + list_add_tail(&of->list, &od->files); } spin_unlock_irq(&sysfs_open_dirent_lock); + mutex_unlock(&sysfs_open_file_mutex); if (od) { kfree(new_od); @@ -286,36 +566,40 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, atomic_set(&new_od->refcnt, 0); atomic_set(&new_od->event, 1); init_waitqueue_head(&new_od->poll); - INIT_LIST_HEAD(&new_od->buffers); + INIT_LIST_HEAD(&new_od->files); goto retry; } /** * sysfs_put_open_dirent - put sysfs_open_dirent * @sd: target sysfs_dirent - * @buffer: associated sysfs_buffer + * @of: associated sysfs_open_file * - * Put @sd->s_attr.open and unlink @buffer from the buffers list. - * If reference count reaches zero, disassociate and free it. + * Put @sd->s_attr.open and unlink @of from the files list. If + * reference count reaches zero, disassociate and free it. * * LOCKING: * None. */ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, - struct sysfs_buffer *buffer) + struct sysfs_open_file *of) { struct sysfs_open_dirent *od = sd->s_attr.open; unsigned long flags; + mutex_lock(&sysfs_open_file_mutex); spin_lock_irqsave(&sysfs_open_dirent_lock, flags); - list_del(&buffer->list); + if (of) + list_del(&of->list); + if (atomic_dec_and_test(&od->refcnt)) sd->s_attr.open = NULL; else od = NULL; spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); + mutex_unlock(&sysfs_open_file_mutex); kfree(od); } @@ -324,67 +608,81 @@ static int sysfs_open_file(struct inode *inode, struct file *file) { struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_buffer *buffer; - const struct sysfs_ops *ops; + struct sysfs_open_file *of; + bool has_read, has_write; int error = -EACCES; /* need attr_sd for attr and ops, its parent for kobj */ if (!sysfs_get_active(attr_sd)) return -ENODEV; - /* every kobject with an attribute needs a ktype assigned */ - if (kobj->ktype && kobj->ktype->sysfs_ops) - ops = kobj->ktype->sysfs_ops; - else { - WARN(1, KERN_ERR - "missing sysfs attribute operations for kobject: %s\n", - kobject_name(kobj)); - goto err_out; - } + if (sysfs_is_bin(attr_sd)) { + struct bin_attribute *battr = attr_sd->s_attr.bin_attr; - /* File needs write support. - * The inode's perms must say it's ok, - * and we must have a store method. - */ - if (file->f_mode & FMODE_WRITE) { - if (!(inode->i_mode & S_IWUGO) || !ops->store) - goto err_out; - } + has_read = battr->read || battr->mmap; + has_write = battr->write || battr->mmap; + } else { + const struct sysfs_ops *ops = sysfs_file_ops(attr_sd); - /* File needs read support. - * The inode's perms must say it's ok, and we there - * must be a show method for it. - */ - if (file->f_mode & FMODE_READ) { - if (!(inode->i_mode & S_IRUGO) || !ops->show) + /* every kobject with an attribute needs a ktype assigned */ + if (WARN(!ops, KERN_ERR + "missing sysfs attribute operations for kobject: %s\n", + kobject_name(kobj))) goto err_out; + + has_read = ops->show; + has_write = ops->store; } - /* No error? Great, allocate a buffer for the file, and store it - * it in file->private_data for easy access. - */ + /* check perms and supported operations */ + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; + + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + + /* allocate a sysfs_open_file for the file */ error = -ENOMEM; - buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); - if (!buffer) + of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL); + if (!of) goto err_out; - mutex_init(&buffer->mutex); - buffer->needs_read_fill = 1; - buffer->ops = ops; - file->private_data = buffer; + mutex_init(&of->mutex); + of->sd = attr_sd; + of->file = file; - /* make sure we have open dirent struct */ - error = sysfs_get_open_dirent(attr_sd, buffer); + /* + * Always instantiate seq_file even if read access doesn't use + * seq_file or is not requested. This unifies private data access + * and readable regular files are the vast majority anyway. + */ + if (sysfs_is_bin(attr_sd)) + error = single_open(file, NULL, of); + else + error = single_open(file, sysfs_seq_show, of); if (error) goto err_free; + /* seq_file clears PWRITE unconditionally, restore it if WRITE */ + if (file->f_mode & FMODE_WRITE) + file->f_mode |= FMODE_PWRITE; + + /* make sure we have open dirent struct */ + error = sysfs_get_open_dirent(attr_sd, of); + if (error) + goto err_close; + /* open succeeded, put active references */ sysfs_put_active(attr_sd); return 0; - err_free: - kfree(buffer); - err_out: +err_close: + single_release(inode, file); +err_free: + kfree(of); +err_out: sysfs_put_active(attr_sd); return error; } @@ -392,17 +690,41 @@ static int sysfs_open_file(struct inode *inode, struct file *file) static int sysfs_release(struct inode *inode, struct file *filp) { struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata; - struct sysfs_buffer *buffer = filp->private_data; + struct sysfs_open_file *of = sysfs_of(filp); - sysfs_put_open_dirent(sd, buffer); - - if (buffer->page) - free_page((unsigned long)buffer->page); - kfree(buffer); + sysfs_put_open_dirent(sd, of); + single_release(inode, filp); + kfree(of); return 0; } +void sysfs_unmap_bin_file(struct sysfs_dirent *sd) +{ + struct sysfs_open_dirent *od; + struct sysfs_open_file *of; + + if (!sysfs_is_bin(sd)) + return; + + spin_lock_irq(&sysfs_open_dirent_lock); + od = sd->s_attr.open; + if (od) + atomic_inc(&od->refcnt); + spin_unlock_irq(&sysfs_open_dirent_lock); + if (!od) + return; + + mutex_lock(&sysfs_open_file_mutex); + list_for_each_entry(of, &od->files, list) { + struct inode *inode = file_inode(of->file); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + } + mutex_unlock(&sysfs_open_file_mutex); + + sysfs_put_open_dirent(sd, NULL); +} + /* Sysfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the @@ -418,7 +740,7 @@ static int sysfs_release(struct inode *inode, struct file *filp) */ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) { - struct sysfs_buffer *buffer = filp->private_data; + struct sysfs_open_file *of = sysfs_of(filp); struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; struct sysfs_open_dirent *od = attr_sd->s_attr.open; @@ -430,13 +752,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) sysfs_put_active(attr_sd); - if (buffer->event != atomic_read(&od->event)) + if (of->event != atomic_read(&od->event)) goto trigger; return DEFAULT_POLLMASK; trigger: - buffer->needs_read_fill = 1; return DEFAULT_POLLMASK|POLLERR|POLLPRI; } @@ -466,9 +787,9 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr) mutex_lock(&sysfs_mutex); if (sd && dir) - sd = sysfs_find_dirent(sd, NULL, dir); + sd = sysfs_find_dirent(sd, dir, NULL); if (sd && attr) - sd = sysfs_find_dirent(sd, NULL, attr); + sd = sysfs_find_dirent(sd, attr, NULL); if (sd) sysfs_notify_dirent(sd); @@ -477,7 +798,7 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr) EXPORT_SYMBOL_GPL(sysfs_notify); const struct file_operations sysfs_file_operations = { - .read = sysfs_read_file, + .read = seq_read, .write = sysfs_write_file, .llseek = generic_file_llseek, .open = sysfs_open_file, @@ -485,58 +806,25 @@ const struct file_operations sysfs_file_operations = { .poll = sysfs_poll, }; -static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, - const void **pns) -{ - struct sysfs_dirent *dir_sd = kobj->sd; - const struct sysfs_ops *ops; - const void *ns = NULL; - int err; - - if (!dir_sd) { - WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n", - kobject_name(kobj)); - return -ENOENT; - } - - err = 0; - if (!sysfs_ns_type(dir_sd)) - goto out; - - err = -EINVAL; - if (!kobj->ktype) - goto out; - ops = kobj->ktype->sysfs_ops; - if (!ops) - goto out; - if (!ops->namespace) - goto out; - - err = 0; - ns = ops->namespace(kobj, attr); -out: - if (err) { - WARN(1, KERN_ERR - "missing sysfs namespace attribute operation for kobject: %s\n", - kobject_name(kobj)); - } - *pns = ns; - return err; -} +const struct file_operations sysfs_bin_operations = { + .read = sysfs_bin_read, + .write = sysfs_write_file, + .llseek = generic_file_llseek, + .mmap = sysfs_bin_mmap, + .open = sysfs_open_file, + .release = sysfs_release, + .poll = sysfs_poll, +}; -int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, umode_t amode) +int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd, + const struct attribute *attr, int type, + umode_t amode, const void *ns) { umode_t mode = (amode & S_IALLUGO) | S_IFREG; struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; - const void *ns; int rc; - rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns); - if (rc) - return rc; - sd = sysfs_new_dirent(attr->name, mode, type); if (!sd) return -ENOMEM; @@ -545,8 +833,8 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, sd->s_attr.attr = (void *)attr; sysfs_dirent_init_lockdep(sd); - sysfs_addrm_start(&acxt, dir_sd); - rc = sysfs_add_one(&acxt, sd); + sysfs_addrm_start(&acxt); + rc = sysfs_add_one(&acxt, sd, dir_sd); sysfs_addrm_finish(&acxt); if (rc) @@ -559,23 +847,25 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type) { - return sysfs_add_file_mode(dir_sd, attr, type, attr->mode); + return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL); } - /** - * sysfs_create_file - create an attribute file for an object. - * @kobj: object we're creating for. - * @attr: attribute descriptor. + * sysfs_create_file_ns - create an attribute file for an object with custom ns + * @kobj: object we're creating for + * @attr: attribute descriptor + * @ns: namespace the new file should belong to */ -int sysfs_create_file(struct kobject *kobj, const struct attribute *attr) +int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, + const void *ns) { BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); + return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR, + attr->mode, ns); } -EXPORT_SYMBOL_GPL(sysfs_create_file); +EXPORT_SYMBOL_GPL(sysfs_create_file_ns); int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) { @@ -604,7 +894,7 @@ int sysfs_add_file_to_group(struct kobject *kobj, int error; if (group) - dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); + dir_sd = sysfs_get_dirent(kobj->sd, group); else dir_sd = sysfs_get(kobj->sd); @@ -630,17 +920,12 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, { struct sysfs_dirent *sd; struct iattr newattrs; - const void *ns; int rc; - rc = sysfs_attr_ns(kobj, attr, &ns); - if (rc) - return rc; - mutex_lock(&sysfs_mutex); rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, ns, attr->name); + sd = sysfs_find_dirent(kobj->sd, attr->name, NULL); if (!sd) goto out; @@ -655,22 +940,21 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, EXPORT_SYMBOL_GPL(sysfs_chmod_file); /** - * sysfs_remove_file - remove an object attribute. - * @kobj: object we're acting for. - * @attr: attribute descriptor. + * sysfs_remove_file_ns - remove an object attribute with a custom ns tag + * @kobj: object we're acting for + * @attr: attribute descriptor + * @ns: namespace tag of the file to remove * - * Hash the attribute name and kill the victim. + * Hash the attribute name and namespace tag and kill the victim. */ -void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) +void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, + const void *ns) { - const void *ns; - - if (sysfs_attr_ns(kobj, attr, &ns)) - return; + struct sysfs_dirent *dir_sd = kobj->sd; - sysfs_hash_and_remove(kobj->sd, ns, attr->name); + sysfs_hash_and_remove(dir_sd, attr->name, ns); } -EXPORT_SYMBOL_GPL(sysfs_remove_file); +EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) { @@ -692,16 +976,42 @@ void sysfs_remove_file_from_group(struct kobject *kobj, struct sysfs_dirent *dir_sd; if (group) - dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); + dir_sd = sysfs_get_dirent(kobj->sd, group); else dir_sd = sysfs_get(kobj->sd); if (dir_sd) { - sysfs_hash_and_remove(dir_sd, NULL, attr->name); + sysfs_hash_and_remove(dir_sd, attr->name, NULL); sysfs_put(dir_sd); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); +/** + * sysfs_create_bin_file - create binary file for object. + * @kobj: object. + * @attr: attribute descriptor. + */ +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) +{ + BUG_ON(!kobj || !kobj->sd || !attr); + + return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); +} +EXPORT_SYMBOL_GPL(sysfs_create_bin_file); + +/** + * sysfs_remove_bin_file - remove binary file for object. + * @kobj: object. + * @attr: attribute descriptor. + */ +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) +{ + sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL); +} +EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); + struct sysfs_schedule_callback_struct { struct list_head workq_list; struct kobject *kobj; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 5f92cd2f61c1..1898a10e38ce 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -26,7 +26,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, if (grp->attrs) for (attr = grp->attrs; *attr; attr++) - sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); + sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) sysfs_remove_bin_file(kobj, *bin_attr); @@ -49,16 +49,17 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, * re-adding (if required) the file. */ if (update) - sysfs_hash_and_remove(dir_sd, NULL, - (*attr)->name); + sysfs_hash_and_remove(dir_sd, (*attr)->name, + NULL); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode) continue; } - error = sysfs_add_file_mode(dir_sd, *attr, - SYSFS_KOBJ_ATTR, - (*attr)->mode | mode); + error = sysfs_add_file_mode_ns(dir_sd, *attr, + SYSFS_KOBJ_ATTR, + (*attr)->mode | mode, + NULL); if (unlikely(error)) break; } @@ -110,7 +111,7 @@ static int internal_create_group(struct kobject *kobj, int update, error = create_files(sd, kobj, grp, update); if (error) { if (grp->name) - sysfs_remove_subdir(sd); + sysfs_remove(sd); } sysfs_put(sd); return error; @@ -206,7 +207,7 @@ void sysfs_remove_group(struct kobject *kobj, struct sysfs_dirent *sd; if (grp->name) { - sd = sysfs_get_dirent(dir_sd, NULL, grp->name); + sd = sysfs_get_dirent(dir_sd, grp->name); if (!sd) { WARN(!sd, KERN_WARNING "sysfs group %p not found for kobject '%s'\n", @@ -218,7 +219,7 @@ void sysfs_remove_group(struct kobject *kobj, remove_files(sd, kobj, grp); if (grp->name) - sysfs_remove_subdir(sd); + sysfs_remove(sd); sysfs_put(sd); } @@ -261,7 +262,7 @@ int sysfs_merge_group(struct kobject *kobj, struct attribute *const *attr; int i; - dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); + dir_sd = sysfs_get_dirent(kobj->sd, grp->name); if (!dir_sd) return -ENOENT; @@ -269,7 +270,7 @@ int sysfs_merge_group(struct kobject *kobj, error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); if (error) { while (--i >= 0) - sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name); + sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL); } sysfs_put(dir_sd); @@ -288,10 +289,10 @@ void sysfs_unmerge_group(struct kobject *kobj, struct sysfs_dirent *dir_sd; struct attribute *const *attr; - dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); + dir_sd = sysfs_get_dirent(kobj->sd, grp->name); if (dir_sd) { for (attr = grp->attrs; *attr; ++attr) - sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); + sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); sysfs_put(dir_sd); } } @@ -310,7 +311,7 @@ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct sysfs_dirent *dir_sd; int error = 0; - dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + dir_sd = sysfs_get_dirent(kobj->sd, group_name); if (!dir_sd) return -ENOENT; @@ -332,9 +333,9 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, { struct sysfs_dirent *dir_sd; - dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + dir_sd = sysfs_get_dirent(kobj->sd, group_name); if (dir_sd) { - sysfs_hash_and_remove(dir_sd, NULL, link_name); + sysfs_hash_and_remove(dir_sd, link_name, NULL); sysfs_put(dir_sd); } } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 963f910c8034..1750f790af3b 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -258,9 +258,9 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) inode->i_fop = &sysfs_file_operations; break; case SYSFS_KOBJ_BIN_ATTR: - bin_attr = sd->s_bin_attr.bin_attr; + bin_attr = sd->s_attr.bin_attr; inode->i_size = bin_attr->size; - inode->i_fop = &bin_fops; + inode->i_fop = &sysfs_bin_operations; break; case SYSFS_KOBJ_LINK: inode->i_op = &sysfs_symlink_inode_operations; @@ -314,32 +314,6 @@ void sysfs_evict_inode(struct inode *inode) sysfs_put(sd); } -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, - const char *name) -{ - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - - if (!dir_sd) { - WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", - name); - return -ENOENT; - } - - sysfs_addrm_start(&acxt, dir_sd); - - sd = sysfs_find_dirent(dir_sd, ns, name); - if (sd) - sysfs_remove_one(&acxt, sd); - - sysfs_addrm_finish(&acxt); - - if (sd) - return 0; - else - return -ENOENT; -} - int sysfs_permission(struct inode *inode, int mask) { struct sysfs_dirent *sd; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 2dd4507d9edd..3ae3f1bf1a09 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -33,13 +33,15 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, BUG_ON(!name || !parent_sd); - /* target->sd can go away beneath us but is protected with - * sysfs_assoc_lock. Fetch target_sd from it. + /* + * We don't own @target and it may be removed at any time. + * Synchronize using sysfs_symlink_target_lock. See + * sysfs_remove_dir() for details. */ - spin_lock(&sysfs_assoc_lock); + spin_lock(&sysfs_symlink_target_lock); if (target->sd) target_sd = sysfs_get(target->sd); - spin_unlock(&sysfs_assoc_lock); + spin_unlock(&sysfs_symlink_target_lock); error = -ENOENT; if (!target_sd) @@ -52,18 +54,18 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, ns_type = sysfs_ns_type(parent_sd); if (ns_type) - sd->s_ns = target->ktype->namespace(target); + sd->s_ns = target_sd->s_ns; sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ - sysfs_addrm_start(&acxt, parent_sd); + sysfs_addrm_start(&acxt); /* Symlinks must be between directories with the same ns_type */ if (!ns_type || (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { if (warn) - error = sysfs_add_one(&acxt, sd); + error = sysfs_add_one(&acxt, sd, parent_sd); else - error = __sysfs_add_one(&acxt, sd); + error = __sysfs_add_one(&acxt, sd, parent_sd); } else { error = -EINVAL; WARN(1, KERN_WARNING @@ -155,11 +157,17 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, const char *name) { const void *ns = NULL; - spin_lock(&sysfs_assoc_lock); + + /* + * We don't own @target and it may be removed at any time. + * Synchronize using sysfs_symlink_target_lock. See + * sysfs_remove_dir() for details. + */ + spin_lock(&sysfs_symlink_target_lock); if (targ->sd && sysfs_ns_type(kobj->sd)) ns = targ->sd->s_ns; - spin_unlock(&sysfs_assoc_lock); - sysfs_hash_and_remove(kobj->sd, ns, name); + spin_unlock(&sysfs_symlink_target_lock); + sysfs_hash_and_remove(kobj->sd, name, ns); } /** @@ -176,24 +184,25 @@ void sysfs_remove_link(struct kobject *kobj, const char *name) else parent_sd = kobj->sd; - sysfs_hash_and_remove(parent_sd, NULL, name); + sysfs_hash_and_remove(parent_sd, name, NULL); } EXPORT_SYMBOL_GPL(sysfs_remove_link); /** - * sysfs_rename_link - rename symlink in object's directory. + * sysfs_rename_link_ns - rename symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @old: previous name of the symlink. * @new: new name of the symlink. + * @new_ns: new namespace of the symlink. * * A helper function for the common rename symlink idiom. */ -int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, - const char *old, const char *new) +int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, + const char *old, const char *new, const void *new_ns) { struct sysfs_dirent *parent_sd, *sd = NULL; - const void *old_ns = NULL, *new_ns = NULL; + const void *old_ns = NULL; int result; if (!kobj) @@ -205,7 +214,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, old_ns = targ->sd->s_ns; result = -ENOENT; - sd = sysfs_get_dirent(parent_sd, old_ns, old); + sd = sysfs_get_dirent_ns(parent_sd, old, old_ns); if (!sd) goto out; @@ -215,16 +224,13 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, if (sd->s_symlink.target_sd->s_dir.kobj != targ) goto out; - if (sysfs_ns_type(parent_sd)) - new_ns = targ->ktype->namespace(targ); - - result = sysfs_rename(sd, parent_sd, new_ns, new); + result = sysfs_rename(sd, parent_sd, new, new_ns); out: sysfs_put(sd); return result; } -EXPORT_SYMBOL_GPL(sysfs_rename_link); +EXPORT_SYMBOL_GPL(sysfs_rename_link_ns); static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, struct sysfs_dirent *target_sd, char *path) diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index b6deca3e301d..0af09fbfb3f6 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -29,15 +29,13 @@ struct sysfs_elem_symlink { }; struct sysfs_elem_attr { - struct attribute *attr; + union { + struct attribute *attr; + struct bin_attribute *bin_attr; + }; struct sysfs_open_dirent *open; }; -struct sysfs_elem_bin_attr { - struct bin_attribute *bin_attr; - struct hlist_head buffers; -}; - struct sysfs_inode_attrs { struct iattr ia_iattr; void *ia_secdata; @@ -74,7 +72,6 @@ struct sysfs_dirent { struct sysfs_elem_dir s_dir; struct sysfs_elem_symlink s_symlink; struct sysfs_elem_attr s_attr; - struct sysfs_elem_bin_attr s_bin_attr; }; unsigned short s_flags; @@ -115,6 +112,7 @@ static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) } #ifdef CONFIG_DEBUG_LOCK_ALLOC + #define sysfs_dirent_init_lockdep(sd) \ do { \ struct attribute *attr = sd->s_attr.attr; \ @@ -124,15 +122,31 @@ do { \ \ lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ } while (0) + +/* Test for attributes that want to ignore lockdep for read-locking */ +static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd) +{ + int type = sysfs_type(sd); + + return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) && + sd->s_attr.attr->ignore_lockdep; +} + #else + #define sysfs_dirent_init_lockdep(sd) do {} while (0) + +static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd) +{ + return true; +} + #endif /* * Context structure to be used while adding/removing nodes. */ struct sysfs_addrm_cxt { - struct sysfs_dirent *parent_sd; struct sysfs_dirent *removed; }; @@ -156,38 +170,37 @@ extern struct kmem_cache *sysfs_dir_cachep; * dir.c */ extern struct mutex sysfs_mutex; -extern spinlock_t sysfs_assoc_lock; +extern spinlock_t sysfs_symlink_target_lock; extern const struct dentry_operations sysfs_dentry_ops; extern const struct file_operations sysfs_dir_operations; extern const struct inode_operations sysfs_dir_inode_operations; -struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); void sysfs_put_active(struct sysfs_dirent *sd); -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *parent_sd); -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); -void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); +void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt); +void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name); +int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, + struct sysfs_dirent *parent_sd); +int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, + struct sysfs_dirent *parent_sd); +void sysfs_remove(struct sysfs_dirent *sd); +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, + const void *ns); void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const void *ns, - const unsigned char *name); -struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, - const void *ns, - const unsigned char *name); + const unsigned char *name, + const void *ns); struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); void release_sysfs_dirent(struct sysfs_dirent *sd); int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); -void sysfs_remove_subdir(struct sysfs_dirent *sd); int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, - const void *ns, const char *new_name); + const char *new_name, const void *new_ns); static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { @@ -218,25 +231,21 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, - const char *name); int sysfs_inode_init(void); /* * file.c */ extern const struct file_operations sysfs_file_operations; +extern const struct file_operations sysfs_bin_operations; int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type); -int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, umode_t amode); -/* - * bin.c - */ -extern const struct file_operations bin_fops; -void unmap_bin_file(struct sysfs_dirent *attr_sd); +int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd, + const struct attribute *attr, int type, + umode_t amode, const void *ns); +void sysfs_unmap_bin_file(struct sysfs_dirent *sd); /* * symlink.c diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 6e025e02ffde..cc1febd8fadf 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2563,9 +2563,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = prandom_u32() % (len + 1); - /* Corruption may only span one max. write unit */ - to = min(len, ALIGN(from, c->max_write_size)); + from = prandom_u32() % len; + /* Corruption span max to end of write unit */ + to = min(len, ALIGN(from + 1, c->max_write_size)); ubifs_warn("filled bytes %u-%u with %s", from, to - 1, ffs ? "0xFFs" : "random data"); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 76ca53cd3eee..9718da86ad01 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -668,8 +668,7 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) ubifs_assert(!wbuf->used); for (i = 0; ; i++) { - int space_before = c->leb_size - wbuf->offs - wbuf->used; - int space_after; + int space_before, space_after; cond_resched(); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 3e4aa7281e04..f69daa514a57 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1630,8 +1630,10 @@ static int ubifs_remount_rw(struct ubifs_info *c) } c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); - if (!c->write_reserve_buf) + if (!c->write_reserve_buf) { + err = -ENOMEM; goto out; + } err = ubifs_lpt_init(c, 0, 1); if (err) @@ -2064,8 +2066,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); - if (!sb->s_root) + if (!sb->s_root) { + err = -ENOMEM; goto out_umount; + } mutex_unlock(&c->umount_mutex); return 0; diff --git a/fs/udf/super.c b/fs/udf/super.c index 91219385691d..3306b9f69bed 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -76,6 +76,9 @@ #define UDF_DEFAULT_BLOCKSIZE 2048 +#define VSD_FIRST_SECTOR_OFFSET 32768 +#define VSD_MAX_SECTOR_OFFSET 0x800000 + enum { UDF_MAX_LINKS = 0xffff }; /* These are the "meat" - everything else is stuffing */ @@ -685,7 +688,7 @@ out_unlock: static loff_t udf_check_vsd(struct super_block *sb) { struct volStructDesc *vsd = NULL; - loff_t sector = 32768; + loff_t sector = VSD_FIRST_SECTOR_OFFSET; int sectorsize; struct buffer_head *bh = NULL; int nsr02 = 0; @@ -703,8 +706,18 @@ static loff_t udf_check_vsd(struct super_block *sb) udf_debug("Starting at sector %u (%ld byte sectors)\n", (unsigned int)(sector >> sb->s_blocksize_bits), sb->s_blocksize); - /* Process the sequence (if applicable) */ - for (; !nsr02 && !nsr03; sector += sectorsize) { + /* Process the sequence (if applicable). The hard limit on the sector + * offset is arbitrary, hopefully large enough so that all valid UDF + * filesystems will be recognised. There is no mention of an upper + * bound to the size of the volume recognition area in the standard. + * The limit will prevent the code to read all the sectors of a + * specially crafted image (like a bluray disc full of CD001 sectors), + * potentially causing minutes or even hours of uninterruptible I/O + * activity. This actually happened with uninitialised SSD partitions + * (all 0xFF) before the check for the limit and all valid IDs were + * added */ + for (; !nsr02 && !nsr03 && sector < VSD_MAX_SECTOR_OFFSET; + sector += sectorsize) { /* Read a block */ bh = udf_tread(sb, sector >> sb->s_blocksize_bits); if (!bh) @@ -714,10 +727,7 @@ static loff_t udf_check_vsd(struct super_block *sb) vsd = (struct volStructDesc *)(bh->b_data + (sector & (sb->s_blocksize - 1))); - if (vsd->stdIdent[0] == 0) { - brelse(bh); - break; - } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, + if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, VSD_STD_ID_LEN)) { switch (vsd->structType) { case 0: @@ -753,6 +763,17 @@ static loff_t udf_check_vsd(struct super_block *sb) else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03, VSD_STD_ID_LEN)) nsr03 = sector; + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BOOT2, + VSD_STD_ID_LEN)) + ; /* nothing */ + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CDW02, + VSD_STD_ID_LEN)) + ; /* nothing */ + else { + /* invalid id : end of volume recognition area */ + brelse(bh); + break; + } brelse(bh); } @@ -760,7 +781,8 @@ static loff_t udf_check_vsd(struct super_block *sb) return nsr03; else if (nsr02) return nsr02; - else if (sector - (sbi->s_session << sb->s_blocksize_bits) == 32768) + else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) == + VSD_FIRST_SECTOR_OFFSET) return -1; else return 0; @@ -1270,6 +1292,9 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) * PHYSICAL partitions are already set up */ type1_idx = i; +#ifdef UDFFS_DEBUG + map = NULL; /* supress 'maybe used uninitialized' warning */ +#endif for (i = 0; i < sbi->s_partitions; i++) { map = &sbi->s_partmaps[i]; @@ -1891,7 +1916,9 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, return 0; } if (nsr_off == -1) - udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n"); + udf_debug("Failed to read sector at offset %d. " + "Assuming open disc. Skipping validity " + "check\n", VSD_FIRST_SECTOR_OFFSET); if (!sbi->s_last_block) sbi->s_last_block = udf_get_last_block(sb); } else { |