summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig20
-rw-r--r--fs/9p/vfs_file.c5
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/cmservice.c14
-rw-r--r--fs/afs/dir.c108
-rw-r--r--fs/afs/dir_silly.c22
-rw-r--r--fs/afs/fs_probe.c2
-rw-r--r--fs/afs/fsclient.c27
-rw-r--r--fs/afs/internal.h12
-rw-r--r--fs/afs/rxrpc.c74
-rw-r--r--fs/afs/yfsclient.c26
-rw-r--r--fs/autofs/dev-ioctl.c6
-rw-r--r--fs/binfmt_elf.c48
-rw-r--r--fs/block_dev.c24
-rw-r--r--fs/buffer.c62
-rw-r--r--fs/ceph/addr.c90
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c536
-rw-r--r--fs/ceph/debugfs.c16
-rw-r--r--fs/ceph/dir.c132
-rw-r--r--fs/ceph/export.c5
-rw-r--r--fs/ceph/file.c500
-rw-r--r--fs/ceph/inode.c84
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/locks.c31
-rw-r--r--fs/ceph/mds_client.c240
-rw-r--r--fs/ceph/mds_client.h32
-rw-r--r--fs/ceph/snap.c1
-rw-r--r--fs/ceph/super.c28
-rw-r--r--fs/ceph/super.h70
-rw-r--r--fs/cifs/Kconfig2
-rw-r--r--fs/cifs/cifs_debug.c6
-rw-r--r--fs/cifs/cifsacl.c5
-rw-r--r--fs/cifs/cifsfs.c8
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/cifs/cifspdu.h19
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c22
-rw-r--r--fs/cifs/connect.c89
-rw-r--r--fs/cifs/dfs_cache.c38
-rw-r--r--fs/cifs/dfs_cache.h4
-rw-r--r--fs/cifs/file.c63
-rw-r--r--fs/cifs/inode.c51
-rw-r--r--fs/cifs/link.c4
-rw-r--r--fs/cifs/misc.c80
-rw-r--r--fs/cifs/readdir.c82
-rw-r--r--fs/cifs/smb2file.c9
-rw-r--r--fs/cifs/smb2misc.c14
-rw-r--r--fs/cifs/smb2ops.c68
-rw-r--r--fs/cifs/smb2pdu.c202
-rw-r--r--fs/cifs/smb2pdu.h138
-rw-r--r--fs/cifs/smb2proto.h13
-rw-r--r--fs/cifs/smb2transport.c95
-rw-r--r--fs/cifs/smbdirect.c342
-rw-r--r--fs/cifs/smbdirect.h8
-rw-r--r--fs/cifs/transport.c28
-rw-r--r--fs/crypto/fscrypt_private.h20
-rw-r--r--fs/crypto/keysetup.c16
-rw-r--r--fs/crypto/policy.c21
-rw-r--r--fs/dax.c59
-rw-r--r--fs/debugfs/file.c26
-rw-r--r--fs/debugfs/inode.c18
-rw-r--r--fs/efivarfs/super.c2
-rw-r--r--fs/erofs/decompressor.c22
-rw-r--r--fs/erofs/internal.h8
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/utils.c90
-rw-r--r--fs/erofs/zdata.c76
-rw-r--r--fs/eventpoll.c64
-rw-r--r--fs/exec.c106
-rw-r--r--fs/exfat/Kconfig21
-rw-r--r--fs/exfat/Makefile8
-rw-r--r--fs/exfat/balloc.c280
-rw-r--r--fs/exfat/cache.c325
-rw-r--r--fs/exfat/dir.c1238
-rw-r--r--fs/exfat/exfat_fs.h519
-rw-r--r--fs/exfat/exfat_raw.h184
-rw-r--r--fs/exfat/fatent.c463
-rw-r--r--fs/exfat/file.c360
-rw-r--r--fs/exfat/inode.c671
-rw-r--r--fs/exfat/misc.c163
-rw-r--r--fs/exfat/namei.c1448
-rw-r--r--fs/exfat/nls.c831
-rw-r--r--fs/exfat/super.c722
-rw-r--r--fs/ext2/xattr.c18
-rw-r--r--fs/ext2/xattr.h2
-rw-r--r--fs/ext4/balloc.c7
-rw-r--r--fs/ext4/block_validity.c18
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h59
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h3
-rw-r--r--fs/ext4/extents.c480
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/ialloc.c36
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inline.c54
-rw-r--r--fs/ext4/inode.c57
-rw-r--r--fs/ext4/ioctl.c18
-rw-r--r--fs/ext4/mballoc.c38
-rw-r--r--fs/ext4/mmp.c13
-rw-r--r--fs/ext4/move_extent.c4
-rw-r--r--fs/ext4/namei.c28
-rw-r--r--fs/ext4/page-io.c8
-rw-r--r--fs/ext4/super.c220
-rw-r--r--fs/ext4/sysfs.c1
-rw-r--r--fs/ext4/xattr.c10
-rw-r--r--fs/ext4/xattr.h4
-rw-r--r--fs/f2fs/Kconfig9
-rw-r--r--fs/f2fs/checkpoint.c42
-rw-r--r--fs/f2fs/compress.c317
-rw-r--r--fs/f2fs/data.c141
-rw-r--r--fs/f2fs/debug.c3
-rw-r--r--fs/f2fs/dir.c16
-rw-r--r--fs/f2fs/f2fs.h207
-rw-r--r--fs/f2fs/file.c102
-rw-r--r--fs/f2fs/gc.c51
-rw-r--r--fs/f2fs/inode.c29
-rw-r--r--fs/f2fs/namei.c12
-rw-r--r--fs/f2fs/node.c33
-rw-r--r--fs/f2fs/recovery.c12
-rw-r--r--fs/f2fs/segment.c54
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/shrinker.c2
-rw-r--r--fs/f2fs/super.c90
-rw-r--r--fs/f2fs/sysfs.c50
-rw-r--r--fs/f2fs/xattr.c67
-rw-r--r--fs/f2fs/xattr.h9
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/gfs2/acl.c7
-rw-r--r--fs/gfs2/aops.c11
-rw-r--r--fs/gfs2/bmap.c9
-rw-r--r--fs/gfs2/dir.c3
-rw-r--r--fs/gfs2/file.c43
-rw-r--r--fs/gfs2/glock.c137
-rw-r--r--fs/gfs2/glops.c157
-rw-r--r--fs/gfs2/incore.h27
-rw-r--r--fs/gfs2/inode.c53
-rw-r--r--fs/gfs2/lock_dlm.c52
-rw-r--r--fs/gfs2/log.c288
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c14
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/gfs2/ops_fstype.c59
-rw-r--r--fs/gfs2/quota.c76
-rw-r--r--fs/gfs2/quota.h4
-rw-r--r--fs/gfs2/recovery.c12
-rw-r--r--fs/gfs2/rgrp.c88
-rw-r--r--fs/gfs2/rgrp.h4
-rw-r--r--fs/gfs2/super.c112
-rw-r--r--fs/gfs2/super.h1
-rw-r--r--fs/gfs2/sys.c5
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/gfs2/util.c419
-rw-r--r--fs/gfs2/util.h76
-rw-r--r--fs/gfs2/xattr.c12
-rw-r--r--fs/hfsplus/attributes.c4
-rw-r--r--fs/hostfs/hostfs_kern.c12
-rw-r--r--fs/hugetlbfs/inode.c30
-rw-r--r--fs/internal.h2
-rw-r--r--fs/io-wq.c380
-rw-r--r--fs/io-wq.h67
-rw-r--r--fs/io_uring.c2365
-rw-r--r--fs/iomap/buffered-io.c24
-rw-r--r--fs/iomap/direct-io.c4
-rw-r--r--fs/iomap/trace.h27
-rw-r--r--fs/jbd2/commit.c7
-rw-r--r--fs/kernfs/inode.c91
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/libfs.c8
-rw-r--r--fs/namei.c1474
-rw-r--r--fs/namespace.c96
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.h4
-rw-r--r--fs/nfs/callback_proc.c69
-rw-r--r--fs/nfs/delegation.c319
-rw-r--r--fs/nfs/dir.c81
-rw-r--r--fs/nfs/direct.c197
-rw-r--r--fs/nfs/dns_resolve.c11
-rw-r--r--fs/nfs/filelayout/filelayout.c165
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c229
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h2
-rw-r--r--fs/nfs/fs_context.c9
-rw-r--r--fs/nfs/getroot.c39
-rw-r--r--fs/nfs/inode.c28
-rw-r--r--fs/nfs/internal.h36
-rw-r--r--fs/nfs/namespace.c67
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4file.c3
-rw-r--r--fs/nfs/nfs4namespace.c2
-rw-r--r--fs/nfs/nfs4proc.c31
-rw-r--r--fs/nfs/nfs4state.c24
-rw-r--r--fs/nfs/nfs4trace.h8
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/pagelist.c367
-rw-r--r--fs/nfs/pnfs.c244
-rw-r--r--fs/nfs/pnfs.h143
-rw-r--r--fs/nfs/pnfs_nfs.c515
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c60
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c276
-rw-r--r--fs/nfsd/Kconfig2
-rw-r--r--fs/nfsd/export.c45
-rw-r--r--fs/nfsd/filecache.c2
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfs4idmap.c14
-rw-r--r--fs/nfsd/nfs4state.c87
-rw-r--r--fs/nfsd/nfs4xdr.c38
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfsfh.c13
-rw-r--r--fs/nfsd/nfssvc.c3
-rw-r--r--fs/nfsd/trace.h122
-rw-r--r--fs/notify/fanotify/fanotify.c302
-rw-r--r--fs/notify/fanotify/fanotify.h189
-rw-r--r--fs/notify/fanotify/fanotify_user.c220
-rw-r--r--fs/notify/fsnotify.c22
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c12
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/nsfs.c14
-rw-r--r--fs/ntfs/aops.c9
-rw-r--r--fs/ocfs2/alloc.c7
-rw-r--r--fs/ocfs2/cluster/heartbeat.c12
-rw-r--r--fs/ocfs2/cluster/netdebug.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c27
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h8
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c100
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/namei.c15
-rw-r--r--fs/ocfs2/ocfs2_fs.h18
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/reservations.c3
-rw-r--r--fs/ocfs2/stackglue.c2
-rw-r--r--fs/ocfs2/suballoc.c5
-rw-r--r--fs/ocfs2/super.c46
-rw-r--r--fs/open.c4
-rw-r--r--fs/orangefs/file.c34
-rw-r--r--fs/orangefs/inode.c39
-rw-r--r--fs/orangefs/orangefs-kernel.h4
-rw-r--r--fs/overlayfs/copy_up.c16
-rw-r--r--fs/overlayfs/dir.c31
-rw-r--r--fs/overlayfs/export.c40
-rw-r--r--fs/overlayfs/inode.c99
-rw-r--r--fs/overlayfs/namei.c5
-rw-r--r--fs/overlayfs/overlayfs.h25
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/readdir.c25
-rw-r--r--fs/overlayfs/super.c258
-rw-r--r--fs/overlayfs/util.c40
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/array.c39
-rw-r--r--fs/proc/base.c121
-rw-r--r--fs/proc/cpuinfo.c1
-rw-r--r--fs/proc/generic.c31
-rw-r--r--fs/proc/inode.c261
-rw-r--r--fs/proc/internal.h10
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/proc_sysctl.c45
-rw-r--r--fs/proc/root.c36
-rw-r--r--fs/proc/stat.c1
-rw-r--r--fs/proc/task_mmu.c95
-rw-r--r--fs/pstore/inode.c5
-rw-r--r--fs/pstore/platform.c4
-rw-r--r--fs/pstore/ram.c1
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/reiserfs/do_balan.c2
-rw-r--r--fs/reiserfs/ioctl.c11
-rw-r--r--fs/reiserfs/journal.c5
-rw-r--r--fs/reiserfs/namei.c10
-rw-r--r--fs/seq_file.c35
-rw-r--r--fs/splice.c6
-rw-r--r--fs/sysfs/file.c148
-rw-r--r--fs/sysfs/group.c135
-rw-r--r--fs/ubifs/io.c16
-rw-r--r--fs/ubifs/ioctl.c4
-rw-r--r--fs/ubifs/journal.c1
-rw-r--r--fs/ubifs/orphan.c13
-rw-r--r--fs/udf/ecma_167.h2
-rw-r--r--fs/udf/osta_udf.h2
-rw-r--r--fs/udf/udf_sb.h2
-rw-r--r--fs/unicode/.gitignore1
-rw-r--r--fs/userfaultfd.c168
-rw-r--r--fs/xattr.c17
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/libxfs/xfs_ag.c16
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c99
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h9
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c119
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h7
-rw-r--r--fs/xfs/libxfs/xfs_attr.c351
-rw-r--r--fs/xfs/libxfs/xfs_attr.h114
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c130
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h1
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c88
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c50
-rw-r--r--fs/xfs/libxfs/xfs_btree.c93
-rw-r--r--fs/xfs/libxfs/xfs_btree.h82
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c879
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h123
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c17
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h11
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c33
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c32
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c11
-rw-r--r--fs/xfs/libxfs/xfs_format.h48
-rw-r--r--fs/xfs/libxfs/xfs_fs.h32
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c35
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c104
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h6
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c43
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h5
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h9
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h10
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c110
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c104
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h6
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c123
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c99
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_sb.c49
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c2
-rw-r--r--fs/xfs/scrub/agheader.c20
-rw-r--r--fs/xfs/scrub/agheader_repair.c78
-rw-r--r--fs/xfs/scrub/alloc.c2
-rw-r--r--fs/xfs/scrub/attr.c20
-rw-r--r--fs/xfs/scrub/bitmap.c87
-rw-r--r--fs/xfs/scrub/bitmap.h23
-rw-r--r--fs/xfs/scrub/bmap.c4
-rw-r--r--fs/xfs/scrub/dabtree.c42
-rw-r--r--fs/xfs/scrub/dir.c13
-rw-r--r--fs/xfs/scrub/ialloc.c8
-rw-r--r--fs/xfs/scrub/refcount.c2
-rw-r--r--fs/xfs/scrub/repair.c28
-rw-r--r--fs/xfs/scrub/repair.h6
-rw-r--r--fs/xfs/scrub/rmap.c2
-rw-r--r--fs/xfs/scrub/scrub.c9
-rw-r--r--fs/xfs/scrub/trace.c4
-rw-r--r--fs/xfs/scrub/trace.h4
-rw-r--r--fs/xfs/xfs_acl.c132
-rw-r--r--fs/xfs/xfs_acl.h6
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_inactive.c6
-rw-r--r--fs/xfs/xfs_attr_list.c169
-rw-r--r--fs/xfs/xfs_bmap_util.c73
-rw-r--r--fs/xfs/xfs_buf.c40
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c12
-rw-r--r--fs/xfs/xfs_discard.c7
-rw-r--r--fs/xfs/xfs_dquot.c10
-rw-r--r--fs/xfs/xfs_dquot_item.c47
-rw-r--r--fs/xfs/xfs_dquot_item.h1
-rw-r--r--fs/xfs/xfs_error.c7
-rw-r--r--fs/xfs/xfs_error.h2
-rw-r--r--fs/xfs/xfs_export.c14
-rw-r--r--fs/xfs/xfs_file.c16
-rw-r--r--fs/xfs/xfs_fsmap.c13
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_inode.c231
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c47
-rw-r--r--fs/xfs/xfs_ioctl.c355
-rw-r--r--fs/xfs/xfs_ioctl.h35
-rw-r--r--fs/xfs/xfs_ioctl32.c99
-rw-r--r--fs/xfs/xfs_iops.c25
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_linux.h27
-rw-r--r--fs/xfs/xfs_log.c818
-rw-r--r--fs/xfs/xfs_log.h9
-rw-r--r--fs/xfs/xfs_log_cil.c113
-rw-r--r--fs/xfs/xfs_log_priv.h84
-rw-r--r--fs/xfs/xfs_log_recover.c18
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_qm.c69
-rw-r--r--fs/xfs/xfs_qm_syscalls.c13
-rw-r--r--fs/xfs/xfs_quota.h4
-rw-r--r--fs/xfs/xfs_stats.c10
-rw-r--r--fs/xfs/xfs_super.c17
-rw-r--r--fs/xfs/xfs_symlink.c7
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h224
-rw-r--r--fs/xfs/xfs_trans.c34
-rw-r--r--fs/xfs/xfs_trans_ail.c93
-rw-r--r--fs/xfs/xfs_trans_priv.h6
-rw-r--r--fs/xfs/xfs_xattr.c92
-rw-r--r--fs/zonefs/super.c28
404 files changed, 22772 insertions, 9869 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index ac2ec4543fe1..09fd4a185fd2 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -32,13 +32,13 @@ endif
config 9P_FS_SECURITY
- bool "9P Security Labels"
- depends on 9P_FS
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the 9P filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
+ bool "9P Security Labels"
+ depends on 9P_FS
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the 9P filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index fe7f0bd2048e..92cd1d80218d 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -388,7 +388,10 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
iov_iter_count(to), iocb->ki_pos);
- ret = p9_client_read(fid, iocb->ki_pos, to, &err);
+ if (iocb->ki_filp->f_flags & O_NONBLOCK)
+ ret = p9_client_read_once(fid, iocb->ki_pos, to, &err);
+ else
+ ret = p9_client_read(fid, iocb->ki_pos, to, &err);
if (!ret)
return err;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b82423a72f68..c9255d399917 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -143,7 +143,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
default:
p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n",
type, stat->extension);
- };
+ }
*rdev = MKDEV(major, minor);
} else
res |= S_IFREG;
diff --git a/fs/Kconfig b/fs/Kconfig
index 708ba336e689..f08fbbfafd9a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -140,9 +140,10 @@ endmenu
endif # BLOCK
if BLOCK
-menu "DOS/FAT/NT Filesystems"
+menu "DOS/FAT/EXFAT/NT Filesystems"
source "fs/fat/Kconfig"
+source "fs/exfat/Kconfig"
source "fs/ntfs/Kconfig"
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 505e51166973..2ce5112b02c8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -83,6 +83,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
obj-$(CONFIG_CODA_FS) += coda/
obj-$(CONFIG_MINIX_FS) += minix/
obj-$(CONFIG_FAT_FS) += fat/
+obj-$(CONFIG_EXFAT_FS) += exfat/
obj-$(CONFIG_BFS_FS) += bfs/
obj-$(CONFIG_ISO9660_FS) += isofs/
obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index ff3994a6be23..6765949b3aab 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -244,6 +244,17 @@ static void afs_cm_destructor(struct afs_call *call)
}
/*
+ * Abort a service call from within an action function.
+ */
+static void afs_abort_service_call(struct afs_call *call, u32 abort_code, int error,
+ const char *why)
+{
+ rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+ abort_code, error, why);
+ afs_set_call_complete(call, error, 0);
+}
+
+/*
* The server supplied a list of callbacks that it wanted to break.
*/
static void SRXAFSCB_CallBack(struct work_struct *work)
@@ -510,8 +521,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
afs_send_empty_reply(call);
else
- rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- 1, 1, "K-1");
+ afs_abort_service_call(call, 1, 1, "K-1");
afs_put_call(call);
_leave("");
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5c794f4b051a..d1e1caa23c8b 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1032,7 +1032,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *inode;
struct key *key;
- afs_dataversion_t dir_version;
+ afs_dataversion_t dir_version, invalid_before;
long de_version;
int ret;
@@ -1084,8 +1084,8 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (de_version == (long)dir_version)
goto out_valid_noupdate;
- dir_version = dir->invalid_before;
- if (de_version - (long)dir_version >= 0)
+ invalid_before = dir->invalid_before;
+ if (de_version - (long)invalid_before >= 0)
goto out_valid;
_debug("dir modified");
@@ -1275,6 +1275,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct afs_fs_cursor fc;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
mode |= S_IFDIR;
@@ -1295,7 +1296,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1316,10 +1317,14 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto error_key;
}
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
- afs_edit_dir_for_create);
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
+ afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
+ afs_edit_dir_for_create);
+ up_write(&dvnode->validate_lock);
+ }
key_put(key);
kfree(scb);
@@ -1360,6 +1365,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
struct afs_fs_cursor fc;
struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%pd}",
@@ -1391,7 +1397,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1404,9 +1410,12 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
ret = afs_end_vnode_operation(&fc);
if (ret == 0) {
afs_dir_remove_subdir(dentry);
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_remove(dvnode, &dentry->d_name,
afs_edit_dir_for_rmdir);
+ up_write(&dvnode->validate_lock);
}
}
@@ -1544,10 +1553,15 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
ret = afs_end_vnode_operation(&fc);
if (ret == 0 && !(scb[1].have_status || scb[1].have_error))
ret = afs_dir_remove_link(dvnode, dentry, key);
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_remove(dvnode, &dentry->d_name,
- afs_edit_dir_for_unlink);
+
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
+ afs_edit_dir_remove(dvnode, &dentry->d_name,
+ afs_edit_dir_for_unlink);
+ up_write(&dvnode->validate_lock);
+ }
}
if (need_rehash && ret < 0 && ret != -ENOENT)
@@ -1573,6 +1587,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct afs_status_cb *scb;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
mode |= S_IFREG;
@@ -1597,7 +1612,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1618,9 +1633,12 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
afs_edit_dir_for_create);
+ up_write(&dvnode->validate_lock);
kfree(scb);
key_put(key);
@@ -1648,6 +1666,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct afs_vnode *vnode = AFS_FS_I(d_inode(from));
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%llx:%llu},{%pd}",
@@ -1672,7 +1691,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) {
afs_end_vnode_operation(&fc);
@@ -1702,9 +1721,12 @@ static int afs_link(struct dentry *from, struct inode *dir,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid,
afs_edit_dir_for_link);
+ up_write(&dvnode->validate_lock);
key_put(key);
kfree(scb);
@@ -1732,6 +1754,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
struct afs_status_cb *scb;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%pd},%s",
@@ -1759,7 +1782,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1780,9 +1803,12 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
afs_edit_dir_for_symlink);
+ up_write(&dvnode->validate_lock);
key_put(key);
kfree(scb);
@@ -1812,6 +1838,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *tmp = NULL, *rehash = NULL;
struct inode *new_inode;
struct key *key;
+ afs_dataversion_t orig_data_version;
+ afs_dataversion_t new_data_version;
bool new_negative = d_is_negative(new_dentry);
int ret;
@@ -1890,10 +1918,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, orig_dvnode, key, true)) {
- afs_dataversion_t orig_data_version;
- afs_dataversion_t new_data_version;
- struct afs_status_cb *new_scb = &scb[1];
-
orig_data_version = orig_dvnode->status.data_version + 1;
if (orig_dvnode != new_dvnode) {
@@ -1904,7 +1928,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_data_version = new_dvnode->status.data_version + 1;
} else {
new_data_version = orig_data_version;
- new_scb = &scb[0];
}
while (afs_select_fileserver(&fc)) {
@@ -1912,7 +1935,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode);
afs_fs_rename(&fc, old_dentry->d_name.name,
new_dvnode, new_dentry->d_name.name,
- &scb[0], new_scb);
+ &scb[0], &scb[1]);
}
afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break,
@@ -1930,18 +1953,25 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret == 0) {
if (rehash)
d_rehash(rehash);
- if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags))
- afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name,
- afs_edit_dir_for_rename_0);
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_data_version)
+ afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name,
+ afs_edit_dir_for_rename_0);
+ if (orig_dvnode != new_dvnode) {
+ up_write(&orig_dvnode->validate_lock);
- if (!new_negative &&
- test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
- afs_edit_dir_remove(new_dvnode, &new_dentry->d_name,
- afs_edit_dir_for_rename_1);
+ down_write(&new_dvnode->validate_lock);
+ }
+ if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) &&
+ orig_dvnode->status.data_version == new_data_version) {
+ if (!new_negative)
+ afs_edit_dir_remove(new_dvnode, &new_dentry->d_name,
+ afs_edit_dir_for_rename_1);
- if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
afs_edit_dir_add(new_dvnode, &new_dentry->d_name,
&vnode->fid, afs_edit_dir_for_rename_2);
+ }
new_inode = d_inode(new_dentry);
if (new_inode) {
@@ -1957,14 +1987,10 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
* Note that if we ever implement RENAME_EXCHANGE, we'll have
* to update both dentries with opposing dir versions.
*/
- if (new_dvnode != orig_dvnode) {
- afs_update_dentry_version(&fc, old_dentry, &scb[1]);
- afs_update_dentry_version(&fc, new_dentry, &scb[1]);
- } else {
- afs_update_dentry_version(&fc, old_dentry, &scb[0]);
- afs_update_dentry_version(&fc, new_dentry, &scb[0]);
- }
+ afs_update_dentry_version(&fc, old_dentry, &scb[1]);
+ afs_update_dentry_version(&fc, new_dentry, &scb[1]);
d_move(old_dentry, new_dentry);
+ up_write(&new_dvnode->validate_lock);
goto error_tmp;
}
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 361088a5edb9..d94e2b7cddff 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -21,6 +21,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
{
struct afs_fs_cursor fc;
struct afs_status_cb *scb;
+ afs_dataversion_t dir_data_version;
int ret = -ERESTARTSYS;
_enter("%pd,%pd", old, new);
@@ -31,7 +32,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
trace_afs_silly_rename(vnode, false);
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t dir_data_version = dvnode->status.data_version + 1;
+ dir_data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -54,12 +55,15 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
dvnode->silly_key = key_get(key);
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == dir_data_version) {
afs_edit_dir_remove(dvnode, &old->d_name,
afs_edit_dir_for_silly_0);
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_edit_dir_add(dvnode, &new->d_name,
&vnode->fid, afs_edit_dir_for_silly_1);
+ }
+ up_write(&dvnode->validate_lock);
}
kfree(scb);
@@ -181,10 +185,14 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
}
}
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_remove(dvnode, &dentry->d_name,
- afs_edit_dir_for_unlink);
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == dir_data_version)
+ afs_edit_dir_remove(dvnode, &dentry->d_name,
+ afs_edit_dir_for_unlink);
+ up_write(&dvnode->validate_lock);
+ }
}
kfree(scb);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index cfe62b154f68..e1b9ed679045 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -145,6 +145,7 @@ static int afs_do_probe_fileserver(struct afs_net *net,
read_lock(&server->fs_lock);
ac.alist = rcu_dereference_protected(server->addresses,
lockdep_is_held(&server->fs_lock));
+ afs_get_addrlist(ac.alist);
read_unlock(&server->fs_lock);
atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
@@ -163,6 +164,7 @@ static int afs_do_probe_fileserver(struct afs_net *net,
if (!in_progress)
afs_fs_probe_done(server);
+ afs_put_addrlist(ac.alist);
return in_progress;
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1f9c5d8e6fe5..68fc46634346 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -65,6 +65,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus);
u64 data_version, size;
u32 type, abort_code;
+ int ret;
abort_code = ntohl(xdr->abort_code);
@@ -78,7 +79,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
*/
status->abort_code = abort_code;
scb->have_error = true;
- return 0;
+ goto good;
}
pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version));
@@ -87,7 +88,8 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
if (abort_code != 0 && inline_error) {
status->abort_code = abort_code;
- return 0;
+ scb->have_error = true;
+ goto good;
}
type = ntohl(xdr->type);
@@ -123,13 +125,16 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
data_version |= (u64)ntohl(xdr->data_version_hi) << 32;
status->data_version = data_version;
scb->have_status = true;
-
+good:
+ ret = 0;
+advance:
*_bp = (const void *)*_bp + sizeof(*xdr);
- return 0;
+ return ret;
bad:
xdr_dump_bad(*_bp);
- return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ ret = afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ goto advance;
}
static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
@@ -981,16 +986,16 @@ static int afs_deliver_fs_rename(struct afs_call *call)
if (ret < 0)
return ret;
- /* unmarshall the reply once we've received all of it */
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
bp = call->buffer;
ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
if (ret < 0)
return ret;
- if (call->out_dir_scb != call->out_scb) {
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- }
+ ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+ if (ret < 0)
+ return ret;
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 35f951ac296f..ef732dd4e7ef 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -154,7 +154,7 @@ struct afs_call {
};
unsigned char unmarshall; /* unmarshalling phase */
unsigned char addr_ix; /* Address in ->alist */
- bool incoming; /* T if incoming call */
+ bool drop_ref; /* T if need to drop ref for incoming call */
bool send_pages; /* T if data from mapping should be sent */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
@@ -1209,8 +1209,16 @@ static inline void afs_set_call_complete(struct afs_call *call,
ok = true;
}
spin_unlock_bh(&call->state_lock);
- if (ok)
+ if (ok) {
trace_afs_call_done(call);
+
+ /* Asynchronous calls have two refs to release - one from the alloc and
+ * one queued with the work item - and we can't just deallocate the
+ * call because the work item may be queued again.
+ */
+ if (call->drop_ref)
+ afs_put_call(call);
+ }
}
/*
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 58d396592250..1ecc67da6c1a 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -18,7 +18,6 @@ struct workqueue_struct *afs_async_calls;
static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
-static void afs_delete_async_call(struct work_struct *);
static void afs_process_async_call(struct work_struct *);
static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
@@ -169,7 +168,7 @@ void afs_put_call(struct afs_call *call)
int n = atomic_dec_return(&call->usage);
int o = atomic_read(&net->nr_outstanding_calls);
- trace_afs_call(call, afs_call_trace_put, n + 1, o,
+ trace_afs_call(call, afs_call_trace_put, n, o,
__builtin_return_address(0));
ASSERTCMP(n, >=, 0);
@@ -402,8 +401,10 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
/* If the call is going to be asynchronous, we need an extra ref for
* the call to hold itself so the caller need not hang on to its ref.
*/
- if (call->async)
+ if (call->async) {
afs_get_call(call, afs_call_trace_get);
+ call->drop_ref = true;
+ }
/* create a call */
rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
@@ -413,7 +414,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
afs_wake_up_async_call :
afs_wake_up_call_waiter),
call->upgrade,
- call->intr,
+ (call->intr ? RXRPC_PREINTERRUPTIBLE :
+ RXRPC_UNINTERRUPTIBLE),
call->debug_id);
if (IS_ERR(rxcall)) {
ret = PTR_ERR(rxcall);
@@ -584,8 +586,6 @@ static void afs_deliver_to_call(struct afs_call *call)
done:
if (call->type->done)
call->type->done(call);
- if (state == AFS_CALL_COMPLETE && call->incoming)
- afs_put_call(call);
out:
_leave("");
return;
@@ -604,11 +604,7 @@ call_complete:
long afs_wait_for_call_to_complete(struct afs_call *call,
struct afs_addr_cursor *ac)
{
- signed long rtt2, timeout;
long ret;
- bool stalled = false;
- u64 rtt;
- u32 life, last_life;
bool rxrpc_complete = false;
DECLARE_WAITQUEUE(myself, current);
@@ -619,14 +615,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
if (ret < 0)
goto out;
- rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
- rtt2 = nsecs_to_jiffies64(rtt) * 2;
- if (rtt2 < 2)
- rtt2 = 2;
-
- timeout = rtt2;
- rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life);
-
add_wait_queue(&call->waitq, &myself);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -637,37 +625,19 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
call->need_attention = false;
__set_current_state(TASK_RUNNING);
afs_deliver_to_call(call);
- timeout = rtt2;
continue;
}
if (afs_check_call_state(call, AFS_CALL_COMPLETE))
break;
- if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) {
+ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
/* rxrpc terminated the call. */
rxrpc_complete = true;
break;
}
- if (call->intr && timeout == 0 &&
- life == last_life && signal_pending(current)) {
- if (stalled)
- break;
- __set_current_state(TASK_RUNNING);
- rxrpc_kernel_probe_life(call->net->socket, call->rxcall);
- timeout = rtt2;
- stalled = true;
- continue;
- }
-
- if (life != last_life) {
- timeout = rtt2;
- last_life = life;
- stalled = false;
- }
-
- timeout = schedule_timeout(timeout);
+ schedule();
}
remove_wait_queue(&call->waitq, &myself);
@@ -735,7 +705,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
u = atomic_fetch_add_unless(&call->usage, 1, 0);
if (u != 0) {
- trace_afs_call(call, afs_call_trace_wake, u,
+ trace_afs_call(call, afs_call_trace_wake, u + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
@@ -745,21 +715,6 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
}
/*
- * Delete an asynchronous call. The work item carries a ref to the call struct
- * that we need to release.
- */
-static void afs_delete_async_call(struct work_struct *work)
-{
- struct afs_call *call = container_of(work, struct afs_call, async_work);
-
- _enter("");
-
- afs_put_call(call);
-
- _leave("");
-}
-
-/*
* Perform I/O processing on an asynchronous call. The work item carries a ref
* to the call struct that we either need to release or to pass on.
*/
@@ -774,16 +729,6 @@ static void afs_process_async_call(struct work_struct *work)
afs_deliver_to_call(call);
}
- if (call->state == AFS_CALL_COMPLETE) {
- /* We have two refs to release - one from the alloc and one
- * queued with the work item - and we can't just deallocate the
- * call because the work item may be queued again.
- */
- call->async_work.func = afs_delete_async_call;
- if (!queue_work(afs_async_calls, &call->async_work))
- afs_put_call(call);
- }
-
afs_put_call(call);
_leave("");
}
@@ -810,6 +755,7 @@ void afs_charge_preallocation(struct work_struct *work)
if (!call)
break;
+ call->drop_ref = true;
call->async = true;
call->state = AFS_CALL_SV_AWAIT_OP_ID;
init_waitqueue_head(&call->waitq);
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index a26126ac7bf1..b5b45c57e1b1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -165,15 +165,15 @@ static void xdr_dump_bad(const __be32 *bp)
int i;
pr_notice("YFS XDR: Bad status record\n");
- for (i = 0; i < 5 * 4 * 4; i += 16) {
+ for (i = 0; i < 6 * 4 * 4; i += 16) {
memcpy(x, bp, 16);
bp += 4;
pr_notice("%03x: %08x %08x %08x %08x\n",
i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3]));
}
- memcpy(x, bp, 4);
- pr_notice("0x50: %08x\n", ntohl(x[0]));
+ memcpy(x, bp, 8);
+ pr_notice("0x60: %08x %08x\n", ntohl(x[0]), ntohl(x[1]));
}
/*
@@ -186,13 +186,14 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp;
struct afs_file_status *status = &scb->status;
u32 type;
+ int ret;
status->abort_code = ntohl(xdr->abort_code);
if (status->abort_code != 0) {
if (status->abort_code == VNOVNODE)
status->nlink = 0;
scb->have_error = true;
- return 0;
+ goto good;
}
type = ntohl(xdr->type);
@@ -220,13 +221,16 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
status->size = xdr_to_u64(xdr->size);
status->data_version = xdr_to_u64(xdr->data_version);
scb->have_status = true;
-
+good:
+ ret = 0;
+advance:
*_bp += xdr_size(xdr);
- return 0;
+ return ret;
bad:
xdr_dump_bad(*_bp);
- return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ ret = afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ goto advance;
}
/*
@@ -1153,11 +1157,9 @@ static int yfs_deliver_fs_rename(struct afs_call *call)
ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
if (ret < 0)
return ret;
- if (call->out_dir_scb != call->out_scb) {
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- }
+ ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+ if (ret < 0)
+ return ret;
xdr_decode_YFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index a3cdb0036c5d..f3a0f412b43b 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -186,7 +186,7 @@ static int find_autofs_mount(const char *pathname,
struct path path;
int err;
- err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+ err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
if (err)
return err;
err = -ENOENT;
@@ -519,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
if (!fp || param->ioctlfd == -1) {
if (autofs_type_any(type))
- err = kern_path_mountpoint(AT_FDCWD,
- name, &path, LOOKUP_FOLLOW);
+ err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT,
+ &path);
else
err = find_autofs_mount(name, &path,
test_by_type, &type);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f4713ea76e82..13f25e241ac4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -27,6 +27,7 @@
#include <linux/highuid.h>
#include <linux/compiler.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
@@ -698,19 +699,11 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
- struct {
- struct elfhdr interp_elf_ex;
- } *loc;
+ struct elfhdr *interp_elf_ex = NULL;
struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
struct mm_struct *mm;
struct pt_regs *regs;
- loc = kmalloc(sizeof(*loc), GFP_KERNEL);
- if (!loc) {
- retval = -ENOMEM;
- goto out_ret;
- }
-
retval = -ENOEXEC;
/* First of all, some simple consistency checks */
if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
@@ -770,9 +763,15 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
would_dump(bprm, interpreter);
+ interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
+ if (!interp_elf_ex) {
+ retval = -ENOMEM;
+ goto out_free_ph;
+ }
+
/* Get the exec headers */
- retval = elf_read(interpreter, &loc->interp_elf_ex,
- sizeof(loc->interp_elf_ex), 0);
+ retval = elf_read(interpreter, interp_elf_ex,
+ sizeof(*interp_elf_ex), 0);
if (retval < 0)
goto out_free_dentry;
@@ -806,25 +805,25 @@ out_free_interp:
if (interpreter) {
retval = -ELIBBAD;
/* Not an ELF interpreter */
- if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+ if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
goto out_free_dentry;
/* Verify the interpreter has a valid arch */
- if (!elf_check_arch(&loc->interp_elf_ex) ||
- elf_check_fdpic(&loc->interp_elf_ex))
+ if (!elf_check_arch(interp_elf_ex) ||
+ elf_check_fdpic(interp_elf_ex))
goto out_free_dentry;
/* Load the interpreter program headers */
- interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex,
+ interp_elf_phdata = load_elf_phdrs(interp_elf_ex,
interpreter);
if (!interp_elf_phdata)
goto out_free_dentry;
/* Pass PT_LOPROC..PT_HIPROC headers to arch code */
elf_ppnt = interp_elf_phdata;
- for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++)
+ for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++)
switch (elf_ppnt->p_type) {
case PT_LOPROC ... PT_HIPROC:
- retval = arch_elf_pt_proc(&loc->interp_elf_ex,
+ retval = arch_elf_pt_proc(interp_elf_ex,
elf_ppnt, interpreter,
true, &arch_state);
if (retval)
@@ -839,7 +838,7 @@ out_free_interp:
* the exec syscall.
*/
retval = arch_check_elf(elf_ex,
- !!interpreter, &loc->interp_elf_ex,
+ !!interpreter, interp_elf_ex,
&arch_state);
if (retval)
goto out_free_dentry;
@@ -1055,7 +1054,7 @@ out_free_interp:
}
if (interpreter) {
- elf_entry = load_elf_interp(&loc->interp_elf_ex,
+ elf_entry = load_elf_interp(interp_elf_ex,
interpreter,
load_bias, interp_elf_phdata);
if (!IS_ERR((void *)elf_entry)) {
@@ -1064,7 +1063,7 @@ out_free_interp:
* adjustment
*/
interp_load_addr = elf_entry;
- elf_entry += loc->interp_elf_ex.e_entry;
+ elf_entry += interp_elf_ex->e_entry;
}
if (BAD_ADDR(elf_entry)) {
retval = IS_ERR((void *)elf_entry) ?
@@ -1075,6 +1074,9 @@ out_free_interp:
allow_write_access(interpreter);
fput(interpreter);
+
+ kfree(interp_elf_ex);
+ kfree(interp_elf_phdata);
} else {
elf_entry = e_entry;
if (BAD_ADDR(elf_entry)) {
@@ -1083,7 +1085,6 @@ out_free_interp:
}
}
- kfree(interp_elf_phdata);
kfree(elf_phdata);
set_binfmt(&elf_format);
@@ -1153,12 +1154,11 @@ out_free_interp:
start_thread(regs, elf_entry, bprm->p);
retval = 0;
out:
- kfree(loc);
-out_ret:
return retval;
/* error cleanup */
out_free_dentry:
+ kfree(interp_elf_ex);
kfree(interp_elf_phdata);
allow_write_access(interpreter);
if (interpreter)
@@ -1317,7 +1317,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
}
/* Hugetlb memory check */
- if (vma->vm_flags & VM_HUGETLB) {
+ if (is_vm_hugetlb_page(vma)) {
if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
goto whole;
if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 69bf2fb6f7cd..52b6f646cdbd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -34,6 +34,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/falloc.h>
#include <linux/uaccess.h>
+#include <linux/suspend.h>
#include "internal.h"
struct bdev_inode {
@@ -1520,10 +1521,22 @@ rescan:
if (ret)
return ret;
- if (invalidate)
- set_capacity(disk, 0);
- else if (disk->fops->revalidate_disk)
- disk->fops->revalidate_disk(disk);
+ /*
+ * Historically we only set the capacity to zero for devices that
+ * support partitions (independ of actually having partitions created).
+ * Doing that is rather inconsistent, but changing it broke legacy
+ * udisks polling for legacy ide-cdrom devices. Use the crude check
+ * below to get the sane behavior for most device while not breaking
+ * userspace for this particular setup.
+ */
+ if (invalidate) {
+ if (disk_part_scan_enabled(disk) ||
+ !(disk->flags & GENHD_FL_REMOVABLE))
+ set_capacity(disk, 0);
+ } else {
+ if (disk->fops->revalidate_disk)
+ disk->fops->revalidate_disk(disk);
+ }
check_disk_size_change(disk, bdev, !invalidate);
@@ -2001,7 +2014,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (bdev_read_only(I_BDEV(bd_inode)))
return -EPERM;
- if (IS_SWAPFILE(bd_inode))
+ /* uswsusp needs write permission to the swap */
+ if (IS_SWAPFILE(bd_inode) && !hibernation_available())
return -ETXTBSY;
if (!iov_iter_count(from))
diff --git a/fs/buffer.c b/fs/buffer.c
index b8d28370cfd7..f73276d746bb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -274,8 +274,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
* decide that the page is now completely done.
*/
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
@@ -288,8 +287,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
}
tmp = tmp->b_this_page;
} while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
* If none of the buffers had errors and they are all
@@ -301,8 +299,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
@@ -371,8 +368,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
}
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_write(bh);
unlock_buffer(bh);
@@ -384,14 +380,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
}
tmp = tmp->b_this_page;
}
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
end_page_writeback(page);
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
EXPORT_SYMBOL(end_buffer_async_write);
@@ -3019,49 +3013,6 @@ static void end_bio_bh_io_sync(struct bio *bio)
bio_put(bio);
}
-/*
- * This allows us to do IO even on the odd last sectors
- * of a device, even if the block size is some multiple
- * of the physical sector size.
- *
- * We'll just truncate the bio to the size of the device,
- * and clear the end of the buffer head manually.
- *
- * Truly out-of-range accesses will turn into actual IO
- * errors, this only handles the "we need to be able to
- * do IO at the final sector" case.
- */
-void guard_bio_eod(struct bio *bio)
-{
- sector_t maxsector;
- struct hd_struct *part;
-
- rcu_read_lock();
- part = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (part)
- maxsector = part_nr_sects_read(part);
- else
- maxsector = get_capacity(bio->bi_disk);
- rcu_read_unlock();
-
- if (!maxsector)
- return;
-
- /*
- * If the *whole* IO is past the end of the device,
- * let it through, and the IO layer will turn it into
- * an EIO.
- */
- if (unlikely(bio->bi_iter.bi_sector >= maxsector))
- return;
-
- maxsector -= bio->bi_iter.bi_sector;
- if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
- return;
-
- bio_truncate(bio, maxsector << 9);
-}
-
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
enum rw_hint write_hint, struct writeback_control *wbc)
{
@@ -3385,6 +3336,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
if (ret) {
INIT_LIST_HEAD(&ret->b_assoc_buffers);
+ spin_lock_init(&ret->b_uptodate_lock);
preempt_disable();
__this_cpu_inc(bh_accounting.nr);
recalc_bh_state();
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..6f4678d98df7 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -159,8 +159,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
if (!PagePrivate(page))
return;
- ClearPageChecked(page);
-
dout("%p invalidatepage %p idx %lu full dirty page\n",
inode, page, page->index);
@@ -183,6 +181,47 @@ static int ceph_releasepage(struct page *page, gfp_t g)
}
/*
+ * Read some contiguous pages. If we cross a stripe boundary, shorten
+ * *plen. Return number of bytes read, or error.
+ */
+static int ceph_sync_readpages(struct ceph_fs_client *fsc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u32 truncate_seq, u64 truncate_size,
+ struct page **pages, int num_pages,
+ int page_align)
+{
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+ vino.snap, off, *plen);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, truncate_seq, truncate_size,
+ false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short read due to an object boundary */
+ osd_req_op_extent_osd_data_pages(req, 0,
+ pages, *plen, page_align, false, false);
+
+ dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
+ off, *plen, *plen, page_align);
+
+ rc = ceph_osdc_start_request(osdc, req, false);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ dout("readpages result %d\n", rc);
+ return rc;
+}
+
+/*
* read a single page, without unlocking it.
*/
static int ceph_do_readpage(struct file *filp, struct page *page)
@@ -218,7 +257,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
- err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+ err = ceph_sync_readpages(fsc, ceph_vino(inode),
&ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
@@ -571,6 +610,47 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
+ * do a synchronous write on N pages
+ */
+static int ceph_sync_writepages(struct ceph_fs_client *fsc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *snapc,
+ u64 off, u64 len,
+ u32 truncate_seq, u64 truncate_size,
+ struct timespec64 *mtime,
+ struct page **pages, int num_pages)
+{
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ int rc = 0;
+ int page_align = off & ~PAGE_MASK;
+
+ req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ snapc, truncate_seq, truncate_size,
+ true);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short write due to an object boundary */
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+ false, false);
+ dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
+
+ req->r_mtime = *mtime;
+ rc = ceph_osdc_start_request(osdc, req, true);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ if (rc == 0)
+ rc = len;
+ dout("writepages result %d\n", rc);
+ return rc;
+}
+
+/*
* Write a single page, but leave the page locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
@@ -628,7 +708,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
- err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
+ err = ceph_sync_writepages(fsc, ceph_vino(inode),
&ci->i_layout, snapc, page_off, len,
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size,
@@ -1575,7 +1655,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
do {
lock_page(page);
- if ((off > size) || (page->mapping != inode->i_mapping)) {
+ if (page_mkwrite_check_truncate(page, inode) < 0) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
break;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 270b769607a2..2f5cb6bc78e1 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -32,7 +32,7 @@ struct ceph_fscache_entry {
size_t uniq_len;
/* The following members must be last */
struct ceph_fsid fsid;
- char uniquifier[0];
+ char uniquifier[];
};
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 28ae0c134700..185db76300b3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
-
- ci->i_hold_caps_min = round_jiffies(jiffies +
- opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+ dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+ ci->i_hold_caps_max - jiffies);
}
/*
@@ -508,10 +505,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci,
- bool set_timeout)
+ struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
@@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
- if (set_timeout)
- __cap_set_timeouts(mdsc, ci);
+ __cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
@@ -561,19 +556,20 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_delay_lock);
}
-/*
- * Common issue checks for add_cap, handle_cap_grant.
- */
+/* Common issue checks for add_cap, handle_cap_grant. */
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
unsigned issued)
{
unsigned had = __ceph_caps_issued(ci, NULL);
+ lockdep_assert_held(&ci->i_ceph_lock);
+
/*
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
}
@@ -592,6 +588,13 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
__ceph_dir_clear_complete(ci);
}
}
+
+ /* Wipe saved layout if we're losing DIR_CREATE caps */
+ if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ !(issued & CEPH_CAP_DIR_CREATE)) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
}
/*
@@ -605,7 +608,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
*/
void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
+ unsigned issued, unsigned wanted,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap **new_cap)
{
@@ -621,13 +624,6 @@ void ceph_add_cap(struct inode *inode,
dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
session->s_mds, cap_id, ceph_cap_string(issued), seq);
- /*
- * If we are opening the file, include file mode wanted bits
- * in wanted.
- */
- if (fmode >= 0)
- wanted |= ceph_caps_for_mode(fmode);
-
spin_lock(&session->s_gen_ttl_lock);
gen = session->s_cap_gen;
spin_unlock(&session->s_gen_ttl_lock);
@@ -725,7 +721,7 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci, true);
+ __cap_delay_requeue(mdsc, ci);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -752,9 +748,6 @@ void ceph_add_cap(struct inode *inode,
cap->issue_seq = seq;
cap->mseq = mseq;
cap->cap_gen = gen;
-
- if (fmode >= 0)
- __ceph_get_fmode(ci, fmode);
}
/*
@@ -958,29 +951,97 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
- (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+ (S_ISREG(ci->vfs_inode.i_mode) &&
ci->vfs_inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
if (ci->i_wb_ref || ci->i_wrbuffer_ref)
used |= CEPH_CAP_FILE_BUFFER;
+ if (ci->i_fx_ref)
+ used |= CEPH_CAP_FILE_EXCL;
return used;
}
+#define FMODE_WAIT_BIAS 1000
+
/*
* wanted, by virtue of open file modes
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int i, bits = 0;
- for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
- if (ci->i_nr_by_mode[i])
- bits |= 1 << i;
+ const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
+ const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
+ const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
+ const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
+ unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
+
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ int want = 0;
+
+ /* use used_cutoff here, to keep dir's wanted caps longer */
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
+ time_after(ci->i_last_rd, used_cutoff))
+ want |= CEPH_CAP_ANY_SHARED;
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
+ time_after(ci->i_last_wr, used_cutoff)) {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ want |= CEPH_CAP_ANY_DIR_OPS;
+ }
+
+ if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
+ want |= CEPH_CAP_PIN;
+
+ return want;
+ } else {
+ int bits = 0;
+
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_rd, used_cutoff))
+ bits |= 1 << RD_SHIFT;
+ } else if (time_after(ci->i_last_rd, idle_cutoff)) {
+ bits |= 1 << RD_SHIFT;
+ }
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_wr, used_cutoff))
+ bits |= 1 << WR_SHIFT;
+ } else if (time_after(ci->i_last_wr, idle_cutoff)) {
+ bits |= 1 << WR_SHIFT;
+ }
+
+ /* check lazyio only when read/write is wanted */
+ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
+ ci->i_nr_by_mode[LAZY_SHIFT] > 0)
+ bits |= 1 << LAZY_SHIFT;
+
+ return bits ? ceph_caps_for_mode(bits >> 1) : 0;
}
- if (bits == 0)
- return 0;
- return ceph_caps_for_mode(bits >> 1);
+}
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ /* we want EXCL if holding caps of dir ops */
+ if (w & CEPH_CAP_ANY_DIR_OPS)
+ w |= CEPH_CAP_FILE_EXCL;
+ } else {
+ /* we want EXCL if dirty data */
+ if (w & CEPH_CAP_FILE_BUFFER)
+ w |= CEPH_CAP_FILE_EXCL;
+ }
+ return w;
}
/*
@@ -1004,14 +1065,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
return mds_wanted;
}
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
-{
- return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
-}
-
int ceph_is_any_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1274,9 +1327,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct cap_msg_args arg;
int held, revoking;
int wake = 0;
- int delayed = 0;
int ret;
+ /* Don't send anything if it's still being created. Return delayed */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ spin_unlock(&ci->i_ceph_lock);
+ dout("%s async create in flight for %p\n", __func__, inode);
+ return 1;
+ }
+
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
@@ -1287,28 +1346,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
- arg.session = cap->session;
-
- /* don't release wanted unless we've waited a bit. */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_min)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- want |= cap->mds_wanted;
- retain |= cap->issued;
- delayed = 1;
- }
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
- if (want & ~cap->mds_wanted) {
- /* user space may open/close single file frequently.
- * This avoids droping mds_wanted immediately after
- * requesting new mds_wanted.
- */
- __cap_set_timeouts(mdsc, ci);
- }
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH;
cap->issued &= retain; /* drop bits we don't want */
if (cap->implemented & ~cap->issued) {
@@ -1323,6 +1361,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
+ arg.session = cap->session;
arg.ino = ceph_vino(inode).ino;
arg.cid = cap->cap_id;
arg.follows = flushing ? ci->i_head_snapc->seq : 0;
@@ -1332,7 +1371,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
arg.size = inode->i_size;
ci->i_reported_size = arg.size;
arg.max_size = ci->i_wanted_max_size;
- ci->i_requested_max_size = arg.max_size;
+ if (cap == ci->i_auth_cap)
+ ci->i_requested_max_size = arg.max_size;
if (flushing & CEPH_CAP_XATTR_EXCL) {
old_blob = __ceph_build_xattrs_blob(ci);
@@ -1383,14 +1423,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ret = send_cap_msg(&arg);
if (ret < 0) {
- dout("error sending cap msg, must requeue %p\n", inode);
- delayed = 1;
+ pr_err("error sending cap msg, ino (%llx.%llx) "
+ "flushing %s tid %llu, requeue\n",
+ ceph_vinop(inode), ceph_cap_string(flushing),
+ flush_tid);
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
}
if (wake)
wake_up_all(&ci->i_cap_wq);
- return delayed;
+ return ret;
}
static inline int __send_flush_snap(struct inode *inode,
@@ -1617,6 +1662,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
int was = ci->i_dirty_caps;
int dirty = 0;
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (!ci->i_auth_cap) {
pr_warn("__mark_dirty_caps %p %llx mask %s, "
"but no auth cap (session was closed?)\n",
@@ -1654,7 +1701,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
- __cap_delay_requeue(mdsc, ci, true);
+ __cap_delay_requeue(mdsc, ci);
return dirty;
}
@@ -1726,6 +1773,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
struct ceph_cap_flush *cf = NULL;
int flushing;
+ lockdep_assert_held(&ci->i_ceph_lock);
BUG_ON(ci->i_dirty_caps == 0);
BUG_ON(list_empty(&ci->i_dirty_item));
BUG_ON(!ci->i_prealloc_cap_flush);
@@ -1805,8 +1853,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* versus held caps. Release, flush, ack revoked caps to mds as
* appropriate.
*
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- * cap release further.
* CHECK_CAPS_AUTHONLY - we should only check the auth cap
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
@@ -1825,24 +1871,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int delayed = 0, sent = 0;
- bool no_delay = flags & CHECK_CAPS_NODELAY;
bool queue_invalidate = false;
bool tried_invalidate = false;
- /* if we are unmounting, flush any unused caps immediately. */
- if (mdsc->stopping)
- no_delay = true;
-
spin_lock(&ci->i_ceph_lock);
-
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
- if (!(flags & CHECK_CAPS_AUTHONLY) ||
- (ci->i_auth_cap && __ceph_is_single_caps(ci)))
- __cap_delay_cancel(mdsc, ci);
-
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1866,10 +1901,11 @@ retry_locked:
* revoking the shared cap on every create/unlink
* operation.
*/
- if (IS_RDONLY(inode))
+ if (IS_RDONLY(inode)) {
want = CEPH_CAP_ANY_SHARED;
- else
- want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ } else {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ }
retain |= want;
} else {
@@ -1885,14 +1921,13 @@ retry_locked:
}
dout("check_caps %p file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s%s\n", inode,
+ " issued %s revoking %s retain %s %s%s\n", inode,
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
/*
@@ -1900,8 +1935,8 @@ retry_locked:
* have cached pages, but don't want them, then try to invalidate.
* If we fail, it's because pages are locked.... try again later.
*/
- if ((!no_delay || mdsc->stopping) &&
- !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
+ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
+ S_ISREG(inode->i_mode) &&
!(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(revoking & (CEPH_CAP_FILE_CACHE|
@@ -1973,28 +2008,17 @@ retry_locked:
}
/* want more caps from mds? */
- if (want & ~(cap->mds_wanted | cap->issued))
- goto ack;
+ if (want & ~cap->mds_wanted) {
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+ if (!__cap_is_valid(cap))
+ goto ack;
+ }
/* things we might delay */
if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
- if (no_delay)
- goto ack;
-
- /* delay? */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- delayed++;
- continue;
- }
-
ack:
if (session && session != cap->session) {
dout("oops, wrong session %p mutex\n", session);
@@ -2055,18 +2079,20 @@ ack:
}
mds = cap->mds; /* remember mds, so we don't repeat */
- sent++;
/* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
- cap_used, want, retain, flushing,
- flush_tid, oldest_flush_tid);
+ __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
+ retain, flushing, flush_tid, oldest_flush_tid);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
- /* Reschedule delayed caps release if we delayed anything */
- if (delayed)
- __cap_delay_requeue(mdsc, ci, false);
+ /* periodically re-calculate caps wanted by open files */
+ if (__ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list) &&
+ (file_wanted & ~CEPH_CAP_PIN) &&
+ !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ __cap_delay_requeue(mdsc, ci);
+ }
spin_unlock(&ci->i_ceph_lock);
@@ -2095,7 +2121,6 @@ retry:
retry_locked:
if (ci->i_dirty_caps && ci->i_auth_cap) {
struct ceph_cap *cap = ci->i_auth_cap;
- int delayed;
if (session != cap->session) {
spin_unlock(&ci->i_ceph_lock);
@@ -2124,18 +2149,10 @@ retry_locked:
&oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- CEPH_CLIENT_CAPS_SYNC,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- (cap->issued | cap->implemented),
- flushing, flush_tid, oldest_flush_tid);
-
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci, true);
- spin_unlock(&ci->i_ceph_lock);
- }
+ __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
+ __ceph_caps_used(ci), __ceph_caps_wanted(ci),
+ (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
@@ -2233,6 +2250,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync)
goto out;
+ ret = ceph_wait_on_async_create(inode);
+ if (ret)
+ goto out;
+
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
@@ -2335,22 +2356,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
if (cf->caps) {
dout("kick_flushing_caps %p cap %p tid %llu %s\n",
inode, cap, cf->tid, ceph_cap_string(cf->caps));
- ci->i_ceph_flags |= CEPH_I_NODELAY;
-
- ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
(cf->tid < last_snap_flush ?
CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
- if (ret) {
- pr_err("kick_flushing_caps: error sending "
- "cap flush, ino (%llx.%llx) "
- "tid %llu flushing %s\n",
- ceph_vinop(inode), cf->tid,
- ceph_cap_string(cf->caps));
- }
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
@@ -2457,16 +2469,15 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
}
}
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct inode *inode)
- __releases(ci->i_ceph_lock)
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_cap *cap = ci->i_auth_cap;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+ dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2478,9 +2489,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_dirty_lock);
__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
- spin_unlock(&ci->i_ceph_lock);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
}
@@ -2488,18 +2496,20 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
/*
* Take references to capabilities we hold, so that we don't release
* them to the MDS prematurely.
- *
- * Protected by i_ceph_lock.
*/
-static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
bool snap_rwsem_locked)
{
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
if (got & CEPH_CAP_FILE_RD)
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
+ if (got & CEPH_CAP_FILE_EXCL)
+ ci->i_fx_ref++;
if (got & CEPH_CAP_FILE_WR) {
if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
BUG_ON(!snap_rwsem_locked);
@@ -2512,7 +2522,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
if (ci->i_wb_ref == 0)
ihold(&ci->vfs_inode);
ci->i_wb_ref++;
- dout("__take_cap_refs %p wb %d -> %d (?)\n",
+ dout("%s %p wb %d -> %d (?)\n", __func__,
&ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2524,14 +2534,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
* Note that caller is responsible for ensuring max_size increases are
* requested from the MDS.
*
- * Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
- * or a negative error code.
- *
- * FIXME: how does a 0 return differ from -EAGAIN?
+ * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
+ * or a negative error code. There are 3 speical error codes:
+ * -EAGAIN: need to sleep but non-blocking is specified
+ * -EFBIG: ask caller to call check_max_size() and try again.
+ * -ESTALE: ask caller to call ceph_renew_caps() and try again.
*/
enum {
- NON_BLOCKING = 1,
- CHECK_FILELOCK = 2,
+ /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
+ NON_BLOCKING = (1 << 8),
+ CHECK_FILELOCK = (1 << 9),
};
static int try_get_cap_refs(struct inode *inode, int need, int want,
@@ -2541,7 +2553,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
int ret = 0;
int have, implemented;
- int file_wanted;
bool snap_rwsem_locked = false;
dout("get_cap_refs %p need %s want %s\n", inode,
@@ -2557,15 +2568,6 @@ again:
goto out_unlock;
}
- /* make sure file is actually open */
- file_wanted = __ceph_caps_file_wanted(ci);
- if ((file_wanted & need) != need) {
- dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
- ceph_cap_string(need), ceph_cap_string(file_wanted));
- ret = -EBADF;
- goto out_unlock;
- }
-
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
@@ -2584,7 +2586,7 @@ again:
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size)
- ret = -EAGAIN;
+ ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
goto out_unlock;
}
/*
@@ -2630,51 +2632,55 @@ again:
}
snap_rwsem_locked = true;
}
- *got = need | (have & want);
- if ((need & CEPH_CAP_FILE_RD) &&
+ if ((have & want) == want)
+ *got = need | want;
+ else
+ *got = need;
+ if (S_ISREG(inode->i_mode) &&
+ (need & CEPH_CAP_FILE_RD) &&
!(*got & CEPH_CAP_FILE_CACHE))
ceph_disable_fscache_readpage(ci);
- __take_cap_refs(ci, *got, true);
+ ceph_take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
int session_readonly = false;
- if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+ int mds_wanted;
+ if (ci->i_auth_cap &&
+ (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
struct ceph_mds_session *s = ci->i_auth_cap->session;
spin_lock(&s->s_cap_lock);
session_readonly = s->s_readonly;
spin_unlock(&s->s_cap_lock);
}
if (session_readonly) {
- dout("get_cap_refs %p needed %s but mds%d readonly\n",
+ dout("get_cap_refs %p need %s but mds%d readonly\n",
inode, ceph_cap_string(need), ci->i_auth_cap->mds);
ret = -EROFS;
goto out_unlock;
}
- if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
- int mds_wanted;
- if (READ_ONCE(mdsc->fsc->mount_state) ==
- CEPH_MOUNT_SHUTDOWN) {
- dout("get_cap_refs %p forced umount\n", inode);
- ret = -EIO;
- goto out_unlock;
- }
- mds_wanted = __ceph_caps_mds_wanted(ci, false);
- if (need & ~(mds_wanted & need)) {
- dout("get_cap_refs %p caps were dropped"
- " (session killed?)\n", inode);
- ret = -ESTALE;
- goto out_unlock;
- }
- if (!(file_wanted & ~mds_wanted))
- ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ ret = -EIO;
+ goto out_unlock;
+ }
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
+ if (need & ~mds_wanted) {
+ dout("get_cap_refs %p need %s > mds_wanted %s\n",
+ inode, ceph_cap_string(need),
+ ceph_cap_string(mds_wanted));
+ ret = -ESTALE;
+ goto out_unlock;
}
- dout("get_cap_refs %p have %s needed %s\n", inode,
+ dout("get_cap_refs %p have %s need %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
out_unlock:
+
+ __ceph_touch_fmode(ci, mdsc, flags);
+
spin_unlock(&ci->i_ceph_lock);
if (snap_rwsem_locked)
up_read(&mdsc->snap_rwsem);
@@ -2712,20 +2718,40 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
+static inline int get_used_fmode(int caps)
+{
+ int fmode = 0;
+ if (caps & CEPH_CAP_FILE_RD)
+ fmode |= CEPH_FILE_MODE_RD;
+ if (caps & CEPH_CAP_FILE_WR)
+ fmode |= CEPH_FILE_MODE_WR;
+ return fmode;
+}
+
int ceph_try_get_caps(struct inode *inode, int need, int want,
bool nonblock, int *got)
{
- int ret;
+ int ret, flags;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
- ret = ceph_pool_perm_check(inode, need);
- if (ret < 0)
- return ret;
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
+ CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_ANY_DIR_OPS));
+ if (need) {
+ ret = ceph_pool_perm_check(inode, need);
+ if (ret < 0)
+ return ret;
+ }
+
+ flags = get_used_fmode(need | want);
+ if (nonblock)
+ flags |= NON_BLOCKING;
- ret = try_get_cap_refs(inode, need, want, 0,
- (nonblock ? NON_BLOCKING : 0), got);
- return ret == -EAGAIN ? 0 : ret;
+ ret = try_get_cap_refs(inode, need, want, 0, flags, got);
+ /* three special error codes */
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN)
+ ret = 0;
+ return ret;
}
/*
@@ -2750,16 +2776,16 @@ int ceph_get_caps(struct file *filp, int need, int want,
fi->filp_gen != READ_ONCE(fsc->filp_gen))
return -EBADF;
- while (true) {
- if (endoff > 0)
- check_max_size(inode, endoff);
+ flags = get_used_fmode(need | want);
- flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
+ while (true) {
+ flags &= CEPH_FILE_MODE_MASK;
+ if (atomic_read(&fi->num_locks))
+ flags |= CHECK_FILELOCK;
_got = 0;
ret = try_get_cap_refs(inode, need, want, endoff,
flags, &_got);
- if (ret == -EAGAIN)
- continue;
+ WARN_ON_ONCE(ret == -EAGAIN);
if (!ret) {
struct ceph_mds_client *mdsc = fsc->mdsc;
struct cap_wait cw;
@@ -2774,6 +2800,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
list_add(&cw.list, &mdsc->cap_wait_list);
spin_unlock(&mdsc->caps_list_lock);
+ /* make sure used fmode not timeout */
+ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
add_wait_queue(&ci->i_cap_wq, &wait);
flags |= NON_BLOCKING;
@@ -2787,6 +2815,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
+ ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
spin_lock(&mdsc->caps_list_lock);
list_del(&cw.list);
@@ -2804,16 +2833,26 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
if (ret < 0) {
+ if (ret == -EFBIG || ret == -ESTALE) {
+ int ret2 = ceph_wait_on_async_create(inode);
+ if (ret2 < 0)
+ return ret2;
+ }
+ if (ret == -EFBIG) {
+ check_max_size(inode, endoff);
+ continue;
+ }
if (ret == -ESTALE) {
/* session was killed, try renew caps */
- ret = ceph_renew_caps(inode);
+ ret = ceph_renew_caps(inode, flags);
if (ret == 0)
continue;
}
return ret;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ ci->i_inline_version != CEPH_INLINE_NONE &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
struct page *page =
@@ -2846,7 +2885,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
break;
}
- if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
ceph_fscache_revalidate_cookie(ci);
*got = _got;
@@ -2860,7 +2900,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps, false);
+ ceph_take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
@@ -2911,6 +2951,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (had & CEPH_CAP_FILE_CACHE)
if (--ci->i_rdcache_ref == 0)
last++;
+ if (had & CEPH_CAP_FILE_EXCL)
+ if (--ci->i_fx_ref == 0)
+ last++;
if (had & CEPH_CAP_FILE_BUFFER) {
if (--ci->i_wb_ref == 0) {
last++;
@@ -2950,7 +2993,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
last ? " last" : "", put ? " put" : "");
- if (last && !flushsnaps)
+ if (last)
ceph_check_caps(ci, 0, NULL);
else if (flushsnaps)
ceph_flush_snaps(ci, NULL);
@@ -3032,7 +3075,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
spin_unlock(&ci->i_ceph_lock);
if (last) {
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
} else if (flush_snaps) {
ceph_flush_snaps(ci, NULL);
}
@@ -3133,7 +3176,7 @@ static void handle_cap_grant(struct inode *inode,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
- if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
@@ -3297,11 +3340,12 @@ static void handle_cap_grant(struct inode *inode,
ceph_cap_string(cap->issued),
ceph_cap_string(newcaps),
ceph_cap_string(revoking));
- if (revoking & used & CEPH_CAP_FILE_BUFFER)
+ if (S_ISREG(inode->i_mode) &&
+ (revoking & used & CEPH_CAP_FILE_BUFFER))
writeback = true; /* initiate writeback; will delay ack */
- else if (revoking == CEPH_CAP_FILE_CACHE &&
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- queue_invalidate)
+ else if (queue_invalidate &&
+ revoking == CEPH_CAP_FILE_CACHE &&
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
; /* do nothing yet, invalidation will be queued */
else if (cap == ci->i_auth_cap)
check_caps = 1; /* check auth cap only */
@@ -3339,7 +3383,8 @@ static void handle_cap_grant(struct inode *inode,
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
if (newcaps & ~extra_info->issued)
wake = true;
- kick_flushing_inode_caps(session->s_mdsc, session, inode);
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
up_read(&session->s_mdsc->snap_rwsem);
} else {
spin_unlock(&ci->i_ceph_lock);
@@ -3367,10 +3412,10 @@ static void handle_cap_grant(struct inode *inode,
wake_up_all(&ci->i_cap_wq);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
session);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
else
mutex_unlock(&session->s_mutex);
}
@@ -3619,8 +3664,6 @@ retry:
goto out_unlock;
if (target < 0) {
- if (cap->mds_wanted | cap->issued)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
goto out_unlock;
}
@@ -3668,7 +3711,7 @@ retry:
/* add placeholder for the export tagert */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
- ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+ ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
if (!list_empty(&ci->i_cap_flush_list) &&
@@ -3773,7 +3816,7 @@ retry:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+ ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
@@ -4047,7 +4090,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
- int flags = CHECK_CAPS_NODELAY;
dout("check_delayed_caps\n");
while (1) {
@@ -4067,7 +4109,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
if (inode) {
dout("check_delayed_caps on %p\n", inode);
- ceph_check_caps(ci, flags, NULL);
+ ceph_check_caps(ci, 0, NULL);
/* avoid calling iput_final() in tick thread */
ceph_async_iput(inode);
}
@@ -4092,7 +4134,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
ihold(inode);
dout("flush_dirty_caps %p\n", inode);
spin_unlock(&mdsc->cap_dirty_lock);
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
@@ -4100,14 +4142,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode)
+{
+ unsigned long now = jiffies;
+ if (fmode & CEPH_FILE_MODE_RD)
+ ci->i_last_rd = now;
+ if (fmode & CEPH_FILE_MODE_WR)
+ ci->i_last_wr = now;
+ /* queue periodic check */
+ if (fmode &&
+ __ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list))
+ __cap_delay_requeue(mdsc, ci);
+}
+
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
int i;
int bits = (fmode << 1) | 1;
+ spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i))
- ci->i_nr_by_mode[i]++;
+ ci->i_nr_by_mode[i] += count;
}
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -4115,26 +4174,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
* we may need to release capabilities to the MDS (or schedule
* their delayed release).
*/
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i, last = 0;
+ int i;
int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
- BUG_ON(ci->i_nr_by_mode[i] == 0);
- if (--ci->i_nr_by_mode[i] == 0)
- last++;
+ BUG_ON(ci->i_nr_by_mode[i] < count);
+ ci->i_nr_by_mode[i] -= count;
}
}
- dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
- &ci->vfs_inode, fmode,
- ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
- ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
spin_unlock(&ci->i_ceph_lock);
-
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
- ceph_check_caps(ci, 0, NULL);
}
/*
@@ -4152,7 +4203,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (inode->i_nlink == 1) {
drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
- ci->i_ceph_flags |= CEPH_I_NODELAY;
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_client(inode)->mdsc;
@@ -4208,8 +4258,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
if (force || (cap->issued & drop)) {
if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
- wanted |= cap->mds_wanted;
dout("encode_inode_release %p cap %p "
"%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb7cabd98e7b..481ac97b4d25 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -218,10 +218,10 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
return 0;
}
-CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
+DEFINE_SHOW_ATTRIBUTE(mdsmap);
+DEFINE_SHOW_ATTRIBUTE(mdsc);
+DEFINE_SHOW_ATTRIBUTE(caps);
+DEFINE_SHOW_ATTRIBUTE(mds_sessions);
/*
@@ -281,25 +281,25 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
0400,
fsc->client->debugfs_dir,
fsc,
- &mdsmap_show_fops);
+ &mdsmap_fops);
fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
0400,
fsc->client->debugfs_dir,
fsc,
- &mds_sessions_show_fops);
+ &mds_sessions_fops);
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
0400,
fsc->client->debugfs_dir,
fsc,
- &mdsc_show_fops);
+ &mdsc_fops);
fsc->debugfs_caps = debugfs_create_file("caps",
0400,
fsc->client->debugfs_dir,
fsc,
- &caps_show_fops);
+ &caps_fops);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d0cd0aba5843..4c4202c93b71 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -335,8 +335,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = 2;
}
- /* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
+ /* request Fx cap. if have Fx, we don't need to release Fs cap
+ * for later create/unlink. */
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
+ /* can we use the dcache? */
if (ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
@@ -752,7 +755,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&ci->i_ceph_lock);
- dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+ dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
@@ -760,6 +763,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
ceph_test_mount_opt(fsc, DCACHE) &&
__ceph_dir_is_complete(ci) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
spin_unlock(&ci->i_ceph_lock);
dout(" dir %p complete, -ENOENT\n", dir);
d_add(dentry, NULL);
@@ -1036,6 +1040,78 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
+static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ if (result == -EJUKEBOX)
+ goto out;
+
+ /* If op failed, mark everyone involved for errors */
+ if (result) {
+ int pathlen = 0;
+ u64 base = 0;
+ char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &base, 0);
+
+ /* mark error on parent + clear complete */
+ mapping_set_error(req->r_parent->i_mapping, result);
+ ceph_dir_clear_complete(req->r_parent);
+
+ /* drop the dentry -- we don't know its status */
+ if (!d_unhashed(req->r_dentry))
+ d_drop(req->r_dentry);
+
+ /* mark inode itself for an error (since metadata is bogus) */
+ mapping_set_error(req->r_old_inode->i_mapping, result);
+
+ pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+ }
+out:
+ iput(req->r_old_inode);
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di;
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
+
+ spin_lock(&ci->i_ceph_lock);
+ if ((__ceph_caps_issued(ci, NULL) & want) == want) {
+ ceph_take_cap_refs(ci, want, false);
+ got = want;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* If we didn't get anything, return 0 */
+ if (!got)
+ return 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ /*
+ * - We are holding Fx, which implies Fs caps.
+ * - Only support async unlink for primary linkage
+ */
+ if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
+ !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
+ want = 0;
+ spin_unlock(&dentry->d_lock);
+
+ /* Do we still want what we've got? */
+ if (want == got)
+ return got;
+
+ ceph_put_cap_refs(ci, got);
+ return 0;
+}
+
/*
* rmdir and unlink are differ only by the metadata op code
*/
@@ -1045,6 +1121,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int err = -EROFS;
int op;
@@ -1059,6 +1136,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
+retry:
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -1067,13 +1145,39 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
- set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- d_delete(dentry);
+
+ if (try_async && op == CEPH_MDS_OP_UNLINK &&
+ (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+ dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+ dentry->d_name.len, dentry->d_name.name,
+ ceph_cap_string(req->r_dir_caps));
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_callback = ceph_async_unlink_cb;
+ req->r_old_inode = d_inode(dentry);
+ ihold(req->r_old_inode);
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ /*
+ * We have enough caps, so we assume that the unlink
+ * will succeed. Fix up the target inode and dcache.
+ */
+ drop_nlink(inode);
+ d_delete(dentry);
+ } else if (err == -EJUKEBOX) {
+ try_async = false;
+ ceph_mdsc_put_request(req);
+ goto retry;
+ }
+ } else {
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ d_delete(dentry);
+ }
+
ceph_mdsc_put_request(req);
out:
return err;
@@ -1411,6 +1515,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
spin_lock(&dentry->d_lock);
di->time = jiffies;
di->lease_shared_gen = 0;
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
__dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -1520,7 +1625,8 @@ static int __dir_lease_try_check(const struct dentry *dentry)
/*
* Check if directory-wide content lease/cap is valid.
*/
-static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_client *mdsc)
{
struct ceph_inode_info *ci = ceph_inode(dir);
int valid;
@@ -1528,7 +1634,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
spin_lock(&ci->i_ceph_lock);
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
- shared_gen = atomic_read(&ci->i_shared_gen);
+ if (valid) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
+ shared_gen = atomic_read(&ci->i_shared_gen);
+ }
spin_unlock(&ci->i_ceph_lock);
if (valid) {
struct ceph_dentry_info *di;
@@ -1554,6 +1663,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
int valid = 0;
struct dentry *parent;
struct inode *dir, *inode;
+ struct ceph_mds_client *mdsc;
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
@@ -1570,6 +1680,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
dentry, inode, ceph_dentry(dentry)->offset);
+ mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
+
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1581,7 +1693,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = dentry_lease_is_valid(dentry, flags);
if (valid == -ECHILD)
return valid;
- if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
if (inode)
valid = ceph_is_any_caps(inode);
else
@@ -1590,8 +1702,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
if (!valid) {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(dir->i_sb)->mdsc;
struct ceph_mds_request *req;
int op, err;
u32 mask;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index b6bfa94332c3..79dc06881e78 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -315,6 +315,11 @@ static struct dentry *__get_parent(struct super_block *sb,
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return ERR_PTR(err);
+ }
+
inode = req->r_target_inode;
if (inode)
ihold(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7e0190b1f821..afdfca965a7f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
if (isdir) {
struct ceph_dir_file_info *dfi =
kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
- if (!dfi) {
- ceph_put_fmode(ci, fmode); /* clean up */
+ if (!dfi)
return -ENOMEM;
- }
file->private_data = dfi;
fi = &dfi->file_info;
@@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
dfi->readdir_cache_idx = -1;
} else {
fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
- if (!fi) {
- ceph_put_fmode(ci, fmode); /* clean up */
+ if (!fi)
return -ENOMEM;
- }
file->private_data = fi;
}
+ ceph_get_fmode(ci, fmode, 1);
fi->fmode = fmode;
+
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
fi->meta_err = errseq_sample(&ci->i_meta_err);
@@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFLNK:
dout("init_file %p %p 0%o (symlink)\n", inode, file,
inode->i_mode);
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
break;
default:
@@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
* we need to drop the open ref now, since we don't
* have .release set to ceph_release.
*/
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
BUG_ON(inode->i_fop->release == ceph_release);
/* call the proper open fop */
@@ -285,14 +281,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
/*
* try renew caps after session gets killed.
*/
-int ceph_renew_caps(struct inode *inode)
+int ceph_renew_caps(struct inode *inode, int fmode)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
spin_lock(&ci->i_ceph_lock);
+ __ceph_touch_fmode(ci, mdsc, fmode);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
(!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
@@ -326,7 +323,6 @@ int ceph_renew_caps(struct inode *inode)
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
- req->r_fmode = -1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
@@ -372,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
/* trivially open snapdir */
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- spin_lock(&ci->i_ceph_lock);
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -392,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
dout("open %p fmode %d want %s issued %s using existing\n",
inode, fmode, ceph_cap_string(wanted),
ceph_cap_string(issued));
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
/* adjust wanted? */
@@ -404,7 +397,7 @@ int ceph_open(struct inode *inode, struct file *file)
return ceph_init_file(inode, file, fmode);
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
(ci->i_snap_caps & wanted) == wanted) {
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -430,6 +423,236 @@ out:
return err;
}
+/* Clone the layout from a synchronous create, if the dir now has Dc caps */
+static void
+cache_file_layout(struct inode *dst, struct inode *src)
+{
+ struct ceph_inode_info *cdst = ceph_inode(dst);
+ struct ceph_inode_info *csrc = ceph_inode(src);
+
+ spin_lock(&cdst->i_ceph_lock);
+ if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
+ !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
+ memcpy(&cdst->i_cached_layout, &csrc->i_layout,
+ sizeof(cdst->i_cached_layout));
+ rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
+ ceph_try_get_string(csrc->i_layout.pool_ns));
+ }
+ spin_unlock(&cdst->i_ceph_lock);
+}
+
+/*
+ * Try to set up an async create. We need caps, a file layout, and inode number,
+ * and either a lease on the dentry or complete dir info. If any of those
+ * criteria are not satisfied, then return false and the caller can go
+ * synchronous.
+ */
+static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
+ struct ceph_file_layout *lo, u64 *pino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
+ u64 ino;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* No auth cap means no chance for Dc caps */
+ if (!ci->i_auth_cap)
+ goto no_async;
+
+ /* Any delegated inos? */
+ if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
+ goto no_async;
+
+ if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
+ goto no_async;
+
+ if ((__ceph_caps_issued(ci, NULL) & want) != want)
+ goto no_async;
+
+ if (d_in_lookup(dentry)) {
+ if (!__ceph_dir_is_complete(ci))
+ goto no_async;
+ spin_lock(&dentry->d_lock);
+ di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
+ spin_unlock(&dentry->d_lock);
+ } else if (atomic_read(&ci->i_shared_gen) !=
+ READ_ONCE(di->lease_shared_gen)) {
+ goto no_async;
+ }
+
+ ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
+ if (!ino)
+ goto no_async;
+
+ *pino = ino;
+ ceph_take_cap_refs(ci, want, false);
+ memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
+ rcu_assign_pointer(lo->pool_ns,
+ ceph_try_get_string(ci->i_cached_layout.pool_ns));
+ got = want;
+no_async:
+ spin_unlock(&ci->i_ceph_lock);
+ return got;
+}
+
+static void restore_deleg_ino(struct inode *dir, u64 ino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_session *s = NULL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ s = ceph_get_mds_session(ci->i_auth_cap->session);
+ spin_unlock(&ci->i_ceph_lock);
+ if (s) {
+ int err = ceph_restore_deleg_ino(s, ino);
+ if (err)
+ pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
+ ino, err);
+ ceph_put_mds_session(s);
+ }
+}
+
+static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ if (result == -EJUKEBOX)
+ goto out;
+
+ mapping_set_error(req->r_parent->i_mapping, result);
+
+ if (result) {
+ struct dentry *dentry = req->r_dentry;
+ int pathlen = 0;
+ u64 base = 0;
+ char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &base, 0);
+
+ ceph_dir_clear_complete(req->r_parent);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+
+ /* FIXME: start returning I/O errors on all accesses? */
+ pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+ }
+
+ if (req->r_target_inode) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ u64 ino = ceph_vino(req->r_target_inode).ino;
+
+ if (req->r_deleg_ino != ino)
+ pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+ __func__, req->r_err, req->r_deleg_ino, ino);
+ mapping_set_error(req->r_target_inode->i_mapping, result);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ }
+ ceph_kick_flushing_inode_caps(req->r_session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
+ req->r_deleg_ino);
+ }
+out:
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
+ struct file *file, umode_t mode,
+ struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx,
+ struct ceph_file_layout *lo)
+{
+ int ret;
+ char xattr_buf[4];
+ struct ceph_mds_reply_inode in = { };
+ struct ceph_mds_reply_info_in iinfo = { .in = &in };
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct inode *inode;
+ struct timespec64 now;
+ struct ceph_vino vino = { .ino = req->r_deleg_ino,
+ .snap = CEPH_NOSNAP };
+
+ ktime_get_real_ts64(&now);
+
+ inode = ceph_get_inode(dentry->d_sb, vino);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ iinfo.inline_version = CEPH_INLINE_NONE;
+ iinfo.change_attr = 1;
+ ceph_encode_timespec64(&iinfo.btime, &now);
+
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+
+ in.ino = cpu_to_le64(vino.ino);
+ in.snapid = cpu_to_le64(CEPH_NOSNAP);
+ in.version = cpu_to_le64(1); // ???
+ in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
+ in.cap.cap_id = cpu_to_le64(1);
+ in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
+ in.cap.flags = CEPH_CAP_FLAG_AUTH;
+ in.ctime = in.mtime = in.atime = iinfo.btime;
+ in.mode = cpu_to_le32((u32)mode);
+ in.truncate_seq = cpu_to_le32(1);
+ in.truncate_size = cpu_to_le64(-1ULL);
+ in.xattr_version = cpu_to_le64(1);
+ in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
+ dir->i_gid : current_fsgid()));
+ in.nlink = cpu_to_le32(1);
+ in.max_size = cpu_to_le64(lo->stripe_unit);
+
+ ceph_file_layout_to_legacy(lo, &in.layout);
+
+ ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
+ req->r_fmode, NULL);
+ if (ret) {
+ dout("%s failed to fill inode: %d\n", __func__, ret);
+ ceph_dir_clear_complete(dir);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ if (inode->i_state & I_NEW)
+ discard_new_inode(inode);
+ } else {
+ struct dentry *dn;
+
+ dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
+ vino.ino, dir->i_ino, dentry->d_name.name);
+ ceph_dir_clear_ordered(dir);
+ ceph_init_inode_acls(inode, as_ctx);
+ if (inode->i_state & I_NEW) {
+ /*
+ * If it's not I_NEW, then someone created this before
+ * we got here. Assume the server is aware of it at
+ * that point and don't worry about setting
+ * CEPH_I_ASYNC_CREATE.
+ */
+ ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
+ unlock_new_inode(inode);
+ }
+ if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ dn = d_splice_alias(inode, dentry);
+ WARN_ON_ONCE(dn && dn != dentry);
+ }
+ file->f_mode |= FMODE_CREATED;
+ ret = finish_open(file, dentry, ceph_open);
+ }
+ return ret;
+}
/*
* Do a lookup + open with a single request. If we get a non-existent
@@ -443,6 +666,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acl_sec_ctx as_ctx = {};
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int mask;
int err;
@@ -466,7 +690,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
/* If it's not being looked up, it's negative */
return -ENOENT;
}
-
+retry:
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
if (IS_ERR(req)) {
@@ -475,21 +699,43 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.open.mask = cpu_to_le32(mask);
+ req->r_parent = dir;
+
if (flags & O_CREAT) {
+ struct ceph_file_layout lo;
+
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
if (as_ctx.pagelist) {
req->r_pagelist = as_ctx.pagelist;
as_ctx.pagelist = NULL;
}
+ if (try_async &&
+ (req->r_dir_caps =
+ try_prep_async_create(dir, dentry, &lo,
+ &req->r_deleg_ino))) {
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
+ req->r_callback = ceph_async_create_cb;
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ err = ceph_finish_async_create(dir, dentry,
+ file, mode, req,
+ &as_ctx, &lo);
+ } else if (err == -EJUKEBOX) {
+ restore_deleg_ino(dir, req->r_deleg_ino);
+ ceph_mdsc_put_request(req);
+ try_async = false;
+ goto retry;
+ }
+ goto out_req;
+ }
}
- mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
- if (ceph_security_xattr_wanted(dir))
- mask |= CEPH_CAP_XATTR_SHARED;
- req->r_args.open.mask = cpu_to_le32(mask);
-
- req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
@@ -518,14 +764,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
- ceph_init_inode_acls(d_inode(dentry), &as_ctx);
+ struct inode *newino = d_inode(dentry);
+
+ cache_file_layout(dir, newino);
+ ceph_init_inode_acls(newino, &as_ctx);
file->f_mode |= FMODE_CREATED;
}
err = finish_open(file, dentry, ceph_open);
}
out_req:
- if (!req->r_err && req->r_target_inode)
- ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
out_ctx:
ceph_release_acl_sec_ctx(&as_ctx);
@@ -542,7 +789,7 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p dir file %p\n", inode, file);
WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
- ceph_put_fmode(ci, dfi->file_info.fmode);
+ ceph_put_fmode(ci, dfi->file_info.fmode, 1);
if (dfi->last_readdir)
ceph_mdsc_put_request(dfi->last_readdir);
@@ -554,7 +801,8 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p regular file %p\n", inode, file);
WARN_ON(!list_empty(&fi->rw_contexts));
- ceph_put_fmode(ci, fi->fmode);
+ ceph_put_fmode(ci, fi->fmode, 1);
+
kmem_cache_free(ceph_file_cachep, fi);
}
@@ -1415,10 +1663,13 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
int err, want, got;
bool direct_lock = false;
+ u32 map_flags;
+ u64 pool_flags;
loff_t pos;
loff_t limit = max(i_size_read(inode), fsc->max_file_size);
@@ -1481,8 +1732,12 @@ retry_snap:
goto out;
}
- /* FIXME: not complete since it doesn't account for being at quota */
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) {
+ down_read(&osdc->lock);
+ map_flags = osdc->osdmap->flags;
+ pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
+ up_read(&osdc->lock);
+ if ((map_flags & CEPH_OSDMAP_FULL) ||
+ (pool_flags & CEPH_POOL_FLAG_FULL)) {
err = -ENOSPC;
goto out;
}
@@ -1560,7 +1815,7 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
+ ceph_check_caps(ci, 0, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1575,7 +1830,8 @@ retry_snap:
}
if (written >= 0) {
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_NEARFULL))
+ if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
+ (pool_flags & CEPH_POOL_FLAG_NEARFULL))
iocb->ki_flags |= IOCB_DSYNC;
written = generic_write_sync(iocb, written);
}
@@ -1936,6 +2192,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
return 0;
}
+static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
+ struct ceph_inode_info *dst_ci, u64 *dst_off,
+ struct ceph_fs_client *fsc,
+ size_t len, unsigned int flags)
+{
+ struct ceph_object_locator src_oloc, dst_oloc;
+ struct ceph_object_id src_oid, dst_oid;
+ size_t bytes = 0;
+ u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
+ u32 src_objlen, dst_objlen;
+ u32 object_size = src_ci->i_layout.object_size;
+ int ret;
+
+ src_oloc.pool = src_ci->i_layout.pool_id;
+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+ dst_oloc.pool = dst_ci->i_layout.pool_id;
+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+
+ while (len >= object_size) {
+ ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
+ object_size, &src_objnum,
+ &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
+ object_size, &dst_objnum,
+ &dst_objoff, &dst_objlen);
+ ceph_oid_init(&src_oid);
+ ceph_oid_printf(&src_oid, "%llx.%08llx",
+ src_ci->i_vino.ino, src_objnum);
+ ceph_oid_init(&dst_oid);
+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
+ dst_ci->i_vino.ino, dst_objnum);
+ /* Do an object remote copy */
+ ret = ceph_osdc_copy_from(&fsc->client->osdc,
+ src_ci->i_vino.snap, 0,
+ &src_oid, &src_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
+ &dst_oid, &dst_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
+ dst_ci->i_truncate_seq,
+ dst_ci->i_truncate_size,
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ fsc->have_copy_from2 = false;
+ pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+ }
+ dout("ceph_osdc_copy_from returned %d\n", ret);
+ if (!bytes)
+ bytes = ret;
+ goto out;
+ }
+ len -= object_size;
+ bytes += object_size;
+ *src_off += object_size;
+ *dst_off += object_size;
+ }
+
+out:
+ ceph_oloc_destroy(&src_oloc);
+ ceph_oloc_destroy(&dst_oloc);
+ return bytes;
+}
+
static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
@@ -1946,14 +2267,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
struct ceph_cap_flush *prealloc_cf;
struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
- struct ceph_object_locator src_oloc, dst_oloc;
- struct ceph_object_id src_oid, dst_oid;
- loff_t endoff = 0, size;
- ssize_t ret = -EIO;
+ loff_t size;
+ ssize_t ret = -EIO, bytes;
u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
- u32 src_objlen, dst_objlen, object_size;
+ u32 src_objlen, dst_objlen;
int src_got = 0, dst_got = 0, err, dirty;
- bool do_final_copy = false;
if (src_inode->i_sb != dst_inode->i_sb) {
struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
@@ -2031,22 +2349,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ret < 0)
goto out_caps;
- size = i_size_read(dst_inode);
- endoff = dst_off + len;
-
/* Drop dst file cached pages */
ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
dst_off >> PAGE_SHIFT,
- endoff >> PAGE_SHIFT);
+ (dst_off + len) >> PAGE_SHIFT);
if (ret < 0) {
dout("Failed to invalidate inode pages (%zd)\n", ret);
ret = 0; /* XXX */
}
- src_oloc.pool = src_ci->i_layout.pool_id;
- src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
- dst_oloc.pool = dst_ci->i_layout.pool_id;
- dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
-
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
src_ci->i_layout.object_size,
&src_objnum, &src_objoff, &src_objlen);
@@ -2065,6 +2375,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* starting at the src_off
*/
if (src_objoff) {
+ dout("Initial partial copy of %u bytes\n", src_objlen);
+
/*
* we need to temporarily drop all caps as we'll be calling
* {read,write}_iter, which will get caps again.
@@ -2072,8 +2384,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
ret = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, src_objlen, flags);
- if (ret < 0) {
- dout("do_splice_direct returned %d\n", err);
+ /* Abort on short copies or on error */
+ if (ret < src_objlen) {
+ dout("Failed partial copy (%zd)\n", ret);
goto out;
}
len -= ret;
@@ -2086,65 +2399,27 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (err < 0)
goto out_caps;
}
- object_size = src_ci->i_layout.object_size;
- while (len >= object_size) {
- ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
- object_size, &src_objnum,
- &src_objoff, &src_objlen);
- ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
- object_size, &dst_objnum,
- &dst_objoff, &dst_objlen);
- ceph_oid_init(&src_oid);
- ceph_oid_printf(&src_oid, "%llx.%08llx",
- src_ci->i_vino.ino, src_objnum);
- ceph_oid_init(&dst_oid);
- ceph_oid_printf(&dst_oid, "%llx.%08llx",
- dst_ci->i_vino.ino, dst_objnum);
- /* Do an object remote copy */
- err = ceph_osdc_copy_from(
- &src_fsc->client->osdc,
- src_ci->i_vino.snap, 0,
- &src_oid, &src_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
- &dst_oid, &dst_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
- dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
- CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
- if (err) {
- if (err == -EOPNOTSUPP) {
- src_fsc->have_copy_from2 = false;
- pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
- }
- dout("ceph_osdc_copy_from returned %d\n", err);
- if (!ret)
- ret = err;
- goto out_caps;
- }
- len -= object_size;
- src_off += object_size;
- dst_off += object_size;
- ret += object_size;
- }
- if (len)
- /* We still need one final local copy */
- do_final_copy = true;
+ size = i_size_read(dst_inode);
+ bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
+ src_fsc, len, flags);
+ if (bytes <= 0) {
+ if (!ret)
+ ret = bytes;
+ goto out_caps;
+ }
+ dout("Copied %zu bytes out of %zu\n", bytes, len);
+ len -= bytes;
+ ret += bytes;
file_update_time(dst_file);
inode_inc_iversion_raw(dst_inode);
- if (endoff > size) {
- int caps_flags = 0;
-
+ if (dst_off > size) {
/* Let the MDS know about dst file size change */
- if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
- caps_flags |= CHECK_CAPS_NODELAY;
- if (ceph_inode_set_size(dst_inode, endoff))
- caps_flags |= CHECK_CAPS_AUTHONLY;
- if (caps_flags)
- ceph_check_caps(dst_ci, caps_flags, NULL);
+ if (ceph_inode_set_size(dst_inode, dst_off) ||
+ ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
@@ -2157,15 +2432,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
out_caps:
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
- if (do_final_copy) {
- err = do_splice_direct(src_file, &src_off, dst_file,
- &dst_off, len, flags);
- if (err < 0) {
- dout("do_splice_direct returned %d\n", err);
- goto out;
- }
- len -= err;
- ret += err;
+ /*
+ * Do the final manual copy if we still have some bytes left, unless
+ * there were errors in remote object copies (len >= object_size).
+ */
+ if (len && (len < src_ci->i_layout.object_size)) {
+ dout("Final partial copy of %zu bytes\n", len);
+ bytes = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, len, flags);
+ if (bytes > 0)
+ ret += bytes;
+ else
+ dout("Failed partial copy (%zd)\n", bytes);
}
out:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d01710a16a4a..7fef94fd1e55 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -82,10 +82,14 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
+ inode->i_mtime = parent->i_mtime;
+ inode->i_ctime = parent->i_ctime;
+ inode->i_atime = parent->i_atime;
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
+ ci->i_btime = ceph_inode(parent)->i_btime;
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
@@ -447,6 +451,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_max_files = 0;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
@@ -471,13 +476,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_prealloc_cap_flush = NULL;
INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
- ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
+ ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
@@ -496,6 +501,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_rdcache_ref = 0;
ci->i_wr_ref = 0;
ci->i_wb_ref = 0;
+ ci->i_fx_ref = 0;
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
@@ -586,6 +592,7 @@ void ceph_evict_inode(struct inode *inode)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
}
static inline blkcnt_t calc_inode_blocks(u64 size)
@@ -636,7 +643,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if ((issued & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_BUFFER)) ||
mapping_mapped(inode->i_mapping) ||
- __ceph_caps_file_wanted(ci)) {
+ __ceph_is_file_opened(ci)) {
ci->i_truncate_pending++;
queue_trunc = 1;
}
@@ -727,11 +734,11 @@ void ceph_fill_file_time(struct inode *inode, int issued,
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
*/
-static int fill_inode(struct inode *inode, struct page *locked_page,
- struct ceph_mds_reply_info_in *iinfo,
- struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session, int cap_fmode,
- struct ceph_cap_reservation *caps_reservation)
+int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_reply_inode *info = iinfo->in;
@@ -748,7 +755,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
bool new_version = false;
bool fill_inline = false;
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+ dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
@@ -769,7 +776,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+ pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
iinfo->xattr_len);
}
@@ -932,8 +939,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
if (symlen != i_size_read(inode)) {
- pr_err("fill_inode %llx.%llx BAD symlink "
- "size %lld\n", ceph_vinop(inode),
+ pr_err("%s %llx.%llx BAD symlink "
+ "size %lld\n", __func__,
+ ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
@@ -957,7 +965,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_dir_fops;
break;
default:
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+ pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
ceph_vinop(inode), inode->i_mode);
}
@@ -966,7 +974,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ceph_snap(inode) == CEPH_NOSNAP) {
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode, info_caps,
+ info_caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
@@ -991,13 +999,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(info_caps));
ci->i_snap_caps |= info_caps;
- if (cap_fmode >= 0)
- __ceph_get_fmode(ci, cap_fmode);
}
- } else if (cap_fmode >= 0) {
- pr_warn("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
- __ceph_get_fmode(ci, cap_fmode);
}
if (iinfo->inline_version > 0 &&
@@ -1009,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
fill_inline = true;
}
+ if (cap_fmode >= 0) {
+ if (!info_caps)
+ pr_warn("mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
+ __ceph_touch_fmode(ci, mdsc, cap_fmode);
+ }
+
spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
@@ -1050,6 +1059,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
struct ceph_mds_session **old_lease_session)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
@@ -1061,8 +1071,13 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return;
+ if (mask & CEPH_LEASE_PRIMARY_LINK)
+ di->flags |= CEPH_DENTRY_PRIMARY_LINK;
+ else
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
+
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
- if (duration == 0) {
+ if (!(mask & CEPH_LEASE_VALID)) {
__ceph_dentry_dir_lease_touch(di);
return;
}
@@ -1239,10 +1254,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct inode *dir = req->r_parent;
if (dir) {
- err = fill_inode(dir, NULL,
- &rinfo->diri, rinfo->dirfrag,
- session, -1,
- &req->r_caps_reservation);
+ err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ rinfo->dirfrag, session, -1,
+ &req->r_caps_reservation);
if (err < 0)
goto done;
} else {
@@ -1307,13 +1321,14 @@ retry_lookup:
goto done;
}
- err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
- session,
+ err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
+ NULL, session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
+ pr_err("ceph_fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in));
if (in->i_state & I_NEW)
discard_new_inode(in);
@@ -1500,10 +1515,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
- rc = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (rc < 0) {
- pr_err("fill_inode badness on %p got %d\n", in, rc);
+ pr_err("ceph_fill_inode badness on %p got %d\n",
+ in, rc);
err = rc;
if (in->i_state & I_NEW) {
ihold(in);
@@ -1707,10 +1723,10 @@ retry_lookup:
}
}
- ret = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (ret < 0) {
- pr_err("fill_inode badness on %p\n", in);
+ pr_err("ceph_fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
/* avoid calling iput_final() in mds
* dispatch threads */
@@ -1972,7 +1988,7 @@ retry:
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
wake_up_all(&ci->i_cap_wq);
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index c90f03beb15d..6e061bf62ad4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
fi->fmode |= CEPH_FILE_MODE_LAZY;
ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+ __ceph_touch_fmode(ci, mdsc, fi->fmode);
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 544e9e85b120..d6b9166e71e4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -210,6 +210,21 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
return 0;
}
+static int try_unlock_file(struct file *file, struct file_lock *fl)
+{
+ int err;
+ unsigned int orig_flags = fl->fl_flags;
+ fl->fl_flags |= FL_EXISTS;
+ err = locks_lock_file_wait(file, fl);
+ fl->fl_flags = orig_flags;
+ if (err == -ENOENT) {
+ if (!(orig_flags & FL_EXISTS))
+ err = 0;
+ return err;
+ }
+ return 1;
+}
+
/**
* Attempt to set an fcntl lock.
* For now, this just goes away to the server. Later it may be more awesome.
@@ -255,9 +270,15 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
- if (op == CEPH_MDS_OP_SETFILELOCK) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
dout("mds locked, locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
@@ -311,9 +332,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (F_UNLCK == fl->fl_type) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
inode, lock_cmd, wait, fl);
- if (!err) {
+ if (!err && F_UNLCK != fl->fl_type) {
err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bbbbddf71326..486f91f9685b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -415,21 +415,121 @@ bad:
return -EIO;
}
+
+#if BITS_PER_LONG == 64
+
+#define DELEGATED_INO_AVAILABLE xa_mk_value(1)
+
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ dout("got %u sets of delegated inodes\n", sets);
+ while (sets--) {
+ u64 start, len, ino;
+
+ ceph_decode_64_safe(p, end, start, bad);
+ ceph_decode_64_safe(p, end, len, bad);
+ while (len--) {
+ int err = xa_insert(&s->s_delegated_inos, ino = start++,
+ DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+ if (!err) {
+ dout("added delegated inode 0x%llx\n",
+ start - 1);
+ } else if (err == -EBUSY) {
+ pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
+ start - 1);
+ } else {
+ return err;
+ }
+ }
+ }
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ unsigned long ino;
+ void *val;
+
+ xa_for_each(&s->s_delegated_inos, ino, val) {
+ val = xa_erase(&s->s_delegated_inos, ino);
+ if (val == DELEGATED_INO_AVAILABLE)
+ return ino;
+ }
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+}
+#else /* BITS_PER_LONG == 64 */
+/*
+ * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
+ * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
+ * and bottom words?
+ */
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ if (sets)
+ ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return 0;
+}
+#endif /* BITS_PER_LONG == 64 */
+
/*
* parse create results
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ u64 features, struct ceph_mds_session *s)
{
+ int ret;
+
if (features == (u64)-1 ||
(features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
- /* Malformed reply? */
if (*p == end) {
+ /* Malformed reply? */
info->has_create_ino = false;
- } else {
+ } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
+ u8 struct_v, struct_compat;
+ u32 len;
+
info->has_create_ino = true;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_64_safe(p, end, info->ino, bad);
+ ret = ceph_parse_deleg_inos(p, end, s);
+ if (ret)
+ return ret;
+ } else {
+ /* legacy */
ceph_decode_64_safe(p, end, info->ino, bad);
+ info->has_create_ino = true;
}
} else {
if (*p != end)
@@ -448,7 +548,7 @@ bad:
*/
static int parse_reply_info_extra(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ u64 features, struct ceph_mds_session *s)
{
u32 op = le32_to_cpu(info->head->op);
@@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end,
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
- return parse_reply_info_create(p, end, info, features);
+ return parse_reply_info_create(p, end, info, features, s);
else
return -EIO;
}
@@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end,
/*
* parse entire mds reply
*/
-static int parse_reply_info(struct ceph_msg *msg,
+static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
@@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg,
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
ceph_decode_need(&p, end, len, bad);
- err = parse_reply_info_extra(&p, p+len, info, features);
+ err = parse_reply_info_extra(&p, p+len, info, features, s);
if (err < 0)
goto out_bad;
}
@@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
+ xa_destroy(&s->s_delegated_inos);
kfree(s);
}
}
@@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
+ xa_init(&s->s_delegated_inos);
s->s_num_cap_releases = 0;
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
@@ -699,6 +801,7 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
+ ceph_mdsc_release_dir_caps(req);
destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
@@ -736,7 +839,7 @@ void ceph_mdsc_release_request(struct kref *kref)
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
WARN_ON_ONCE(!list_empty(&req->r_wait));
- kfree(req);
+ kmem_cache_free(ceph_mds_request_cachep, req);
}
DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
@@ -793,8 +896,13 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+
ihold(dir);
req->r_unsafe_dir = dir;
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -822,8 +930,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
erase_request(&mdsc->request_tree, req);
- if (req->r_unsafe_dir &&
- test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ if (req->r_unsafe_dir) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
@@ -1407,8 +1514,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
- if (cap->mds_wanted | cap->issued)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
@@ -1574,9 +1679,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
/* mds did not re-issue stale cap */
spin_lock(&ci->i_ceph_lock);
cap->issued = cap->implemented = CEPH_CAP_PIN;
- /* make sure mds knows what we want */
- if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
spin_unlock(&ci->i_ceph_lock);
}
} else if (ev == FORCE_RO) {
@@ -1772,7 +1874,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
}
/* The inode has cached pages, but it's no longer used.
* we can safely drop it */
- if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ if (S_ISREG(inode->i_mode) &&
+ wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
!(oissued & CEPH_CAP_FILE_CACHE)) {
used = 0;
oissued = 0;
@@ -2089,8 +2192,9 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
- struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+ struct ceph_mds_request *req;
+ req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
if (!req)
return ERR_PTR(-ENOMEM);
@@ -2368,7 +2472,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->op = cpu_to_le32(req->r_op);
head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
- head->ino = 0;
+ head->ino = cpu_to_le64(req->r_deleg_ino);
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
@@ -2382,7 +2486,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_inode_drop)
releases += ceph_encode_inode_release(&p,
req->r_inode ? req->r_inode : d_inode(req->r_dentry),
- mds, req->r_inode_drop, req->r_inode_unless, 0);
+ mds, req->r_inode_drop, req->r_inode_unless,
+ req->r_op == CEPH_MDS_OP_READDIR);
if (req->r_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_dentry,
req->r_parent, mds, req->r_dentry_drop,
@@ -2522,12 +2627,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
+ flags |= CEPH_MDS_FLAG_ASYNC;
if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1;
- rhead->ino = 0;
dout(" r_parent = %p\n", req->r_parent);
return 0;
@@ -2573,7 +2679,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
dout("do_request timed out\n");
- err = -EIO;
+ err = -ETIMEDOUT;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
@@ -2605,6 +2711,10 @@ static void __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto finish;
+ }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
@@ -2629,6 +2739,15 @@ static void __do_request(struct ceph_mds_client *mdsc,
err = -EACCES;
goto out_session;
}
+ /*
+ * We cannot queue async requests since the caps and delegated
+ * inodes are bound to the session. Just return -EJUKEBOX and
+ * let the caller retry a sync request in that case.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto out_session;
+ }
if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING) {
__open_session(mdsc, session);
@@ -2709,19 +2828,43 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
struct ceph_mds_request *req)
{
- int err;
+ int err = 0;
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_parent) {
- ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ struct ceph_inode_info *ci = ceph_inode(req->r_parent);
+ int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
+ CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
+ spin_lock(&ci->i_ceph_lock);
+ ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
+ __ceph_touch_fmode(ci, mdsc, fmode);
+ spin_unlock(&ci->i_ceph_lock);
ihold(req->r_parent);
}
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
+ if (req->r_inode) {
+ err = ceph_wait_on_async_create(req->r_inode);
+ if (err) {
+ dout("%s: wait for async create returned: %d\n",
+ __func__, err);
+ return err;
+ }
+ }
+
+ if (!err && req->r_old_inode) {
+ err = ceph_wait_on_async_create(req->r_old_inode);
+ if (err) {
+ dout("%s: wait for async create returned: %d\n",
+ __func__, err);
+ return err;
+ }
+ }
+
dout("submit_request on %p for inode %p\n", req, dir);
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2747,7 +2890,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
if (timeleft > 0)
err = 0;
else if (!timeleft)
- err = -EIO; /* timed out */
+ err = -ETIMEDOUT; /* timed out */
else
err = timeleft; /* killed */
}
@@ -2935,22 +3078,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
- if (req->r_unsafe_dir) {
- struct ceph_inode_info *ci =
- ceph_inode(req->r_unsafe_dir);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_dir_item,
- &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
- }
}
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
- err = parse_reply_info(msg, rinfo, (u64)-1);
+ err = parse_reply_info(session, msg, rinfo, (u64)-1);
else
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -3249,6 +3384,17 @@ bad:
return;
}
+void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
+{
+ int dcaps;
+
+ dcaps = xchg(&req->r_dir_caps, 0);
+ if (dcaps) {
+ dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
+ }
+}
+
/*
* called under session->mutex.
*/
@@ -3276,9 +3422,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
continue;
if (req->r_attempts == 0)
continue; /* only old requests */
- if (req->r_session &&
- req->r_session->s_mds == session->s_mds)
- __send_request(mdsc, session, req, true);
+ if (!req->r_session)
+ continue;
+ if (req->r_session->s_mds != session->s_mds)
+ continue;
+
+ ceph_mdsc_release_dir_caps(req);
+
+ __send_request(mdsc, session, req, true);
}
mutex_unlock(&mdsc->mutex);
}
@@ -3362,7 +3513,7 @@ fail_msg:
/*
* Encode information about a cap for a reconnect with the MDS.
*/
-static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
union {
@@ -3385,6 +3536,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = cap->session->s_cap_gen;
+ /* These are lost when the session goes away */
+ if (S_ISDIR(inode->i_mode)) {
+ if (cap->issued & CEPH_CAP_DIR_CREATE) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
+ cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
+ }
+
if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -3626,6 +3786,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
if (!reply)
goto fail_nomsg;
+ xa_destroy(&session->s_delegated_inos);
+
mutex_lock(&session->s_mutex);
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
@@ -3681,7 +3843,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.msg_version = 2;
}
/* trsaverse this session's caps */
- err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
+ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 27a7446e10d3..903d9edfd4bf 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -23,8 +23,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_RECLAIM_CLIENT,
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_DELEG_INO,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
};
/*
@@ -37,6 +38,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
+ CEPHFS_FEATURE_DELEG_INO, \
\
CEPHFS_FEATURE_MAX, \
}
@@ -201,6 +203,7 @@ struct ceph_mds_session {
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
+ struct xarray s_delegated_inos;
};
/*
@@ -255,6 +258,7 @@ struct ceph_mds_request {
#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
+#define CEPH_MDS_R_ASYNC (8) /* async request */
unsigned long r_req_flags;
struct mutex r_fill_mutex;
@@ -263,6 +267,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
kuid_t r_uid;
kgid_t r_gid;
+ int r_request_release_offset;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -280,12 +285,16 @@ struct ceph_mds_request {
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
- int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
- struct page *r_locked_page;
int r_err;
+
+ struct page *r_locked_page;
+ int r_dir_caps;
+ int r_num_caps;
+ u32 r_readdir_offset;
+
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only,
@@ -304,6 +313,7 @@ struct ceph_mds_request {
int r_num_fwd; /* number of forward attempts */
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
+ u64 r_deleg_ino;
struct list_head r_wait;
struct completion r_completion;
@@ -315,10 +325,8 @@ struct ceph_mds_request {
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
- u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation;
- int r_num_caps;
};
struct ceph_pool_perm {
@@ -488,6 +496,7 @@ extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
@@ -512,7 +521,7 @@ extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
static inline void ceph_mdsc_free_path(char *path, int len)
{
- if (path)
+ if (!IS_ERR_OR_NULL(path))
__putname(path - (PATH_MAX - 1 - len));
}
@@ -537,4 +546,15 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
int max_caps);
+
+static inline int ceph_wait_on_async_create(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
+ TASK_INTERRUPTIBLE);
+}
+
+extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
+extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ccfcc66aaf44..923be9399b21 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1155,5 +1155,6 @@ void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
pr_err("snapid map %llx -> %x still in use\n",
sm->snap, sm->dev);
}
+ kfree(sm);
}
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c7f150686a53..c9784eb1159a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -155,6 +155,7 @@ enum {
Opt_acl,
Opt_quotadf,
Opt_copyfrom,
+ Opt_wsync,
};
enum ceph_recover_session_mode {
@@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_string ("snapdirname", Opt_snapdirname),
fsparam_string ("source", Opt_source),
fsparam_u32 ("wsize", Opt_wsize),
+ fsparam_flag_no ("wsync", Opt_wsync),
{}
};
@@ -444,6 +446,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
fc->sb_flags &= ~SB_POSIXACL;
}
break;
+ case Opt_wsync:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ break;
default:
BUG();
}
@@ -567,6 +575,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");
+ if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ seq_puts(m, ",nowsync");
+
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%u", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -729,6 +740,7 @@ struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
struct kmem_cache *ceph_dir_file_cachep;
+struct kmem_cache *ceph_mds_request_cachep;
static void ceph_inode_init_once(void *foo)
{
@@ -769,6 +781,10 @@ static int __init init_caches(void)
if (!ceph_dir_file_cachep)
goto bad_dir_file;
+ ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD);
+ if (!ceph_mds_request_cachep)
+ goto bad_mds_req;
+
error = ceph_fscache_register();
if (error)
goto bad_fscache;
@@ -776,6 +792,8 @@ static int __init init_caches(void)
return 0;
bad_fscache:
+ kmem_cache_destroy(ceph_mds_request_cachep);
+bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
kmem_cache_destroy(ceph_file_cachep);
@@ -804,6 +822,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
+ kmem_cache_destroy(ceph_mds_request_cachep);
ceph_fscache_unregister();
}
@@ -1107,6 +1126,15 @@ static void ceph_free_fc(struct fs_context *fc)
static int ceph_reconfigure_fc(struct fs_context *fc)
{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ ceph_set_mount_opt(fsc, ASYNC_DIROPS);
+ else
+ ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+
sync_filesystem(fc->root->d_sb);
return 0;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 037cdfb2ad4f..60aac3aee055 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -43,13 +43,16 @@
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
+#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
#define CEPH_MOUNT_OPT_DEFAULT \
(CEPH_MOUNT_OPT_DCACHE | \
CEPH_MOUNT_OPT_NOCOPYFROM)
#define ceph_set_mount_opt(fsc, opt) \
- (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt
+#define ceph_clear_mount_opt(fsc, opt) \
+ (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
@@ -170,9 +173,9 @@ struct ceph_cap {
struct list_head caps_item;
};
-#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
-#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */
struct ceph_cap_flush {
u64 tid;
@@ -284,6 +287,7 @@ struct ceph_dentry_info {
#define CEPH_DENTRY_REFERENCED 1
#define CEPH_DENTRY_LEASE_LIST 2
#define CEPH_DENTRY_SHRINK_LIST 4
+#define CEPH_DENTRY_PRIMARY_LINK 8
struct ceph_inode_xattrs_info {
/*
@@ -315,13 +319,14 @@ struct ceph_inode_info {
u64 i_inline_version;
u32 i_time_warp_seq;
- unsigned i_ceph_flags;
+ unsigned long i_ceph_flags;
atomic64_t i_release_count;
atomic64_t i_ordered_count;
atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ struct ceph_file_layout i_cached_layout; // for async creates
char *i_symlink;
/* for dirs */
@@ -352,7 +357,6 @@ struct ceph_inode_info {
struct ceph_cap_flush *i_prealloc_cap_flush;
struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
- unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
struct ceph_cap_reservation i_cap_migration_resv;
@@ -361,6 +365,8 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
+ unsigned long i_last_rd;
+ unsigned long i_last_wr;
int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
@@ -375,7 +381,7 @@ struct ceph_inode_info {
/* held references to caps */
int i_pin_ref;
- int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+ int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
atomic_t i_filelock_ref;
atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */
@@ -511,18 +517,18 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
* Ceph inode.
*/
#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
-#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
-#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
-#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
-#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
-#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
-#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
-#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
+#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */
+#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */
+#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
+#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
/*
* Masks of ceph inode work.
@@ -674,18 +680,12 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
-extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
-
-/*
- * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
- */
-static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
{
- int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (w & CEPH_CAP_FILE_BUFFER)
- w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
- return w;
+ return ci->i_nr_by_mode[0];
}
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
/* what the mds thinks we want */
extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
@@ -899,6 +899,9 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
}
/* inode.c */
+struct ceph_mds_reply_info_in;
+struct ceph_mds_reply_dirfrag;
+
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
@@ -914,6 +917,11 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec64 *ctime,
struct timespec64 *mtime,
struct timespec64 *atime);
+extern int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation);
extern int ceph_fill_trace(struct super_block *sb,
struct ceph_mds_request *req);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
@@ -1042,7 +1050,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
extern void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
+ unsigned issued, unsigned wanted,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
@@ -1058,8 +1066,12 @@ extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds);
+extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
+ bool snap_rwsem_locked);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
@@ -1084,8 +1096,10 @@ extern int ceph_try_get_caps(struct inode *inode,
int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
-extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode);
/* addr.c */
extern const struct address_space_operations ceph_aops;
@@ -1097,7 +1111,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */
extern const struct file_operations ceph_file_fops;
-extern int ceph_renew_caps(struct inode *inode);
+extern int ceph_renew_caps(struct inode *inode, int fmode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 22cf04fb32d3..604f65f4b6c5 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -202,7 +202,7 @@ config CIFS_SMB_DIRECT
help
Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
SMB Direct allows transferring SMB packets over RDMA. If unsure,
- say N.
+ say Y.
config CIFS_FSCACHE
bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 276e4b5ea8e0..916567d770f5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -323,10 +323,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
atomic_read(&server->smbd_conn->send_credits),
atomic_read(&server->smbd_conn->receive_credits),
server->smbd_conn->receive_credit_target);
- seq_printf(m, "\nPending send_pending: %x "
- "send_payload_pending: %x",
- atomic_read(&server->smbd_conn->send_pending),
- atomic_read(&server->smbd_conn->send_payload_pending));
+ seq_printf(m, "\nPending send_pending: %x ",
+ atomic_read(&server->smbd_conn->send_pending));
seq_printf(m, "\nReceive buffers count_receive_queue: %x "
"count_empty_packet_queue: %x",
server->smbd_conn->count_receive_queue,
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 716574aab3b6..ae421634aa42 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -342,7 +342,7 @@ static int
sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
struct cifs_fattr *fattr, uint sidtype)
{
- int rc;
+ int rc = 0;
struct key *sidkey;
char *sidstr;
const struct cred *saved_cred;
@@ -450,11 +450,12 @@ out_revert_creds:
* fails then we just fall back to using the mnt_uid/mnt_gid.
*/
got_valid_id:
+ rc = 0;
if (sidtype == SIDOWNER)
fattr->cf_uid = fuid;
else
fattr->cf_gid = fgid;
- return 0;
+ return rc;
}
int
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index fa77fe5258b0..c31f362fa098 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1018,7 +1018,7 @@ struct file_system_type cifs_fs_type = {
.name = "cifs",
.mount = cifs_do_mount,
.kill_sb = cifs_kill_sb,
- /* .fs_flags */
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
};
MODULE_ALIAS_FS("cifs");
@@ -1027,7 +1027,7 @@ static struct file_system_type smb3_fs_type = {
.name = "smb3",
.mount = smb3_do_mount,
.kill_sb = cifs_kill_sb,
- /* .fs_flags */
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
};
MODULE_ALIAS_FS("smb3");
MODULE_ALIAS("smb3");
@@ -1208,6 +1208,10 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
{
unsigned int xid = get_xid();
ssize_t rc;
+ struct cifsFileInfo *cfile = dst_file->private_data;
+
+ if (cfile->swapfile)
+ return -EOPNOTSUPP;
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b87456bae1a1..c9e2e6bbca13 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.25"
+#define CIFS_VERSION "2.26"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0d956360e984..05dd3dea684b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -426,7 +426,8 @@ struct smb_version_operations {
/* generate new lease key */
void (*new_lease_key)(struct cifs_fid *);
int (*generate_signingkey)(struct cifs_ses *);
- int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
+ int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *,
+ bool allocate_crypto);
int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file);
int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1312,6 +1313,7 @@ struct cifsFileInfo {
struct tcon_link *tlink;
unsigned int f_flags;
bool invalidHandle:1; /* file closed via session abend */
+ bool swapfile:1;
bool oplock_break_cancelled:1;
unsigned int oplock_epoch; /* epoch from the lease break */
__u32 oplock_level; /* oplock/lease level from the lease break */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 79d842e7240c..593d826820c3 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1021,7 +1021,7 @@ typedef struct smb_com_writex_req {
__le16 ByteCount;
__u8 Pad; /* BB check for whether padded to DWORD
boundary and optimum performance here */
- char Data[0];
+ char Data[];
} __attribute__((packed)) WRITEX_REQ;
typedef struct smb_com_write_req {
@@ -1041,7 +1041,7 @@ typedef struct smb_com_write_req {
__le16 ByteCount;
__u8 Pad; /* BB check for whether padded to DWORD
boundary and optimum performance here */
- char Data[0];
+ char Data[];
} __attribute__((packed)) WRITE_REQ;
typedef struct smb_com_write_rsp {
@@ -1306,7 +1306,7 @@ typedef struct smb_com_ntransact_req {
/* SetupCount words follow then */
__le16 ByteCount;
__u8 Pad[3];
- __u8 Parms[0];
+ __u8 Parms[];
} __attribute__((packed)) NTRANSACT_REQ;
typedef struct smb_com_ntransact_rsp {
@@ -1523,7 +1523,7 @@ struct file_notify_information {
__le32 NextEntryOffset;
__le32 Action;
__le32 FileNameLength;
- __u8 FileName[0];
+ __u8 FileName[];
} __attribute__((packed));
/* For IO_REPARSE_TAG_SYMLINK */
@@ -1536,7 +1536,7 @@ struct reparse_symlink_data {
__le16 PrintNameOffset;
__le16 PrintNameLength;
__le32 Flags;
- char PathBuffer[0];
+ char PathBuffer[];
} __attribute__((packed));
/* Flag above */
@@ -1553,7 +1553,7 @@ struct reparse_posix_data {
__le16 ReparseDataLength;
__u16 Reserved;
__le64 InodeType; /* LNK, FIFO, CHR etc. */
- char PathBuffer[0];
+ char PathBuffer[];
} __attribute__((packed));
struct cifs_quota_data {
@@ -1691,6 +1691,7 @@ struct smb_t2_rsp {
#define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105
#define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106
#define SMB_FIND_FILE_UNIX 0x202
+#define SMB_FIND_FILE_POSIX_INFO 0x064
typedef struct smb_com_transaction2_qpi_req {
struct smb_hdr hdr; /* wct = 14+ */
@@ -1761,7 +1762,7 @@ struct set_file_rename {
__le32 overwrite; /* 1 = overwrite dest */
__u32 root_fid; /* zero */
__le32 target_name_len;
- char target_name[0]; /* Must be unicode */
+ char target_name[]; /* Must be unicode */
} __attribute__((packed));
struct smb_com_transaction2_sfi_req {
@@ -2450,7 +2451,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
__le16 version;
__le16 access_entry_count; /* access ACL - count of entries */
__le16 default_entry_count; /* default ACL - count of entries */
- struct cifs_posix_ace ace_array[0];
+ struct cifs_posix_ace ace_array[];
/* followed by
struct cifs_posix_ace default_ace_arraay[] */
} __attribute__((packed)); /* level 0x204 */
@@ -2756,7 +2757,7 @@ typedef struct file_xattr_info {
/* BB do we need another field for flags? BB */
__u32 xattr_name_len;
__u32 xattr_value_len;
- char xattr_name[0];
+ char xattr_name[];
/* followed by xattr_value[xattr_value_len], no pad */
} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
level 0x205 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e5cb681ec138..12a895e02db4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -602,6 +602,11 @@ int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov,
int resp_buftype,
struct cifs_search_info *srch_inf);
+struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server);
+void cifs_put_tcp_super(struct super_block *sb);
+int update_super_prepath(struct cifs_tcon *tcon, const char *prefix,
+ size_t prefix_len);
+
#ifdef CONFIG_CIFS_DFS_UPCALL
static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
const char *old_path,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6f6fb3606a5d..140efc1a9374 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -162,9 +162,18 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
for (it = dfs_cache_get_tgt_iterator(&tl); it;
it = dfs_cache_get_next_tgt(&tl, it)) {
- const char *tgt = dfs_cache_get_tgt_name(it);
+ const char *share, *prefix;
+ size_t share_len, prefix_len;
- extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+ rc = dfs_cache_get_tgt_share(it, &share, &share_len, &prefix,
+ &prefix_len);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to parse target share %d\n",
+ __func__, rc);
+ continue;
+ }
+
+ extract_unc_hostname(share, &dfs_host, &dfs_host_len);
if (dfs_host_len != tcp_host_len
|| strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
@@ -175,11 +184,13 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
continue;
}
- scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, share);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
- if (!rc)
+ if (!rc) {
+ rc = update_super_prepath(tcon, prefix, prefix_len);
break;
+ }
if (rc == -EREMOTE)
break;
}
@@ -320,7 +331,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
atomic_inc(&tconInfoReconnectCount);
/* tell server Unix caps we support */
- if (ses->capabilities & CAP_UNIX)
+ if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, NULL);
/*
@@ -1591,7 +1602,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
cifs_reconnect(server);
- wake_up(&server->response_q);
return -1;
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4804d1df8c1c..95b3ab0ca8c0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -21,6 +21,7 @@
#include <linux/fs.h>
#include <linux/net.h>
#include <linux/string.h>
+#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/list.h>
#include <linux/wait.h>
@@ -57,7 +58,6 @@
#include "smb2proto.h"
#include "smbdirect.h"
#include "dns_resolve.h"
-#include "cifsfs.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
@@ -389,54 +389,7 @@ static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
#endif
#ifdef CONFIG_CIFS_DFS_UPCALL
-struct super_cb_data {
- struct TCP_Server_Info *server;
- struct super_block *sb;
-};
-
/* These functions must be called with server->srv_mutex held */
-
-static void super_cb(struct super_block *sb, void *arg)
-{
- struct super_cb_data *d = arg;
- struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
-
- if (d->sb)
- return;
-
- cifs_sb = CIFS_SB(sb);
- tcon = cifs_sb_master_tcon(cifs_sb);
- if (tcon->ses->server == d->server)
- d->sb = sb;
-}
-
-static struct super_block *get_tcp_super(struct TCP_Server_Info *server)
-{
- struct super_cb_data d = {
- .server = server,
- .sb = NULL,
- };
-
- iterate_supers_type(&cifs_fs_type, super_cb, &d);
-
- if (unlikely(!d.sb))
- return ERR_PTR(-ENOENT);
- /*
- * Grab an active reference in order to prevent automounts (DFS links)
- * of expiring and then freeing up our cifs superblock pointer while
- * we're doing failover.
- */
- cifs_sb_active(d.sb);
- return d.sb;
-}
-
-static inline void put_tcp_super(struct super_block *sb)
-{
- if (!IS_ERR_OR_NULL(sb))
- cifs_sb_deactive(sb);
-}
-
static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
struct cifs_sb_info *cifs_sb,
struct dfs_cache_tgt_list *tgt_list,
@@ -508,7 +461,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->nr_targets = 1;
#ifdef CONFIG_CIFS_DFS_UPCALL
spin_unlock(&GlobalMid_Lock);
- sb = get_tcp_super(server);
+ sb = cifs_get_tcp_super(server);
if (IS_ERR(sb)) {
rc = PTR_ERR(sb);
cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
@@ -535,8 +488,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
spin_unlock(&GlobalMid_Lock);
#ifdef CONFIG_CIFS_DFS_UPCALL
dfs_cache_free_tgts(&tgt_list);
- put_tcp_super(sb);
+ cifs_put_tcp_super(sb);
#endif
+ wake_up(&server->response_q);
return rc;
} else
server->tcpStatus = CifsNeedReconnect;
@@ -666,11 +620,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
}
- put_tcp_super(sb);
+ cifs_put_tcp_super(sb);
#endif
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
+ wake_up(&server->response_q);
return rc;
}
@@ -765,7 +720,6 @@ server_unresponsive(struct TCP_Server_Info *server)
cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n",
(3 * server->echo_interval) / HZ);
cifs_reconnect(server);
- wake_up(&server->response_q);
return true;
}
@@ -898,7 +852,6 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
*/
cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
cifs_reconnect(server);
- wake_up(&server->response_q);
break;
default:
cifs_server_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
@@ -1070,7 +1023,6 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
server->vals->header_preamble_size) {
cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
cifs_reconnect(server);
- wake_up(&server->response_q);
return -ECONNABORTED;
}
@@ -1118,7 +1070,6 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
cifs_reconnect(server);
- wake_up(&server->response_q);
return -1;
}
@@ -1164,8 +1115,9 @@ cifs_demultiplex_thread(void *p)
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mids[MAX_COMPOUND];
char *bufs[MAX_COMPOUND];
+ unsigned int noreclaim_flag;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
length = atomic_inc_return(&tcpSesAllocCount);
@@ -1212,7 +1164,6 @@ next_pdu:
cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n",
server->pdu_size);
cifs_reconnect(server);
- wake_up(&server->response_q);
continue;
}
@@ -1320,6 +1271,7 @@ next_pdu:
set_current_state(TASK_RUNNING);
}
+ memalloc_noreclaim_restore(noreclaim_flag);
module_put_and_exit(0);
}
@@ -1522,6 +1474,9 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3)
cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n");
return 1;
}
+ cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 "
+ "is not recommended unless required for "
+ "access to very old servers\n");
vol->ops = &smb1_operations;
vol->vals = &smb1_values;
break;
@@ -2517,11 +2472,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n");
if (got_version == false)
- pr_warn("No dialect specified on mount. Default has changed to "
- "a more secure dialect, SMB2.1 or later (e.g. SMB3), from CIFS "
- "(SMB1). To use the less secure SMB1 dialect to access "
- "old servers which do not support SMB3 (or SMB2.1) specify vers=1.0"
- " on mount.\n");
+ pr_warn_once("No dialect specified on mount. Default has changed"
+ " to a more secure dialect, SMB2.1 or later (e.g. "
+ "SMB3.1.1), from CIFS (SMB1). To use the less secure "
+ "SMB1 dialect to access old servers which do not "
+ "support SMB3.1.1 (or even SMB3 or SMB2.1) specify "
+ "vers=1.0 on mount.\n");
kfree(mountdata_copy);
return 0;
@@ -4999,6 +4955,15 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
* dentry revalidation to think the dentry are stale (ESTALE).
*/
cifs_autodisable_serverino(cifs_sb);
+ /*
+ * Force the use of prefix path to support failover on DFS paths that
+ * resolve to targets that have different prefix paths.
+ */
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ kfree(cifs_sb->prepath);
+ cifs_sb->prepath = vol->prepath;
+ vol->prepath = NULL;
+
out:
free_xid(xid);
cifs_try_adding_channels(ses);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 43c1b43a07ec..a67f88bf7ae1 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1260,6 +1260,44 @@ void dfs_cache_del_vol(const char *fullpath)
kref_put(&vi->refcnt, vol_release);
}
+/**
+ * dfs_cache_get_tgt_share - parse a DFS target
+ *
+ * @it: DFS target iterator.
+ * @share: tree name.
+ * @share_len: length of tree name.
+ * @prefix: prefix path.
+ * @prefix_len: length of prefix path.
+ *
+ * Return zero if target was parsed correctly, otherwise non-zero.
+ */
+int dfs_cache_get_tgt_share(const struct dfs_cache_tgt_iterator *it,
+ const char **share, size_t *share_len,
+ const char **prefix, size_t *prefix_len)
+{
+ char *s, sep;
+
+ if (!it || !share || !share_len || !prefix || !prefix_len)
+ return -EINVAL;
+
+ sep = it->it_name[0];
+ if (sep != '\\' && sep != '/')
+ return -EINVAL;
+
+ s = strchr(it->it_name + 1, sep);
+ if (!s)
+ return -EINVAL;
+
+ s = strchrnul(s + 1, sep);
+
+ *share = it->it_name;
+ *share_len = s - it->it_name;
+ *prefix = *s ? s + 1 : s;
+ *prefix_len = &it->it_name[strlen(it->it_name)] - *prefix;
+
+ return 0;
+}
+
/* Get all tcons that are within a DFS namespace and can be refreshed */
static void get_tcons(struct TCP_Server_Info *server, struct list_head *head)
{
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
index 99ee44f8ad07..bf94d08cfb5a 100644
--- a/fs/cifs/dfs_cache.h
+++ b/fs/cifs/dfs_cache.h
@@ -49,6 +49,10 @@ extern int dfs_cache_update_vol(const char *fullpath,
struct TCP_Server_Info *server);
extern void dfs_cache_del_vol(const char *fullpath);
+extern int dfs_cache_get_tgt_share(const struct dfs_cache_tgt_iterator *it,
+ const char **share, size_t *share_len,
+ const char **prefix, size_t *prefix_len);
+
static inline struct dfs_cache_tgt_iterator *
dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl,
struct dfs_cache_tgt_iterator *it)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8f9d849a0012..0b1528edebcf 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3841,7 +3841,7 @@ again:
if (rc == -ENODATA)
rc = 0;
- ctx->rc = (rc == 0) ? ctx->total_len : rc;
+ ctx->rc = (rc == 0) ? (ssize_t)ctx->total_len : rc;
mutex_unlock(&ctx->aio_mutex);
@@ -4808,6 +4808,60 @@ cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
return -EINVAL;
}
+static int cifs_swap_activate(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *span)
+{
+ struct cifsFileInfo *cfile = swap_file->private_data;
+ struct inode *inode = swap_file->f_mapping->host;
+ unsigned long blocks;
+ long long isize;
+
+ cifs_dbg(FYI, "swap activate\n");
+
+ spin_lock(&inode->i_lock);
+ blocks = inode->i_blocks;
+ isize = inode->i_size;
+ spin_unlock(&inode->i_lock);
+ if (blocks*512 < isize) {
+ pr_warn("swap activate: swapfile has holes\n");
+ return -EINVAL;
+ }
+ *span = sis->pages;
+
+ printk_once(KERN_WARNING "Swap support over SMB3 is experimental\n");
+
+ /*
+ * TODO: consider adding ACL (or documenting how) to prevent other
+ * users (on this or other systems) from reading it
+ */
+
+
+ /* TODO: add sk_set_memalloc(inet) or similar */
+
+ if (cfile)
+ cfile->swapfile = true;
+ /*
+ * TODO: Since file already open, we can't open with DENY_ALL here
+ * but we could add call to grab a byte range lock to prevent others
+ * from reading or writing the file
+ */
+
+ return 0;
+}
+
+static void cifs_swap_deactivate(struct file *file)
+{
+ struct cifsFileInfo *cfile = file->private_data;
+
+ cifs_dbg(FYI, "swap deactivate\n");
+
+ /* TODO: undo sk_set_memalloc(inet) will eventually be needed */
+
+ if (cfile)
+ cfile->swapfile = false;
+
+ /* do we need to unpin (or unlock) the file */
+}
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
@@ -4821,6 +4875,13 @@ const struct address_space_operations cifs_addr_ops = {
.direct_IO = cifs_direct_io,
.invalidatepage = cifs_invalidate_page,
.launder_page = cifs_launder_page,
+ /*
+ * TODO: investigate and if useful we could add an cifs_migratePage
+ * helper (under an CONFIG_MIGRATION) in the future, and also
+ * investigate and add an is_dirty_writeback helper if needed
+ */
+ .swap_activate = cifs_swap_activate,
+ .swap_deactivate = cifs_swap_deactivate,
};
/*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b16f8d23e97b..8fbbdcdad8ff 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1835,6 +1835,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
CIFSSMBClose(xid, tcon, fid.netfid);
}
do_rename_exit:
+ if (rc == 0)
+ d_move(from_dentry, to_dentry);
cifs_put_tlink(tlink);
return rc;
}
@@ -2024,6 +2026,10 @@ cifs_revalidate_mapping(struct inode *inode)
int rc;
unsigned long *flags = &CIFS_I(inode)->flags;
+ /* swapfiles are not supposed to be shared */
+ if (IS_SWAPFILE(inode))
+ return 0;
+
rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
TASK_KILLABLE);
if (rc)
@@ -2148,8 +2154,9 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
* We need to be sure that all dirty pages are written and the server
* has actual ctime, mtime and file length.
*/
- if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
- inode->i_mapping->nrpages != 0) {
+ if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE)) &&
+ !CIFS_CACHE_READ(CIFS_I(inode)) &&
+ inode->i_mapping && inode->i_mapping->nrpages != 0) {
rc = filemap_fdatawait(inode->i_mapping);
if (rc) {
mapping_set_error(inode->i_mapping, rc);
@@ -2157,9 +2164,20 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
}
}
- rc = cifs_revalidate_dentry_attr(dentry);
- if (rc)
- return rc;
+ if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_FORCE_SYNC)
+ CIFS_I(inode)->time = 0; /* force revalidate */
+
+ /*
+ * If the caller doesn't require syncing, only sync if
+ * necessary (e.g. due to earlier truncate or setattr
+ * invalidating the cached metadata)
+ */
+ if (((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) ||
+ (CIFS_I(inode)->time == 0)) {
+ rc = cifs_revalidate_dentry_attr(dentry);
+ if (rc)
+ return rc;
+ }
generic_fillattr(inode, stat);
stat->blksize = cifs_sb->bsize;
@@ -2516,25 +2534,26 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
/*
* Attempt to flush data before changing attributes. We need to do
- * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
- * ownership or mode then we may also need to do this. Here, we take
- * the safe way out and just do the flush on all setattr requests. If
- * the flush returns error, store it to report later and continue.
+ * this for ATTR_SIZE and ATTR_MTIME. If the flush of the data
+ * returns error, store it to report later and continue.
*
* BB: This should be smarter. Why bother flushing pages that
* will be truncated anyway? Also, should we error out here if
- * the flush returns error?
+ * the flush returns error? Do we need to check for ATTR_MTIME_SET flag?
*/
- rc = filemap_write_and_wait(inode->i_mapping);
- if (is_interrupt_error(rc)) {
- rc = -ERESTARTSYS;
- goto cifs_setattr_exit;
+ if (attrs->ia_valid & (ATTR_MTIME | ATTR_SIZE | ATTR_CTIME)) {
+ rc = filemap_write_and_wait(inode->i_mapping);
+ if (is_interrupt_error(rc)) {
+ rc = -ERESTARTSYS;
+ goto cifs_setattr_exit;
+ }
+ mapping_set_error(inode->i_mapping, rc);
}
- mapping_set_error(inode->i_mapping, rc);
rc = 0;
- if (attrs->ia_valid & ATTR_MTIME) {
+ if ((attrs->ia_valid & ATTR_MTIME) &&
+ !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
rc = cifs_get_writable_file(cifsInode, FIND_WR_ANY, &wfile);
if (!rc) {
tcon = tlink_tcon(wfile->tlink);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 852aa00ec729..a25ef35b023e 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -416,7 +416,7 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
}
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL,
- NULL);
+ NULL, NULL);
if (rc)
goto qmf_out_open_fail;
@@ -470,7 +470,7 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
oparms.reconnect = false;
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
- NULL);
+ NULL, NULL);
if (rc) {
kfree(utf16_path);
return rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 40ca394fd5de..a456febd4109 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -31,6 +31,7 @@
#include "nterr.h"
#include "cifs_unicode.h"
#include "smb2pdu.h"
+#include "cifsfs.h"
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
@@ -1022,3 +1023,82 @@ int copy_path_name(char *dst, const char *src)
name_len++;
return name_len;
}
+
+struct super_cb_data {
+ struct TCP_Server_Info *server;
+ struct super_block *sb;
+};
+
+static void super_cb(struct super_block *sb, void *arg)
+{
+ struct super_cb_data *d = arg;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+
+ if (d->sb)
+ return;
+
+ cifs_sb = CIFS_SB(sb);
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (tcon->ses->server == d->server)
+ d->sb = sb;
+}
+
+struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server)
+{
+ struct super_cb_data d = {
+ .server = server,
+ .sb = NULL,
+ };
+
+ iterate_supers_type(&cifs_fs_type, super_cb, &d);
+
+ if (unlikely(!d.sb))
+ return ERR_PTR(-ENOENT);
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(d.sb);
+ return d.sb;
+}
+
+void cifs_put_tcp_super(struct super_block *sb)
+{
+ if (!IS_ERR_OR_NULL(sb))
+ cifs_sb_deactive(sb);
+}
+
+int update_super_prepath(struct cifs_tcon *tcon, const char *prefix,
+ size_t prefix_len)
+{
+ struct super_block *sb;
+ struct cifs_sb_info *cifs_sb;
+ int rc = 0;
+
+ sb = cifs_get_tcp_super(tcon->ses->server);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ cifs_sb = CIFS_SB(sb);
+
+ kfree(cifs_sb->prepath);
+
+ if (*prefix && prefix_len) {
+ cifs_sb->prepath = kstrndup(prefix, prefix_len, GFP_ATOMIC);
+ if (!cifs_sb->prepath) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
+ } else
+ cifs_sb->prepath = NULL;
+
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+
+out:
+ cifs_put_tcp_super(sb);
+ return rc;
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ba9dadf3be24..50f776a8d4ba 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -32,6 +32,7 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
#include "cifsfs.h"
+#include "smb2proto.h"
/*
* To be safe - for UCS to UTF-8 with strings loaded with the rare long
@@ -217,6 +218,60 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
}
}
+/* Fill a cifs_fattr struct with info from SMB_FIND_FILE_POSIX_INFO. */
+static void
+cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
+ struct cifs_sb_info *cifs_sb)
+{
+ struct smb2_posix_info_parsed parsed;
+
+ posix_info_parse(info, NULL, &parsed);
+
+ memset(fattr, 0, sizeof(*fattr));
+ fattr->cf_uniqueid = le64_to_cpu(info->Inode);
+ fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+ fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+
+ fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+ fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
+ fattr->cf_ctime = cifs_NTtimeToUnix(info->CreationTime);
+
+ fattr->cf_nlink = le32_to_cpu(info->HardLinks);
+ fattr->cf_cifsattrs = le32_to_cpu(info->DosAttributes);
+
+ /*
+ * Since we set the inode type below we need to mask off
+ * to avoid strange results if bits set above.
+ * XXX: why not make server&client use the type bits?
+ */
+ fattr->cf_mode = le32_to_cpu(info->Mode) & ~S_IFMT;
+
+ cifs_dbg(FYI, "posix fattr: dev %d, reparse %d, mode %o",
+ le32_to_cpu(info->DeviceId),
+ le32_to_cpu(info->ReparseTag),
+ le32_to_cpu(info->Mode));
+
+ if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+ fattr->cf_mode |= S_IFDIR;
+ fattr->cf_dtype = DT_DIR;
+ } else {
+ /*
+ * mark anything that is not a dir as regular
+ * file. special files should have the REPARSE
+ * attribute and will be marked as needing revaluation
+ */
+ fattr->cf_mode |= S_IFREG;
+ fattr->cf_dtype = DT_REG;
+ }
+
+ if (reparse_file_needs_reval(fattr))
+ fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
+ /* TODO map SIDs */
+ fattr->cf_uid = cifs_sb->mnt_uid;
+ fattr->cf_gid = cifs_sb->mnt_gid;
+}
+
static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info)
{
const FILE_DIRECTORY_INFO *fi = info;
@@ -359,6 +414,8 @@ ffirst_retry:
/* if (cap_unix(tcon->ses) { */
if (tcon->unix_ext)
cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX;
+ else if (tcon->posix_extensions)
+ cifsFile->srch_inf.info_level = SMB_FIND_FILE_POSIX_INFO;
else if ((tcon->ses->capabilities &
tcon->ses->server->vals->cap_nt_find) == 0) {
cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD;
@@ -451,6 +508,23 @@ struct cifs_dirent {
u64 ino;
};
+static void cifs_fill_dirent_posix(struct cifs_dirent *de,
+ const struct smb2_posix_info *info)
+{
+ struct smb2_posix_info_parsed parsed;
+
+ /* payload should have already been checked at this point */
+ if (posix_info_parse(info, NULL, &parsed) < 0) {
+ cifs_dbg(VFS, "invalid POSIX info payload");
+ return;
+ }
+
+ de->name = parsed.name;
+ de->namelen = parsed.name_len;
+ de->resume_key = info->Ignored;
+ de->ino = le64_to_cpu(info->Inode);
+}
+
static void cifs_fill_dirent_unix(struct cifs_dirent *de,
const FILE_UNIX_INFO *info, bool is_unicode)
{
@@ -511,6 +585,9 @@ static int cifs_fill_dirent(struct cifs_dirent *de, const void *info,
memset(de, 0, sizeof(*de));
switch (level) {
+ case SMB_FIND_FILE_POSIX_INFO:
+ cifs_fill_dirent_posix(de, info);
+ break;
case SMB_FIND_FILE_UNIX:
cifs_fill_dirent_unix(de, info, is_unicode);
break;
@@ -786,6 +863,11 @@ static int cifs_filldir(char *find_entry, struct file *file,
}
switch (file_info->srch_inf.info_level) {
+ case SMB_FIND_FILE_POSIX_INFO:
+ cifs_posix_to_fattr(&fattr,
+ (struct smb2_posix_info *)find_entry,
+ cifs_sb);
+ break;
case SMB_FIND_FILE_UNIX:
cifs_unix_basic_to_fattr(&fattr,
&((FILE_UNIX_INFO *)find_entry)->basic,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index afe1f03aabe3..2fa3ba354cc9 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -62,7 +62,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL,
- NULL);
+ NULL, NULL);
if (rc)
goto out;
@@ -152,7 +152,12 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
(li->offset + li->length))
continue;
if (current->tgid != li->pid)
- continue;
+ /*
+ * flock and OFD lock are associated with an open
+ * file description, not the process.
+ */
+ if (!(flock->fl_flags & (FL_FLOCK | FL_OFDLCK)))
+ continue;
if (cinode->can_cache_brlcks) {
/*
* We can cache brlock requests - simply remove a lock
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 0511aaf451d4..497afb0b9960 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -766,6 +766,20 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->tc_count <= 0) {
+ struct TCP_Server_Info *server = NULL;
+
+ WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative");
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ if (tcon->ses)
+ server = tcon->ses->server;
+
+ cifs_server_dbg(FYI, "tid=%u: tcon is closing, skipping async close retry of fid %llu %llu\n",
+ tcon->tid, persistent_fid, volatile_fid);
+
+ return 0;
+ }
tcon->tc_count++;
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index cfe9b800ea8c..b36c46f48705 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -328,16 +328,6 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified wsize, or default */
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->rdma) {
- if (server->sign)
- wsize = min_t(unsigned int,
- wsize, server->smbd_conn->max_fragmented_send_size);
- else
- wsize = min_t(unsigned int,
- wsize, server->smbd_conn->max_readwrite_size);
- }
-#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
@@ -356,8 +346,15 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
if (server->sign)
+ /*
+ * Account for SMB2 data transfer packet header and
+ * possible encryption header
+ */
wsize = min_t(unsigned int,
- wsize, server->smbd_conn->max_fragmented_send_size);
+ wsize,
+ server->smbd_conn->max_fragmented_send_size -
+ SMB2_READWRITE_PDU_HEADER_SIZE -
+ sizeof(struct smb2_transform_hdr));
else
wsize = min_t(unsigned int,
wsize, server->smbd_conn->max_readwrite_size);
@@ -378,16 +375,6 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified rsize, or default */
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->rdma) {
- if (server->sign)
- rsize = min_t(unsigned int,
- rsize, server->smbd_conn->max_fragmented_recv_size);
- else
- rsize = min_t(unsigned int,
- rsize, server->smbd_conn->max_readwrite_size);
- }
-#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
@@ -407,8 +394,15 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
if (server->sign)
+ /*
+ * Account for SMB2 data transfer packet header and
+ * possible encryption header
+ */
rsize = min_t(unsigned int,
- rsize, server->smbd_conn->max_fragmented_recv_size);
+ rsize,
+ server->smbd_conn->max_fragmented_recv_size -
+ SMB2_READWRITE_PDU_HEADER_SIZE -
+ sizeof(struct smb2_transform_hdr));
else
rsize = min_t(unsigned int,
rsize, server->smbd_conn->max_readwrite_size);
@@ -794,7 +788,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
tcon->crfid.has_lease = true;
smb2_parse_contexts(server, o_rsp,
&oparms.fid->epoch,
- oparms.fid->lease_key, &oplock, NULL);
+ oparms.fid->lease_key, &oplock,
+ NULL, NULL);
} else
goto oshr_exit;
@@ -838,7 +833,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
if (no_cached_open)
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
- NULL);
+ NULL, NULL);
else
rc = open_shroot(xid, tcon, cifs_sb, &fid);
@@ -878,7 +873,8 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
+ NULL, NULL);
if (rc)
return;
@@ -913,7 +909,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
+ NULL);
if (rc) {
kfree(utf16_path);
return rc;
@@ -2122,7 +2119,8 @@ smb3_notify(const unsigned int xid, struct file *pfile,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
+ NULL);
if (rc)
goto notify_exit;
@@ -2543,7 +2541,8 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
+ NULL, NULL);
if (rc)
return rc;
@@ -3028,7 +3027,8 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
+ NULL);
kfree(utf16_path);
if (!rc) {
rc = SMB2_query_acl(xid, tlink_tcon(tlink), fid.persistent_fid,
@@ -3086,7 +3086,8 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
+ NULL, NULL);
kfree(utf16_path);
if (!rc) {
rc = SMB2_set_acl(xid, tlink_tcon(tlink), fid.persistent_fid,
@@ -3248,6 +3249,10 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
* Extending the file
*/
if ((keep_size == false) && i_size_read(inode) < off + len) {
+ rc = inode_newsize_ok(inode, off + len);
+ if (rc)
+ goto out;
+
if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0)
smb2_set_sparse(xid, tcon, cfile, inode, false);
@@ -4151,7 +4156,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
cifs_reconnect(server);
- wake_up(&server->response_q);
return -1;
}
@@ -4515,14 +4519,12 @@ smb3_receive_transform(struct TCP_Server_Info *server,
cifs_server_dbg(VFS, "Transform message is too small (%u)\n",
pdu_length);
cifs_reconnect(server);
- wake_up(&server->response_q);
return -ECONNABORTED;
}
if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) {
cifs_server_dbg(VFS, "Transform message is broken\n");
cifs_reconnect(server);
- wake_up(&server->response_q);
return -ECONNABORTED;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 28c0be5e69b7..47d3e382ecaa 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -193,9 +193,18 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
for (it = dfs_cache_get_tgt_iterator(&tl); it;
it = dfs_cache_get_next_tgt(&tl, it)) {
- const char *tgt = dfs_cache_get_tgt_name(it);
+ const char *share, *prefix;
+ size_t share_len, prefix_len;
- extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+ rc = dfs_cache_get_tgt_share(it, &share, &share_len, &prefix,
+ &prefix_len);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to parse target share %d\n",
+ __func__, rc);
+ continue;
+ }
+
+ extract_unc_hostname(share, &dfs_host, &dfs_host_len);
if (dfs_host_len != tcp_host_len
|| strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
@@ -206,11 +215,13 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
continue;
}
- scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, share);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
- if (!rc)
+ if (!rc) {
+ rc = update_super_prepath(tcon, prefix, prefix_len);
break;
+ }
if (rc == -EREMOTE)
break;
}
@@ -378,7 +389,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
}
if (smb2_command != SMB2_INTERNAL_CMD)
- queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
+ mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
atomic_inc(&tconInfoReconnectCount);
out:
@@ -1940,20 +1951,46 @@ parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf)
}
static void
-parse_posix_ctxt(struct create_context *cc, struct smb_posix_info *pposix_inf)
+parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info,
+ struct create_posix_rsp *posix)
{
- /* struct smb_posix_info *ppinf = (struct smb_posix_info *)cc; */
+ int sid_len;
+ u8 *beg = (u8 *)cc + le16_to_cpu(cc->DataOffset);
+ u8 *end = beg + le32_to_cpu(cc->DataLength);
+ u8 *sid;
+
+ memset(posix, 0, sizeof(*posix));
+
+ posix->nlink = le32_to_cpu(*(__le32 *)(beg + 0));
+ posix->reparse_tag = le32_to_cpu(*(__le32 *)(beg + 4));
+ posix->mode = le32_to_cpu(*(__le32 *)(beg + 8));
- /* TODO: Need to add parsing for the context and return */
- printk_once(KERN_WARNING
- "SMB3 3.11 POSIX response context not completed yet\n");
+ sid = beg + 12;
+ sid_len = posix_info_sid_size(sid, end);
+ if (sid_len < 0) {
+ cifs_dbg(VFS, "bad owner sid in posix create response\n");
+ return;
+ }
+ memcpy(&posix->owner, sid, sid_len);
+
+ sid = sid + sid_len;
+ sid_len = posix_info_sid_size(sid, end);
+ if (sid_len < 0) {
+ cifs_dbg(VFS, "bad group sid in posix create response\n");
+ return;
+ }
+ memcpy(&posix->group, sid, sid_len);
+
+ cifs_dbg(FYI, "nlink=%d mode=%o reparse_tag=%x\n",
+ posix->nlink, posix->mode, posix->reparse_tag);
}
void
smb2_parse_contexts(struct TCP_Server_Info *server,
- struct smb2_create_rsp *rsp,
- unsigned int *epoch, char *lease_key, __u8 *oplock,
- struct smb2_file_all_info *buf)
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key, __u8 *oplock,
+ struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix)
{
char *data_offset;
struct create_context *cc;
@@ -1983,8 +2020,9 @@ smb2_parse_contexts(struct TCP_Server_Info *server,
strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0)
parse_query_id_ctxt(cc, buf);
else if ((le16_to_cpu(cc->NameLength) == 16)) {
- if (memcmp(name, smb3_create_tag_posix, 16) == 0)
- parse_posix_ctxt(cc, NULL);
+ if (posix &&
+ memcmp(name, smb3_create_tag_posix, 16) == 0)
+ parse_posix_ctxt(cc, buf, posix);
}
/* else {
cifs_dbg(FYI, "Context not matched with len %d\n",
@@ -2709,6 +2747,7 @@ SMB2_open_free(struct smb_rqst *rqst)
int
SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
__u8 *oplock, struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix,
struct kvec *err_iov, int *buftype)
{
struct smb_rqst rqst;
@@ -2787,7 +2826,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
smb2_parse_contexts(server, rsp, &oparms->fid->epoch,
- oparms->fid->lease_key, oplock, buf);
+ oparms->fid->lease_key, oplock, buf, posix);
creat_exit:
SMB2_open_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
@@ -3559,7 +3598,7 @@ SMB2_echo(struct TCP_Server_Info *server)
if (server->tcpStatus == CifsNeedNegotiate) {
/* No need to send echo on newly established connections */
- queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
+ mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
return rc;
}
@@ -4286,8 +4325,104 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
return rc;
}
+int posix_info_sid_size(const void *beg, const void *end)
+{
+ size_t subauth;
+ int total;
+
+ if (beg + 1 > end)
+ return -1;
+
+ subauth = *(u8 *)(beg+1);
+ if (subauth < 1 || subauth > 15)
+ return -1;
+
+ total = 1 + 1 + 6 + 4*subauth;
+ if (beg + total > end)
+ return -1;
+
+ return total;
+}
+
+int posix_info_parse(const void *beg, const void *end,
+ struct smb2_posix_info_parsed *out)
+
+{
+ int total_len = 0;
+ int sid_len;
+ int name_len;
+ const void *owner_sid;
+ const void *group_sid;
+ const void *name;
+
+ /* if no end bound given, assume payload to be correct */
+ if (!end) {
+ const struct smb2_posix_info *p = beg;
+
+ end = beg + le32_to_cpu(p->NextEntryOffset);
+ /* last element will have a 0 offset, pick a sensible bound */
+ if (end == beg)
+ end += 0xFFFF;
+ }
+
+ /* check base buf */
+ if (beg + sizeof(struct smb2_posix_info) > end)
+ return -1;
+ total_len = sizeof(struct smb2_posix_info);
+
+ /* check owner sid */
+ owner_sid = beg + total_len;
+ sid_len = posix_info_sid_size(owner_sid, end);
+ if (sid_len < 0)
+ return -1;
+ total_len += sid_len;
+
+ /* check group sid */
+ group_sid = beg + total_len;
+ sid_len = posix_info_sid_size(group_sid, end);
+ if (sid_len < 0)
+ return -1;
+ total_len += sid_len;
+
+ /* check name len */
+ if (beg + total_len + 4 > end)
+ return -1;
+ name_len = le32_to_cpu(*(__le32 *)(beg + total_len));
+ if (name_len < 1 || name_len > 0xFFFF)
+ return -1;
+ total_len += 4;
+
+ /* check name */
+ name = beg + total_len;
+ if (name + name_len > end)
+ return -1;
+ total_len += name_len;
+
+ if (out) {
+ out->base = beg;
+ out->size = total_len;
+ out->name_len = name_len;
+ out->name = name;
+ memcpy(&out->owner, owner_sid,
+ posix_info_sid_size(owner_sid, end));
+ memcpy(&out->group, group_sid,
+ posix_info_sid_size(group_sid, end));
+ }
+ return total_len;
+}
+
+static int posix_info_extra_size(const void *beg, const void *end)
+{
+ int len = posix_info_parse(beg, end, NULL);
+
+ if (len < 0)
+ return -1;
+ return len - sizeof(struct smb2_posix_info);
+}
+
static unsigned int
-num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
+num_entries(int infotype, char *bufstart, char *end_of_buf, char **lastentry,
+ size_t size)
{
int len;
unsigned int entrycount = 0;
@@ -4311,8 +4446,13 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
entryptr = entryptr + next_offset;
dir_info = (FILE_DIRECTORY_INFO *)entryptr;
- len = le32_to_cpu(dir_info->FileNameLength);
- if (entryptr + len < entryptr ||
+ if (infotype == SMB_FIND_FILE_POSIX_INFO)
+ len = posix_info_extra_size(entryptr, end_of_buf);
+ else
+ len = le32_to_cpu(dir_info->FileNameLength);
+
+ if (len < 0 ||
+ entryptr + len < entryptr ||
entryptr + len > end_of_buf ||
entryptr + len + size > end_of_buf) {
cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n",
@@ -4362,6 +4502,9 @@ int SMB2_query_directory_init(const unsigned int xid,
case SMB_FIND_FILE_ID_FULL_DIR_INFO:
req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION;
break;
+ case SMB_FIND_FILE_POSIX_INFO:
+ req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO;
+ break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
info_level);
@@ -4427,6 +4570,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
case SMB_FIND_FILE_ID_FULL_DIR_INFO:
info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
break;
+ case SMB_FIND_FILE_POSIX_INFO:
+ /* note that posix payload are variable size */
+ info_buf_size = sizeof(struct smb2_posix_info);
+ break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
srch_inf->info_level);
@@ -4436,8 +4583,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), rsp_iov,
info_buf_size);
- if (rc)
+ if (rc) {
+ cifs_tcon_dbg(VFS, "bad info payload");
return rc;
+ }
srch_inf->unicode = true;
@@ -4451,9 +4600,14 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
srch_inf->srch_entries_start = srch_inf->last_entry =
(char *)rsp + le16_to_cpu(rsp->OutputBufferOffset);
end_of_smb = rsp_iov->iov_len + (char *)rsp;
- srch_inf->entries_in_buffer =
- num_entries(srch_inf->srch_entries_start, end_of_smb,
- &srch_inf->last_entry, info_buf_size);
+
+ srch_inf->entries_in_buffer = num_entries(
+ srch_inf->info_level,
+ srch_inf->srch_entries_start,
+ end_of_smb,
+ &srch_inf->last_entry,
+ info_buf_size);
+
srch_inf->index_of_last_entry += srch_inf->entries_in_buffer;
cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n",
srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index fa03df130f1a..10acf90f858d 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -91,6 +91,7 @@
#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
+#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
/*
* SMB2 Header Definition
@@ -119,6 +120,9 @@ struct smb2_sync_hdr {
__u8 Signature[16];
} __packed;
+/* The total header size for SMB2 read and write */
+#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr))
+
struct smb2_sync_pdu {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize2; /* size of wct area (varies, request specific) */
@@ -127,16 +131,33 @@ struct smb2_sync_pdu {
#define SMB3_AES128CCM_NONCE 11
#define SMB3_AES128GCM_NONCE 12
+/* Transform flags (for 3.0 dialect this flag indicates CCM */
+#define TRANSFORM_FLAG_ENCRYPTED 0x0001
struct smb2_transform_hdr {
__le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
__u8 Signature[16];
__u8 Nonce[16];
__le32 OriginalMessageSize;
__u16 Reserved1;
- __le16 Flags; /* EncryptionAlgorithm */
+ __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
__u64 SessionId;
} __packed;
+/* See MS-SMB2 2.2.42.1 */
+struct compression_playload_header {
+ __le16 AlgorithmId;
+ __le16 Reserved;
+ __le32 Length;
+} __packed;
+
+/* See MS-SMB2 2.2.42.2 */
+struct compression_pattern_payload_v1 {
+ __le16 Pattern;
+ __le16 Reserved1;
+ __le16 Reserved2;
+ __le32 Repetitions;
+} __packed;
+
/*
* SMB2 flag definitions
*/
@@ -182,7 +203,7 @@ struct smb2_symlink_err_rsp {
__le16 PrintNameOffset;
__le16 PrintNameLength;
__le32 Flags;
- __u8 PathBuffer[0];
+ __u8 PathBuffer[];
} __packed;
/* SMB 3.1.1 and later dialects. See MS-SMB2 section 2.2.2.1 */
@@ -192,6 +213,10 @@ struct smb2_error_context_rsp {
__u8 ErrorContextData; /* ErrorDataLength long array */
} __packed;
+/* ErrorId values */
+#define SMB2_ERROR_ID_DEFAULT 0x00000000
+#define SMB2_ERROR_ID_SHARE_REDIRECT cpu_to_le32(0x72645253) /* "rdRS" */
+
/* Defines for Type field below (see MS-SMB2 2.2.2.2.2.1) */
#define MOVE_DST_IPADDR_V4 cpu_to_le32(0x00000001)
#define MOVE_DST_IPADDR_V6 cpu_to_le32(0x00000002)
@@ -210,7 +235,7 @@ struct share_redirect_error_context_rsp {
__le16 Flags;
__le16 TargetType;
__le32 IPAddrCount;
- struct move_dst_ipaddr IpAddrMoveList[0];
+ struct move_dst_ipaddr IpAddrMoveList[];
/* __u8 ResourceName[] */ /* Name of share as counted Unicode string */
} __packed;
@@ -307,11 +332,17 @@ struct smb2_encryption_neg_context {
#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
+/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
+#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004)
+
+/* Compression Flags */
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001)
struct smb2_compression_capabilities_context {
__le16 ContextType; /* 3 */
__le16 DataLength;
- __u32 Reserved;
+ __u32 Flags;
__le16 CompressionAlgorithmCount;
__u16 Padding;
__u32 Reserved1;
@@ -326,7 +357,7 @@ struct smb2_netname_neg_context {
__le16 ContextType; /* 0x100 */
__le16 DataLength;
__le32 Reserved;
- __le16 NetName[0]; /* hostname of target converted to UCS-2 */
+ __le16 NetName[]; /* hostname of target converted to UCS-2 */
} __packed;
#define POSIX_CTXT_DATA_LEN 16
@@ -406,7 +437,7 @@ struct smb2_logoff_rsp {
struct smb2_tree_connect_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
- __le16 Reserved; /* Flags in SMB3.1.1 */
+ __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */
__le16 PathOffset;
__le16 PathLength;
__u8 Buffer[1]; /* variable length */
@@ -421,13 +452,13 @@ struct tree_connect_contexts {
__le16 ContextType;
__le16 DataLength;
__le32 Reserved;
- __u8 Data[0];
+ __u8 Data[];
} __packed;
/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
struct smb3_blob_data {
__le16 BlobSize;
- __u8 BlobData[0];
+ __u8 BlobData[];
} __packed;
/* Valid values for Attr */
@@ -477,14 +508,14 @@ struct remoted_identity_tcon_context {
__le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
__le16 UserClaims; /* offset to BLOB_DATA struct */
__le16 DeviceClaims; /* offset to BLOB_DATA struct */
- __u8 TicketInfo[0]; /* variable length buf - remoted identity data */
+ __u8 TicketInfo[]; /* variable length buf - remoted identity data */
} __packed;
struct smb2_tree_connect_req_extension {
__le32 TreeConnectContextOffset;
__le16 TreeConnectContextCount;
__u8 Reserved[10];
- __u8 PathName[0]; /* variable sized array */
+ __u8 PathName[]; /* variable sized array */
/* followed by array of TreeConnectContexts */
} __packed;
@@ -633,7 +664,7 @@ struct smb2_tree_disconnect_rsp {
| FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
-/* Impersonation Levels */
+/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
#define IL_ANONYMOUS cpu_to_le32(0x00000000)
#define IL_IDENTIFICATION cpu_to_le32(0x00000001)
#define IL_IMPERSONATION cpu_to_le32(0x00000002)
@@ -689,7 +720,7 @@ struct smb2_create_req {
__le16 NameLength;
__le32 CreateContextsOffset;
__le32 CreateContextsLength;
- __u8 Buffer[0];
+ __u8 Buffer[];
} __packed;
/*
@@ -727,7 +758,7 @@ struct create_context {
__le16 Reserved;
__le16 DataOffset;
__le32 DataLength;
- __u8 Buffer[0];
+ __u8 Buffer[];
} __packed;
#define SMB2_LEASE_READ_CACHING_HE 0x01
@@ -739,7 +770,7 @@ struct create_context {
#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02)
#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04)
-#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02)
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x00000002)
#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET cpu_to_le32(0x00000004)
#define SMB2_LEASE_KEY_SIZE 16
@@ -869,7 +900,7 @@ struct crt_sd_ctxt {
struct resume_key_req {
char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
__le32 ContextLength; /* MBZ */
- char Context[0]; /* ignored, Windows sets to 4 bytes of zero */
+ char Context[]; /* ignored, Windows sets to 4 bytes of zero */
} __packed;
/* this goes in the ioctl buffer when doing a copychunk request */
@@ -931,7 +962,7 @@ struct reparse_data_buffer {
__le32 ReparseTag;
__le16 ReparseDataLength;
__u16 Reserved;
- __u8 DataBuffer[0]; /* Variable Length */
+ __u8 DataBuffer[]; /* Variable Length */
} __packed;
struct reparse_guid_data_buffer {
@@ -939,7 +970,7 @@ struct reparse_guid_data_buffer {
__le16 ReparseDataLength;
__u16 Reserved;
__u8 ReparseGuid[16];
- __u8 DataBuffer[0]; /* Variable Length */
+ __u8 DataBuffer[]; /* Variable Length */
} __packed;
struct reparse_mount_point_data_buffer {
@@ -950,7 +981,7 @@ struct reparse_mount_point_data_buffer {
__le16 SubstituteNameLength;
__le16 PrintNameOffset;
__le16 PrintNameLength;
- __u8 PathBuffer[0]; /* Variable Length */
+ __u8 PathBuffer[]; /* Variable Length */
} __packed;
#define SYMLINK_FLAG_RELATIVE 0x00000001
@@ -964,7 +995,7 @@ struct reparse_symlink_data_buffer {
__le16 PrintNameOffset;
__le16 PrintNameLength;
__le32 Flags;
- __u8 PathBuffer[0]; /* Variable Length */
+ __u8 PathBuffer[]; /* Variable Length */
} __packed;
/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
@@ -1066,7 +1097,7 @@ struct smb2_ioctl_req {
__le32 MaxOutputResponse;
__le32 Flags;
__u32 Reserved2;
- __u8 Buffer[0];
+ __u8 Buffer[];
} __packed;
struct smb2_ioctl_rsp {
@@ -1180,7 +1211,7 @@ struct smb2_write_req {
__le64 Offset;
__u64 PersistentFileId; /* opaque endianness */
__u64 VolatileFileId; /* opaque endianness */
- __le32 Channel; /* Reserved MBZ */
+ __le32 Channel; /* MBZ unless SMB3.02 or later */
__le32 RemainingBytes;
__le16 WriteChannelInfoOffset;
__le16 WriteChannelInfoLength;
@@ -1469,7 +1500,7 @@ struct smb3_fs_vol_info {
__le32 VolumeLabelLength; /* includes trailing null */
__u8 SupportsObjects; /* True if eg like NTFS, supports objects */
__u8 Reserved;
- __u8 VolumeLabel[0]; /* variable len */
+ __u8 VolumeLabel[]; /* variable len */
} __packed;
/* partial list of QUERY INFO levels */
@@ -1531,7 +1562,7 @@ struct smb2_file_rename_info { /* encoding of request for level 10 */
__u8 Reserved[7];
__u64 RootDirectory; /* MBZ for network operations (why says spec?) */
__le32 FileNameLength;
- char FileName[0]; /* New name to be assigned */
+ char FileName[]; /* New name to be assigned */
} __packed; /* level 10 Set */
struct smb2_file_link_info { /* encoding of request for level 11 */
@@ -1540,7 +1571,7 @@ struct smb2_file_link_info { /* encoding of request for level 11 */
__u8 Reserved[7];
__u64 RootDirectory; /* MBZ for network operations (why says spec?) */
__le32 FileNameLength;
- char FileName[0]; /* Name to be assigned to new link */
+ char FileName[]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
struct smb2_file_full_ea_info { /* encoding of response for level 15 */
@@ -1548,7 +1579,7 @@ struct smb2_file_full_ea_info { /* encoding of response for level 15 */
__u8 flags;
__u8 ea_name_length;
__le16 ea_value_length;
- char ea_data[0]; /* \0 terminated name plus value */
+ char ea_data[]; /* \0 terminated name plus value */
} __packed; /* level 15 Set */
/*
@@ -1604,11 +1635,56 @@ struct smb2_file_id_information {
extern char smb2_padding[7];
/* equivalent of the contents of SMB3.1.1 POSIX open context response */
-struct smb_posix_info {
- __le32 nlink;
- __le32 reparse_tag;
- __le32 mode;
- kuid_t uid;
- kuid_t gid;
+struct create_posix_rsp {
+ u32 nlink;
+ u32 reparse_tag;
+ u32 mode;
+ struct cifs_sid owner; /* var-sized on the wire */
+ struct cifs_sid group; /* var-sized on the wire */
+} __packed;
+
+/*
+ * SMB2-only POSIX info level
+ *
+ * See posix_info_sid_size(), posix_info_extra_size() and
+ * posix_info_parse() to help with the handling of this struct.
+ */
+struct smb2_posix_info {
+ __le32 NextEntryOffset;
+ __u32 Ignored;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 DosAttributes;
+ __le64 Inode;
+ __le32 DeviceId;
+ __le32 Zero;
+ /* beginning of POSIX Create Context Response */
+ __le32 HardLinks;
+ __le32 ReparseTag;
+ __le32 Mode;
+ /*
+ * var sized owner SID
+ * var sized group SID
+ * le32 filenamelength
+ * u8 filename[]
+ */
+} __packed;
+
+/*
+ * Parsed version of the above struct. Allows direct access to the
+ * variable length fields
+ */
+struct smb2_posix_info_parsed {
+ const struct smb2_posix_info *base;
+ size_t size;
+ struct cifs_sid owner;
+ struct cifs_sid group;
+ int name_len;
+ const u8 *name;
};
+
#endif /* _SMB2PDU_H */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index de6388ef344f..087d5f14320b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -55,9 +55,11 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server,
__u64 ses_id, __u32 tid);
extern int smb2_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server);
+ struct TCP_Server_Info *server,
+ bool allocate_crypto);
extern int smb3_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server);
+ struct TCP_Server_Info *server,
+ bool allocate_crypto);
extern void smb2_echo_request(struct work_struct *work);
extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
extern bool smb2_is_valid_oplock_break(char *buffer,
@@ -139,6 +141,7 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
__le16 *path, __u8 *oplock,
struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix,
struct kvec *err_iov, int *resp_buftype);
extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
__u8 *oplock, struct cifs_open_parms *oparms,
@@ -252,7 +255,8 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
extern void smb2_parse_contexts(struct TCP_Server_Info *server,
struct smb2_create_rsp *rsp,
unsigned int *epoch, char *lease_key,
- __u8 *oplock, struct smb2_file_all_info *buf);
+ __u8 *oplock, struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix);
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
@@ -272,4 +276,7 @@ extern int smb2_query_info_compound(const unsigned int xid,
u32 class, u32 type, u32 output_len,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb);
+int posix_info_parse(const void *beg, const void *end,
+ struct smb2_posix_info_parsed *out);
+int posix_info_sid_size(const void *beg, const void *end);
#endif /* _SMB2PROTO_H */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 08b703b7a15e..1a6c227ada8f 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -41,14 +41,6 @@
#include "smb2glob.h"
static int
-smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
- return cifs_alloc_hash("hmac(sha256)",
- &server->secmech.hmacsha256,
- &server->secmech.sdeschmacsha256);
-}
-
-static int
smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
{
struct cifs_secmech *p = &server->secmech;
@@ -219,7 +211,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
}
int
-smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+ bool allocate_crypto)
{
int rc;
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
@@ -228,6 +221,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
struct shash_desc *shash;
+ struct crypto_shash *hash;
+ struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
ses = smb2_find_smb_ses(server, shdr->SessionId);
@@ -239,24 +234,32 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- rc = smb2_crypto_shash_allocate(server);
- if (rc) {
- cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__);
- return rc;
+ if (allocate_crypto) {
+ rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc);
+ if (rc) {
+ cifs_server_dbg(VFS,
+ "%s: sha256 alloc failed\n", __func__);
+ return rc;
+ }
+ shash = &sdesc->shash;
+ } else {
+ hash = server->secmech.hmacsha256;
+ shash = &server->secmech.sdeschmacsha256->shash;
}
- rc = crypto_shash_setkey(server->secmech.hmacsha256,
- ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+ rc = crypto_shash_setkey(hash, ses->auth_key.response,
+ SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with response\n", __func__);
- return rc;
+ cifs_server_dbg(VFS,
+ "%s: Could not update with response\n",
+ __func__);
+ goto out;
}
- shash = &server->secmech.sdeschmacsha256->shash;
rc = crypto_shash_init(shash);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init sha256", __func__);
- return rc;
+ goto out;
}
/*
@@ -271,9 +274,10 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_update(shash, iov[0].iov_base,
iov[0].iov_len);
if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with payload\n",
- __func__);
- return rc;
+ cifs_server_dbg(VFS,
+ "%s: Could not update with payload\n",
+ __func__);
+ goto out;
}
drqst.rq_iov++;
drqst.rq_nvec--;
@@ -283,6 +287,9 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!rc)
memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+out:
+ if (allocate_crypto)
+ cifs_free_hash(&hash, &sdesc);
return rc;
}
@@ -504,14 +511,17 @@ generate_smb311signingkey(struct cifs_ses *ses)
}
int
-smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+ bool allocate_crypto)
{
int rc;
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
- struct shash_desc *shash = &server->secmech.sdesccmacaes->shash;
+ struct shash_desc *shash;
+ struct crypto_shash *hash;
+ struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
@@ -519,14 +529,24 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc)
return 0;
+ if (allocate_crypto) {
+ rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc);
+ if (rc)
+ return rc;
+
+ shash = &sdesc->shash;
+ } else {
+ hash = server->secmech.cmacaes;
+ shash = &server->secmech.sdesccmacaes->shash;
+ }
+
memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- rc = crypto_shash_setkey(server->secmech.cmacaes,
- key, SMB2_CMACAES_SIZE);
+ rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
- return rc;
+ goto out;
}
/*
@@ -537,7 +557,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_init(shash);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
- return rc;
+ goto out;
}
/*
@@ -554,7 +574,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with payload\n",
__func__);
- return rc;
+ goto out;
}
drqst.rq_iov++;
drqst.rq_nvec--;
@@ -564,6 +584,9 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!rc)
memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+out:
+ if (allocate_crypto)
+ cifs_free_hash(&hash, &sdesc);
return rc;
}
@@ -593,7 +616,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return 0;
}
- rc = server->ops->calc_signature(rqst, server);
+ rc = server->ops->calc_signature(rqst, server, false);
return rc;
}
@@ -602,7 +625,7 @@ int
smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
unsigned int rc;
- char server_response_sig[16];
+ char server_response_sig[SMB2_SIGNATURE_SIZE];
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
@@ -631,16 +654,16 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE);
- mutex_lock(&server->srv_mutex);
- rc = server->ops->calc_signature(rqst, server);
- mutex_unlock(&server->srv_mutex);
+ rc = server->ops->calc_signature(rqst, server, true);
if (rc)
return rc;
- if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE))
+ if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE)) {
+ dump_stack();
+ cifs_dbg(VFS, "sign fail cmd 0x%x message id 0x%llx\n", shdr->Command, shdr->MessageId);
return -EACCES;
- else
+ } else
return 0;
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5b1b97e9e0c9..1a5834a5d597 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -284,13 +284,10 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
request->sge[i].length,
DMA_TO_DEVICE);
- if (request->has_payload) {
- if (atomic_dec_and_test(&request->info->send_payload_pending))
- wake_up(&request->info->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&request->info->send_pending))
- wake_up(&request->info->wait_send_pending);
- }
+ if (atomic_dec_and_test(&request->info->send_pending))
+ wake_up(&request->info->wait_send_pending);
+
+ wake_up(&request->info->wait_post_send);
mempool_free(request, request->info->request_mempool);
}
@@ -383,27 +380,6 @@ static bool process_negotiation_response(
return true;
}
-/*
- * Check and schedule to send an immediate packet
- * This is used to extend credtis to remote peer to keep the transport busy
- */
-static void check_and_send_immediate(struct smbd_connection *info)
-{
- if (info->transport_status != SMBD_CONNECTED)
- return;
-
- info->send_immediate = true;
-
- /*
- * Promptly send a packet if our peer is running low on receive
- * credits
- */
- if (atomic_read(&info->receive_credits) <
- info->receive_credit_target - 1)
- queue_delayed_work(
- info->workqueue, &info->send_immediate_work, 0);
-}
-
static void smbd_post_send_credits(struct work_struct *work)
{
int ret = 0;
@@ -453,29 +429,16 @@ static void smbd_post_send_credits(struct work_struct *work)
info->new_credits_offered += ret;
spin_unlock(&info->lock_new_credits_offered);
- atomic_add(ret, &info->receive_credits);
-
- /* Check if we can post new receive and grant credits to peer */
- check_and_send_immediate(info);
-}
-
-static void smbd_recv_done_work(struct work_struct *work)
-{
- struct smbd_connection *info =
- container_of(work, struct smbd_connection, recv_done_work);
-
- /*
- * We may have new send credits granted from remote peer
- * If any sender is blcoked on lack of credets, unblock it
- */
- if (atomic_read(&info->send_credits))
- wake_up_interruptible(&info->wait_send_queue);
-
- /*
- * Check if we need to send something to remote peer to
- * grant more credits or respond to KEEP_ALIVE packet
- */
- check_and_send_immediate(info);
+ /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
+ info->send_immediate = true;
+ if (atomic_read(&info->receive_credits) <
+ info->receive_credit_target - 1) {
+ if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
+ info->send_immediate) {
+ log_keep_alive(INFO, "send an empty message\n");
+ smbd_post_send_empty(info);
+ }
+ }
}
/* Called from softirq, when recv is done */
@@ -546,8 +509,15 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
atomic_dec(&info->receive_credits);
info->receive_credit_target =
le16_to_cpu(data_transfer->credits_requested);
- atomic_add(le16_to_cpu(data_transfer->credits_granted),
- &info->send_credits);
+ if (le16_to_cpu(data_transfer->credits_granted)) {
+ atomic_add(le16_to_cpu(data_transfer->credits_granted),
+ &info->send_credits);
+ /*
+ * We have new send credits granted from remote peer
+ * If any sender is waiting for credits, unblock it
+ */
+ wake_up_interruptible(&info->wait_send_queue);
+ }
log_incoming(INFO, "data flags %d data_offset %d "
"data_length %d remaining_data_length %d\n",
@@ -563,7 +533,6 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
info->keep_alive_requested = KEEP_ALIVE_PENDING;
}
- queue_work(info->workqueue, &info->recv_done_work);
return;
default:
@@ -756,7 +725,6 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
request->sge[0].addr,
request->sge[0].length, request->sge[0].lkey);
- request->has_payload = false;
atomic_inc(&info->send_pending);
rc = ib_post_send(info->id->qp, &send_wr, NULL);
if (!rc)
@@ -813,45 +781,96 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info)
return 0;
}
-/*
- * Build and prepare the SMBD packet header
- * This function waits for avaialbe send credits and build a SMBD packet
- * header. The caller then optional append payload to the packet after
- * the header
- * intput values
- * size: the size of the payload
- * remaining_data_length: remaining data to send if this is part of a
- * fragmented packet
- * output values
- * request_out: the request allocated from this function
- * return values: 0 on success, otherwise actual error code returned
- */
-static int smbd_create_header(struct smbd_connection *info,
- int size, int remaining_data_length,
- struct smbd_request **request_out)
+/* Post the send request */
+static int smbd_post_send(struct smbd_connection *info,
+ struct smbd_request *request)
{
+ struct ib_send_wr send_wr;
+ int rc, i;
+
+ for (i = 0; i < request->num_sge; i++) {
+ log_rdma_send(INFO,
+ "rdma_request sge[%d] addr=%llu length=%u\n",
+ i, request->sge[i].addr, request->sge[i].length);
+ ib_dma_sync_single_for_device(
+ info->id->device,
+ request->sge[i].addr,
+ request->sge[i].length,
+ DMA_TO_DEVICE);
+ }
+
+ request->cqe.done = send_done;
+
+ send_wr.next = NULL;
+ send_wr.wr_cqe = &request->cqe;
+ send_wr.sg_list = request->sge;
+ send_wr.num_sge = request->num_sge;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ rc = ib_post_send(info->id->qp, &send_wr, NULL);
+ if (rc) {
+ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
+ smbd_disconnect_rdma_connection(info);
+ rc = -EAGAIN;
+ } else
+ /* Reset timer for idle connection after packet is sent */
+ mod_delayed_work(info->workqueue, &info->idle_timer_work,
+ info->keep_alive_interval*HZ);
+
+ return rc;
+}
+
+static int smbd_post_send_sgl(struct smbd_connection *info,
+ struct scatterlist *sgl, int data_length, int remaining_data_length)
+{
+ int num_sgs;
+ int i, rc;
+ int header_length;
struct smbd_request *request;
struct smbd_data_transfer *packet;
- int header_length;
- int rc;
+ int new_credits;
+ struct scatterlist *sg;
+wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(info->wait_send_queue,
atomic_read(&info->send_credits) > 0 ||
info->transport_status != SMBD_CONNECTED);
if (rc)
- return rc;
+ goto err_wait_credit;
if (info->transport_status != SMBD_CONNECTED) {
- log_outgoing(ERR, "disconnected not sending\n");
- return -EAGAIN;
+ log_outgoing(ERR, "disconnected not sending on wait_credit\n");
+ rc = -EAGAIN;
+ goto err_wait_credit;
+ }
+ if (unlikely(atomic_dec_return(&info->send_credits) < 0)) {
+ atomic_inc(&info->send_credits);
+ goto wait_credit;
+ }
+
+wait_send_queue:
+ wait_event(info->wait_post_send,
+ atomic_read(&info->send_pending) < info->send_credit_target ||
+ info->transport_status != SMBD_CONNECTED);
+
+ if (info->transport_status != SMBD_CONNECTED) {
+ log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
+ rc = -EAGAIN;
+ goto err_wait_send_queue;
+ }
+
+ if (unlikely(atomic_inc_return(&info->send_pending) >
+ info->send_credit_target)) {
+ atomic_dec(&info->send_pending);
+ goto wait_send_queue;
}
- atomic_dec(&info->send_credits);
request = mempool_alloc(info->request_mempool, GFP_KERNEL);
if (!request) {
rc = -ENOMEM;
- goto err;
+ goto err_alloc;
}
request->info = info;
@@ -859,8 +878,11 @@ static int smbd_create_header(struct smbd_connection *info,
/* Fill in the packet header */
packet = smbd_request_payload(request);
packet->credits_requested = cpu_to_le16(info->send_credit_target);
- packet->credits_granted =
- cpu_to_le16(manage_credits_prior_sending(info));
+
+ new_credits = manage_credits_prior_sending(info);
+ atomic_add(new_credits, &info->receive_credits);
+ packet->credits_granted = cpu_to_le16(new_credits);
+
info->send_immediate = false;
packet->flags = 0;
@@ -868,11 +890,11 @@ static int smbd_create_header(struct smbd_connection *info,
packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
packet->reserved = 0;
- if (!size)
+ if (!data_length)
packet->data_offset = 0;
else
packet->data_offset = cpu_to_le32(24);
- packet->data_length = cpu_to_le32(size);
+ packet->data_length = cpu_to_le32(data_length);
packet->remaining_data_length = cpu_to_le32(remaining_data_length);
packet->padding = 0;
@@ -887,7 +909,7 @@ static int smbd_create_header(struct smbd_connection *info,
/* Map the packet to DMA */
header_length = sizeof(struct smbd_data_transfer);
/* If this is a packet without payload, don't send padding */
- if (!size)
+ if (!data_length)
header_length = offsetof(struct smbd_data_transfer, padding);
request->num_sge = 1;
@@ -896,102 +918,15 @@ static int smbd_create_header(struct smbd_connection *info,
header_length,
DMA_TO_DEVICE);
if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
- mempool_free(request, info->request_mempool);
rc = -EIO;
- goto err;
+ request->sge[0].addr = 0;
+ goto err_dma;
}
request->sge[0].length = header_length;
request->sge[0].lkey = info->pd->local_dma_lkey;
- *request_out = request;
- return 0;
-
-err:
- atomic_inc(&info->send_credits);
- return rc;
-}
-
-static void smbd_destroy_header(struct smbd_connection *info,
- struct smbd_request *request)
-{
-
- ib_dma_unmap_single(info->id->device,
- request->sge[0].addr,
- request->sge[0].length,
- DMA_TO_DEVICE);
- mempool_free(request, info->request_mempool);
- atomic_inc(&info->send_credits);
-}
-
-/* Post the send request */
-static int smbd_post_send(struct smbd_connection *info,
- struct smbd_request *request, bool has_payload)
-{
- struct ib_send_wr send_wr;
- int rc, i;
-
- for (i = 0; i < request->num_sge; i++) {
- log_rdma_send(INFO,
- "rdma_request sge[%d] addr=%llu length=%u\n",
- i, request->sge[i].addr, request->sge[i].length);
- ib_dma_sync_single_for_device(
- info->id->device,
- request->sge[i].addr,
- request->sge[i].length,
- DMA_TO_DEVICE);
- }
-
- request->cqe.done = send_done;
-
- send_wr.next = NULL;
- send_wr.wr_cqe = &request->cqe;
- send_wr.sg_list = request->sge;
- send_wr.num_sge = request->num_sge;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
-
- if (has_payload) {
- request->has_payload = true;
- atomic_inc(&info->send_payload_pending);
- } else {
- request->has_payload = false;
- atomic_inc(&info->send_pending);
- }
-
- rc = ib_post_send(info->id->qp, &send_wr, NULL);
- if (rc) {
- log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
- if (has_payload) {
- if (atomic_dec_and_test(&info->send_payload_pending))
- wake_up(&info->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&info->send_pending))
- wake_up(&info->wait_send_pending);
- }
- smbd_disconnect_rdma_connection(info);
- rc = -EAGAIN;
- } else
- /* Reset timer for idle connection after packet is sent */
- mod_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
-
- return rc;
-}
-
-static int smbd_post_send_sgl(struct smbd_connection *info,
- struct scatterlist *sgl, int data_length, int remaining_data_length)
-{
- int num_sgs;
- int i, rc;
- struct smbd_request *request;
- struct scatterlist *sg;
-
- rc = smbd_create_header(
- info, data_length, remaining_data_length, &request);
- if (rc)
- return rc;
-
+ /* Fill in the packet data payload */
num_sgs = sgl ? sg_nents(sgl) : 0;
for_each_sg(sgl, sg, num_sgs, i) {
request->sge[i+1].addr =
@@ -1001,25 +936,41 @@ static int smbd_post_send_sgl(struct smbd_connection *info,
info->id->device, request->sge[i+1].addr)) {
rc = -EIO;
request->sge[i+1].addr = 0;
- goto dma_mapping_failure;
+ goto err_dma;
}
request->sge[i+1].length = sg->length;
request->sge[i+1].lkey = info->pd->local_dma_lkey;
request->num_sge++;
}
- rc = smbd_post_send(info, request, data_length);
+ rc = smbd_post_send(info, request);
if (!rc)
return 0;
-dma_mapping_failure:
- for (i = 1; i < request->num_sge; i++)
+err_dma:
+ for (i = 0; i < request->num_sge; i++)
if (request->sge[i].addr)
ib_dma_unmap_single(info->id->device,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
- smbd_destroy_header(info, request);
+ mempool_free(request, info->request_mempool);
+
+ /* roll back receive credits and credits to be offered */
+ spin_lock(&info->lock_new_credits_offered);
+ info->new_credits_offered += new_credits;
+ spin_unlock(&info->lock_new_credits_offered);
+ atomic_sub(new_credits, &info->receive_credits);
+
+err_alloc:
+ if (atomic_dec_and_test(&info->send_pending))
+ wake_up(&info->wait_send_pending);
+
+err_wait_send_queue:
+ /* roll back send credits and pending */
+ atomic_inc(&info->send_credits);
+
+err_wait_credit:
return rc;
}
@@ -1341,25 +1292,6 @@ static void destroy_receive_buffers(struct smbd_connection *info)
mempool_free(response, info->response_mempool);
}
-/*
- * Check and send an immediate or keep alive packet
- * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
- * Connection.KeepaliveRequested and Connection.SendImmediate
- * The idea is to extend credits to server as soon as it becomes available
- */
-static void send_immediate_work(struct work_struct *work)
-{
- struct smbd_connection *info = container_of(
- work, struct smbd_connection,
- send_immediate_work.work);
-
- if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
- info->send_immediate) {
- log_keep_alive(INFO, "send an empty message\n");
- smbd_post_send_empty(info);
- }
-}
-
/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
static void idle_connection_timer(struct work_struct *work)
{
@@ -1414,14 +1346,10 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "cancelling idle timer\n");
cancel_delayed_work_sync(&info->idle_timer_work);
- log_rdma_event(INFO, "cancelling send immediate work\n");
- cancel_delayed_work_sync(&info->send_immediate_work);
log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
wait_event(info->wait_send_pending,
atomic_read(&info->send_pending) == 0);
- wait_event(info->wait_send_payload_pending,
- atomic_read(&info->send_payload_pending) == 0);
/* It's not posssible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
@@ -1751,18 +1679,15 @@ static struct smbd_connection *_smbd_get_connection(
init_waitqueue_head(&info->wait_send_queue);
INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
- INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
queue_delayed_work(info->workqueue, &info->idle_timer_work,
info->keep_alive_interval*HZ);
init_waitqueue_head(&info->wait_send_pending);
atomic_set(&info->send_pending, 0);
- init_waitqueue_head(&info->wait_send_payload_pending);
- atomic_set(&info->send_payload_pending, 0);
+ init_waitqueue_head(&info->wait_post_send);
INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
- INIT_WORK(&info->recv_done_work, smbd_recv_done_work);
INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
info->new_credits_offered = 0;
spin_lock_init(&info->lock_new_credits_offered);
@@ -2097,8 +2022,7 @@ int smbd_send(struct TCP_Server_Info *server,
for (i = 0; i < num_rqst; i++)
remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
- if (remaining_data_length + sizeof(struct smbd_data_transfer) >
- info->max_fragmented_send_size) {
+ if (remaining_data_length > info->max_fragmented_send_size) {
log_write(ERR, "payload size %d > max size %d\n",
remaining_data_length, info->max_fragmented_send_size);
rc = -EINVAL;
@@ -2235,8 +2159,8 @@ done:
* that means all the I/Os have been out and we are good to return
*/
- wait_event(info->wait_send_payload_pending,
- atomic_read(&info->send_payload_pending) == 0);
+ wait_event(info->wait_send_pending,
+ atomic_read(&info->send_pending) == 0);
return rc;
}
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index 6ff880a1e186..a87fca82a796 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h
@@ -67,7 +67,6 @@ struct smbd_connection {
bool negotiate_done;
struct work_struct disconnect_work;
- struct work_struct recv_done_work;
struct work_struct post_send_credits_work;
spinlock_t lock_new_credits_offered;
@@ -115,8 +114,7 @@ struct smbd_connection {
/* Activity accoutning */
atomic_t send_pending;
wait_queue_head_t wait_send_pending;
- atomic_t send_payload_pending;
- wait_queue_head_t wait_send_payload_pending;
+ wait_queue_head_t wait_post_send;
/* Receive queue */
struct list_head receive_queue;
@@ -155,7 +153,6 @@ struct smbd_connection {
struct workqueue_struct *workqueue;
struct delayed_work idle_timer_work;
- struct delayed_work send_immediate_work;
/* Memory pool for preallocating buffers */
/* request pool for RDMA send */
@@ -235,9 +232,6 @@ struct smbd_request {
struct smbd_connection *info;
struct ib_cqe cqe;
- /* true if this request carries upper layer payload */
- bool has_payload;
-
/* the SGE entries for this packet */
struct ib_sge sge[SMBDIRECT_MAX_SGE];
int num_sge;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index cb3ee916f527..c97570eb2c18 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -466,7 +466,7 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
struct smb_rqst *rqst, int flags)
{
struct kvec iov;
- struct smb2_transform_hdr tr_hdr;
+ struct smb2_transform_hdr *tr_hdr;
struct smb_rqst cur_rqst[MAX_COMPOUND];
int rc;
@@ -476,28 +476,34 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
if (num_rqst > MAX_COMPOUND - 1)
return -ENOMEM;
- memset(&cur_rqst[0], 0, sizeof(cur_rqst));
- memset(&iov, 0, sizeof(iov));
- memset(&tr_hdr, 0, sizeof(tr_hdr));
-
- iov.iov_base = &tr_hdr;
- iov.iov_len = sizeof(tr_hdr);
- cur_rqst[0].rq_iov = &iov;
- cur_rqst[0].rq_nvec = 1;
-
if (!server->ops->init_transform_rq) {
cifs_server_dbg(VFS, "Encryption requested but transform "
"callback is missing\n");
return -EIO;
}
+ tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS);
+ if (!tr_hdr)
+ return -ENOMEM;
+
+ memset(&cur_rqst[0], 0, sizeof(cur_rqst));
+ memset(&iov, 0, sizeof(iov));
+ memset(tr_hdr, 0, sizeof(*tr_hdr));
+
+ iov.iov_base = tr_hdr;
+ iov.iov_len = sizeof(*tr_hdr);
+ cur_rqst[0].rq_iov = &iov;
+ cur_rqst[0].rq_nvec = 1;
+
rc = server->ops->init_transform_rq(server, num_rqst + 1,
&cur_rqst[0], rqst);
if (rc)
- return rc;
+ goto out;
rc = __smb_send_rqst(server, num_rqst + 1, &cur_rqst[0]);
smb3_free_compound_rqst(num_rqst, &cur_rqst[1]);
+out:
+ kfree(tr_hdr);
return rc;
}
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 9aae851409e5..dbced2937ec8 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -76,6 +76,26 @@ static inline int fscrypt_context_size(const union fscrypt_context *ctx)
return 0;
}
+/* Check whether an fscrypt_context has a recognized version number and size */
+static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx,
+ int ctx_size)
+{
+ return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx);
+}
+
+/* Retrieve the context's nonce, assuming the context was already validated */
+static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx)
+{
+ switch (ctx->version) {
+ case FSCRYPT_CONTEXT_V1:
+ return ctx->v1.nonce;
+ case FSCRYPT_CONTEXT_V2:
+ return ctx->v2.nonce;
+ }
+ WARN_ON(1);
+ return NULL;
+}
+
#undef fscrypt_policy
union fscrypt_policy {
u8 version;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 08c9f216a54d..302375e9f719 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -425,20 +425,8 @@ int fscrypt_get_encryption_info(struct inode *inode)
goto out;
}
- switch (ctx.version) {
- case FSCRYPT_CONTEXT_V1:
- memcpy(crypt_info->ci_nonce, ctx.v1.nonce,
- FS_KEY_DERIVATION_NONCE_SIZE);
- break;
- case FSCRYPT_CONTEXT_V2:
- memcpy(crypt_info->ci_nonce, ctx.v2.nonce,
- FS_KEY_DERIVATION_NONCE_SIZE);
- break;
- default:
- WARN_ON(1);
- res = -EINVAL;
- goto out;
- }
+ memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx),
+ FS_KEY_DERIVATION_NONCE_SIZE);
if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
res = -EINVAL;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index cf2a9d26ef7d..10ccf945020c 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -258,7 +258,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
{
memset(policy_u, 0, sizeof(*policy_u));
- if (ctx_size <= 0 || ctx_size != fscrypt_context_size(ctx_u))
+ if (!fscrypt_context_is_valid(ctx_u, ctx_size))
return -EINVAL;
switch (ctx_u->version) {
@@ -481,6 +481,25 @@ int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg)
}
EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex);
+/* FS_IOC_GET_ENCRYPTION_NONCE: retrieve file's encryption nonce for testing */
+int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
+{
+ struct inode *inode = file_inode(filp);
+ union fscrypt_context ctx;
+ int ret;
+
+ ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (ret < 0)
+ return ret;
+ if (!fscrypt_context_is_valid(&ctx, ret))
+ return -EINVAL;
+ if (copy_to_user(arg, fscrypt_context_nonce(&ctx),
+ FS_KEY_DERIVATION_NONCE_SIZE))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce);
+
/**
* fscrypt_has_permitted_context() - is a file's encryption policy permitted
* within its directory?
diff --git a/fs/dax.c b/fs/dax.c
index 35da144375a0..11b16729b86f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1038,50 +1038,43 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
return ret;
}
-static bool dax_range_is_aligned(struct block_device *bdev,
- unsigned int offset, unsigned int length)
+int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
+ struct iomap *iomap)
{
- unsigned short sector_size = bdev_logical_block_size(bdev);
+ sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
+ pgoff_t pgoff;
+ long rc, id;
+ void *kaddr;
+ bool page_aligned = false;
- if (!IS_ALIGNED(offset, sector_size))
- return false;
- if (!IS_ALIGNED(length, sector_size))
- return false;
- return true;
-}
+ if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
+ IS_ALIGNED(size, PAGE_SIZE))
+ page_aligned = true;
-int __dax_zero_page_range(struct block_device *bdev,
- struct dax_device *dax_dev, sector_t sector,
- unsigned int offset, unsigned int size)
-{
- if (dax_range_is_aligned(bdev, offset, size)) {
- sector_t start_sector = sector + (offset >> 9);
+ rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
+ if (rc)
+ return rc;
- return blkdev_issue_zeroout(bdev, start_sector,
- size >> 9, GFP_NOFS, 0);
- } else {
- pgoff_t pgoff;
- long rc, id;
- void *kaddr;
+ id = dax_read_lock();
- rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
+ if (page_aligned)
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff,
+ size >> PAGE_SHIFT);
+ else
+ rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
- id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
- }
+ if (!page_aligned) {
memset(kaddr + offset, 0, size);
- dax_flush(dax_dev, kaddr + offset, size);
- dax_read_unlock(id);
+ dax_flush(iomap->dax_dev, kaddr + offset, size);
}
+ dax_read_unlock(id);
return 0;
}
-EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index db987b5110a9..2d357680094c 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/device.h>
+#include <linux/pm_runtime.h>
#include <linux/poll.h>
#include <linux/security.h>
@@ -175,8 +176,13 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
if (r)
goto out;
- real_fops = fops_get(real_fops);
- if (!real_fops) {
+ if (!fops_get(real_fops)) {
+#ifdef MODULE
+ if (real_fops->owner &&
+ real_fops->owner->state == MODULE_STATE_GOING)
+ goto out;
+#endif
+
/* Huh? Module did not clean up after itself at exit? */
WARN(1, "debugfs file owner did not clean up at exit: %pd",
dentry);
@@ -305,8 +311,13 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
if (r)
goto out;
- real_fops = fops_get(real_fops);
- if (!real_fops) {
+ if (!fops_get(real_fops)) {
+#ifdef MODULE
+ if (real_fops->owner &&
+ real_fops->owner->state == MODULE_STATE_GOING)
+ goto out;
+#endif
+
/* Huh? Module did not cleanup after itself at exit? */
WARN(1, "debugfs file owner did not clean up at exit: %pd",
dentry);
@@ -1060,7 +1071,14 @@ static int debugfs_show_regset32(struct seq_file *s, void *data)
{
struct debugfs_regset32 *regset = s->private;
+ if (regset->dev)
+ pm_runtime_get_sync(regset->dev);
+
debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, "");
+
+ if (regset->dev)
+ pm_runtime_put(regset->dev);
+
return 0;
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e742dfc66933..b7f2e971ecbc 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -501,26 +501,16 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
* wide range of flexibility in creating a file, or a directory (if you want
* to create a directory, the debugfs_create_dir() function is
* recommended to be used instead.)
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
*/
-struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fops,
- loff_t file_size)
+void debugfs_create_file_size(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops,
+ loff_t file_size)
{
struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
if (de)
d_inode(de)->i_size = file_size;
- return de;
}
EXPORT_SYMBOL_GPL(debugfs_create_file_size);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index fa4f6447ddad..12c66f5d92dd 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -252,7 +252,7 @@ static struct file_system_type efivarfs_type = {
static __init int efivarfs_init(void)
{
- if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES))
return -ENODEV;
if (!efivars_kobject())
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 5779a15c2cd6..5d2d81940679 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -157,17 +157,27 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
}
}
- ret = LZ4_decompress_safe_partial(src + inputmargin, out,
- inlen, rq->outputsize,
- rq->outputsize);
- if (ret < 0) {
- erofs_err(rq->sb, "failed to decompress, in[%u, %u] out[%u]",
- inlen, inputmargin, rq->outputsize);
+ /* legacy format could compress extra data in a pcluster. */
+ if (rq->partial_decoding || !support_0padding)
+ ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+ inlen, rq->outputsize,
+ rq->outputsize);
+ else
+ ret = LZ4_decompress_safe(src + inputmargin, out,
+ inlen, rq->outputsize);
+
+ if (ret != rq->outputsize) {
+ erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+ ret, inlen, inputmargin, rq->outputsize);
+
WARN_ON(1);
print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
16, 1, src + inputmargin, inlen, true);
print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
16, 1, out, rq->outputsize, true);
+
+ if (ret >= 0)
+ memset(out + ret, 0, rq->outputsize - ret);
ret = -EIO;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c4c6dcdc89ad..5eead7fdc7a6 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -52,8 +52,8 @@ struct erofs_sb_info {
struct list_head list;
struct mutex umount_mutex;
- /* the dedicated workstation for compression */
- struct radix_tree_root workstn_tree;
+ /* managed XArray arranged in physical block number */
+ struct xarray managed_pslots;
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
@@ -402,8 +402,8 @@ static inline void *erofs_get_pcpubuf(unsigned int pagenr)
int erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index);
-int erofs_register_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp);
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 057e6d7b5b7f..b514c67e5fc2 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -425,7 +425,7 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags &= ~SB_POSIXACL;
#ifdef CONFIG_EROFS_FS_ZIP
- INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
+ xa_init(&sbi->managed_pslots);
#endif
/* get the root inode */
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index fddc5059c930..52d0be10f1aa 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -37,9 +37,6 @@ void *erofs_get_pcpubuf(unsigned int pagenr)
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
-#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount)
-#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount)
-
static int erofs_workgroup_get(struct erofs_workgroup *grp)
{
int o;
@@ -66,7 +63,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
repeat:
rcu_read_lock();
- grp = radix_tree_lookup(&sbi->workstn_tree, index);
+ grp = xa_load(&sbi->managed_pslots, index);
if (grp) {
if (erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
@@ -80,43 +77,37 @@ repeat:
return grp;
}
-int erofs_register_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
+struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp)
{
- struct erofs_sb_info *sbi;
- int err;
-
- /* grp shouldn't be broken or used before */
- if (atomic_read(&grp->refcount) != 1) {
- DBG_BUGON(1);
- return -EINVAL;
- }
-
- err = radix_tree_preload(GFP_NOFS);
- if (err)
- return err;
-
- sbi = EROFS_SB(sb);
- xa_lock(&sbi->workstn_tree);
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ struct erofs_workgroup *pre;
/*
- * Bump up reference count before making this workgroup
- * visible to other users in order to avoid potential UAF
- * without serialized by workstn_lock.
+ * Bump up a reference count before making this visible
+ * to others for the XArray in order to avoid potential
+ * UAF without serialized by xa_lock.
*/
- __erofs_workgroup_get(grp);
-
- err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
- if (err)
- /*
- * it's safe to decrease since the workgroup isn't visible
- * and refcount >= 2 (cannot be freezed).
- */
- __erofs_workgroup_put(grp);
+ atomic_inc(&grp->refcount);
- xa_unlock(&sbi->workstn_tree);
- radix_tree_preload_end();
- return err;
+repeat:
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
+ NULL, grp, GFP_NOFS);
+ if (pre) {
+ if (xa_is_err(pre)) {
+ pre = ERR_PTR(xa_err(pre));
+ } else if (erofs_workgroup_get(pre)) {
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
+ goto repeat;
+ }
+ atomic_dec(&grp->refcount);
+ grp = pre;
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return grp;
}
static void __erofs_workgroup_free(struct erofs_workgroup *grp)
@@ -155,7 +146,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
/*
* Note that all cached pages should be unattached
- * before deleted from the radix tree. Otherwise some
+ * before deleted from the XArray. Otherwise some
* cached pages could be still attached to the orphan
* old workgroup when the new one is available in the tree.
*/
@@ -169,7 +160,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
* however in order to avoid some race conditions, add a
* DBG_BUGON to observe this in advance.
*/
- DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp);
+ DBG_BUGON(xa_erase(&sbi->managed_pslots, grp->index) != grp);
/*
* If managed cache is on, last refcount should indicate
@@ -182,22 +173,11 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
unsigned long nr_shrink)
{
- pgoff_t first_index = 0;
- void *batch[PAGEVEC_SIZE];
+ struct erofs_workgroup *grp;
unsigned int freed = 0;
+ unsigned long index;
- int i, found;
-repeat:
- xa_lock(&sbi->workstn_tree);
-
- found = radix_tree_gang_lookup(&sbi->workstn_tree,
- batch, first_index, PAGEVEC_SIZE);
-
- for (i = 0; i < found; ++i) {
- struct erofs_workgroup *grp = batch[i];
-
- first_index = grp->index + 1;
-
+ xa_for_each(&sbi->managed_pslots, index, grp) {
/* try to shrink each valid workgroup */
if (!erofs_try_to_release_workgroup(sbi, grp))
continue;
@@ -206,10 +186,6 @@ repeat:
if (!--nr_shrink)
break;
}
- xa_unlock(&sbi->workstn_tree);
-
- if (i && nr_shrink)
- goto repeat;
return freed;
}
@@ -286,7 +262,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
- freed += erofs_shrink_workstation(sbi, nr);
+ freed += erofs_shrink_workstation(sbi, nr - freed);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 80e47f07d946..c4b6c9aa87ec 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -67,16 +67,6 @@ static void z_erofs_pcluster_init_once(void *ptr)
pcl->compressed_pages[i] = NULL;
}
-static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl)
-{
- struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
-
- atomic_set(&pcl->obj.refcount, 1);
-
- DBG_BUGON(cl->nr_pages);
- DBG_BUGON(cl->vcnt);
-}
-
int __init z_erofs_init_zip_subsystem(void)
{
pcluster_cachep = kmem_cache_create("erofs_compress",
@@ -341,26 +331,19 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
struct inode *inode,
struct erofs_map_blocks *map)
{
- struct erofs_workgroup *grp;
- struct z_erofs_pcluster *pcl;
+ struct z_erofs_pcluster *pcl = clt->pcl;
struct z_erofs_collection *cl;
unsigned int length;
- grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
- if (!grp)
- return -ENOENT;
-
- pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ /* to avoid unexpected loop formed by corrupted images */
if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
DBG_BUGON(1);
- erofs_workgroup_put(grp);
return -EFSCORRUPTED;
}
cl = z_erofs_primarycollection(pcl);
if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
DBG_BUGON(1);
- erofs_workgroup_put(grp);
return -EFSCORRUPTED;
}
@@ -368,7 +351,6 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
DBG_BUGON(1);
- erofs_workgroup_put(grp);
return -EFSCORRUPTED;
}
} else {
@@ -391,7 +373,6 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
/* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
clt->tailpcl = NULL;
- clt->pcl = pcl;
clt->cl = cl;
return 0;
}
@@ -402,6 +383,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
{
struct z_erofs_pcluster *pcl;
struct z_erofs_collection *cl;
+ struct erofs_workgroup *grp;
int err;
/* no available workgroup, let's allocate one */
@@ -409,7 +391,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
if (!pcl)
return -ENOMEM;
- z_erofs_pcluster_init_always(pcl);
+ atomic_set(&pcl->obj.refcount, 1);
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
@@ -429,19 +411,29 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
clt->mode = COLLECT_PRIMARY_FOLLOWED;
cl = z_erofs_primarycollection(pcl);
+
+ /* must be cleaned before freeing to slab */
+ DBG_BUGON(cl->nr_pages);
+ DBG_BUGON(cl->vcnt);
+
cl->pageofs = map->m_la & ~PAGE_MASK;
/*
* lock all primary followed works before visible to others
* and mutex_trylock *never* fails for a new pcluster.
*/
- mutex_trylock(&cl->lock);
+ DBG_BUGON(!mutex_trylock(&cl->lock));
- err = erofs_register_workgroup(inode->i_sb, &pcl->obj);
- if (err) {
- mutex_unlock(&cl->lock);
- kmem_cache_free(pcluster_cachep, pcl);
- return -EAGAIN;
+ grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
+ if (IS_ERR(grp)) {
+ err = PTR_ERR(grp);
+ goto err_out;
+ }
+
+ if (grp != &pcl->obj) {
+ clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ err = -EEXIST;
+ goto err_out;
}
/* used to check tail merging loop due to corrupted images */
if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
@@ -450,12 +442,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
clt->pcl = pcl;
clt->cl = cl;
return 0;
+
+err_out:
+ mutex_unlock(&cl->lock);
+ kmem_cache_free(pcluster_cachep, pcl);
+ return err;
}
static int z_erofs_collector_begin(struct z_erofs_collector *clt,
struct inode *inode,
struct erofs_map_blocks *map)
{
+ struct erofs_workgroup *grp;
int ret;
DBG_BUGON(clt->cl);
@@ -469,21 +467,25 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
return -EINVAL;
}
-repeat:
- ret = z_erofs_lookup_collection(clt, inode, map);
- if (ret == -ENOENT) {
+ grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
+ if (grp) {
+ clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ } else {
ret = z_erofs_register_collection(clt, inode, map);
- /* someone registered at the same time, give another try */
- if (ret == -EAGAIN) {
- cond_resched();
- goto repeat;
- }
+ if (!ret)
+ goto out;
+ if (ret != -EEXIST)
+ return ret;
}
- if (ret)
+ ret = z_erofs_lookup_collection(clt, inode, map);
+ if (ret) {
+ erofs_workgroup_put(&clt->pcl->obj);
return ret;
+ }
+out:
z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
clt->cl->pagevec, clt->cl->vcnt);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index eee3c92a9ebf..8c596641a72b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,13 +218,18 @@ struct eventpoll {
struct file *file;
/* used to optimize loop detection check */
- int visited;
struct list_head visited_list_link;
+ int visited;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /* tracks wakeup nests for lockdep validation */
+ u8 nests;
+#endif
};
/* Wait structure used by the poll hooks */
@@ -545,30 +550,47 @@ out_unlock:
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static DEFINE_PER_CPU(int, wakeup_nest);
-
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
+ struct eventpoll *ep_src;
unsigned long flags;
- int subclass;
+ u8 nests = 0;
- local_irq_save(flags);
- preempt_disable();
- subclass = __this_cpu_read(wakeup_nest);
- spin_lock_nested(&wq->lock, subclass + 1);
- __this_cpu_inc(wakeup_nest);
- wake_up_locked_poll(wq, POLLIN);
- __this_cpu_dec(wakeup_nest);
- spin_unlock(&wq->lock);
- local_irq_restore(flags);
- preempt_enable();
+ /*
+ * To set the subclass or nesting level for spin_lock_irqsave_nested()
+ * it might be natural to create a per-cpu nest count. However, since
+ * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
+ * schedule() in the -rt kernel, the per-cpu variable are no longer
+ * protected. Thus, we are introducing a per eventpoll nest field.
+ * If we are not being call from ep_poll_callback(), epi is NULL and
+ * we are at the first level of nesting, 0. Otherwise, we are being
+ * called from ep_poll_callback() and if a previous wakeup source is
+ * not an epoll file itself, we are at depth 1 since the wakeup source
+ * is depth 0. If the wakeup source is a previous epoll file in the
+ * wakeup chain then we use its nests value and record ours as
+ * nests + 1. The previous epoll file nests value is stable since its
+ * already holding its own poll_wait.lock.
+ */
+ if (epi) {
+ if ((is_file_epoll(epi->ffd.file))) {
+ ep_src = epi->ffd.file->private_data;
+ nests = ep_src->nests;
+ } else {
+ nests = 1;
+ }
+ }
+ spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
+ ep->nests = nests + 1;
+ wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+ ep->nests = 0;
+ spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
- wake_up_poll(wq, EPOLLIN);
+ wake_up_poll(&ep->poll_wait, EPOLLIN);
}
#endif
@@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
/*
* We need to lock this because we could be hit by
@@ -1258,7 +1280,7 @@ out_unlock:
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, epi);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
return 0;
@@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index db17be51b112..06b4c550af5d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -985,6 +985,32 @@ int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
+int kernel_read_file_from_path_initns(const char *path, void **buf,
+ loff_t *size, loff_t max_size,
+ enum kernel_read_file_id id)
+{
+ struct file *file;
+ struct path root;
+ int ret;
+
+ if (!path || !*path)
+ return -EINVAL;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+
+ file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0);
+ path_put(&root);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = kernel_read_file(file, buf, size, max_size, id);
+ fput(file);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
+
int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
enum kernel_read_file_id id)
{
@@ -1010,16 +1036,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
}
EXPORT_SYMBOL(read_code);
+/*
+ * Maps the mm_struct mm into the current task struct.
+ * On success, this function returns with the mutex
+ * exec_update_mutex locked.
+ */
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
+ int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
+ ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
+ if (ret)
+ return ret;
+
if (old_mm) {
sync_mm_rss(old_mm);
/*
@@ -1031,9 +1067,11 @@ static int exec_mmap(struct mm_struct *mm)
down_read(&old_mm->mmap_sem);
if (unlikely(old_mm->core_state)) {
up_read(&old_mm->mmap_sem);
+ mutex_unlock(&tsk->signal->exec_update_mutex);
return -EINTR;
}
}
+
task_lock(tsk);
active_mm = tsk->active_mm;
membarrier_exec_mmap(mm);
@@ -1189,10 +1227,22 @@ no_thread_group:
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;
-#ifdef CONFIG_POSIX_TIMERS
- exit_itimers(sig);
- flush_itimer_signals();
-#endif
+ BUG_ON(!thread_group_leader(tsk));
+ return 0;
+
+killed:
+ /* protects against exit_notify() and __exit_signal() */
+ read_lock(&tasklist_lock);
+ sig->group_exit_task = NULL;
+ sig->notify_count = 0;
+ read_unlock(&tasklist_lock);
+ return -EAGAIN;
+}
+
+
+static int unshare_sighand(struct task_struct *me)
+{
+ struct sighand_struct *oldsighand = me->sighand;
if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
@@ -1210,23 +1260,13 @@ no_thread_group:
write_lock_irq(&tasklist_lock);
spin_lock(&oldsighand->siglock);
- rcu_assign_pointer(tsk->sighand, newsighand);
+ rcu_assign_pointer(me->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
write_unlock_irq(&tasklist_lock);
__cleanup_sighand(oldsighand);
}
-
- BUG_ON(!thread_group_leader(tsk));
return 0;
-
-killed:
- /* protects against exit_notify() and __exit_signal() */
- read_lock(&tasklist_lock);
- sig->group_exit_task = NULL;
- sig->notify_count = 0;
- read_unlock(&tasklist_lock);
- return -EAGAIN;
}
char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
@@ -1260,13 +1300,13 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
*/
int flush_old_exec(struct linux_binprm * bprm)
{
+ struct task_struct *me = current;
int retval;
/*
- * Make sure we have a private signal table and that
- * we are unassociated from the previous thread group.
+ * Make this the only thread in the thread group.
*/
- retval = de_thread(current);
+ retval = de_thread(me);
if (retval)
goto out;
@@ -1286,18 +1326,31 @@ int flush_old_exec(struct linux_binprm * bprm)
goto out;
/*
- * After clearing bprm->mm (to mark that current is using the
- * prepared mm now), we have nothing left of the original
+ * After setting bprm->called_exec_mmap (to mark that current is
+ * using the prepared mm now), we have nothing left of the original
* process. If anything from here on returns an error, the check
* in search_binary_handler() will SEGV current.
*/
+ bprm->called_exec_mmap = 1;
bprm->mm = NULL;
+#ifdef CONFIG_POSIX_TIMERS
+ exit_itimers(me->signal);
+ flush_itimer_signals();
+#endif
+
+ /*
+ * Make the signal table private.
+ */
+ retval = unshare_sighand(me);
+ if (retval)
+ goto out;
+
set_fs(USER_DS);
- current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+ me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
- current->personality &= ~bprm->per_clear;
+ me->personality &= ~bprm->per_clear;
/*
* We have to apply CLOEXEC before we change whether the process is
@@ -1305,7 +1358,7 @@ int flush_old_exec(struct linux_binprm * bprm)
* trying to access the should-be-closed file descriptors of a process
* undergoing exec(2).
*/
- do_close_on_exec(current->files);
+ do_close_on_exec(me->files);
return 0;
out:
@@ -1386,7 +1439,7 @@ void setup_new_exec(struct linux_binprm * bprm)
/* An exec changes our domain. We are no longer part of the thread
group */
- current->self_exec_id++;
+ WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
flush_signal_handlers(current, 0);
}
EXPORT_SYMBOL(setup_new_exec);
@@ -1424,6 +1477,8 @@ static void free_bprm(struct linux_binprm *bprm)
{
free_arg_pages(bprm);
if (bprm->cred) {
+ if (bprm->called_exec_mmap)
+ mutex_unlock(&current->signal->exec_update_mutex);
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
@@ -1473,6 +1528,7 @@ void install_exec_creds(struct linux_binprm *bprm)
* credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
+ mutex_unlock(&current->signal->exec_update_mutex);
mutex_unlock(&current->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
@@ -1664,7 +1720,7 @@ int search_binary_handler(struct linux_binprm *bprm)
read_lock(&binfmt_lock);
put_binfmt(fmt);
- if (retval < 0 && !bprm->mm) {
+ if (retval < 0 && bprm->called_exec_mmap) {
/* we got to flush_old_exec() and failed after it */
read_unlock(&binfmt_lock);
force_sigsegv(SIGSEGV);
diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig
new file mode 100644
index 000000000000..2d3636dc5b8c
--- /dev/null
+++ b/fs/exfat/Kconfig
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config EXFAT_FS
+ tristate "exFAT filesystem support"
+ select NLS
+ help
+ This allows you to mount devices formatted with the exFAT file system.
+ exFAT is typically used on SD-Cards or USB sticks.
+
+ To compile this as a module, choose M here: the module will be called
+ exfat.
+
+config EXFAT_DEFAULT_IOCHARSET
+ string "Default iocharset for exFAT"
+ default "utf8"
+ depends on EXFAT_FS
+ help
+ Set this to the default input/output character set to use for
+ converting between the encoding is used for user visible filename and
+ UTF-16 character that exfat filesystem use, and can be overridden with
+ the "iocharset" mount option for exFAT filesystems.
diff --git a/fs/exfat/Makefile b/fs/exfat/Makefile
new file mode 100644
index 000000000000..ed51926a4971
--- /dev/null
+++ b/fs/exfat/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Makefile for the linux exFAT filesystem support.
+#
+obj-$(CONFIG_EXFAT_FS) += exfat.o
+
+exfat-y := inode.o namei.o dir.o super.o fatent.o cache.o nls.o misc.o \
+ file.o balloc.o
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
new file mode 100644
index 000000000000..6a04cc02565a
--- /dev/null
+++ b/fs/exfat/balloc.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+static const unsigned char free_bit[] = {
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/* 0 ~ 19*/
+ 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3,/* 20 ~ 39*/
+ 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/* 40 ~ 59*/
+ 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/* 60 ~ 79*/
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2,/* 80 ~ 99*/
+ 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,/*100 ~ 119*/
+ 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*120 ~ 139*/
+ 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,/*140 ~ 159*/
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/*160 ~ 179*/
+ 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3,/*180 ~ 199*/
+ 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*200 ~ 219*/
+ 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/*220 ~ 239*/
+ 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 /*240 ~ 254*/
+};
+
+static const unsigned char used_bit[] = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3,/* 0 ~ 19*/
+ 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4,/* 20 ~ 39*/
+ 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5,/* 40 ~ 59*/
+ 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,/* 60 ~ 79*/
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,/* 80 ~ 99*/
+ 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,/*100 ~ 119*/
+ 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,/*120 ~ 139*/
+ 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,/*140 ~ 159*/
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,/*160 ~ 179*/
+ 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,/*180 ~ 199*/
+ 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6,/*200 ~ 219*/
+ 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,/*220 ~ 239*/
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 /*240 ~ 255*/
+};
+
+/*
+ * Allocation Bitmap Management Functions
+ */
+static int exfat_allocate_bitmap(struct super_block *sb,
+ struct exfat_dentry *ep)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ long long map_size;
+ unsigned int i, need_map_size;
+ sector_t sector;
+
+ sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu);
+ map_size = le64_to_cpu(ep->dentry.bitmap.size);
+ need_map_size = ((EXFAT_DATA_CLUSTER_COUNT(sbi) - 1) / BITS_PER_BYTE)
+ + 1;
+ if (need_map_size != map_size) {
+ exfat_msg(sb, KERN_ERR,
+ "bogus allocation bitmap size(need : %u, cur : %lld)",
+ need_map_size, map_size);
+ /*
+ * Only allowed when bogus allocation
+ * bitmap size is large
+ */
+ if (need_map_size > map_size)
+ return -EIO;
+ }
+ sbi->map_sectors = ((need_map_size - 1) >>
+ (sb->s_blocksize_bits)) + 1;
+ sbi->vol_amap = kmalloc_array(sbi->map_sectors,
+ sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!sbi->vol_amap)
+ return -ENOMEM;
+
+ sector = exfat_cluster_to_sector(sbi, sbi->map_clu);
+ for (i = 0; i < sbi->map_sectors; i++) {
+ sbi->vol_amap[i] = sb_bread(sb, sector + i);
+ if (!sbi->vol_amap[i]) {
+ /* release all buffers and free vol_amap */
+ int j = 0;
+
+ while (j < i)
+ brelse(sbi->vol_amap[j++]);
+
+ kfree(sbi->vol_amap);
+ sbi->vol_amap = NULL;
+ return -EIO;
+ }
+ }
+
+ sbi->pbr_bh = NULL;
+ return 0;
+}
+
+int exfat_load_bitmap(struct super_block *sb)
+{
+ unsigned int i, type;
+ struct exfat_chain clu;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ for (i = 0; i < sbi->dentries_per_clu; i++) {
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+
+ ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ if (!ep)
+ return -EIO;
+
+ type = exfat_get_entry_type(ep);
+ if (type == TYPE_UNUSED)
+ break;
+ if (type != TYPE_BITMAP)
+ continue;
+ if (ep->dentry.bitmap.flags == 0x0) {
+ int err;
+
+ err = exfat_allocate_bitmap(sb, ep);
+ brelse(bh);
+ return err;
+ }
+ brelse(bh);
+ }
+
+ if (exfat_get_next_cluster(sb, &clu.dir))
+ return -EIO;
+ }
+
+ return -EINVAL;
+}
+
+void exfat_free_bitmap(struct exfat_sb_info *sbi)
+{
+ int i;
+
+ brelse(sbi->pbr_bh);
+
+ for (i = 0; i < sbi->map_sectors; i++)
+ __brelse(sbi->vol_amap[i]);
+
+ kfree(sbi->vol_amap);
+}
+
+/*
+ * If the value of "clu" is 0, it means cluster 2 which is the first cluster of
+ * the cluster heap.
+ */
+int exfat_set_bitmap(struct inode *inode, unsigned int clu)
+{
+ int i, b;
+ unsigned int ent_idx;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
+ i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+ b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+
+ set_bit_le(b, sbi->vol_amap[i]->b_data);
+ exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode));
+ return 0;
+}
+
+/*
+ * If the value of "clu" is 0, it means cluster 2 which is the first cluster of
+ * the cluster heap.
+ */
+void exfat_clear_bitmap(struct inode *inode, unsigned int clu)
+{
+ int i, b;
+ unsigned int ent_idx;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_mount_options *opts = &sbi->options;
+
+ WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
+ i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+ b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+
+ clear_bit_le(b, sbi->vol_amap[i]->b_data);
+ exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode));
+
+ if (opts->discard) {
+ int ret_discard;
+
+ ret_discard = sb_issue_discard(sb,
+ exfat_cluster_to_sector(sbi, clu +
+ EXFAT_RESERVED_CLUSTERS),
+ (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0);
+
+ if (ret_discard == -EOPNOTSUPP) {
+ exfat_msg(sb, KERN_ERR,
+ "discard not supported by device, disabling");
+ opts->discard = 0;
+ }
+ }
+}
+
+/*
+ * If the value of "clu" is 0, it means cluster 2 which is the first cluster of
+ * the cluster heap.
+ */
+unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu)
+{
+ unsigned int i, map_i, map_b, ent_idx;
+ unsigned int clu_base, clu_free;
+ unsigned char k, clu_mask;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
+ clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx & ~(BITS_PER_BYTE_MASK));
+ clu_mask = IGNORED_BITS_REMAINED(clu, clu_base);
+
+ map_i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+ map_b = BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent_idx);
+
+ for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters;
+ i += BITS_PER_BYTE) {
+ k = *(sbi->vol_amap[map_i]->b_data + map_b);
+ if (clu_mask > 0) {
+ k |= clu_mask;
+ clu_mask = 0;
+ }
+ if (k < 0xFF) {
+ clu_free = clu_base + free_bit[k];
+ if (clu_free < sbi->num_clusters)
+ return clu_free;
+ }
+ clu_base += BITS_PER_BYTE;
+
+ if (++map_b >= sb->s_blocksize ||
+ clu_base >= sbi->num_clusters) {
+ if (++map_i >= sbi->map_sectors) {
+ clu_base = EXFAT_FIRST_CLUSTER;
+ map_i = 0;
+ }
+ map_b = 0;
+ }
+ }
+
+ return EXFAT_EOF_CLUSTER;
+}
+
+int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned int count = 0;
+ unsigned int i, map_i = 0, map_b = 0;
+ unsigned int total_clus = EXFAT_DATA_CLUSTER_COUNT(sbi);
+ unsigned int last_mask = total_clus & BITS_PER_BYTE_MASK;
+ unsigned char clu_bits;
+ const unsigned char last_bit_mask[] = {0, 0b00000001, 0b00000011,
+ 0b00000111, 0b00001111, 0b00011111, 0b00111111, 0b01111111};
+
+ total_clus &= ~last_mask;
+ for (i = 0; i < total_clus; i += BITS_PER_BYTE) {
+ clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b);
+ count += used_bit[clu_bits];
+ if (++map_b >= (unsigned int)sb->s_blocksize) {
+ map_i++;
+ map_b = 0;
+ }
+ }
+
+ if (last_mask) {
+ clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b);
+ clu_bits &= last_bit_mask[last_mask];
+ count += used_bit[clu_bits];
+ }
+
+ *ret_count = count;
+ return 0;
+}
diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c
new file mode 100644
index 000000000000..03d0824fc368
--- /dev/null
+++ b/fs/exfat/cache.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * linux/fs/fat/cache.c
+ *
+ * Written 1992,1993 by Werner Almesberger
+ *
+ * Mar 1999. AV. Changed cache, so that it uses the starting cluster instead
+ * of inode number.
+ * May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers.
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include <linux/buffer_head.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+#define EXFAT_CACHE_VALID 0
+#define EXFAT_MAX_CACHE 16
+
+struct exfat_cache {
+ struct list_head cache_list;
+ unsigned int nr_contig; /* number of contiguous clusters */
+ unsigned int fcluster; /* cluster number in the file. */
+ unsigned int dcluster; /* cluster number on disk. */
+};
+
+struct exfat_cache_id {
+ unsigned int id;
+ unsigned int nr_contig;
+ unsigned int fcluster;
+ unsigned int dcluster;
+};
+
+static struct kmem_cache *exfat_cachep;
+
+static void exfat_cache_init_once(void *c)
+{
+ struct exfat_cache *cache = (struct exfat_cache *)c;
+
+ INIT_LIST_HEAD(&cache->cache_list);
+}
+
+int exfat_cache_init(void)
+{
+ exfat_cachep = kmem_cache_create("exfat_cache",
+ sizeof(struct exfat_cache),
+ 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ exfat_cache_init_once);
+ if (!exfat_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+void exfat_cache_shutdown(void)
+{
+ if (!exfat_cachep)
+ return;
+ kmem_cache_destroy(exfat_cachep);
+}
+
+void exfat_cache_init_inode(struct inode *inode)
+{
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+
+ spin_lock_init(&ei->cache_lru_lock);
+ ei->nr_caches = 0;
+ ei->cache_valid_id = EXFAT_CACHE_VALID + 1;
+ INIT_LIST_HEAD(&ei->cache_lru);
+}
+
+static inline struct exfat_cache *exfat_cache_alloc(void)
+{
+ return kmem_cache_alloc(exfat_cachep, GFP_NOFS);
+}
+
+static inline void exfat_cache_free(struct exfat_cache *cache)
+{
+ WARN_ON(!list_empty(&cache->cache_list));
+ kmem_cache_free(exfat_cachep, cache);
+}
+
+static inline void exfat_cache_update_lru(struct inode *inode,
+ struct exfat_cache *cache)
+{
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+
+ if (ei->cache_lru.next != &cache->cache_list)
+ list_move(&cache->cache_list, &ei->cache_lru);
+}
+
+static unsigned int exfat_cache_lookup(struct inode *inode,
+ unsigned int fclus, struct exfat_cache_id *cid,
+ unsigned int *cached_fclus, unsigned int *cached_dclus)
+{
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ static struct exfat_cache nohit = { .fcluster = 0, };
+ struct exfat_cache *hit = &nohit, *p;
+ unsigned int offset = EXFAT_EOF_CLUSTER;
+
+ spin_lock(&ei->cache_lru_lock);
+ list_for_each_entry(p, &ei->cache_lru, cache_list) {
+ /* Find the cache of "fclus" or nearest cache. */
+ if (p->fcluster <= fclus && hit->fcluster < p->fcluster) {
+ hit = p;
+ if (hit->fcluster + hit->nr_contig < fclus) {
+ offset = hit->nr_contig;
+ } else {
+ offset = fclus - hit->fcluster;
+ break;
+ }
+ }
+ }
+ if (hit != &nohit) {
+ exfat_cache_update_lru(inode, hit);
+
+ cid->id = ei->cache_valid_id;
+ cid->nr_contig = hit->nr_contig;
+ cid->fcluster = hit->fcluster;
+ cid->dcluster = hit->dcluster;
+ *cached_fclus = cid->fcluster + offset;
+ *cached_dclus = cid->dcluster + offset;
+ }
+ spin_unlock(&ei->cache_lru_lock);
+
+ return offset;
+}
+
+static struct exfat_cache *exfat_cache_merge(struct inode *inode,
+ struct exfat_cache_id *new)
+{
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct exfat_cache *p;
+
+ list_for_each_entry(p, &ei->cache_lru, cache_list) {
+ /* Find the same part as "new" in cluster-chain. */
+ if (p->fcluster == new->fcluster) {
+ if (new->nr_contig > p->nr_contig)
+ p->nr_contig = new->nr_contig;
+ return p;
+ }
+ }
+ return NULL;
+}
+
+static void exfat_cache_add(struct inode *inode,
+ struct exfat_cache_id *new)
+{
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct exfat_cache *cache, *tmp;
+
+ if (new->fcluster == EXFAT_EOF_CLUSTER) /* dummy cache */
+ return;
+
+ spin_lock(&ei->cache_lru_lock);
+ if (new->id != EXFAT_CACHE_VALID &&
+ new->id != ei->cache_valid_id)
+ goto unlock; /* this cache was invalidated */
+
+ cache = exfat_cache_merge(inode, new);
+ if (cache == NULL) {
+ if (ei->nr_caches < EXFAT_MAX_CACHE) {
+ ei->nr_caches++;
+ spin_unlock(&ei->cache_lru_lock);
+
+ tmp = exfat_cache_alloc();
+ if (!tmp) {
+ spin_lock(&ei->cache_lru_lock);
+ ei->nr_caches--;
+ spin_unlock(&ei->cache_lru_lock);
+ return;
+ }
+
+ spin_lock(&ei->cache_lru_lock);
+ cache = exfat_cache_merge(inode, new);
+ if (cache != NULL) {
+ ei->nr_caches--;
+ exfat_cache_free(tmp);
+ goto out_update_lru;
+ }
+ cache = tmp;
+ } else {
+ struct list_head *p = ei->cache_lru.prev;
+
+ cache = list_entry(p,
+ struct exfat_cache, cache_list);
+ }
+ cache->fcluster = new->fcluster;
+ cache->dcluster = new->dcluster;
+ cache->nr_contig = new->nr_contig;
+ }
+out_update_lru:
+ exfat_cache_update_lru(inode, cache);
+unlock:
+ spin_unlock(&ei->cache_lru_lock);
+}
+
+/*
+ * Cache invalidation occurs rarely, thus the LRU chain is not updated. It
+ * fixes itself after a while.
+ */
+static void __exfat_cache_inval_inode(struct inode *inode)
+{
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct exfat_cache *cache;
+
+ while (!list_empty(&ei->cache_lru)) {
+ cache = list_entry(ei->cache_lru.next,
+ struct exfat_cache, cache_list);
+ list_del_init(&cache->cache_list);
+ ei->nr_caches--;
+ exfat_cache_free(cache);
+ }
+ /* Update. The copy of caches before this id is discarded. */
+ ei->cache_valid_id++;
+ if (ei->cache_valid_id == EXFAT_CACHE_VALID)
+ ei->cache_valid_id++;
+}
+
+void exfat_cache_inval_inode(struct inode *inode)
+{
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+
+ spin_lock(&ei->cache_lru_lock);
+ __exfat_cache_inval_inode(inode);
+ spin_unlock(&ei->cache_lru_lock);
+}
+
+static inline int cache_contiguous(struct exfat_cache_id *cid,
+ unsigned int dclus)
+{
+ cid->nr_contig++;
+ return cid->dcluster + cid->nr_contig == dclus;
+}
+
+static inline void cache_init(struct exfat_cache_id *cid,
+ unsigned int fclus, unsigned int dclus)
+{
+ cid->id = EXFAT_CACHE_VALID;
+ cid->fcluster = fclus;
+ cid->dcluster = dclus;
+ cid->nr_contig = 0;
+}
+
+int exfat_get_cluster(struct inode *inode, unsigned int cluster,
+ unsigned int *fclus, unsigned int *dclus,
+ unsigned int *last_dclus, int allow_eof)
+{
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned int limit = sbi->num_clusters;
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct exfat_cache_id cid;
+ unsigned int content;
+
+ if (ei->start_clu == EXFAT_FREE_CLUSTER) {
+ exfat_fs_error(sb,
+ "invalid access to exfat cache (entry 0x%08x)",
+ ei->start_clu);
+ return -EIO;
+ }
+
+ *fclus = 0;
+ *dclus = ei->start_clu;
+ *last_dclus = *dclus;
+
+ /*
+ * Don`t use exfat_cache if zero offset or non-cluster allocation
+ */
+ if (cluster == 0 || *dclus == EXFAT_EOF_CLUSTER)
+ return 0;
+
+ cache_init(&cid, EXFAT_EOF_CLUSTER, EXFAT_EOF_CLUSTER);
+
+ if (exfat_cache_lookup(inode, cluster, &cid, fclus, dclus) ==
+ EXFAT_EOF_CLUSTER) {
+ /*
+ * dummy, always not contiguous
+ * This is reinitialized by cache_init(), later.
+ */
+ WARN_ON(cid.id != EXFAT_CACHE_VALID ||
+ cid.fcluster != EXFAT_EOF_CLUSTER ||
+ cid.dcluster != EXFAT_EOF_CLUSTER ||
+ cid.nr_contig != 0);
+ }
+
+ if (*fclus == cluster)
+ return 0;
+
+ while (*fclus < cluster) {
+ /* prevent the infinite loop of cluster chain */
+ if (*fclus > limit) {
+ exfat_fs_error(sb,
+ "detected the cluster chain loop (i_pos %u)",
+ (*fclus));
+ return -EIO;
+ }
+
+ if (exfat_ent_get(sb, *dclus, &content))
+ return -EIO;
+
+ *last_dclus = *dclus;
+ *dclus = content;
+ (*fclus)++;
+
+ if (content == EXFAT_EOF_CLUSTER) {
+ if (!allow_eof) {
+ exfat_fs_error(sb,
+ "invalid cluster chain (i_pos %u, last_clus 0x%08x is EOF)",
+ *fclus, (*last_dclus));
+ return -EIO;
+ }
+
+ break;
+ }
+
+ if (!cache_contiguous(&cid, *dclus))
+ cache_init(&cid, *fclus, *dclus);
+ }
+
+ exfat_cache_add(inode, &cid);
+ return 0;
+}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
new file mode 100644
index 000000000000..4b91afb0f051
--- /dev/null
+++ b/fs/exfat/dir.c
@@ -0,0 +1,1238 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+static int exfat_extract_uni_name(struct exfat_dentry *ep,
+ unsigned short *uniname)
+{
+ int i, len = 0;
+
+ for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) {
+ *uniname = le16_to_cpu(ep->dentry.name.unicode_0_14[i]);
+ if (*uniname == 0x0)
+ return len;
+ uniname++;
+ len++;
+ }
+
+ *uniname = 0x0;
+ return len;
+
+}
+
+static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
+ struct exfat_chain *p_dir, int entry, unsigned short *uniname)
+{
+ int i;
+ struct exfat_dentry *ep;
+ struct exfat_entry_set_cache *es;
+
+ es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES, &ep);
+ if (!es)
+ return;
+
+ if (es->num_entries < 3)
+ goto free_es;
+
+ ep += 2;
+
+ /*
+ * First entry : file entry
+ * Second entry : stream-extension entry
+ * Third entry : first file-name entry
+ * So, the index of first file-name dentry should start from 2.
+ */
+ for (i = 2; i < es->num_entries; i++, ep++) {
+ /* end of name entry */
+ if (exfat_get_entry_type(ep) != TYPE_EXTEND)
+ goto free_es;
+
+ exfat_extract_uni_name(ep, uniname);
+ uniname += EXFAT_FILE_NAME_LEN;
+ }
+
+free_es:
+ kfree(es);
+}
+
+/* read a directory entry from the opened directory */
+static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
+{
+ int i, dentries_per_clu, dentries_per_clu_bits = 0;
+ unsigned int type, clu_offset;
+ sector_t sector;
+ struct exfat_chain dir, clu;
+ struct exfat_uni_name uni_name;
+ struct exfat_dentry *ep;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ unsigned int dentry = ei->rwoffset & 0xFFFFFFFF;
+ struct buffer_head *bh;
+
+ /* check if the given file ID is opened */
+ if (ei->type != TYPE_DIR)
+ return -EPERM;
+
+ if (ei->entry == -1)
+ exfat_chain_set(&dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ else
+ exfat_chain_set(&dir, ei->start_clu,
+ EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags);
+
+ dentries_per_clu = sbi->dentries_per_clu;
+ dentries_per_clu_bits = ilog2(dentries_per_clu);
+
+ clu_offset = dentry >> dentries_per_clu_bits;
+ exfat_chain_dup(&clu, &dir);
+
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ clu.dir += clu_offset;
+ clu.size -= clu_offset;
+ } else {
+ /* hint_information */
+ if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER &&
+ ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) {
+ clu_offset -= ei->hint_bmap.off;
+ clu.dir = ei->hint_bmap.clu;
+ }
+
+ while (clu_offset > 0) {
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+
+ clu_offset--;
+ }
+ }
+
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ i = dentry & (dentries_per_clu - 1);
+
+ for ( ; i < dentries_per_clu; i++, dentry++) {
+ ep = exfat_get_dentry(sb, &clu, i, &bh, &sector);
+ if (!ep)
+ return -EIO;
+
+ type = exfat_get_entry_type(ep);
+ if (type == TYPE_UNUSED) {
+ brelse(bh);
+ break;
+ }
+
+ if (type != TYPE_FILE && type != TYPE_DIR) {
+ brelse(bh);
+ continue;
+ }
+
+ dir_entry->attr = le16_to_cpu(ep->dentry.file.attr);
+ exfat_get_entry_time(sbi, &dir_entry->crtime,
+ ep->dentry.file.create_tz,
+ ep->dentry.file.create_time,
+ ep->dentry.file.create_date,
+ ep->dentry.file.create_time_ms);
+ exfat_get_entry_time(sbi, &dir_entry->mtime,
+ ep->dentry.file.modify_tz,
+ ep->dentry.file.modify_time,
+ ep->dentry.file.modify_date,
+ ep->dentry.file.modify_time_ms);
+ exfat_get_entry_time(sbi, &dir_entry->atime,
+ ep->dentry.file.access_tz,
+ ep->dentry.file.access_time,
+ ep->dentry.file.access_date,
+ 0);
+
+ *uni_name.name = 0x0;
+ exfat_get_uniname_from_ext_entry(sb, &dir, dentry,
+ uni_name.name);
+ exfat_utf16_to_nls(sb, &uni_name,
+ dir_entry->namebuf.lfn,
+ dir_entry->namebuf.lfnbuf_len);
+ brelse(bh);
+
+ ep = exfat_get_dentry(sb, &clu, i + 1, &bh, NULL);
+ if (!ep)
+ return -EIO;
+ dir_entry->size =
+ le64_to_cpu(ep->dentry.stream.valid_size);
+ brelse(bh);
+
+ ei->hint_bmap.off = dentry >> dentries_per_clu_bits;
+ ei->hint_bmap.clu = clu.dir;
+
+ ei->rwoffset = ++dentry;
+ return 0;
+ }
+
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ if (--clu.size > 0)
+ clu.dir++;
+ else
+ clu.dir = EXFAT_EOF_CLUSTER;
+ } else {
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+ }
+ }
+
+ dir_entry->namebuf.lfn[0] = '\0';
+ ei->rwoffset = dentry;
+ return 0;
+}
+
+static void exfat_init_namebuf(struct exfat_dentry_namebuf *nb)
+{
+ nb->lfn = NULL;
+ nb->lfnbuf_len = 0;
+}
+
+static int exfat_alloc_namebuf(struct exfat_dentry_namebuf *nb)
+{
+ nb->lfn = __getname();
+ if (!nb->lfn)
+ return -ENOMEM;
+ nb->lfnbuf_len = MAX_VFSNAME_BUF_SIZE;
+ return 0;
+}
+
+static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb)
+{
+ if (!nb->lfn)
+ return;
+
+ __putname(nb->lfn);
+ exfat_init_namebuf(nb);
+}
+
+/* skip iterating emit_dots when dir is empty */
+#define ITER_POS_FILLED_DOTS (2)
+static int exfat_iterate(struct file *filp, struct dir_context *ctx)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct inode *tmp;
+ struct exfat_dir_entry de;
+ struct exfat_dentry_namebuf *nb = &(de.namebuf);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ unsigned long inum;
+ loff_t cpos, i_pos;
+ int err = 0, fake_offset = 0;
+
+ exfat_init_namebuf(nb);
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+
+ cpos = ctx->pos;
+ if (!dir_emit_dots(filp, ctx))
+ goto unlock;
+
+ if (ctx->pos == ITER_POS_FILLED_DOTS) {
+ cpos = 0;
+ fake_offset = 1;
+ }
+
+ if (cpos & (DENTRY_SIZE - 1)) {
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ /* name buffer should be allocated before use */
+ err = exfat_alloc_namebuf(nb);
+ if (err)
+ goto unlock;
+get_new:
+ ei->rwoffset = EXFAT_B_TO_DEN(cpos);
+
+ if (cpos >= i_size_read(inode))
+ goto end_of_dir;
+
+ err = exfat_readdir(inode, &de);
+ if (err) {
+ /*
+ * At least we tried to read a sector. Move cpos to next sector
+ * position (should be aligned).
+ */
+ if (err == -EIO) {
+ cpos += 1 << (sb->s_blocksize_bits);
+ cpos &= ~(sb->s_blocksize - 1);
+ }
+
+ err = -EIO;
+ goto end_of_dir;
+ }
+
+ cpos = EXFAT_DEN_TO_B(ei->rwoffset);
+
+ if (!nb->lfn[0])
+ goto end_of_dir;
+
+ i_pos = ((loff_t)ei->start_clu << 32) |
+ ((ei->rwoffset - 1) & 0xffffffff);
+ tmp = exfat_iget(sb, i_pos);
+ if (tmp) {
+ inum = tmp->i_ino;
+ iput(tmp);
+ } else {
+ inum = iunique(sb, EXFAT_ROOT_INO);
+ }
+
+ /*
+ * Before calling dir_emit(), sb_lock should be released.
+ * Because page fault can occur in dir_emit() when the size
+ * of buffer given from user is larger than one page size.
+ */
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum,
+ (de.attr & ATTR_SUBDIR) ? DT_DIR : DT_REG))
+ goto out_unlocked;
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+ ctx->pos = cpos;
+ goto get_new;
+
+end_of_dir:
+ if (!cpos && fake_offset)
+ cpos = ITER_POS_FILLED_DOTS;
+ ctx->pos = cpos;
+unlock:
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+out_unlocked:
+ /*
+ * To improve performance, free namebuf after unlock sb_lock.
+ * If namebuf is not allocated, this function do nothing
+ */
+ exfat_free_namebuf(nb);
+ return err;
+}
+
+const struct file_operations exfat_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate = exfat_iterate,
+ .fsync = generic_file_fsync,
+};
+
+int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu)
+{
+ int ret;
+
+ exfat_chain_set(clu, EXFAT_EOF_CLUSTER, 0, ALLOC_NO_FAT_CHAIN);
+
+ ret = exfat_alloc_cluster(inode, 1, clu);
+ if (ret)
+ return ret;
+
+ return exfat_zeroed_cluster(inode, clu->dir);
+}
+
+int exfat_calc_num_entries(struct exfat_uni_name *p_uniname)
+{
+ int len;
+
+ len = p_uniname->name_len;
+ if (len == 0)
+ return -EINVAL;
+
+ /* 1 file entry + 1 stream entry + name entries */
+ return ((len - 1) / EXFAT_FILE_NAME_LEN + 3);
+}
+
+unsigned int exfat_get_entry_type(struct exfat_dentry *ep)
+{
+ if (ep->type == EXFAT_UNUSED)
+ return TYPE_UNUSED;
+ if (IS_EXFAT_DELETED(ep->type))
+ return TYPE_DELETED;
+ if (ep->type == EXFAT_INVAL)
+ return TYPE_INVALID;
+ if (IS_EXFAT_CRITICAL_PRI(ep->type)) {
+ if (ep->type == EXFAT_BITMAP)
+ return TYPE_BITMAP;
+ if (ep->type == EXFAT_UPCASE)
+ return TYPE_UPCASE;
+ if (ep->type == EXFAT_VOLUME)
+ return TYPE_VOLUME;
+ if (ep->type == EXFAT_FILE) {
+ if (le16_to_cpu(ep->dentry.file.attr) & ATTR_SUBDIR)
+ return TYPE_DIR;
+ return TYPE_FILE;
+ }
+ return TYPE_CRITICAL_PRI;
+ }
+ if (IS_EXFAT_BENIGN_PRI(ep->type)) {
+ if (ep->type == EXFAT_GUID)
+ return TYPE_GUID;
+ if (ep->type == EXFAT_PADDING)
+ return TYPE_PADDING;
+ if (ep->type == EXFAT_ACLTAB)
+ return TYPE_ACLTAB;
+ return TYPE_BENIGN_PRI;
+ }
+ if (IS_EXFAT_CRITICAL_SEC(ep->type)) {
+ if (ep->type == EXFAT_STREAM)
+ return TYPE_STREAM;
+ if (ep->type == EXFAT_NAME)
+ return TYPE_EXTEND;
+ if (ep->type == EXFAT_ACL)
+ return TYPE_ACL;
+ return TYPE_CRITICAL_SEC;
+ }
+ return TYPE_BENIGN_SEC;
+}
+
+static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type)
+{
+ if (type == TYPE_UNUSED) {
+ ep->type = EXFAT_UNUSED;
+ } else if (type == TYPE_DELETED) {
+ ep->type &= EXFAT_DELETE;
+ } else if (type == TYPE_STREAM) {
+ ep->type = EXFAT_STREAM;
+ } else if (type == TYPE_EXTEND) {
+ ep->type = EXFAT_NAME;
+ } else if (type == TYPE_BITMAP) {
+ ep->type = EXFAT_BITMAP;
+ } else if (type == TYPE_UPCASE) {
+ ep->type = EXFAT_UPCASE;
+ } else if (type == TYPE_VOLUME) {
+ ep->type = EXFAT_VOLUME;
+ } else if (type == TYPE_DIR) {
+ ep->type = EXFAT_FILE;
+ ep->dentry.file.attr = cpu_to_le16(ATTR_SUBDIR);
+ } else if (type == TYPE_FILE) {
+ ep->type = EXFAT_FILE;
+ ep->dentry.file.attr = cpu_to_le16(ATTR_ARCHIVE);
+ }
+}
+
+static void exfat_init_stream_entry(struct exfat_dentry *ep,
+ unsigned char flags, unsigned int start_clu,
+ unsigned long long size)
+{
+ exfat_set_entry_type(ep, TYPE_STREAM);
+ ep->dentry.stream.flags = flags;
+ ep->dentry.stream.start_clu = cpu_to_le32(start_clu);
+ ep->dentry.stream.valid_size = cpu_to_le64(size);
+ ep->dentry.stream.size = cpu_to_le64(size);
+}
+
+static void exfat_init_name_entry(struct exfat_dentry *ep,
+ unsigned short *uniname)
+{
+ int i;
+
+ exfat_set_entry_type(ep, TYPE_EXTEND);
+ ep->dentry.name.flags = 0x0;
+
+ for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) {
+ ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname);
+ if (*uniname == 0x0)
+ break;
+ uniname++;
+ }
+}
+
+int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
+ int entry, unsigned int type, unsigned int start_clu,
+ unsigned long long size)
+{
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct timespec64 ts = current_time(inode);
+ sector_t sector;
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+
+ /*
+ * We cannot use exfat_get_dentry_set here because file ep is not
+ * initialized yet.
+ */
+ ep = exfat_get_dentry(sb, p_dir, entry, &bh, &sector);
+ if (!ep)
+ return -EIO;
+
+ exfat_set_entry_type(ep, type);
+ exfat_set_entry_time(sbi, &ts,
+ &ep->dentry.file.create_tz,
+ &ep->dentry.file.create_time,
+ &ep->dentry.file.create_date,
+ &ep->dentry.file.create_time_ms);
+ exfat_set_entry_time(sbi, &ts,
+ &ep->dentry.file.modify_tz,
+ &ep->dentry.file.modify_time,
+ &ep->dentry.file.modify_date,
+ &ep->dentry.file.modify_time_ms);
+ exfat_set_entry_time(sbi, &ts,
+ &ep->dentry.file.access_tz,
+ &ep->dentry.file.access_time,
+ &ep->dentry.file.access_date,
+ NULL);
+
+ exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ brelse(bh);
+
+ ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, &sector);
+ if (!ep)
+ return -EIO;
+
+ exfat_init_stream_entry(ep,
+ (type == TYPE_FILE) ? ALLOC_FAT_CHAIN : ALLOC_NO_FAT_CHAIN,
+ start_clu, size);
+ exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ brelse(bh);
+
+ return 0;
+}
+
+int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
+ int entry)
+{
+ struct super_block *sb = inode->i_sb;
+ int ret = 0;
+ int i, num_entries;
+ sector_t sector;
+ unsigned short chksum;
+ struct exfat_dentry *ep, *fep;
+ struct buffer_head *fbh, *bh;
+
+ fep = exfat_get_dentry(sb, p_dir, entry, &fbh, &sector);
+ if (!fep)
+ return -EIO;
+
+ num_entries = fep->dentry.file.num_ext + 1;
+ chksum = exfat_calc_chksum_2byte(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
+
+ for (i = 1; i < num_entries; i++) {
+ ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, NULL);
+ if (!ep) {
+ ret = -EIO;
+ goto release_fbh;
+ }
+ chksum = exfat_calc_chksum_2byte(ep, DENTRY_SIZE, chksum,
+ CS_DEFAULT);
+ brelse(bh);
+ }
+
+ fep->dentry.file.checksum = cpu_to_le16(chksum);
+ exfat_update_bh(sb, fbh, IS_DIRSYNC(inode));
+release_fbh:
+ brelse(fbh);
+ return ret;
+}
+
+int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
+ int entry, int num_entries, struct exfat_uni_name *p_uniname)
+{
+ struct super_block *sb = inode->i_sb;
+ int i;
+ sector_t sector;
+ unsigned short *uniname = p_uniname->name;
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+ int sync = IS_DIRSYNC(inode);
+
+ ep = exfat_get_dentry(sb, p_dir, entry, &bh, &sector);
+ if (!ep)
+ return -EIO;
+
+ ep->dentry.file.num_ext = (unsigned char)(num_entries - 1);
+ exfat_update_bh(sb, bh, sync);
+ brelse(bh);
+
+ ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, &sector);
+ if (!ep)
+ return -EIO;
+
+ ep->dentry.stream.name_len = p_uniname->name_len;
+ ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash);
+ exfat_update_bh(sb, bh, sync);
+ brelse(bh);
+
+ for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) {
+ ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, &sector);
+ if (!ep)
+ return -EIO;
+
+ exfat_init_name_entry(ep, uniname);
+ exfat_update_bh(sb, bh, sync);
+ brelse(bh);
+ uniname += EXFAT_FILE_NAME_LEN;
+ }
+
+ exfat_update_dir_chksum(inode, p_dir, entry);
+ return 0;
+}
+
+int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
+ int entry, int order, int num_entries)
+{
+ struct super_block *sb = inode->i_sb;
+ int i;
+ sector_t sector;
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+
+ for (i = order; i < num_entries; i++) {
+ ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, &sector);
+ if (!ep)
+ return -EIO;
+
+ exfat_set_entry_type(ep, TYPE_DELETED);
+ exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ brelse(bh);
+ }
+
+ return 0;
+}
+
+int exfat_update_dir_chksum_with_entry_set(struct super_block *sb,
+ struct exfat_entry_set_cache *es, int sync)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct buffer_head *bh;
+ sector_t sec = es->sector;
+ unsigned int off = es->offset;
+ int chksum_type = CS_DIR_ENTRY, i, num_entries = es->num_entries;
+ unsigned int buf_off = (off - es->offset);
+ unsigned int remaining_byte_in_sector, copy_entries, clu;
+ unsigned short chksum = 0;
+
+ for (i = 0; i < num_entries; i++) {
+ chksum = exfat_calc_chksum_2byte(&es->entries[i], DENTRY_SIZE,
+ chksum, chksum_type);
+ chksum_type = CS_DEFAULT;
+ }
+
+ es->entries[0].dentry.file.checksum = cpu_to_le16(chksum);
+
+ while (num_entries) {
+ /* write per sector base */
+ remaining_byte_in_sector = (1 << sb->s_blocksize_bits) - off;
+ copy_entries = min_t(int,
+ EXFAT_B_TO_DEN(remaining_byte_in_sector),
+ num_entries);
+ bh = sb_bread(sb, sec);
+ if (!bh)
+ goto err_out;
+ memcpy(bh->b_data + off,
+ (unsigned char *)&es->entries[0] + buf_off,
+ EXFAT_DEN_TO_B(copy_entries));
+ exfat_update_bh(sb, bh, sync);
+ brelse(bh);
+ num_entries -= copy_entries;
+
+ if (num_entries) {
+ /* get next sector */
+ if (exfat_is_last_sector_in_cluster(sbi, sec)) {
+ clu = exfat_sector_to_cluster(sbi, sec);
+ if (es->alloc_flag == ALLOC_NO_FAT_CHAIN)
+ clu++;
+ else if (exfat_get_next_cluster(sb, &clu))
+ goto err_out;
+ sec = exfat_cluster_to_sector(sbi, clu);
+ } else {
+ sec++;
+ }
+ off = 0;
+ buf_off += EXFAT_DEN_TO_B(copy_entries);
+ }
+ }
+
+ return 0;
+err_out:
+ return -EIO;
+}
+
+static int exfat_walk_fat_chain(struct super_block *sb,
+ struct exfat_chain *p_dir, unsigned int byte_offset,
+ unsigned int *clu)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned int clu_offset;
+ unsigned int cur_clu;
+
+ clu_offset = EXFAT_B_TO_CLU(byte_offset, sbi);
+ cur_clu = p_dir->dir;
+
+ if (p_dir->flags == ALLOC_NO_FAT_CHAIN) {
+ cur_clu += clu_offset;
+ } else {
+ while (clu_offset > 0) {
+ if (exfat_get_next_cluster(sb, &cur_clu))
+ return -EIO;
+ if (cur_clu == EXFAT_EOF_CLUSTER) {
+ exfat_fs_error(sb,
+ "invalid dentry access beyond EOF (clu : %u, eidx : %d)",
+ p_dir->dir,
+ EXFAT_B_TO_DEN(byte_offset));
+ return -EIO;
+ }
+ clu_offset--;
+ }
+ }
+
+ *clu = cur_clu;
+ return 0;
+}
+
+int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, sector_t *sector, int *offset)
+{
+ int ret;
+ unsigned int off, clu = 0;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ off = EXFAT_DEN_TO_B(entry);
+
+ ret = exfat_walk_fat_chain(sb, p_dir, off, &clu);
+ if (ret)
+ return ret;
+
+ /* byte offset in cluster */
+ off = EXFAT_CLU_OFFSET(off, sbi);
+
+ /* byte offset in sector */
+ *offset = EXFAT_BLK_OFFSET(off, sb);
+
+ /* sector offset in cluster */
+ *sector = EXFAT_B_TO_BLK(off, sb);
+ *sector += exfat_cluster_to_sector(sbi, clu);
+ return 0;
+}
+
+#define EXFAT_MAX_RA_SIZE (128*1024)
+static int exfat_dir_readahead(struct super_block *sb, sector_t sec)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct buffer_head *bh;
+ unsigned int max_ra_count = EXFAT_MAX_RA_SIZE >> sb->s_blocksize_bits;
+ unsigned int page_ra_count = PAGE_SIZE >> sb->s_blocksize_bits;
+ unsigned int adj_ra_count = max(sbi->sect_per_clus, page_ra_count);
+ unsigned int ra_count = min(adj_ra_count, max_ra_count);
+
+ /* Read-ahead is not required */
+ if (sbi->sect_per_clus == 1)
+ return 0;
+
+ if (sec < sbi->data_start_sector) {
+ exfat_msg(sb, KERN_ERR,
+ "requested sector is invalid(sect:%llu, root:%llu)",
+ (unsigned long long)sec, sbi->data_start_sector);
+ return -EIO;
+ }
+
+ /* Not sector aligned with ra_count, resize ra_count to page size */
+ if ((sec - sbi->data_start_sector) & (ra_count - 1))
+ ra_count = page_ra_count;
+
+ bh = sb_find_get_block(sb, sec);
+ if (!bh || !buffer_uptodate(bh)) {
+ unsigned int i;
+
+ for (i = 0; i < ra_count; i++)
+ sb_breadahead(sb, (sector_t)(sec + i));
+ }
+ brelse(bh);
+ return 0;
+}
+
+struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
+ struct exfat_chain *p_dir, int entry, struct buffer_head **bh,
+ sector_t *sector)
+{
+ unsigned int dentries_per_page = EXFAT_B_TO_DEN(PAGE_SIZE);
+ int off;
+ sector_t sec;
+
+ if (p_dir->dir == DIR_DELETED) {
+ exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry\n");
+ return NULL;
+ }
+
+ if (exfat_find_location(sb, p_dir, entry, &sec, &off))
+ return NULL;
+
+ if (p_dir->dir != EXFAT_FREE_CLUSTER &&
+ !(entry & (dentries_per_page - 1)))
+ exfat_dir_readahead(sb, sec);
+
+ *bh = sb_bread(sb, sec);
+ if (!*bh)
+ return NULL;
+
+ if (sector)
+ *sector = sec;
+ return (struct exfat_dentry *)((*bh)->b_data + off);
+}
+
+enum exfat_validate_dentry_mode {
+ ES_MODE_STARTED,
+ ES_MODE_GET_FILE_ENTRY,
+ ES_MODE_GET_STRM_ENTRY,
+ ES_MODE_GET_NAME_ENTRY,
+ ES_MODE_GET_CRITICAL_SEC_ENTRY,
+};
+
+static bool exfat_validate_entry(unsigned int type,
+ enum exfat_validate_dentry_mode *mode)
+{
+ if (type == TYPE_UNUSED || type == TYPE_DELETED)
+ return false;
+
+ switch (*mode) {
+ case ES_MODE_STARTED:
+ if (type != TYPE_FILE && type != TYPE_DIR)
+ return false;
+ *mode = ES_MODE_GET_FILE_ENTRY;
+ return true;
+ case ES_MODE_GET_FILE_ENTRY:
+ if (type != TYPE_STREAM)
+ return false;
+ *mode = ES_MODE_GET_STRM_ENTRY;
+ return true;
+ case ES_MODE_GET_STRM_ENTRY:
+ if (type != TYPE_EXTEND)
+ return false;
+ *mode = ES_MODE_GET_NAME_ENTRY;
+ return true;
+ case ES_MODE_GET_NAME_ENTRY:
+ if (type == TYPE_STREAM)
+ return false;
+ if (type != TYPE_EXTEND) {
+ if (!(type & TYPE_CRITICAL_SEC))
+ return false;
+ *mode = ES_MODE_GET_CRITICAL_SEC_ENTRY;
+ }
+ return true;
+ case ES_MODE_GET_CRITICAL_SEC_ENTRY:
+ if (type == TYPE_EXTEND || type == TYPE_STREAM)
+ return false;
+ if ((type & TYPE_CRITICAL_SEC) != TYPE_CRITICAL_SEC)
+ return false;
+ return true;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+}
+
+/*
+ * Returns a set of dentries for a file or dir.
+ *
+ * Note that this is a copy (dump) of dentries so that user should
+ * call write_entry_set() to apply changes made in this entry set
+ * to the real device.
+ *
+ * in:
+ * sb+p_dir+entry: indicates a file/dir
+ * type: specifies how many dentries should be included.
+ * out:
+ * file_ep: will point the first dentry(= file dentry) on success
+ * return:
+ * pointer of entry set on success,
+ * NULL on failure.
+ */
+struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
+ struct exfat_chain *p_dir, int entry, unsigned int type,
+ struct exfat_dentry **file_ep)
+{
+ int ret;
+ unsigned int off, byte_offset, clu = 0;
+ unsigned int entry_type;
+ sector_t sec;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_entry_set_cache *es;
+ struct exfat_dentry *ep, *pos;
+ unsigned char num_entries;
+ enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
+ struct buffer_head *bh;
+
+ if (p_dir->dir == DIR_DELETED) {
+ exfat_msg(sb, KERN_ERR, "access to deleted dentry\n");
+ return NULL;
+ }
+
+ byte_offset = EXFAT_DEN_TO_B(entry);
+ ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu);
+ if (ret)
+ return NULL;
+
+ /* byte offset in cluster */
+ byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi);
+
+ /* byte offset in sector */
+ off = EXFAT_BLK_OFFSET(byte_offset, sb);
+
+ /* sector offset in cluster */
+ sec = EXFAT_B_TO_BLK(byte_offset, sb);
+ sec += exfat_cluster_to_sector(sbi, clu);
+
+ bh = sb_bread(sb, sec);
+ if (!bh)
+ return NULL;
+
+ ep = (struct exfat_dentry *)(bh->b_data + off);
+ entry_type = exfat_get_entry_type(ep);
+
+ if (entry_type != TYPE_FILE && entry_type != TYPE_DIR)
+ goto release_bh;
+
+ num_entries = type == ES_ALL_ENTRIES ?
+ ep->dentry.file.num_ext + 1 : type;
+ es = kmalloc(struct_size(es, entries, num_entries), GFP_KERNEL);
+ if (!es)
+ goto release_bh;
+
+ es->num_entries = num_entries;
+ es->sector = sec;
+ es->offset = off;
+ es->alloc_flag = p_dir->flags;
+
+ pos = &es->entries[0];
+
+ while (num_entries) {
+ if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
+ goto free_es;
+
+ /* copy dentry */
+ memcpy(pos, ep, sizeof(struct exfat_dentry));
+
+ if (--num_entries == 0)
+ break;
+
+ if (((off + DENTRY_SIZE) & (sb->s_blocksize - 1)) <
+ (off & (sb->s_blocksize - 1))) {
+ /* get the next sector */
+ if (exfat_is_last_sector_in_cluster(sbi, sec)) {
+ if (es->alloc_flag == ALLOC_NO_FAT_CHAIN)
+ clu++;
+ else if (exfat_get_next_cluster(sb, &clu))
+ goto free_es;
+ sec = exfat_cluster_to_sector(sbi, clu);
+ } else {
+ sec++;
+ }
+
+ brelse(bh);
+ bh = sb_bread(sb, sec);
+ if (!bh)
+ goto free_es;
+ off = 0;
+ ep = (struct exfat_dentry *)bh->b_data;
+ } else {
+ ep++;
+ off += DENTRY_SIZE;
+ }
+ pos++;
+ }
+
+ if (file_ep)
+ *file_ep = &es->entries[0];
+ brelse(bh);
+ return es;
+
+free_es:
+ kfree(es);
+release_bh:
+ brelse(bh);
+ return NULL;
+}
+
+enum {
+ DIRENT_STEP_FILE,
+ DIRENT_STEP_STRM,
+ DIRENT_STEP_NAME,
+ DIRENT_STEP_SECD,
+};
+
+/*
+ * return values:
+ * >= 0 : return dir entiry position with the name in dir
+ * -EEXIST : (root dir, ".") it is the root dir itself
+ * -ENOENT : entry with the name does not exist
+ * -EIO : I/O error
+ */
+int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
+ struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
+ int num_entries, unsigned int type)
+{
+ int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len;
+ int order, step, name_len = 0;
+ int dentries_per_clu, num_empty = 0;
+ unsigned int entry_type;
+ unsigned short *uniname = NULL;
+ struct exfat_chain clu;
+ struct exfat_hint *hint_stat = &ei->hint_stat;
+ struct exfat_hint_femp candi_empty;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ dentries_per_clu = sbi->dentries_per_clu;
+
+ exfat_chain_dup(&clu, p_dir);
+
+ if (hint_stat->eidx) {
+ clu.dir = hint_stat->clu;
+ dentry = hint_stat->eidx;
+ end_eidx = dentry;
+ }
+
+ candi_empty.eidx = EXFAT_HINT_NONE;
+rewind:
+ order = 0;
+ step = DIRENT_STEP_FILE;
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ i = dentry & (dentries_per_clu - 1);
+ for (; i < dentries_per_clu; i++, dentry++) {
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+
+ if (rewind && dentry == end_eidx)
+ goto not_found;
+
+ ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ if (!ep)
+ return -EIO;
+
+ entry_type = exfat_get_entry_type(ep);
+
+ if (entry_type == TYPE_UNUSED ||
+ entry_type == TYPE_DELETED) {
+ step = DIRENT_STEP_FILE;
+
+ num_empty++;
+ if (candi_empty.eidx == EXFAT_HINT_NONE &&
+ num_empty == 1) {
+ exfat_chain_set(&candi_empty.cur,
+ clu.dir, clu.size, clu.flags);
+ }
+
+ if (candi_empty.eidx == EXFAT_HINT_NONE &&
+ num_empty >= num_entries) {
+ candi_empty.eidx =
+ dentry - (num_empty - 1);
+ WARN_ON(candi_empty.eidx < 0);
+ candi_empty.count = num_empty;
+
+ if (ei->hint_femp.eidx ==
+ EXFAT_HINT_NONE ||
+ candi_empty.eidx <=
+ ei->hint_femp.eidx) {
+ memcpy(&ei->hint_femp,
+ &candi_empty,
+ sizeof(candi_empty));
+ }
+ }
+
+ brelse(bh);
+ if (entry_type == TYPE_UNUSED)
+ goto not_found;
+ continue;
+ }
+
+ num_empty = 0;
+ candi_empty.eidx = EXFAT_HINT_NONE;
+
+ if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) {
+ step = DIRENT_STEP_FILE;
+ if (type == TYPE_ALL || type == entry_type) {
+ num_ext = ep->dentry.file.num_ext;
+ step = DIRENT_STEP_STRM;
+ }
+ brelse(bh);
+ continue;
+ }
+
+ if (entry_type == TYPE_STREAM) {
+ unsigned short name_hash;
+
+ if (step != DIRENT_STEP_STRM) {
+ step = DIRENT_STEP_FILE;
+ brelse(bh);
+ continue;
+ }
+ step = DIRENT_STEP_FILE;
+ name_hash = le16_to_cpu(
+ ep->dentry.stream.name_hash);
+ if (p_uniname->name_hash == name_hash &&
+ p_uniname->name_len ==
+ ep->dentry.stream.name_len) {
+ step = DIRENT_STEP_NAME;
+ order = 1;
+ name_len = 0;
+ }
+ brelse(bh);
+ continue;
+ }
+
+ brelse(bh);
+ if (entry_type == TYPE_EXTEND) {
+ unsigned short entry_uniname[16], unichar;
+
+ if (step != DIRENT_STEP_NAME) {
+ step = DIRENT_STEP_FILE;
+ continue;
+ }
+
+ if (++order == 2)
+ uniname = p_uniname->name;
+ else
+ uniname += EXFAT_FILE_NAME_LEN;
+
+ len = exfat_extract_uni_name(ep, entry_uniname);
+ name_len += len;
+
+ unichar = *(uniname+len);
+ *(uniname+len) = 0x0;
+
+ if (exfat_uniname_ncmp(sb, uniname,
+ entry_uniname, len)) {
+ step = DIRENT_STEP_FILE;
+ } else if (p_uniname->name_len == name_len) {
+ if (order == num_ext)
+ goto found;
+ step = DIRENT_STEP_SECD;
+ }
+
+ *(uniname+len) = unichar;
+ continue;
+ }
+
+ if (entry_type &
+ (TYPE_CRITICAL_SEC | TYPE_BENIGN_SEC)) {
+ if (step == DIRENT_STEP_SECD) {
+ if (++order == num_ext)
+ goto found;
+ continue;
+ }
+ }
+ step = DIRENT_STEP_FILE;
+ }
+
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ if (--clu.size > 0)
+ clu.dir++;
+ else
+ clu.dir = EXFAT_EOF_CLUSTER;
+ } else {
+ if (exfat_get_next_cluster(sb, &clu.dir))
+ return -EIO;
+ }
+ }
+
+not_found:
+ /*
+ * We started at not 0 index,so we should try to find target
+ * from 0 index to the index we started at.
+ */
+ if (!rewind && end_eidx) {
+ rewind = 1;
+ dentry = 0;
+ clu.dir = p_dir->dir;
+ /* reset empty hint */
+ num_empty = 0;
+ candi_empty.eidx = EXFAT_HINT_NONE;
+ goto rewind;
+ }
+
+ /* initialized hint_stat */
+ hint_stat->clu = p_dir->dir;
+ hint_stat->eidx = 0;
+ return -ENOENT;
+
+found:
+ /* next dentry we'll find is out of this cluster */
+ if (!((dentry + 1) & (dentries_per_clu - 1))) {
+ int ret = 0;
+
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ if (--clu.size > 0)
+ clu.dir++;
+ else
+ clu.dir = EXFAT_EOF_CLUSTER;
+ } else {
+ ret = exfat_get_next_cluster(sb, &clu.dir);
+ }
+
+ if (ret || clu.dir != EXFAT_EOF_CLUSTER) {
+ /* just initialized hint_stat */
+ hint_stat->clu = p_dir->dir;
+ hint_stat->eidx = 0;
+ return (dentry - num_ext);
+ }
+ }
+
+ hint_stat->clu = clu.dir;
+ hint_stat->eidx = dentry + 1;
+ return dentry - num_ext;
+}
+
+int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, struct exfat_dentry *ep)
+{
+ int i, count = 0;
+ unsigned int type;
+ struct exfat_dentry *ext_ep;
+ struct buffer_head *bh;
+
+ for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) {
+ ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh, NULL);
+ if (!ext_ep)
+ return -EIO;
+
+ type = exfat_get_entry_type(ext_ep);
+ brelse(bh);
+ if (type == TYPE_EXTEND || type == TYPE_STREAM)
+ count++;
+ else
+ break;
+ }
+ return count;
+}
+
+int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
+{
+ int i, count = 0;
+ int dentries_per_clu;
+ unsigned int entry_type;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct buffer_head *bh;
+
+ dentries_per_clu = sbi->dentries_per_clu;
+
+ exfat_chain_dup(&clu, p_dir);
+
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ for (i = 0; i < dentries_per_clu; i++) {
+ ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ if (!ep)
+ return -EIO;
+ entry_type = exfat_get_entry_type(ep);
+ brelse(bh);
+
+ if (entry_type == TYPE_UNUSED)
+ return count;
+ if (entry_type != TYPE_DIR)
+ continue;
+ count++;
+ }
+
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ if (--clu.size > 0)
+ clu.dir++;
+ else
+ clu.dir = EXFAT_EOF_CLUSTER;
+ } else {
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+ }
+ }
+
+ return count;
+}
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
new file mode 100644
index 000000000000..67d4e46fb810
--- /dev/null
+++ b/fs/exfat/exfat_fs.h
@@ -0,0 +1,519 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef _EXFAT_FS_H
+#define _EXFAT_FS_H
+
+#include <linux/fs.h>
+#include <linux/ratelimit.h>
+#include <linux/nls.h>
+
+#define EXFAT_SUPER_MAGIC 0x2011BAB0UL
+#define EXFAT_ROOT_INO 1
+
+#define EXFAT_SB_DIRTY 0
+
+#define EXFAT_CLUSTERS_UNTRACKED (~0u)
+
+/*
+ * exfat error flags
+ */
+enum exfat_error_mode {
+ EXFAT_ERRORS_CONT, /* ignore error and continue */
+ EXFAT_ERRORS_PANIC, /* panic on error */
+ EXFAT_ERRORS_RO, /* remount r/o on error */
+};
+
+/*
+ * exfat nls lossy flag
+ */
+enum {
+ NLS_NAME_NO_LOSSY, /* no lossy */
+ NLS_NAME_LOSSY, /* just detected incorrect filename(s) */
+ NLS_NAME_OVERLEN, /* the length is over than its limit */
+};
+
+#define EXFAT_HASH_BITS 8
+#define EXFAT_HASH_SIZE (1UL << EXFAT_HASH_BITS)
+
+/*
+ * Type Definitions
+ */
+#define ES_2_ENTRIES 2
+#define ES_ALL_ENTRIES 0
+
+#define DIR_DELETED 0xFFFF0321
+
+/* type values */
+#define TYPE_UNUSED 0x0000
+#define TYPE_DELETED 0x0001
+#define TYPE_INVALID 0x0002
+#define TYPE_CRITICAL_PRI 0x0100
+#define TYPE_BITMAP 0x0101
+#define TYPE_UPCASE 0x0102
+#define TYPE_VOLUME 0x0103
+#define TYPE_DIR 0x0104
+#define TYPE_FILE 0x011F
+#define TYPE_CRITICAL_SEC 0x0200
+#define TYPE_STREAM 0x0201
+#define TYPE_EXTEND 0x0202
+#define TYPE_ACL 0x0203
+#define TYPE_BENIGN_PRI 0x0400
+#define TYPE_GUID 0x0401
+#define TYPE_PADDING 0x0402
+#define TYPE_ACLTAB 0x0403
+#define TYPE_BENIGN_SEC 0x0800
+#define TYPE_ALL 0x0FFF
+
+#define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */
+#define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */
+#define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE)
+
+#define FAT_CACHE_SIZE 128
+#define FAT_CACHE_HASH_SIZE 64
+#define BUF_CACHE_SIZE 256
+#define BUF_CACHE_HASH_SIZE 64
+
+#define EXFAT_HINT_NONE -1
+#define EXFAT_MIN_SUBDIR 2
+
+/*
+ * helpers for cluster size to byte conversion.
+ */
+#define EXFAT_CLU_TO_B(b, sbi) ((b) << (sbi)->cluster_size_bits)
+#define EXFAT_B_TO_CLU(b, sbi) ((b) >> (sbi)->cluster_size_bits)
+#define EXFAT_B_TO_CLU_ROUND_UP(b, sbi) \
+ (((b - 1) >> (sbi)->cluster_size_bits) + 1)
+#define EXFAT_CLU_OFFSET(off, sbi) ((off) & ((sbi)->cluster_size - 1))
+
+/*
+ * helpers for block size to byte conversion.
+ */
+#define EXFAT_BLK_TO_B(b, sb) ((b) << (sb)->s_blocksize_bits)
+#define EXFAT_B_TO_BLK(b, sb) ((b) >> (sb)->s_blocksize_bits)
+#define EXFAT_B_TO_BLK_ROUND_UP(b, sb) \
+ (((b - 1) >> (sb)->s_blocksize_bits) + 1)
+#define EXFAT_BLK_OFFSET(off, sb) ((off) & ((sb)->s_blocksize - 1))
+
+/*
+ * helpers for block size to dentry size conversion.
+ */
+#define EXFAT_B_TO_DEN_IDX(b, sbi) \
+ ((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+#define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS)
+#define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS)
+
+/*
+ * helpers for fat entry.
+ */
+#define FAT_ENT_SIZE (4)
+#define FAT_ENT_SIZE_BITS (2)
+#define FAT_ENT_OFFSET_SECTOR(sb, loc) (EXFAT_SB(sb)->FAT1_start_sector + \
+ (((u64)loc << FAT_ENT_SIZE_BITS) >> sb->s_blocksize_bits))
+#define FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc) \
+ ((loc << FAT_ENT_SIZE_BITS) & (sb->s_blocksize - 1))
+
+/*
+ * helpers for bitmap.
+ */
+#define CLUSTER_TO_BITMAP_ENT(clu) ((clu) - EXFAT_RESERVED_CLUSTERS)
+#define BITMAP_ENT_TO_CLUSTER(ent) ((ent) + EXFAT_RESERVED_CLUSTERS)
+#define BITS_PER_SECTOR(sb) ((sb)->s_blocksize * BITS_PER_BYTE)
+#define BITS_PER_SECTOR_MASK(sb) (BITS_PER_SECTOR(sb) - 1)
+#define BITMAP_OFFSET_SECTOR_INDEX(sb, ent) \
+ ((ent / BITS_PER_BYTE) >> (sb)->s_blocksize_bits)
+#define BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent) (ent & BITS_PER_SECTOR_MASK(sb))
+#define BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent) \
+ ((ent / BITS_PER_BYTE) & ((sb)->s_blocksize - 1))
+#define BITS_PER_BYTE_MASK 0x7
+#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1)
+
+struct exfat_dentry_namebuf {
+ char *lfn;
+ int lfnbuf_len; /* usally MAX_UNINAME_BUF_SIZE */
+};
+
+/* unicode name structure */
+struct exfat_uni_name {
+ /* +3 for null and for converting */
+ unsigned short name[MAX_NAME_LENGTH + 3];
+ unsigned short name_hash;
+ unsigned char name_len;
+};
+
+/* directory structure */
+struct exfat_chain {
+ unsigned int dir;
+ unsigned int size;
+ unsigned char flags;
+};
+
+/* first empty entry hint information */
+struct exfat_hint_femp {
+ /* entry index of a directory */
+ int eidx;
+ /* count of continuous empty entry */
+ int count;
+ /* the cluster that first empty slot exists in */
+ struct exfat_chain cur;
+};
+
+/* hint structure */
+struct exfat_hint {
+ unsigned int clu;
+ union {
+ unsigned int off; /* cluster offset */
+ int eidx; /* entry index */
+ };
+};
+
+struct exfat_entry_set_cache {
+ /* sector number that contains file_entry */
+ sector_t sector;
+ /* byte offset in the sector */
+ unsigned int offset;
+ /* flag in stream entry. 01 for cluster chain, 03 for contig. */
+ int alloc_flag;
+ unsigned int num_entries;
+ struct exfat_dentry entries[];
+};
+
+struct exfat_dir_entry {
+ struct exfat_chain dir;
+ int entry;
+ unsigned int type;
+ unsigned int start_clu;
+ unsigned char flags;
+ unsigned short attr;
+ loff_t size;
+ unsigned int num_subdirs;
+ struct timespec64 atime;
+ struct timespec64 mtime;
+ struct timespec64 crtime;
+ struct exfat_dentry_namebuf namebuf;
+};
+
+/*
+ * exfat mount in-memory data
+ */
+struct exfat_mount_options {
+ kuid_t fs_uid;
+ kgid_t fs_gid;
+ unsigned short fs_fmask;
+ unsigned short fs_dmask;
+ /* permission for setting the [am]time */
+ unsigned short allow_utime;
+ /* charset for filename input/display */
+ char *iocharset;
+ /* on error: continue, panic, remount-ro */
+ enum exfat_error_mode errors;
+ unsigned utf8:1, /* Use of UTF-8 character set */
+ discard:1; /* Issue discard requests on deletions */
+ int time_offset; /* Offset of timestamps from UTC (in minutes) */
+};
+
+/*
+ * EXFAT file system superblock in-memory data
+ */
+struct exfat_sb_info {
+ unsigned long long num_sectors; /* num of sectors in volume */
+ unsigned int num_clusters; /* num of clusters in volume */
+ unsigned int cluster_size; /* cluster size in bytes */
+ unsigned int cluster_size_bits;
+ unsigned int sect_per_clus; /* cluster size in sectors */
+ unsigned int sect_per_clus_bits;
+ unsigned long long FAT1_start_sector; /* FAT1 start sector */
+ unsigned long long FAT2_start_sector; /* FAT2 start sector */
+ unsigned long long data_start_sector; /* data area start sector */
+ unsigned int num_FAT_sectors; /* num of FAT sectors */
+ unsigned int root_dir; /* root dir cluster */
+ unsigned int dentries_per_clu; /* num of dentries per cluster */
+ unsigned int vol_flag; /* volume dirty flag */
+ struct buffer_head *pbr_bh; /* buffer_head of PBR sector */
+
+ unsigned int map_clu; /* allocation bitmap start cluster */
+ unsigned int map_sectors; /* num of allocation bitmap sectors */
+ struct buffer_head **vol_amap; /* allocation bitmap */
+
+ unsigned short *vol_utbl; /* upcase table */
+
+ unsigned int clu_srch_ptr; /* cluster search pointer */
+ unsigned int used_clusters; /* number of used clusters */
+
+ unsigned long s_state;
+ struct mutex s_lock; /* superblock lock */
+ struct exfat_mount_options options;
+ struct nls_table *nls_io; /* Charset used for input and display */
+ struct ratelimit_state ratelimit;
+
+ spinlock_t inode_hash_lock;
+ struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
+
+ struct rcu_head rcu;
+};
+
+/*
+ * EXFAT file system inode in-memory data
+ */
+struct exfat_inode_info {
+ struct exfat_chain dir;
+ int entry;
+ unsigned int type;
+ unsigned short attr;
+ unsigned int start_clu;
+ unsigned char flags;
+ /*
+ * the copy of low 32bit of i_version to check
+ * the validation of hint_stat.
+ */
+ unsigned int version;
+ /* file offset or dentry index for readdir */
+ loff_t rwoffset;
+
+ /* hint for cluster last accessed */
+ struct exfat_hint hint_bmap;
+ /* hint for entry index we try to lookup next time */
+ struct exfat_hint hint_stat;
+ /* hint for first empty entry */
+ struct exfat_hint_femp hint_femp;
+
+ spinlock_t cache_lru_lock;
+ struct list_head cache_lru;
+ int nr_caches;
+ /* for avoiding the race between alloc and free */
+ unsigned int cache_valid_id;
+
+ /*
+ * NOTE: i_size_ondisk is 64bits, so must hold ->inode_lock to access.
+ * physically allocated size.
+ */
+ loff_t i_size_ondisk;
+ /* block-aligned i_size (used in cont_write_begin) */
+ loff_t i_size_aligned;
+ /* on-disk position of directory entry or 0 */
+ loff_t i_pos;
+ /* hash by i_location */
+ struct hlist_node i_hash_fat;
+ /* protect bmap against truncate */
+ struct rw_semaphore truncate_lock;
+ struct inode vfs_inode;
+ /* File creation time */
+ struct timespec64 i_crtime;
+};
+
+static inline struct exfat_sb_info *EXFAT_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct exfat_inode_info *EXFAT_I(struct inode *inode)
+{
+ return container_of(inode, struct exfat_inode_info, vfs_inode);
+}
+
+/*
+ * If ->i_mode can't hold 0222 (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
+ */
+static inline int exfat_mode_can_hold_ro(struct inode *inode)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+
+ if (S_ISDIR(inode->i_mode))
+ return 0;
+
+ if ((~sbi->options.fs_fmask) & 0222)
+ return 1;
+ return 0;
+}
+
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi,
+ unsigned short attr, mode_t mode)
+{
+ if ((attr & ATTR_READONLY) && !(attr & ATTR_SUBDIR))
+ mode &= ~0222;
+
+ if (attr & ATTR_SUBDIR)
+ return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+
+ return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline unsigned short exfat_make_attr(struct inode *inode)
+{
+ unsigned short attr = EXFAT_I(inode)->attr;
+
+ if (S_ISDIR(inode->i_mode))
+ attr |= ATTR_SUBDIR;
+ if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & 0222))
+ attr |= ATTR_READONLY;
+ return attr;
+}
+
+static inline void exfat_save_attr(struct inode *inode, unsigned short attr)
+{
+ if (exfat_mode_can_hold_ro(inode))
+ EXFAT_I(inode)->attr = attr & (ATTR_RWMASK | ATTR_READONLY);
+ else
+ EXFAT_I(inode)->attr = attr & ATTR_RWMASK;
+}
+
+static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi,
+ sector_t sec)
+{
+ return ((sec - sbi->data_start_sector + 1) &
+ ((1 << sbi->sect_per_clus_bits) - 1)) == 0;
+}
+
+static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi,
+ unsigned int clus)
+{
+ return ((clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) +
+ sbi->data_start_sector;
+}
+
+static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
+ sector_t sec)
+{
+ return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) +
+ EXFAT_RESERVED_CLUSTERS;
+}
+
+/* super.c */
+int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag);
+
+/* fatent.c */
+#define exfat_get_next_cluster(sb, pclu) exfat_ent_get(sb, *(pclu), pclu)
+
+int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
+ struct exfat_chain *p_chain);
+int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain);
+int exfat_ent_get(struct super_block *sb, unsigned int loc,
+ unsigned int *content);
+int exfat_ent_set(struct super_block *sb, unsigned int loc,
+ unsigned int content);
+int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, struct exfat_dentry *p_entry);
+int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain,
+ unsigned int len);
+int exfat_zeroed_cluster(struct inode *dir, unsigned int clu);
+int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain,
+ unsigned int *ret_clu);
+int exfat_count_num_clusters(struct super_block *sb,
+ struct exfat_chain *p_chain, unsigned int *ret_count);
+
+/* balloc.c */
+int exfat_load_bitmap(struct super_block *sb);
+void exfat_free_bitmap(struct exfat_sb_info *sbi);
+int exfat_set_bitmap(struct inode *inode, unsigned int clu);
+void exfat_clear_bitmap(struct inode *inode, unsigned int clu);
+unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu);
+int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
+
+/* file.c */
+extern const struct file_operations exfat_file_operations;
+int __exfat_truncate(struct inode *inode, loff_t new_size);
+void exfat_truncate(struct inode *inode, loff_t size);
+int exfat_setattr(struct dentry *dentry, struct iattr *attr);
+int exfat_getattr(const struct path *path, struct kstat *stat,
+ unsigned int request_mask, unsigned int query_flags);
+
+/* namei.c */
+extern const struct dentry_operations exfat_dentry_ops;
+extern const struct dentry_operations exfat_utf8_dentry_ops;
+
+/* cache.c */
+int exfat_cache_init(void);
+void exfat_cache_shutdown(void);
+void exfat_cache_init_inode(struct inode *inode);
+void exfat_cache_inval_inode(struct inode *inode);
+int exfat_get_cluster(struct inode *inode, unsigned int cluster,
+ unsigned int *fclus, unsigned int *dclus,
+ unsigned int *last_dclus, int allow_eof);
+
+/* dir.c */
+extern const struct inode_operations exfat_dir_inode_operations;
+extern const struct file_operations exfat_dir_operations;
+unsigned int exfat_get_entry_type(struct exfat_dentry *p_entry);
+int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
+ int entry, unsigned int type, unsigned int start_clu,
+ unsigned long long size);
+int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
+ int entry, int num_entries, struct exfat_uni_name *p_uniname);
+int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
+ int entry, int order, int num_entries);
+int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
+ int entry);
+int exfat_update_dir_chksum_with_entry_set(struct super_block *sb,
+ struct exfat_entry_set_cache *es, int sync);
+int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
+int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
+ struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
+ int num_entries, unsigned int type);
+int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu);
+int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, sector_t *sector, int *offset);
+struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
+ struct exfat_chain *p_dir, int entry, struct buffer_head **bh,
+ sector_t *sector);
+struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
+ struct exfat_chain *p_dir, int entry, unsigned int type,
+ struct exfat_dentry **file_ep);
+int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
+
+/* inode.c */
+extern const struct inode_operations exfat_file_inode_operations;
+void exfat_sync_inode(struct inode *inode);
+struct inode *exfat_build_inode(struct super_block *sb,
+ struct exfat_dir_entry *info, loff_t i_pos);
+void exfat_hash_inode(struct inode *inode, loff_t i_pos);
+void exfat_unhash_inode(struct inode *inode);
+struct inode *exfat_iget(struct super_block *sb, loff_t i_pos);
+int exfat_write_inode(struct inode *inode, struct writeback_control *wbc);
+void exfat_evict_inode(struct inode *inode);
+int exfat_block_truncate_page(struct inode *inode, loff_t from);
+
+/* exfat/nls.c */
+unsigned short exfat_toupper(struct super_block *sb, unsigned short a);
+int exfat_uniname_ncmp(struct super_block *sb, unsigned short *a,
+ unsigned short *b, unsigned int len);
+int exfat_utf16_to_nls(struct super_block *sb,
+ struct exfat_uni_name *uniname, unsigned char *p_cstring,
+ int len);
+int exfat_nls_to_utf16(struct super_block *sb,
+ const unsigned char *p_cstring, const int len,
+ struct exfat_uni_name *uniname, int *p_lossy);
+int exfat_create_upcase_table(struct super_block *sb);
+void exfat_free_upcase_table(struct exfat_sb_info *sbi);
+unsigned short exfat_high_surrogate(unicode_t u);
+unsigned short exfat_low_surrogate(unicode_t u);
+
+/* exfat/misc.c */
+void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+ __printf(3, 4) __cold;
+#define exfat_fs_error(sb, fmt, args...) \
+ __exfat_fs_error(sb, 1, fmt, ## args)
+#define exfat_fs_error_ratelimit(sb, fmt, args...) \
+ __exfat_fs_error(sb, __ratelimit(&EXFAT_SB(sb)->ratelimit), \
+ fmt, ## args)
+void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...)
+ __printf(3, 4) __cold;
+void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
+ u8 tz, __le16 time, __le16 date, u8 time_ms);
+void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
+ u8 *tz, __le16 *time, __le16 *date, u8 *time_ms);
+unsigned short exfat_calc_chksum_2byte(void *data, int len,
+ unsigned short chksum, int type);
+void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync);
+void exfat_chain_set(struct exfat_chain *ec, unsigned int dir,
+ unsigned int size, unsigned char flags);
+void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec);
+
+#endif /* !_EXFAT_FS_H */
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
new file mode 100644
index 000000000000..2a841010e649
--- /dev/null
+++ b/fs/exfat/exfat_raw.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef _EXFAT_RAW_H
+#define _EXFAT_RAW_H
+
+#include <linux/types.h>
+
+#define PBR_SIGNATURE 0xAA55
+
+#define EXFAT_MAX_FILE_LEN 255
+
+#define VOL_CLEAN 0x0000
+#define VOL_DIRTY 0x0002
+
+#define EXFAT_EOF_CLUSTER 0xFFFFFFFFu
+#define EXFAT_BAD_CLUSTER 0xFFFFFFF7u
+#define EXFAT_FREE_CLUSTER 0
+/* Cluster 0, 1 are reserved, the first cluster is 2 in the cluster heap. */
+#define EXFAT_RESERVED_CLUSTERS 2
+#define EXFAT_FIRST_CLUSTER 2
+#define EXFAT_DATA_CLUSTER_COUNT(sbi) \
+ ((sbi)->num_clusters - EXFAT_RESERVED_CLUSTERS)
+
+/* AllocationPossible and NoFatChain field in GeneralSecondaryFlags Field */
+#define ALLOC_FAT_CHAIN 0x01
+#define ALLOC_NO_FAT_CHAIN 0x03
+
+#define DENTRY_SIZE 32 /* directory entry size */
+#define DENTRY_SIZE_BITS 5
+/* exFAT allows 8388608(256MB) directory entries */
+#define MAX_EXFAT_DENTRIES 8388608
+
+/* dentry types */
+#define EXFAT_UNUSED 0x00 /* end of directory */
+#define EXFAT_DELETE (~0x80)
+#define IS_EXFAT_DELETED(x) ((x) < 0x80) /* deleted file (0x01~0x7F) */
+#define EXFAT_INVAL 0x80 /* invalid value */
+#define EXFAT_BITMAP 0x81 /* allocation bitmap */
+#define EXFAT_UPCASE 0x82 /* upcase table */
+#define EXFAT_VOLUME 0x83 /* volume label */
+#define EXFAT_FILE 0x85 /* file or dir */
+#define EXFAT_GUID 0xA0
+#define EXFAT_PADDING 0xA1
+#define EXFAT_ACLTAB 0xA2
+#define EXFAT_STREAM 0xC0 /* stream entry */
+#define EXFAT_NAME 0xC1 /* file name entry */
+#define EXFAT_ACL 0xC2 /* stream entry */
+
+#define IS_EXFAT_CRITICAL_PRI(x) (x < 0xA0)
+#define IS_EXFAT_BENIGN_PRI(x) (x < 0xC0)
+#define IS_EXFAT_CRITICAL_SEC(x) (x < 0xE0)
+
+/* checksum types */
+#define CS_DIR_ENTRY 0
+#define CS_PBR_SECTOR 1
+#define CS_DEFAULT 2
+
+/* file attributes */
+#define ATTR_READONLY 0x0001
+#define ATTR_HIDDEN 0x0002
+#define ATTR_SYSTEM 0x0004
+#define ATTR_VOLUME 0x0008
+#define ATTR_SUBDIR 0x0010
+#define ATTR_ARCHIVE 0x0020
+
+#define ATTR_RWMASK (ATTR_HIDDEN | ATTR_SYSTEM | ATTR_VOLUME | \
+ ATTR_SUBDIR | ATTR_ARCHIVE)
+
+#define PBR64_JUMP_BOOT_LEN 3
+#define PBR64_OEM_NAME_LEN 8
+#define PBR64_RESERVED_LEN 53
+
+#define EXFAT_FILE_NAME_LEN 15
+
+/* EXFAT BIOS parameter block (64 bytes) */
+struct bpb64 {
+ __u8 jmp_boot[PBR64_JUMP_BOOT_LEN];
+ __u8 oem_name[PBR64_OEM_NAME_LEN];
+ __u8 res_zero[PBR64_RESERVED_LEN];
+} __packed;
+
+/* EXFAT EXTEND BIOS parameter block (56 bytes) */
+struct bsx64 {
+ __le64 vol_offset;
+ __le64 vol_length;
+ __le32 fat_offset;
+ __le32 fat_length;
+ __le32 clu_offset;
+ __le32 clu_count;
+ __le32 root_cluster;
+ __le32 vol_serial;
+ __u8 fs_version[2];
+ __le16 vol_flags;
+ __u8 sect_size_bits;
+ __u8 sect_per_clus_bits;
+ __u8 num_fats;
+ __u8 phy_drv_no;
+ __u8 perc_in_use;
+ __u8 reserved2[7];
+} __packed;
+
+/* EXFAT PBR[BPB+BSX] (120 bytes) */
+struct pbr64 {
+ struct bpb64 bpb;
+ struct bsx64 bsx;
+} __packed;
+
+/* Common PBR[Partition Boot Record] (512 bytes) */
+struct pbr {
+ union {
+ __u8 raw[64];
+ struct bpb64 f64;
+ } bpb;
+ union {
+ __u8 raw[56];
+ struct bsx64 f64;
+ } bsx;
+ __u8 boot_code[390];
+ __le16 signature;
+} __packed;
+
+struct exfat_dentry {
+ __u8 type;
+ union {
+ struct {
+ __u8 num_ext;
+ __le16 checksum;
+ __le16 attr;
+ __le16 reserved1;
+ __le16 create_time;
+ __le16 create_date;
+ __le16 modify_time;
+ __le16 modify_date;
+ __le16 access_time;
+ __le16 access_date;
+ __u8 create_time_ms;
+ __u8 modify_time_ms;
+ __u8 create_tz;
+ __u8 modify_tz;
+ __u8 access_tz;
+ __u8 reserved2[7];
+ } __packed file; /* file directory entry */
+ struct {
+ __u8 flags;
+ __u8 reserved1;
+ __u8 name_len;
+ __le16 name_hash;
+ __le16 reserved2;
+ __le64 valid_size;
+ __le32 reserved3;
+ __le32 start_clu;
+ __le64 size;
+ } __packed stream; /* stream extension directory entry */
+ struct {
+ __u8 flags;
+ __le16 unicode_0_14[EXFAT_FILE_NAME_LEN];
+ } __packed name; /* file name directory entry */
+ struct {
+ __u8 flags;
+ __u8 reserved[18];
+ __le32 start_clu;
+ __le64 size;
+ } __packed bitmap; /* allocation bitmap directory entry */
+ struct {
+ __u8 reserved1[3];
+ __le32 checksum;
+ __u8 reserved2[12];
+ __le32 start_clu;
+ __le64 size;
+ } __packed upcase; /* up-case table directory entry */
+ } __packed dentry;
+} __packed;
+
+#define EXFAT_TZ_VALID (1 << 7)
+
+/* Jan 1 GMT 00:00:00 1980 */
+#define EXFAT_MIN_TIMESTAMP_SECS 315532800LL
+/* Dec 31 GMT 23:59:59 2107 */
+#define EXFAT_MAX_TIMESTAMP_SECS 4354819199LL
+
+#endif /* !_EXFAT_RAW_H */
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
new file mode 100644
index 000000000000..a855b1769a96
--- /dev/null
+++ b/fs/exfat/fatent.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include <linux/buffer_head.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+static int exfat_mirror_bh(struct super_block *sb, sector_t sec,
+ struct buffer_head *bh)
+{
+ struct buffer_head *c_bh;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ sector_t sec2;
+ int err = 0;
+
+ if (sbi->FAT2_start_sector != sbi->FAT1_start_sector) {
+ sec2 = sec - sbi->FAT1_start_sector + sbi->FAT2_start_sector;
+ c_bh = sb_getblk(sb, sec2);
+ if (!c_bh)
+ return -ENOMEM;
+ memcpy(c_bh->b_data, bh->b_data, sb->s_blocksize);
+ set_buffer_uptodate(c_bh);
+ mark_buffer_dirty(c_bh);
+ if (sb->s_flags & SB_SYNCHRONOUS)
+ err = sync_dirty_buffer(c_bh);
+ brelse(c_bh);
+ }
+
+ return err;
+}
+
+static int __exfat_ent_get(struct super_block *sb, unsigned int loc,
+ unsigned int *content)
+{
+ unsigned int off;
+ sector_t sec;
+ struct buffer_head *bh;
+
+ sec = FAT_ENT_OFFSET_SECTOR(sb, loc);
+ off = FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc);
+
+ bh = sb_bread(sb, sec);
+ if (!bh)
+ return -EIO;
+
+ *content = le32_to_cpu(*(__le32 *)(&bh->b_data[off]));
+
+ /* remap reserved clusters to simplify code */
+ if (*content > EXFAT_BAD_CLUSTER)
+ *content = EXFAT_EOF_CLUSTER;
+
+ brelse(bh);
+ return 0;
+}
+
+int exfat_ent_set(struct super_block *sb, unsigned int loc,
+ unsigned int content)
+{
+ unsigned int off;
+ sector_t sec;
+ __le32 *fat_entry;
+ struct buffer_head *bh;
+
+ sec = FAT_ENT_OFFSET_SECTOR(sb, loc);
+ off = FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc);
+
+ bh = sb_bread(sb, sec);
+ if (!bh)
+ return -EIO;
+
+ fat_entry = (__le32 *)&(bh->b_data[off]);
+ *fat_entry = cpu_to_le32(content);
+ exfat_update_bh(sb, bh, sb->s_flags & SB_SYNCHRONOUS);
+ exfat_mirror_bh(sb, sec, bh);
+ brelse(bh);
+ return 0;
+}
+
+static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
+ unsigned int clus)
+{
+ if (clus < EXFAT_FIRST_CLUSTER || sbi->num_clusters <= clus)
+ return false;
+ return true;
+}
+
+int exfat_ent_get(struct super_block *sb, unsigned int loc,
+ unsigned int *content)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ int err;
+
+ if (!is_valid_cluster(sbi, loc)) {
+ exfat_fs_error(sb, "invalid access to FAT (entry 0x%08x)",
+ loc);
+ return -EIO;
+ }
+
+ err = __exfat_ent_get(sb, loc, content);
+ if (err) {
+ exfat_fs_error(sb,
+ "failed to access to FAT (entry 0x%08x, err:%d)",
+ loc, err);
+ return err;
+ }
+
+ if (*content == EXFAT_FREE_CLUSTER) {
+ exfat_fs_error(sb,
+ "invalid access to FAT free cluster (entry 0x%08x)",
+ loc);
+ return -EIO;
+ }
+
+ if (*content == EXFAT_BAD_CLUSTER) {
+ exfat_fs_error(sb,
+ "invalid access to FAT bad cluster (entry 0x%08x)",
+ loc);
+ return -EIO;
+ }
+
+ if (*content != EXFAT_EOF_CLUSTER && !is_valid_cluster(sbi, *content)) {
+ exfat_fs_error(sb,
+ "invalid access to FAT (entry 0x%08x) bogus content (0x%08x)",
+ loc, *content);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain,
+ unsigned int len)
+{
+ if (!len)
+ return 0;
+
+ while (len > 1) {
+ if (exfat_ent_set(sb, chain, chain + 1))
+ return -EIO;
+ chain++;
+ len--;
+ }
+
+ if (exfat_ent_set(sb, chain, EXFAT_EOF_CLUSTER))
+ return -EIO;
+ return 0;
+}
+
+int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain)
+{
+ unsigned int num_clusters = 0;
+ unsigned int clu;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ /* invalid cluster number */
+ if (p_chain->dir == EXFAT_FREE_CLUSTER ||
+ p_chain->dir == EXFAT_EOF_CLUSTER ||
+ p_chain->dir < EXFAT_FIRST_CLUSTER)
+ return 0;
+
+ /* no cluster to truncate */
+ if (p_chain->size == 0)
+ return 0;
+
+ /* check cluster validation */
+ if (p_chain->dir < 2 && p_chain->dir >= sbi->num_clusters) {
+ exfat_msg(sb, KERN_ERR, "invalid start cluster (%u)",
+ p_chain->dir);
+ return -EIO;
+ }
+
+ set_bit(EXFAT_SB_DIRTY, &sbi->s_state);
+ clu = p_chain->dir;
+
+ if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ do {
+ exfat_clear_bitmap(inode, clu);
+ clu++;
+
+ num_clusters++;
+ } while (num_clusters < p_chain->size);
+ } else {
+ do {
+ exfat_clear_bitmap(inode, clu);
+
+ if (exfat_get_next_cluster(sb, &clu))
+ goto dec_used_clus;
+
+ num_clusters++;
+ } while (clu != EXFAT_EOF_CLUSTER);
+ }
+
+dec_used_clus:
+ sbi->used_clusters -= num_clusters;
+ return 0;
+}
+
+int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain,
+ unsigned int *ret_clu)
+{
+ unsigned int clu, next;
+ unsigned int count = 0;
+
+ next = p_chain->dir;
+ if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ *ret_clu = next + p_chain->size - 1;
+ return 0;
+ }
+
+ do {
+ count++;
+ clu = next;
+ if (exfat_ent_get(sb, clu, &next))
+ return -EIO;
+ } while (next != EXFAT_EOF_CLUSTER);
+
+ if (p_chain->size != count) {
+ exfat_fs_error(sb,
+ "bogus directory size (clus : ondisk(%d) != counted(%d))",
+ p_chain->size, count);
+ return -EIO;
+ }
+
+ *ret_clu = clu;
+ return 0;
+}
+
+static inline int exfat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
+{
+ int i, err = 0;
+
+ for (i = 0; i < nr_bhs; i++)
+ write_dirty_buffer(bhs[i], 0);
+
+ for (i = 0; i < nr_bhs; i++) {
+ wait_on_buffer(bhs[i]);
+ if (!err && !buffer_uptodate(bhs[i]))
+ err = -EIO;
+ }
+ return err;
+}
+
+int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
+{
+ struct super_block *sb = dir->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+ int nr_bhs = MAX_BUF_PER_PAGE;
+ sector_t blknr, last_blknr;
+ int err, i, n;
+
+ blknr = exfat_cluster_to_sector(sbi, clu);
+ last_blknr = blknr + sbi->sect_per_clus;
+
+ if (last_blknr > sbi->num_sectors && sbi->num_sectors > 0) {
+ exfat_fs_error_ratelimit(sb,
+ "%s: out of range(sect:%llu len:%u)",
+ __func__, (unsigned long long)blknr,
+ sbi->sect_per_clus);
+ return -EIO;
+ }
+
+ /* Zeroing the unused blocks on this cluster */
+ n = 0;
+ while (blknr < last_blknr) {
+ bhs[n] = sb_getblk(sb, blknr);
+ if (!bhs[n]) {
+ err = -ENOMEM;
+ goto release_bhs;
+ }
+ memset(bhs[n]->b_data, 0, sb->s_blocksize);
+ exfat_update_bh(sb, bhs[n], 0);
+
+ n++;
+ blknr++;
+
+ if (n == nr_bhs) {
+ if (IS_DIRSYNC(dir)) {
+ err = exfat_sync_bhs(bhs, n);
+ if (err)
+ goto release_bhs;
+ }
+
+ for (i = 0; i < n; i++)
+ brelse(bhs[i]);
+ n = 0;
+ }
+ }
+
+ if (IS_DIRSYNC(dir)) {
+ err = exfat_sync_bhs(bhs, n);
+ if (err)
+ goto release_bhs;
+ }
+
+ for (i = 0; i < n; i++)
+ brelse(bhs[i]);
+
+ return 0;
+
+release_bhs:
+ exfat_msg(sb, KERN_ERR, "failed zeroed sect %llu\n",
+ (unsigned long long)blknr);
+ for (i = 0; i < n; i++)
+ bforget(bhs[i]);
+ return err;
+}
+
+int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
+ struct exfat_chain *p_chain)
+{
+ int ret = -ENOSPC;
+ unsigned int num_clusters = 0, total_cnt;
+ unsigned int hint_clu, new_clu, last_clu = EXFAT_EOF_CLUSTER;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ total_cnt = EXFAT_DATA_CLUSTER_COUNT(sbi);
+
+ if (unlikely(total_cnt < sbi->used_clusters)) {
+ exfat_fs_error_ratelimit(sb,
+ "%s: invalid used clusters(t:%u,u:%u)\n",
+ __func__, total_cnt, sbi->used_clusters);
+ return -EIO;
+ }
+
+ if (num_alloc > total_cnt - sbi->used_clusters)
+ return -ENOSPC;
+
+ hint_clu = p_chain->dir;
+ /* find new cluster */
+ if (hint_clu == EXFAT_EOF_CLUSTER) {
+ if (sbi->clu_srch_ptr < EXFAT_FIRST_CLUSTER) {
+ exfat_msg(sb, KERN_ERR,
+ "sbi->clu_srch_ptr is invalid (%u)\n",
+ sbi->clu_srch_ptr);
+ sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
+ }
+
+ hint_clu = exfat_find_free_bitmap(sb, sbi->clu_srch_ptr);
+ if (hint_clu == EXFAT_EOF_CLUSTER)
+ return -ENOSPC;
+ }
+
+ /* check cluster validation */
+ if (hint_clu < EXFAT_FIRST_CLUSTER && hint_clu >= sbi->num_clusters) {
+ exfat_msg(sb, KERN_ERR, "hint_cluster is invalid (%u)\n",
+ hint_clu);
+ hint_clu = EXFAT_FIRST_CLUSTER;
+ if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ if (exfat_chain_cont_cluster(sb, p_chain->dir,
+ num_clusters))
+ return -EIO;
+ p_chain->flags = ALLOC_FAT_CHAIN;
+ }
+ }
+
+ set_bit(EXFAT_SB_DIRTY, &sbi->s_state);
+
+ p_chain->dir = EXFAT_EOF_CLUSTER;
+
+ while ((new_clu = exfat_find_free_bitmap(sb, hint_clu)) !=
+ EXFAT_EOF_CLUSTER) {
+ if (new_clu != hint_clu &&
+ p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ if (exfat_chain_cont_cluster(sb, p_chain->dir,
+ num_clusters)) {
+ ret = -EIO;
+ goto free_cluster;
+ }
+ p_chain->flags = ALLOC_FAT_CHAIN;
+ }
+
+ /* update allocation bitmap */
+ if (exfat_set_bitmap(inode, new_clu)) {
+ ret = -EIO;
+ goto free_cluster;
+ }
+
+ num_clusters++;
+
+ /* update FAT table */
+ if (p_chain->flags == ALLOC_FAT_CHAIN) {
+ if (exfat_ent_set(sb, new_clu, EXFAT_EOF_CLUSTER)) {
+ ret = -EIO;
+ goto free_cluster;
+ }
+ }
+
+ if (p_chain->dir == EXFAT_EOF_CLUSTER) {
+ p_chain->dir = new_clu;
+ } else if (p_chain->flags == ALLOC_FAT_CHAIN) {
+ if (exfat_ent_set(sb, last_clu, new_clu)) {
+ ret = -EIO;
+ goto free_cluster;
+ }
+ }
+ last_clu = new_clu;
+
+ if (--num_alloc == 0) {
+ sbi->clu_srch_ptr = hint_clu;
+ sbi->used_clusters += num_clusters;
+
+ p_chain->size += num_clusters;
+ return 0;
+ }
+
+ hint_clu = new_clu + 1;
+ if (hint_clu >= sbi->num_clusters) {
+ hint_clu = EXFAT_FIRST_CLUSTER;
+
+ if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ if (exfat_chain_cont_cluster(sb, p_chain->dir,
+ num_clusters)) {
+ ret = -EIO;
+ goto free_cluster;
+ }
+ p_chain->flags = ALLOC_FAT_CHAIN;
+ }
+ }
+ }
+free_cluster:
+ if (num_clusters)
+ exfat_free_cluster(inode, p_chain);
+ return ret;
+}
+
+int exfat_count_num_clusters(struct super_block *sb,
+ struct exfat_chain *p_chain, unsigned int *ret_count)
+{
+ unsigned int i, count;
+ unsigned int clu;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ if (!p_chain->dir || p_chain->dir == EXFAT_EOF_CLUSTER) {
+ *ret_count = 0;
+ return 0;
+ }
+
+ if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ *ret_count = p_chain->size;
+ return 0;
+ }
+
+ clu = p_chain->dir;
+ count = 0;
+ for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters; i++) {
+ count++;
+ if (exfat_ent_get(sb, clu, &clu))
+ return -EIO;
+ if (clu == EXFAT_EOF_CLUSTER)
+ break;
+ }
+
+ *ret_count = count;
+ return 0;
+}
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
new file mode 100644
index 000000000000..483f683757aa
--- /dev/null
+++ b/fs/exfat/file.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/buffer_head.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+static int exfat_cont_expand(struct inode *inode, loff_t size)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t start = i_size_read(inode), count = size - i_size_read(inode);
+ int err, err2;
+
+ err = generic_cont_expand_simple(inode, size);
+ if (err)
+ return err;
+
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ mark_inode_dirty(inode);
+
+ if (!IS_SYNC(inode))
+ return 0;
+
+ err = filemap_fdatawrite_range(mapping, start, start + count - 1);
+ err2 = sync_mapping_buffers(mapping);
+ if (!err)
+ err = err2;
+ err2 = write_inode_now(inode, 1);
+ if (!err)
+ err = err2;
+ if (err)
+ return err;
+
+ return filemap_fdatawait_range(mapping, start, start + count - 1);
+}
+
+static bool exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode)
+{
+ mode_t allow_utime = sbi->options.allow_utime;
+
+ if (!uid_eq(current_fsuid(), inode->i_uid)) {
+ if (in_group_p(inode->i_gid))
+ allow_utime >>= 3;
+ if (allow_utime & MAY_WRITE)
+ return true;
+ }
+
+ /* use a default check */
+ return false;
+}
+
+static int exfat_sanitize_mode(const struct exfat_sb_info *sbi,
+ struct inode *inode, umode_t *mode_ptr)
+{
+ mode_t i_mode, mask, perm;
+
+ i_mode = inode->i_mode;
+
+ mask = (S_ISREG(i_mode) || S_ISLNK(i_mode)) ?
+ sbi->options.fs_fmask : sbi->options.fs_dmask;
+ perm = *mode_ptr & ~(S_IFMT | mask);
+
+ /* Of the r and x bits, all (subject to umask) must be present.*/
+ if ((perm & 0555) != (i_mode & 0555))
+ return -EPERM;
+
+ if (exfat_mode_can_hold_ro(inode)) {
+ /*
+ * Of the w bits, either all (subject to umask) or none must
+ * be present.
+ */
+ if ((perm & 0222) && ((perm & 0222) != (0222 & ~mask)))
+ return -EPERM;
+ } else {
+ /*
+ * If exfat_mode_can_hold_ro(inode) is false, can't change
+ * w bits.
+ */
+ if ((perm & 0222) != (0222 & ~mask))
+ return -EPERM;
+ }
+
+ *mode_ptr &= S_IFMT | perm;
+
+ return 0;
+}
+
+/* resize the file length */
+int __exfat_truncate(struct inode *inode, loff_t new_size)
+{
+ unsigned int num_clusters_new, num_clusters_phys;
+ unsigned int last_clu = EXFAT_FREE_CLUSTER;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep, *ep2;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct exfat_entry_set_cache *es = NULL;
+ int evict = (ei->dir.dir == DIR_DELETED) ? 1 : 0;
+
+ /* check if the given file ID is opened */
+ if (ei->type != TYPE_FILE && ei->type != TYPE_DIR)
+ return -EPERM;
+
+ exfat_set_vol_flags(sb, VOL_DIRTY);
+
+ num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi);
+ num_clusters_phys =
+ EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, sbi);
+
+ exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags);
+
+ if (new_size > 0) {
+ /*
+ * Truncate FAT chain num_clusters after the first cluster
+ * num_clusters = min(new, phys);
+ */
+ unsigned int num_clusters =
+ min(num_clusters_new, num_clusters_phys);
+
+ /*
+ * Follow FAT chain
+ * (defensive coding - works fine even with corrupted FAT table
+ */
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ clu.dir += num_clusters;
+ clu.size -= num_clusters;
+ } else {
+ while (num_clusters > 0) {
+ last_clu = clu.dir;
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+
+ num_clusters--;
+ clu.size--;
+ }
+ }
+ } else {
+ ei->flags = ALLOC_NO_FAT_CHAIN;
+ ei->start_clu = EXFAT_EOF_CLUSTER;
+ }
+
+ i_size_write(inode, new_size);
+
+ if (ei->type == TYPE_FILE)
+ ei->attr |= ATTR_ARCHIVE;
+
+ /* update the directory entry */
+ if (!evict) {
+ struct timespec64 ts;
+
+ es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
+ ES_ALL_ENTRIES, &ep);
+ if (!es)
+ return -EIO;
+ ep2 = ep + 1;
+
+ ts = current_time(inode);
+ exfat_set_entry_time(sbi, &ts,
+ &ep->dentry.file.modify_tz,
+ &ep->dentry.file.modify_time,
+ &ep->dentry.file.modify_date,
+ &ep->dentry.file.modify_time_ms);
+ ep->dentry.file.attr = cpu_to_le16(ei->attr);
+
+ /* File size should be zero if there is no cluster allocated */
+ if (ei->start_clu == EXFAT_EOF_CLUSTER) {
+ ep->dentry.stream.valid_size = 0;
+ ep->dentry.stream.size = 0;
+ } else {
+ ep->dentry.stream.valid_size = cpu_to_le64(new_size);
+ ep->dentry.stream.size = ep->dentry.stream.valid_size;
+ }
+
+ if (new_size == 0) {
+ /* Any directory can not be truncated to zero */
+ WARN_ON(ei->type != TYPE_FILE);
+
+ ep2->dentry.stream.flags = ALLOC_FAT_CHAIN;
+ ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
+ }
+
+ if (exfat_update_dir_chksum_with_entry_set(sb, es,
+ inode_needs_sync(inode)))
+ return -EIO;
+ kfree(es);
+ }
+
+ /* cut off from the FAT chain */
+ if (ei->flags == ALLOC_FAT_CHAIN && last_clu != EXFAT_FREE_CLUSTER &&
+ last_clu != EXFAT_EOF_CLUSTER) {
+ if (exfat_ent_set(sb, last_clu, EXFAT_EOF_CLUSTER))
+ return -EIO;
+ }
+
+ /* invalidate cache and free the clusters */
+ /* clear exfat cache */
+ exfat_cache_inval_inode(inode);
+
+ /* hint information */
+ ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
+ ei->hint_bmap.clu = EXFAT_EOF_CLUSTER;
+ if (ei->rwoffset > new_size)
+ ei->rwoffset = new_size;
+
+ /* hint_stat will be used if this is directory. */
+ ei->hint_stat.eidx = 0;
+ ei->hint_stat.clu = ei->start_clu;
+ ei->hint_femp.eidx = EXFAT_HINT_NONE;
+
+ /* free the clusters */
+ if (exfat_free_cluster(inode, &clu))
+ return -EIO;
+
+ exfat_set_vol_flags(sb, VOL_CLEAN);
+
+ return 0;
+}
+
+void exfat_truncate(struct inode *inode, loff_t size)
+{
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned int blocksize = 1 << inode->i_blkbits;
+ loff_t aligned_size;
+ int err;
+
+ mutex_lock(&sbi->s_lock);
+ if (EXFAT_I(inode)->start_clu == 0) {
+ /*
+ * Empty start_clu != ~0 (not allocated)
+ */
+ exfat_fs_error(sb, "tried to truncate zeroed cluster.");
+ goto write_size;
+ }
+
+ err = __exfat_truncate(inode, i_size_read(inode));
+ if (err)
+ goto write_size;
+
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ if (IS_DIRSYNC(inode))
+ exfat_sync_inode(inode);
+ else
+ mark_inode_dirty(inode);
+
+ inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
+ ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+write_size:
+ aligned_size = i_size_read(inode);
+ if (aligned_size & (blocksize - 1)) {
+ aligned_size |= (blocksize - 1);
+ aligned_size++;
+ }
+
+ if (EXFAT_I(inode)->i_size_ondisk > i_size_read(inode))
+ EXFAT_I(inode)->i_size_ondisk = aligned_size;
+
+ if (EXFAT_I(inode)->i_size_aligned > i_size_read(inode))
+ EXFAT_I(inode)->i_size_aligned = aligned_size;
+ mutex_unlock(&sbi->s_lock);
+}
+
+int exfat_getattr(const struct path *path, struct kstat *stat,
+ unsigned int request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_backing_inode(path->dentry);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+
+ generic_fillattr(inode, stat);
+ stat->result_mask |= STATX_BTIME;
+ stat->btime.tv_sec = ei->i_crtime.tv_sec;
+ stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
+ stat->blksize = EXFAT_SB(inode->i_sb)->cluster_size;
+ return 0;
+}
+
+int exfat_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb);
+ struct inode *inode = dentry->d_inode;
+ unsigned int ia_valid;
+ int error;
+
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ attr->ia_size > i_size_read(inode)) {
+ error = exfat_cont_expand(inode, attr->ia_size);
+ if (error || attr->ia_valid == ATTR_SIZE)
+ return error;
+ attr->ia_valid &= ~ATTR_SIZE;
+ }
+
+ /* Check for setting the inode time. */
+ ia_valid = attr->ia_valid;
+ if ((ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) &&
+ exfat_allow_set_time(sbi, inode)) {
+ attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET |
+ ATTR_TIMES_SET);
+ }
+
+ error = setattr_prepare(dentry, attr);
+ attr->ia_valid = ia_valid;
+ if (error)
+ goto out;
+
+ if (((attr->ia_valid & ATTR_UID) &&
+ !uid_eq(attr->ia_uid, sbi->options.fs_uid)) ||
+ ((attr->ia_valid & ATTR_GID) &&
+ !gid_eq(attr->ia_gid, sbi->options.fs_gid)) ||
+ ((attr->ia_valid & ATTR_MODE) &&
+ (attr->ia_mode & ~(S_IFREG | S_IFLNK | S_IFDIR | 0777)))) {
+ error = -EPERM;
+ goto out;
+ }
+
+ /*
+ * We don't return -EPERM here. Yes, strange, but this is too
+ * old behavior.
+ */
+ if (attr->ia_valid & ATTR_MODE) {
+ if (exfat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+ attr->ia_valid &= ~ATTR_MODE;
+ }
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ error = exfat_block_truncate_page(inode, attr->ia_size);
+ if (error)
+ goto out;
+
+ down_write(&EXFAT_I(inode)->truncate_lock);
+ truncate_setsize(inode, attr->ia_size);
+ exfat_truncate(inode, attr->ia_size);
+ up_write(&EXFAT_I(inode)->truncate_lock);
+ }
+
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+
+out:
+ return error;
+}
+
+const struct file_operations exfat_file_operations = {
+ .llseek = generic_file_llseek,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
+ .mmap = generic_file_mmap,
+ .fsync = generic_file_fsync,
+ .splice_read = generic_file_splice_read,
+};
+
+const struct inode_operations exfat_file_inode_operations = {
+ .setattr = exfat_setattr,
+ .getattr = exfat_getattr,
+};
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
new file mode 100644
index 000000000000..06887492f54b
--- /dev/null
+++ b/fs/exfat/inode.c
@@ -0,0 +1,671 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/init.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/time.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include <linux/random.h>
+#include <linux/iversion.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+static int __exfat_write_inode(struct inode *inode, int sync)
+{
+ int ret = -EIO;
+ unsigned long long on_disk_size;
+ struct exfat_dentry *ep, *ep2;
+ struct exfat_entry_set_cache *es = NULL;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ bool is_dir = (ei->type == TYPE_DIR) ? true : false;
+
+ if (inode->i_ino == EXFAT_ROOT_INO)
+ return 0;
+
+ /*
+ * If the indode is already unlinked, there is no need for updating it.
+ */
+ if (ei->dir.dir == DIR_DELETED)
+ return 0;
+
+ if (is_dir && ei->dir.dir == sbi->root_dir && ei->entry == -1)
+ return 0;
+
+ exfat_set_vol_flags(sb, VOL_DIRTY);
+
+ /* get the directory entry of given file or directory */
+ es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES,
+ &ep);
+ if (!es)
+ return -EIO;
+ ep2 = ep + 1;
+
+ ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode));
+
+ /* set FILE_INFO structure using the acquired struct exfat_dentry */
+ exfat_set_entry_time(sbi, &ei->i_crtime,
+ &ep->dentry.file.create_tz,
+ &ep->dentry.file.create_time,
+ &ep->dentry.file.create_date,
+ &ep->dentry.file.create_time_ms);
+ exfat_set_entry_time(sbi, &inode->i_mtime,
+ &ep->dentry.file.modify_tz,
+ &ep->dentry.file.modify_time,
+ &ep->dentry.file.modify_date,
+ &ep->dentry.file.modify_time_ms);
+ exfat_set_entry_time(sbi, &inode->i_atime,
+ &ep->dentry.file.access_tz,
+ &ep->dentry.file.access_time,
+ &ep->dentry.file.access_date,
+ NULL);
+
+ /* File size should be zero if there is no cluster allocated */
+ on_disk_size = i_size_read(inode);
+
+ if (ei->start_clu == EXFAT_EOF_CLUSTER)
+ on_disk_size = 0;
+
+ ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size);
+ ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
+
+ ret = exfat_update_dir_chksum_with_entry_set(sb, es, sync);
+ kfree(es);
+ return ret;
+}
+
+int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ int ret;
+
+ mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
+ ret = __exfat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+ mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
+
+ return ret;
+}
+
+void exfat_sync_inode(struct inode *inode)
+{
+ lockdep_assert_held(&EXFAT_SB(inode->i_sb)->s_lock);
+ __exfat_write_inode(inode, 1);
+}
+
+/*
+ * Input: inode, (logical) clu_offset, target allocation area
+ * Output: errcode, cluster number
+ * *clu = (~0), if it's unable to allocate a new cluster
+ */
+static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
+ unsigned int *clu, int create)
+{
+ int ret, modified = false;
+ unsigned int last_clu;
+ struct exfat_chain new_clu;
+ struct exfat_dentry *ep;
+ struct exfat_entry_set_cache *es = NULL;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ unsigned int local_clu_offset = clu_offset;
+ unsigned int num_to_be_allocated = 0, num_clusters = 0;
+
+ ei->rwoffset = EXFAT_CLU_TO_B(clu_offset, sbi);
+
+ if (EXFAT_I(inode)->i_size_ondisk > 0)
+ num_clusters =
+ EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk,
+ sbi);
+
+ if (clu_offset >= num_clusters)
+ num_to_be_allocated = clu_offset - num_clusters + 1;
+
+ if (!create && (num_to_be_allocated > 0)) {
+ *clu = EXFAT_EOF_CLUSTER;
+ return 0;
+ }
+
+ *clu = last_clu = ei->start_clu;
+
+ if (ei->flags == ALLOC_NO_FAT_CHAIN) {
+ if (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) {
+ last_clu += clu_offset - 1;
+
+ if (clu_offset == num_clusters)
+ *clu = EXFAT_EOF_CLUSTER;
+ else
+ *clu += clu_offset;
+ }
+ } else if (ei->type == TYPE_FILE) {
+ unsigned int fclus = 0;
+ int err = exfat_get_cluster(inode, clu_offset,
+ &fclus, clu, &last_clu, 1);
+ if (err)
+ return -EIO;
+
+ clu_offset -= fclus;
+ } else {
+ /* hint information */
+ if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER &&
+ ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) {
+ clu_offset -= ei->hint_bmap.off;
+ /* hint_bmap.clu should be valid */
+ WARN_ON(ei->hint_bmap.clu < 2);
+ *clu = ei->hint_bmap.clu;
+ }
+
+ while (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) {
+ last_clu = *clu;
+ if (exfat_get_next_cluster(sb, clu))
+ return -EIO;
+ clu_offset--;
+ }
+ }
+
+ if (*clu == EXFAT_EOF_CLUSTER) {
+ exfat_set_vol_flags(sb, VOL_DIRTY);
+
+ new_clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ?
+ EXFAT_EOF_CLUSTER : last_clu + 1;
+ new_clu.size = 0;
+ new_clu.flags = ei->flags;
+
+ /* allocate a cluster */
+ if (num_to_be_allocated < 1) {
+ /* Broken FAT (i_sze > allocated FAT) */
+ exfat_fs_error(sb, "broken FAT chain.");
+ return -EIO;
+ }
+
+ ret = exfat_alloc_cluster(inode, num_to_be_allocated, &new_clu);
+ if (ret)
+ return ret;
+
+ if (new_clu.dir == EXFAT_EOF_CLUSTER ||
+ new_clu.dir == EXFAT_FREE_CLUSTER) {
+ exfat_fs_error(sb,
+ "bogus cluster new allocated (last_clu : %u, new_clu : %u)",
+ last_clu, new_clu.dir);
+ return -EIO;
+ }
+
+ /* append to the FAT chain */
+ if (last_clu == EXFAT_EOF_CLUSTER) {
+ if (new_clu.flags == ALLOC_FAT_CHAIN)
+ ei->flags = ALLOC_FAT_CHAIN;
+ ei->start_clu = new_clu.dir;
+ modified = true;
+ } else {
+ if (new_clu.flags != ei->flags) {
+ /* no-fat-chain bit is disabled,
+ * so fat-chain should be synced with
+ * alloc-bitmap
+ */
+ exfat_chain_cont_cluster(sb, ei->start_clu,
+ num_clusters);
+ ei->flags = ALLOC_FAT_CHAIN;
+ modified = true;
+ }
+ if (new_clu.flags == ALLOC_FAT_CHAIN)
+ if (exfat_ent_set(sb, last_clu, new_clu.dir))
+ return -EIO;
+ }
+
+ num_clusters += num_to_be_allocated;
+ *clu = new_clu.dir;
+
+ if (ei->dir.dir != DIR_DELETED) {
+ es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
+ ES_ALL_ENTRIES, &ep);
+ if (!es)
+ return -EIO;
+ /* get stream entry */
+ ep++;
+
+ /* update directory entry */
+ if (modified) {
+ if (ep->dentry.stream.flags != ei->flags)
+ ep->dentry.stream.flags = ei->flags;
+
+ if (le32_to_cpu(ep->dentry.stream.start_clu) !=
+ ei->start_clu)
+ ep->dentry.stream.start_clu =
+ cpu_to_le32(ei->start_clu);
+
+ ep->dentry.stream.valid_size =
+ cpu_to_le64(i_size_read(inode));
+ ep->dentry.stream.size =
+ ep->dentry.stream.valid_size;
+ }
+
+ if (exfat_update_dir_chksum_with_entry_set(sb, es,
+ inode_needs_sync(inode)))
+ return -EIO;
+ kfree(es);
+
+ } /* end of if != DIR_DELETED */
+
+ inode->i_blocks +=
+ num_to_be_allocated << sbi->sect_per_clus_bits;
+
+ /*
+ * Move *clu pointer along FAT chains (hole care) because the
+ * caller of this function expect *clu to be the last cluster.
+ * This only works when num_to_be_allocated >= 2,
+ * *clu = (the first cluster of the allocated chain) =>
+ * (the last cluster of ...)
+ */
+ if (ei->flags == ALLOC_NO_FAT_CHAIN) {
+ *clu += num_to_be_allocated - 1;
+ } else {
+ while (num_to_be_allocated > 1) {
+ if (exfat_get_next_cluster(sb, clu))
+ return -EIO;
+ num_to_be_allocated--;
+ }
+ }
+
+ }
+
+ /* hint information */
+ ei->hint_bmap.off = local_clu_offset;
+ ei->hint_bmap.clu = *clu;
+
+ return 0;
+}
+
+static int exfat_map_new_buffer(struct exfat_inode_info *ei,
+ struct buffer_head *bh, loff_t pos)
+{
+ if (buffer_delay(bh) && pos > ei->i_size_aligned)
+ return -EIO;
+ set_buffer_new(bh);
+
+ /*
+ * Adjust i_size_aligned if i_size_ondisk is bigger than it.
+ */
+ if (ei->i_size_ondisk > ei->i_size_aligned)
+ ei->i_size_aligned = ei->i_size_ondisk;
+ return 0;
+}
+
+static int exfat_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err = 0;
+ unsigned long mapped_blocks = 0;
+ unsigned int cluster, sec_offset;
+ sector_t last_block;
+ sector_t phys = 0;
+ loff_t pos;
+
+ mutex_lock(&sbi->s_lock);
+ last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb);
+ if (iblock >= last_block && !create)
+ goto done;
+
+ /* Is this block already allocated? */
+ err = exfat_map_cluster(inode, iblock >> sbi->sect_per_clus_bits,
+ &cluster, create);
+ if (err) {
+ if (err != -ENOSPC)
+ exfat_fs_error_ratelimit(sb,
+ "failed to bmap (inode : %p iblock : %llu, err : %d)",
+ inode, (unsigned long long)iblock, err);
+ goto unlock_ret;
+ }
+
+ if (cluster == EXFAT_EOF_CLUSTER)
+ goto done;
+
+ /* sector offset in cluster */
+ sec_offset = iblock & (sbi->sect_per_clus - 1);
+
+ phys = exfat_cluster_to_sector(sbi, cluster) + sec_offset;
+ mapped_blocks = sbi->sect_per_clus - sec_offset;
+ max_blocks = min(mapped_blocks, max_blocks);
+
+ /* Treat newly added block / cluster */
+ if (iblock < last_block)
+ create = 0;
+
+ if (create || buffer_delay(bh_result)) {
+ pos = EXFAT_BLK_TO_B((iblock + 1), sb);
+ if (ei->i_size_ondisk < pos)
+ ei->i_size_ondisk = pos;
+ }
+
+ if (create) {
+ err = exfat_map_new_buffer(ei, bh_result, pos);
+ if (err) {
+ exfat_fs_error(sb,
+ "requested for bmap out of range(pos : (%llu) > i_size_aligned(%llu)\n",
+ pos, ei->i_size_aligned);
+ goto unlock_ret;
+ }
+ }
+
+ if (buffer_delay(bh_result))
+ clear_buffer_delay(bh_result);
+ map_bh(bh_result, sb, phys);
+done:
+ bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb);
+unlock_ret:
+ mutex_unlock(&sbi->s_lock);
+ return err;
+}
+
+static int exfat_readpage(struct file *file, struct page *page)
+{
+ return mpage_readpage(page, exfat_get_block);
+}
+
+static int exfat_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned int nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, exfat_get_block);
+}
+
+static int exfat_writepage(struct page *page, struct writeback_control *wbc)
+{
+ return block_write_full_page(page, exfat_get_block, wbc);
+}
+
+static int exfat_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return mpage_writepages(mapping, wbc, exfat_get_block);
+}
+
+static void exfat_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > i_size_read(inode)) {
+ truncate_pagecache(inode, i_size_read(inode));
+ exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned);
+ }
+}
+
+static int exfat_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret;
+
+ *pagep = NULL;
+ ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ exfat_get_block,
+ &EXFAT_I(mapping->host)->i_size_ondisk);
+
+ if (ret < 0)
+ exfat_write_failed(mapping, pos+len);
+
+ return ret;
+}
+
+static int exfat_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct page *pagep, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ int err;
+
+ err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+
+ if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) {
+ exfat_fs_error(inode->i_sb,
+ "invalid size(size(%llu) > aligned(%llu)\n",
+ i_size_read(inode), EXFAT_I(inode)->i_size_aligned);
+ return -EIO;
+ }
+
+ if (err < len)
+ exfat_write_failed(mapping, pos+len);
+
+ if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) {
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+ ei->attr |= ATTR_ARCHIVE;
+ mark_inode_dirty(inode);
+ }
+
+ return err;
+}
+
+static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t size = iocb->ki_pos + iov_iter_count(iter);
+ int rw = iov_iter_rw(iter);
+ ssize_t ret;
+
+ if (rw == WRITE) {
+ /*
+ * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
+ * so we need to update the ->i_size_aligned to block boundary.
+ *
+ * But we must fill the remaining area or hole by nul for
+ * updating ->i_size_aligned
+ *
+ * Return 0, and fallback to normal buffered write.
+ */
+ if (EXFAT_I(inode)->i_size_aligned < size)
+ return 0;
+ }
+
+ /*
+ * Need to use the DIO_LOCKING for avoiding the race
+ * condition of exfat_get_block() and ->truncate().
+ */
+ ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block);
+ if (ret < 0 && (rw & WRITE))
+ exfat_write_failed(mapping, size);
+ return ret;
+}
+
+static sector_t exfat_aop_bmap(struct address_space *mapping, sector_t block)
+{
+ sector_t blocknr;
+
+ /* exfat_get_cluster() assumes the requested blocknr isn't truncated. */
+ down_read(&EXFAT_I(mapping->host)->truncate_lock);
+ blocknr = generic_block_bmap(mapping, block, exfat_get_block);
+ up_read(&EXFAT_I(mapping->host)->truncate_lock);
+ return blocknr;
+}
+
+/*
+ * exfat_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This is required during truncate to physically zeroout the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ * Also, avoid causing failure from fsx for cases of "data past EOF"
+ */
+int exfat_block_truncate_page(struct inode *inode, loff_t from)
+{
+ return block_truncate_page(inode->i_mapping, from, exfat_get_block);
+}
+
+static const struct address_space_operations exfat_aops = {
+ .readpage = exfat_readpage,
+ .readpages = exfat_readpages,
+ .writepage = exfat_writepage,
+ .writepages = exfat_writepages,
+ .write_begin = exfat_write_begin,
+ .write_end = exfat_write_end,
+ .direct_IO = exfat_direct_IO,
+ .bmap = exfat_aop_bmap
+};
+
+static inline unsigned long exfat_hash(loff_t i_pos)
+{
+ return hash_32(i_pos, EXFAT_HASH_BITS);
+}
+
+void exfat_hash_inode(struct inode *inode, loff_t i_pos)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+ struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos);
+
+ spin_lock(&sbi->inode_hash_lock);
+ EXFAT_I(inode)->i_pos = i_pos;
+ hlist_add_head(&EXFAT_I(inode)->i_hash_fat, head);
+ spin_unlock(&sbi->inode_hash_lock);
+}
+
+void exfat_unhash_inode(struct inode *inode)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+
+ spin_lock(&sbi->inode_hash_lock);
+ hlist_del_init(&EXFAT_I(inode)->i_hash_fat);
+ EXFAT_I(inode)->i_pos = 0;
+ spin_unlock(&sbi->inode_hash_lock);
+}
+
+struct inode *exfat_iget(struct super_block *sb, loff_t i_pos)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *info;
+ struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos);
+ struct inode *inode = NULL;
+
+ spin_lock(&sbi->inode_hash_lock);
+ hlist_for_each_entry(info, head, i_hash_fat) {
+ WARN_ON(info->vfs_inode.i_sb != sb);
+
+ if (i_pos != info->i_pos)
+ continue;
+ inode = igrab(&info->vfs_inode);
+ if (inode)
+ break;
+ }
+ spin_unlock(&sbi->inode_hash_lock);
+ return inode;
+}
+
+/* doesn't deal with root inode */
+static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t size = info->size;
+
+ memcpy(&ei->dir, &info->dir, sizeof(struct exfat_chain));
+ ei->entry = info->entry;
+ ei->attr = info->attr;
+ ei->start_clu = info->start_clu;
+ ei->flags = info->flags;
+ ei->type = info->type;
+
+ ei->version = 0;
+ ei->hint_stat.eidx = 0;
+ ei->hint_stat.clu = info->start_clu;
+ ei->hint_femp.eidx = EXFAT_HINT_NONE;
+ ei->rwoffset = 0;
+ ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
+ ei->i_pos = 0;
+
+ inode->i_uid = sbi->options.fs_uid;
+ inode->i_gid = sbi->options.fs_gid;
+ inode_inc_iversion(inode);
+ inode->i_generation = prandom_u32();
+
+ if (info->attr & ATTR_SUBDIR) { /* directory */
+ inode->i_generation &= ~1;
+ inode->i_mode = exfat_make_mode(sbi, info->attr, 0777);
+ inode->i_op = &exfat_dir_inode_operations;
+ inode->i_fop = &exfat_dir_operations;
+ set_nlink(inode, info->num_subdirs);
+ } else { /* regular file */
+ inode->i_generation |= 1;
+ inode->i_mode = exfat_make_mode(sbi, info->attr, 0777);
+ inode->i_op = &exfat_file_inode_operations;
+ inode->i_fop = &exfat_file_operations;
+ inode->i_mapping->a_ops = &exfat_aops;
+ inode->i_mapping->nrpages = 0;
+ }
+
+ i_size_write(inode, size);
+
+ /* ondisk and aligned size should be aligned with block size */
+ if (size & (inode->i_sb->s_blocksize - 1)) {
+ size |= (inode->i_sb->s_blocksize - 1);
+ size++;
+ }
+
+ ei->i_size_aligned = size;
+ ei->i_size_ondisk = size;
+
+ exfat_save_attr(inode, info->attr);
+
+ inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
+ ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+ inode->i_mtime = info->mtime;
+ inode->i_ctime = info->mtime;
+ ei->i_crtime = info->crtime;
+ inode->i_atime = info->atime;
+
+ exfat_cache_init_inode(inode);
+
+ return 0;
+}
+
+struct inode *exfat_build_inode(struct super_block *sb,
+ struct exfat_dir_entry *info, loff_t i_pos)
+{
+ struct inode *inode;
+ int err;
+
+ inode = exfat_iget(sb, i_pos);
+ if (inode)
+ goto out;
+ inode = new_inode(sb);
+ if (!inode) {
+ inode = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ inode->i_ino = iunique(sb, EXFAT_ROOT_INO);
+ inode_set_iversion(inode, 1);
+ err = exfat_fill_inode(inode, info);
+ if (err) {
+ iput(inode);
+ inode = ERR_PTR(err);
+ goto out;
+ }
+ exfat_hash_inode(inode, i_pos);
+ insert_inode_hash(inode);
+out:
+ return inode;
+}
+
+void exfat_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (!inode->i_nlink) {
+ i_size_write(inode, 0);
+ mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
+ __exfat_truncate(inode, 0);
+ mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
+ }
+
+ invalidate_inode_buffers(inode);
+ clear_inode(inode);
+ exfat_cache_inval_inode(inode);
+ exfat_unhash_inode(inode);
+}
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
new file mode 100644
index 000000000000..14a3300848f6
--- /dev/null
+++ b/fs/exfat/misc.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Written 1992,1993 by Werner Almesberger
+ * 22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+/*
+ * exfat_fs_error reports a file system problem that might indicate fa data
+ * corruption/inconsistency. Depending on 'errors' mount option the
+ * panic() is called, or error message is printed FAT and nothing is done,
+ * or filesystem is remounted read-only (default behavior).
+ * In case the file system is remounted read-only, it can be made writable
+ * again by remounting it.
+ */
+void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+{
+ struct exfat_mount_options *opts = &EXFAT_SB(sb)->options;
+ va_list args;
+ struct va_format vaf;
+
+ if (report) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ exfat_msg(sb, KERN_ERR, "error, %pV\n", &vaf);
+ va_end(args);
+ }
+
+ if (opts->errors == EXFAT_ERRORS_PANIC) {
+ panic("exFAT-fs (%s): fs panic from previous error\n",
+ sb->s_id);
+ } else if (opts->errors == EXFAT_ERRORS_RO && !sb_rdonly(sb)) {
+ sb->s_flags |= SB_RDONLY;
+ exfat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
+ }
+}
+
+/*
+ * exfat_msg() - print preformated EXFAT specific messages.
+ * All logs except what uses exfat_fs_error() should be written by exfat_msg()
+ */
+void exfat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ /* level means KERN_ pacility level */
+ printk("%sexFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+ va_end(args);
+}
+
+#define SECS_PER_MIN (60)
+#define TIMEZONE_SEC(x) ((x) * 15 * SECS_PER_MIN)
+
+static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off)
+{
+ if (tz_off <= 0x3F)
+ ts->tv_sec -= TIMEZONE_SEC(tz_off);
+ else /* 0x40 <= (tz_off & 0x7F) <=0x7F */
+ ts->tv_sec += TIMEZONE_SEC(0x80 - tz_off);
+}
+
+/* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */
+void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
+ u8 tz, __le16 time, __le16 date, u8 time_ms)
+{
+ u16 t = le16_to_cpu(time);
+ u16 d = le16_to_cpu(date);
+
+ ts->tv_sec = mktime64(1980 + (d >> 9), d >> 5 & 0x000F, d & 0x001F,
+ t >> 11, (t >> 5) & 0x003F, (t & 0x001F) << 1);
+
+
+ /* time_ms field represent 0 ~ 199(1990 ms) */
+ if (time_ms) {
+ ts->tv_sec += time_ms / 100;
+ ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC;
+ }
+
+ if (tz & EXFAT_TZ_VALID)
+ /* Adjust timezone to UTC0. */
+ exfat_adjust_tz(ts, tz & ~EXFAT_TZ_VALID);
+ else
+ /* Convert from local time to UTC using time_offset. */
+ ts->tv_sec -= sbi->options.time_offset * SECS_PER_MIN;
+}
+
+/* Convert linear UNIX date to a EXFAT time/date pair. */
+void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
+ u8 *tz, __le16 *time, __le16 *date, u8 *time_ms)
+{
+ struct tm tm;
+ u16 t, d;
+
+ time64_to_tm(ts->tv_sec, 0, &tm);
+ t = (tm.tm_hour << 11) | (tm.tm_min << 5) | (tm.tm_sec >> 1);
+ d = ((tm.tm_year - 80) << 9) | ((tm.tm_mon + 1) << 5) | tm.tm_mday;
+
+ *time = cpu_to_le16(t);
+ *date = cpu_to_le16(d);
+
+ /* time_ms field represent 0 ~ 199(1990 ms) */
+ if (time_ms)
+ *time_ms = (tm.tm_sec & 1) * 100 +
+ ts->tv_nsec / (10 * NSEC_PER_MSEC);
+
+ /*
+ * Record 00h value for OffsetFromUtc field and 1 value for OffsetValid
+ * to indicate that local time and UTC are the same.
+ */
+ *tz = EXFAT_TZ_VALID;
+}
+
+unsigned short exfat_calc_chksum_2byte(void *data, int len,
+ unsigned short chksum, int type)
+{
+ int i;
+ unsigned char *c = (unsigned char *)data;
+
+ for (i = 0; i < len; i++, c++) {
+ if (((i == 2) || (i == 3)) && (type == CS_DIR_ENTRY))
+ continue;
+ chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) +
+ (unsigned short)*c;
+ }
+ return chksum;
+}
+
+void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync)
+{
+ set_bit(EXFAT_SB_DIRTY, &EXFAT_SB(sb)->s_state);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+
+ if (sync)
+ sync_dirty_buffer(bh);
+}
+
+void exfat_chain_set(struct exfat_chain *ec, unsigned int dir,
+ unsigned int size, unsigned char flags)
+{
+ ec->dir = dir;
+ ec->size = size;
+ ec->flags = flags;
+}
+
+void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec)
+{
+ return exfat_chain_set(dup, ec->dir, ec->size, ec->flags);
+}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
new file mode 100644
index 000000000000..a8681d91f569
--- /dev/null
+++ b/fs/exfat/namei.c
@@ -0,0 +1,1448 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/iversion.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/nls.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+static inline unsigned long exfat_d_version(struct dentry *dentry)
+{
+ return (unsigned long) dentry->d_fsdata;
+}
+
+static inline void exfat_d_version_set(struct dentry *dentry,
+ unsigned long version)
+{
+ dentry->d_fsdata = (void *) version;
+}
+
+/*
+ * If new entry was created in the parent, it could create the 8.3 alias (the
+ * shortname of logname). So, the parent may have the negative-dentry which
+ * matches the created 8.3 alias.
+ *
+ * If it happened, the negative dentry isn't actually negative anymore. So,
+ * drop it.
+ */
+static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ int ret;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ /*
+ * This is not negative dentry. Always valid.
+ *
+ * Note, rename() to existing directory entry will have ->d_inode, and
+ * will use existing name which isn't specified name by user.
+ *
+ * We may be able to drop this positive dentry here. But dropping
+ * positive dentry isn't good idea. So it's unsupported like
+ * rename("filename", "FILENAME") for now.
+ */
+ if (d_really_is_positive(dentry))
+ return 1;
+
+ /*
+ * Drop the negative dentry, in order to make sure to use the case
+ * sensitive name which is specified by user if this is for creation.
+ */
+ if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+ return 0;
+
+ spin_lock(&dentry->d_lock);
+ ret = inode_eq_iversion(d_inode(dentry->d_parent),
+ exfat_d_version(dentry));
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}
+
+/* returns the length of a struct qstr, ignoring trailing dots */
+static unsigned int exfat_striptail_len(unsigned int len, const char *name)
+{
+ while (len && name[len - 1] == '.')
+ len--;
+ return len;
+}
+
+/*
+ * Compute the hash for the exfat name corresponding to the dentry. If the name
+ * is invalid, we leave the hash code unchanged so that the existing dentry can
+ * be used. The exfat fs routines will return ENOENT or EINVAL as appropriate.
+ */
+static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct nls_table *t = EXFAT_SB(sb)->nls_io;
+ const unsigned char *name = qstr->name;
+ unsigned int len = exfat_striptail_len(qstr->len, qstr->name);
+ unsigned long hash = init_name_hash(dentry);
+ int i, charlen;
+ wchar_t c;
+
+ for (i = 0; i < len; i += charlen) {
+ charlen = t->char2uni(&name[i], len - i, &c);
+ if (charlen < 0)
+ return charlen;
+ hash = partial_name_hash(exfat_toupper(sb, c), hash);
+ }
+
+ qstr->hash = end_name_hash(hash);
+ return 0;
+}
+
+static int exfat_d_cmp(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct nls_table *t = EXFAT_SB(sb)->nls_io;
+ unsigned int alen = exfat_striptail_len(name->len, name->name);
+ unsigned int blen = exfat_striptail_len(len, str);
+ wchar_t c1, c2;
+ int charlen, i;
+
+ if (alen != blen)
+ return 1;
+
+ for (i = 0; i < len; i += charlen) {
+ charlen = t->char2uni(&name->name[i], alen - i, &c1);
+ if (charlen < 0)
+ return 1;
+ if (charlen != t->char2uni(&str[i], blen - i, &c2))
+ return 1;
+
+ if (exfat_toupper(sb, c1) != exfat_toupper(sb, c2))
+ return 1;
+ }
+
+ return 0;
+}
+
+const struct dentry_operations exfat_dentry_ops = {
+ .d_revalidate = exfat_d_revalidate,
+ .d_hash = exfat_d_hash,
+ .d_compare = exfat_d_cmp,
+};
+
+static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+ struct super_block *sb = dentry->d_sb;
+ const unsigned char *name = qstr->name;
+ unsigned int len = exfat_striptail_len(qstr->len, qstr->name);
+ unsigned long hash = init_name_hash(dentry);
+ int i, charlen;
+ unicode_t u;
+
+ for (i = 0; i < len; i += charlen) {
+ charlen = utf8_to_utf32(&name[i], len - i, &u);
+ if (charlen < 0)
+ return charlen;
+
+ /*
+ * Convert to UTF-16: code points above U+FFFF are encoded as
+ * surrogate pairs.
+ * exfat_toupper() works only for code points up to the U+FFFF.
+ */
+ if (u > 0xFFFF) {
+ hash = partial_name_hash(exfat_high_surrogate(u), hash);
+ hash = partial_name_hash(exfat_low_surrogate(u), hash);
+ } else {
+ hash = partial_name_hash(exfat_toupper(sb, u), hash);
+ }
+ }
+
+ qstr->hash = end_name_hash(hash);
+ return 0;
+}
+
+static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
+{
+ struct super_block *sb = dentry->d_sb;
+ unsigned int alen = exfat_striptail_len(name->len, name->name);
+ unsigned int blen = exfat_striptail_len(len, str);
+ unicode_t u_a, u_b;
+ int charlen, i;
+
+ if (alen != blen)
+ return 1;
+
+ for (i = 0; i < alen; i += charlen) {
+ charlen = utf8_to_utf32(&name->name[i], alen - i, &u_a);
+ if (charlen < 0)
+ return 1;
+ if (charlen != utf8_to_utf32(&str[i], blen - i, &u_b))
+ return 1;
+
+ if (u_a <= 0xFFFF && u_b <= 0xFFFF) {
+ if (exfat_toupper(sb, u_a) != exfat_toupper(sb, u_b))
+ return 1;
+ } else if (u_a > 0xFFFF && u_b > 0xFFFF) {
+ if (exfat_low_surrogate(u_a) !=
+ exfat_low_surrogate(u_b) ||
+ exfat_high_surrogate(u_a) !=
+ exfat_high_surrogate(u_b))
+ return 1;
+ } else {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+const struct dentry_operations exfat_utf8_dentry_ops = {
+ .d_revalidate = exfat_d_revalidate,
+ .d_hash = exfat_utf8_d_hash,
+ .d_compare = exfat_utf8_d_cmp,
+};
+
+/* used only in search empty_slot() */
+#define CNT_UNUSED_NOHIT (-1)
+#define CNT_UNUSED_HIT (-2)
+/* search EMPTY CONTINUOUS "num_entries" entries */
+static int exfat_search_empty_slot(struct super_block *sb,
+ struct exfat_hint_femp *hint_femp, struct exfat_chain *p_dir,
+ int num_entries)
+{
+ int i, dentry, num_empty = 0;
+ int dentries_per_clu;
+ unsigned int type;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct buffer_head *bh;
+
+ dentries_per_clu = sbi->dentries_per_clu;
+
+ if (hint_femp->eidx != EXFAT_HINT_NONE) {
+ dentry = hint_femp->eidx;
+ if (num_entries <= hint_femp->count) {
+ hint_femp->eidx = EXFAT_HINT_NONE;
+ return dentry;
+ }
+
+ exfat_chain_dup(&clu, &hint_femp->cur);
+ } else {
+ exfat_chain_dup(&clu, p_dir);
+ dentry = 0;
+ }
+
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ i = dentry & (dentries_per_clu - 1);
+
+ for (; i < dentries_per_clu; i++, dentry++) {
+ ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ if (!ep)
+ return -EIO;
+ type = exfat_get_entry_type(ep);
+ brelse(bh);
+
+ if (type == TYPE_UNUSED || type == TYPE_DELETED) {
+ num_empty++;
+ if (hint_femp->eidx == EXFAT_HINT_NONE) {
+ hint_femp->eidx = dentry;
+ hint_femp->count = CNT_UNUSED_NOHIT;
+ exfat_chain_set(&hint_femp->cur,
+ clu.dir, clu.size, clu.flags);
+ }
+
+ if (type == TYPE_UNUSED &&
+ hint_femp->count != CNT_UNUSED_HIT)
+ hint_femp->count = CNT_UNUSED_HIT;
+ } else {
+ if (hint_femp->eidx != EXFAT_HINT_NONE &&
+ hint_femp->count == CNT_UNUSED_HIT) {
+ /* unused empty group means
+ * an empty group which includes
+ * unused dentry
+ */
+ exfat_fs_error(sb,
+ "found bogus dentry(%d) beyond unused empty group(%d) (start_clu : %u, cur_clu : %u)",
+ dentry, hint_femp->eidx,
+ p_dir->dir, clu.dir);
+ return -EIO;
+ }
+
+ num_empty = 0;
+ hint_femp->eidx = EXFAT_HINT_NONE;
+ }
+
+ if (num_empty >= num_entries) {
+ /* found and invalidate hint_femp */
+ hint_femp->eidx = EXFAT_HINT_NONE;
+ return (dentry - (num_entries - 1));
+ }
+ }
+
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ if (--clu.size > 0)
+ clu.dir++;
+ else
+ clu.dir = EXFAT_EOF_CLUSTER;
+ } else {
+ if (exfat_get_next_cluster(sb, &clu.dir))
+ return -EIO;
+ }
+ }
+
+ return -ENOSPC;
+}
+
+static int exfat_check_max_dentries(struct inode *inode)
+{
+ if (EXFAT_B_TO_DEN(i_size_read(inode)) >= MAX_EXFAT_DENTRIES) {
+ /*
+ * exFAT spec allows a dir to grow upto 8388608(256MB)
+ * dentries
+ */
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+/* find empty directory entry.
+ * if there isn't any empty slot, expand cluster chain.
+ */
+static int exfat_find_empty_entry(struct inode *inode,
+ struct exfat_chain *p_dir, int num_entries)
+{
+ int dentry;
+ unsigned int ret, last_clu;
+ sector_t sector;
+ loff_t size = 0;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep = NULL;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct exfat_hint_femp hint_femp;
+
+ hint_femp.eidx = EXFAT_HINT_NONE;
+
+ if (ei->hint_femp.eidx != EXFAT_HINT_NONE) {
+ memcpy(&hint_femp, &ei->hint_femp,
+ sizeof(struct exfat_hint_femp));
+ ei->hint_femp.eidx = EXFAT_HINT_NONE;
+ }
+
+ while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir,
+ num_entries)) < 0) {
+ if (dentry == -EIO)
+ break;
+
+ if (exfat_check_max_dentries(inode))
+ return -ENOSPC;
+
+ /* we trust p_dir->size regardless of FAT type */
+ if (exfat_find_last_cluster(sb, p_dir, &last_clu))
+ return -EIO;
+
+ /*
+ * Allocate new cluster to this directory
+ */
+ exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags);
+
+ /* allocate a cluster */
+ ret = exfat_alloc_cluster(inode, 1, &clu);
+ if (ret)
+ return ret;
+
+ if (exfat_zeroed_cluster(inode, clu.dir))
+ return -EIO;
+
+ /* append to the FAT chain */
+ if (clu.flags != p_dir->flags) {
+ /* no-fat-chain bit is disabled,
+ * so fat-chain should be synced with alloc-bitmap
+ */
+ exfat_chain_cont_cluster(sb, p_dir->dir, p_dir->size);
+ p_dir->flags = ALLOC_FAT_CHAIN;
+ hint_femp.cur.flags = ALLOC_FAT_CHAIN;
+ }
+
+ if (clu.flags == ALLOC_FAT_CHAIN)
+ if (exfat_ent_set(sb, last_clu, clu.dir))
+ return -EIO;
+
+ if (hint_femp.eidx == EXFAT_HINT_NONE) {
+ /* the special case that new dentry
+ * should be allocated from the start of new cluster
+ */
+ hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi);
+ hint_femp.count = sbi->dentries_per_clu;
+
+ exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags);
+ }
+ hint_femp.cur.size++;
+ p_dir->size++;
+ size = EXFAT_CLU_TO_B(p_dir->size, sbi);
+
+ /* update the directory entry */
+ if (p_dir->dir != sbi->root_dir) {
+ struct buffer_head *bh;
+
+ ep = exfat_get_dentry(sb,
+ &(ei->dir), ei->entry + 1, &bh, &sector);
+ if (!ep)
+ return -EIO;
+
+ ep->dentry.stream.valid_size = cpu_to_le64(size);
+ ep->dentry.stream.size = ep->dentry.stream.valid_size;
+ ep->dentry.stream.flags = p_dir->flags;
+ exfat_update_bh(sb, bh, IS_DIRSYNC(inode));
+ brelse(bh);
+ if (exfat_update_dir_chksum(inode, &(ei->dir),
+ ei->entry))
+ return -EIO;
+ }
+
+ /* directory inode should be updated in here */
+ i_size_write(inode, size);
+ EXFAT_I(inode)->i_size_ondisk += sbi->cluster_size;
+ EXFAT_I(inode)->i_size_aligned += sbi->cluster_size;
+ EXFAT_I(inode)->flags = p_dir->flags;
+ inode->i_blocks += 1 << sbi->sect_per_clus_bits;
+ }
+
+ return dentry;
+}
+
+/*
+ * Name Resolution Functions :
+ * Zero if it was successful; otherwise nonzero.
+ */
+static int __exfat_resolve_path(struct inode *inode, const unsigned char *path,
+ struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
+ int lookup)
+{
+ int namelen;
+ int lossy = NLS_NAME_NO_LOSSY;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+
+ /* strip all trailing periods */
+ namelen = exfat_striptail_len(strlen(path), path);
+ if (!namelen)
+ return -ENOENT;
+
+ if (strlen(path) > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE))
+ return -ENAMETOOLONG;
+
+ /*
+ * strip all leading spaces :
+ * "MS windows 7" supports leading spaces.
+ * So we should skip this preprocessing for compatibility.
+ */
+
+ /* file name conversion :
+ * If lookup case, we allow bad-name for compatibility.
+ */
+ namelen = exfat_nls_to_utf16(sb, path, namelen, p_uniname,
+ &lossy);
+ if (namelen < 0)
+ return namelen; /* return error value */
+
+ if ((lossy && !lookup) || !namelen)
+ return -EINVAL;
+
+ exfat_chain_set(p_dir, ei->start_clu,
+ EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags);
+
+ return 0;
+}
+
+static inline int exfat_resolve_path(struct inode *inode,
+ const unsigned char *path, struct exfat_chain *dir,
+ struct exfat_uni_name *uni)
+{
+ return __exfat_resolve_path(inode, path, dir, uni, 0);
+}
+
+static inline int exfat_resolve_path_for_lookup(struct inode *inode,
+ const unsigned char *path, struct exfat_chain *dir,
+ struct exfat_uni_name *uni)
+{
+ return __exfat_resolve_path(inode, path, dir, uni, 1);
+}
+
+static inline loff_t exfat_make_i_pos(struct exfat_dir_entry *info)
+{
+ return ((loff_t) info->dir.dir << 32) | (info->entry & 0xffffffff);
+}
+
+static int exfat_add_entry(struct inode *inode, const char *path,
+ struct exfat_chain *p_dir, unsigned int type,
+ struct exfat_dir_entry *info)
+{
+ int ret, dentry, num_entries;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_uni_name uniname;
+ struct exfat_chain clu;
+ int clu_size = 0;
+ unsigned int start_clu = EXFAT_FREE_CLUSTER;
+
+ ret = exfat_resolve_path(inode, path, p_dir, &uniname);
+ if (ret)
+ goto out;
+
+ num_entries = exfat_calc_num_entries(&uniname);
+ if (num_entries < 0) {
+ ret = num_entries;
+ goto out;
+ }
+
+ /* exfat_find_empty_entry must be called before alloc_cluster() */
+ dentry = exfat_find_empty_entry(inode, p_dir, num_entries);
+ if (dentry < 0) {
+ ret = dentry; /* -EIO or -ENOSPC */
+ goto out;
+ }
+
+ if (type == TYPE_DIR) {
+ ret = exfat_alloc_new_dir(inode, &clu);
+ if (ret)
+ goto out;
+ start_clu = clu.dir;
+ clu_size = sbi->cluster_size;
+ }
+
+ /* update the directory entry */
+ /* fill the dos name directory entry information of the created file.
+ * the first cluster is not determined yet. (0)
+ */
+ ret = exfat_init_dir_entry(inode, p_dir, dentry, type,
+ start_clu, clu_size);
+ if (ret)
+ goto out;
+
+ ret = exfat_init_ext_entry(inode, p_dir, dentry, num_entries, &uniname);
+ if (ret)
+ goto out;
+
+ memcpy(&info->dir, p_dir, sizeof(struct exfat_chain));
+ info->entry = dentry;
+ info->flags = ALLOC_NO_FAT_CHAIN;
+ info->type = type;
+
+ if (type == TYPE_FILE) {
+ info->attr = ATTR_ARCHIVE;
+ info->start_clu = EXFAT_EOF_CLUSTER;
+ info->size = 0;
+ info->num_subdirs = 0;
+ } else {
+ int count;
+ struct exfat_chain cdir;
+
+ info->attr = ATTR_SUBDIR;
+ info->start_clu = start_clu;
+ info->size = clu_size;
+
+ exfat_chain_set(&cdir, info->start_clu,
+ EXFAT_B_TO_CLU(info->size, sbi), info->flags);
+ count = exfat_count_dir_entries(sb, &cdir);
+ if (count < 0)
+ return -EIO;
+ info->num_subdirs = count + EXFAT_MIN_SUBDIR;
+ }
+ memset(&info->crtime, 0, sizeof(info->crtime));
+ memset(&info->mtime, 0, sizeof(info->mtime));
+ memset(&info->atime, 0, sizeof(info->atime));
+out:
+ return ret;
+}
+
+static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ struct super_block *sb = dir->i_sb;
+ struct inode *inode;
+ struct exfat_chain cdir;
+ struct exfat_dir_entry info;
+ loff_t i_pos;
+ int err;
+
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+ exfat_set_vol_flags(sb, VOL_DIRTY);
+ err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE,
+ &info);
+ exfat_set_vol_flags(sb, VOL_CLEAN);
+ if (err)
+ goto unlock;
+
+ inode_inc_iversion(dir);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ if (IS_DIRSYNC(dir))
+ exfat_sync_inode(dir);
+ else
+ mark_inode_dirty(dir);
+
+ i_pos = exfat_make_i_pos(&info);
+ inode = exfat_build_inode(sb, &info, i_pos);
+ if (IS_ERR(inode))
+ goto unlock;
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_atime = inode->i_ctime =
+ EXFAT_I(inode)->i_crtime = current_time(inode);
+ /* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+ d_instantiate(dentry, inode);
+unlock:
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ return err;
+}
+
+/* lookup a file */
+static int exfat_find(struct inode *dir, struct qstr *qname,
+ struct exfat_dir_entry *info)
+{
+ int ret, dentry, num_entries, count;
+ struct exfat_chain cdir;
+ struct exfat_uni_name uni_name;
+ struct exfat_dentry *ep, *ep2;
+ struct exfat_entry_set_cache *es = NULL;
+ struct super_block *sb = dir->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(dir);
+
+ if (qname->len == 0)
+ return -ENOENT;
+
+ /* check the validity of directory name in the given pathname */
+ ret = exfat_resolve_path_for_lookup(dir, qname->name, &cdir, &uni_name);
+ if (ret)
+ return ret;
+
+ num_entries = exfat_calc_num_entries(&uni_name);
+ if (num_entries < 0)
+ return num_entries;
+
+ /* check the validation of hint_stat and initialize it if required */
+ if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) {
+ ei->hint_stat.clu = cdir.dir;
+ ei->hint_stat.eidx = 0;
+ ei->version = (inode_peek_iversion_raw(dir) & 0xffffffff);
+ ei->hint_femp.eidx = EXFAT_HINT_NONE;
+ }
+
+ /* search the file name for directories */
+ dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name,
+ num_entries, TYPE_ALL);
+
+ if ((dentry < 0) && (dentry != -EEXIST))
+ return dentry; /* -error value */
+
+ memcpy(&info->dir, &cdir.dir, sizeof(struct exfat_chain));
+ info->entry = dentry;
+ info->num_subdirs = 0;
+
+ /* root directory itself */
+ if (unlikely(dentry == -EEXIST)) {
+ int num_clu = 0;
+
+ info->type = TYPE_DIR;
+ info->attr = ATTR_SUBDIR;
+ info->flags = ALLOC_FAT_CHAIN;
+ info->start_clu = sbi->root_dir;
+ memset(&info->crtime, 0, sizeof(info->crtime));
+ memset(&info->mtime, 0, sizeof(info->mtime));
+ memset(&info->atime, 0, sizeof(info->atime));
+
+ exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ if (exfat_count_num_clusters(sb, &cdir, &num_clu))
+ return -EIO;
+ info->size = num_clu << sbi->cluster_size_bits;
+
+ count = exfat_count_dir_entries(sb, &cdir);
+ if (count < 0)
+ return -EIO;
+
+ info->num_subdirs = count;
+ } else {
+ es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES, &ep);
+ if (!es)
+ return -EIO;
+ ep2 = ep + 1;
+
+ info->type = exfat_get_entry_type(ep);
+ info->attr = le16_to_cpu(ep->dentry.file.attr);
+ info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
+ if ((info->type == TYPE_FILE) && (info->size == 0)) {
+ info->flags = ALLOC_NO_FAT_CHAIN;
+ info->start_clu = EXFAT_EOF_CLUSTER;
+ } else {
+ info->flags = ep2->dentry.stream.flags;
+ info->start_clu =
+ le32_to_cpu(ep2->dentry.stream.start_clu);
+ }
+
+ if (ei->start_clu == EXFAT_FREE_CLUSTER) {
+ exfat_fs_error(sb,
+ "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)",
+ i_size_read(dir), ei->dir.dir, ei->entry);
+ return -EIO;
+ }
+
+ exfat_get_entry_time(sbi, &info->crtime,
+ ep->dentry.file.create_tz,
+ ep->dentry.file.create_time,
+ ep->dentry.file.create_date,
+ ep->dentry.file.create_time_ms);
+ exfat_get_entry_time(sbi, &info->mtime,
+ ep->dentry.file.modify_tz,
+ ep->dentry.file.modify_time,
+ ep->dentry.file.modify_date,
+ ep->dentry.file.modify_time_ms);
+ exfat_get_entry_time(sbi, &info->atime,
+ ep->dentry.file.access_tz,
+ ep->dentry.file.access_time,
+ ep->dentry.file.access_date,
+ 0);
+ kfree(es);
+
+ if (info->type == TYPE_DIR) {
+ exfat_chain_set(&cdir, info->start_clu,
+ EXFAT_B_TO_CLU(info->size, sbi), info->flags);
+ count = exfat_count_dir_entries(sb, &cdir);
+ if (count < 0)
+ return -EIO;
+
+ info->num_subdirs = count + EXFAT_MIN_SUBDIR;
+ }
+ }
+ return 0;
+}
+
+static int exfat_d_anon_disconn(struct dentry *dentry)
+{
+ return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
+static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct super_block *sb = dir->i_sb;
+ struct inode *inode;
+ struct dentry *alias;
+ struct exfat_dir_entry info;
+ int err;
+ loff_t i_pos;
+ mode_t i_mode;
+
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+ err = exfat_find(dir, &dentry->d_name, &info);
+ if (err) {
+ if (err == -ENOENT) {
+ inode = NULL;
+ goto out;
+ }
+ goto unlock;
+ }
+
+ i_pos = exfat_make_i_pos(&info);
+ inode = exfat_build_inode(sb, &info, i_pos);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto unlock;
+ }
+
+ i_mode = inode->i_mode;
+ alias = d_find_alias(inode);
+
+ /*
+ * Checking "alias->d_parent == dentry->d_parent" to make sure
+ * FS is not corrupted (especially double linked dir).
+ */
+ if (alias && alias->d_parent == dentry->d_parent &&
+ !exfat_d_anon_disconn(alias)) {
+
+ /*
+ * Unhashed alias is able to exist because of revalidate()
+ * called by lookup_fast. You can easily make this status
+ * by calling create and lookup concurrently
+ * In such case, we reuse an alias instead of new dentry
+ */
+ if (d_unhashed(alias)) {
+ WARN_ON(alias->d_name.hash_len !=
+ dentry->d_name.hash_len);
+ exfat_msg(sb, KERN_INFO,
+ "rehashed a dentry(%p) in read lookup", alias);
+ d_drop(dentry);
+ d_rehash(alias);
+ } else if (!S_ISDIR(i_mode)) {
+ /*
+ * This inode has non anonymous-DCACHE_DISCONNECTED
+ * dentry. This means, the user did ->lookup() by an
+ * another name (longname vs 8.3 alias of it) in past.
+ *
+ * Switch to new one for reason of locality if possible.
+ */
+ d_move(alias, dentry);
+ }
+ iput(inode);
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ return alias;
+ }
+ dput(alias);
+out:
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ if (!inode)
+ exfat_d_version_set(dentry, inode_query_iversion(dir));
+
+ return d_splice_alias(inode, dentry);
+unlock:
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ return ERR_PTR(err);
+}
+
+/* remove an entry, BUT don't truncate */
+static int exfat_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct exfat_chain cdir;
+ struct exfat_dentry *ep;
+ struct super_block *sb = dir->i_sb;
+ struct inode *inode = dentry->d_inode;
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct buffer_head *bh;
+ sector_t sector;
+ int num_entries, entry, err = 0;
+
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+ exfat_chain_dup(&cdir, &ei->dir);
+ entry = ei->entry;
+ if (ei->dir.dir == DIR_DELETED) {
+ exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry");
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ ep = exfat_get_dentry(sb, &cdir, entry, &bh, &sector);
+ if (!ep) {
+ err = -EIO;
+ goto unlock;
+ }
+ num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep);
+ if (num_entries < 0) {
+ err = -EIO;
+ brelse(bh);
+ goto unlock;
+ }
+ num_entries++;
+ brelse(bh);
+
+ exfat_set_vol_flags(sb, VOL_DIRTY);
+ /* update the directory entry */
+ if (exfat_remove_entries(dir, &cdir, entry, 0, num_entries)) {
+ err = -EIO;
+ goto unlock;
+ }
+
+ /* This doesn't modify ei */
+ ei->dir.dir = DIR_DELETED;
+ exfat_set_vol_flags(sb, VOL_CLEAN);
+
+ inode_inc_iversion(dir);
+ dir->i_mtime = dir->i_atime = current_time(dir);
+ if (IS_DIRSYNC(dir))
+ exfat_sync_inode(dir);
+ else
+ mark_inode_dirty(dir);
+
+ clear_nlink(inode);
+ inode->i_mtime = inode->i_atime = current_time(inode);
+ exfat_unhash_inode(inode);
+ exfat_d_version_set(dentry, inode_query_iversion(dir));
+unlock:
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ return err;
+}
+
+static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct super_block *sb = dir->i_sb;
+ struct inode *inode;
+ struct exfat_dir_entry info;
+ struct exfat_chain cdir;
+ loff_t i_pos;
+ int err;
+
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+ exfat_set_vol_flags(sb, VOL_DIRTY);
+ err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR,
+ &info);
+ exfat_set_vol_flags(sb, VOL_CLEAN);
+ if (err)
+ goto unlock;
+
+ inode_inc_iversion(dir);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ if (IS_DIRSYNC(dir))
+ exfat_sync_inode(dir);
+ else
+ mark_inode_dirty(dir);
+ inc_nlink(dir);
+
+ i_pos = exfat_make_i_pos(&info);
+ inode = exfat_build_inode(sb, &info, i_pos);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto unlock;
+ }
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_atime = inode->i_ctime =
+ EXFAT_I(inode)->i_crtime = current_time(inode);
+ /* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+ d_instantiate(dentry, inode);
+
+unlock:
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ return err;
+}
+
+static int exfat_check_dir_empty(struct super_block *sb,
+ struct exfat_chain *p_dir)
+{
+ int i, dentries_per_clu;
+ unsigned int type;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct buffer_head *bh;
+
+ dentries_per_clu = sbi->dentries_per_clu;
+
+ exfat_chain_dup(&clu, p_dir);
+
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ for (i = 0; i < dentries_per_clu; i++) {
+ ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ if (!ep)
+ return -EIO;
+ type = exfat_get_entry_type(ep);
+ brelse(bh);
+ if (type == TYPE_UNUSED)
+ return 0;
+
+ if (type != TYPE_FILE && type != TYPE_DIR)
+ continue;
+
+ return -ENOTEMPTY;
+ }
+
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ if (--clu.size > 0)
+ clu.dir++;
+ else
+ clu.dir = EXFAT_EOF_CLUSTER;
+ } else {
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ struct exfat_dentry *ep;
+ struct exfat_chain cdir, clu_to_free;
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct buffer_head *bh;
+ sector_t sector;
+ int num_entries, entry, err;
+
+ mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
+
+ exfat_chain_dup(&cdir, &ei->dir);
+ entry = ei->entry;
+
+ if (ei->dir.dir == DIR_DELETED) {
+ exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry");
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ exfat_set_vol_flags(sb, VOL_DIRTY);
+ exfat_chain_set(&clu_to_free, ei->start_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi), ei->flags);
+
+ err = exfat_check_dir_empty(sb, &clu_to_free);
+ if (err) {
+ if (err == -EIO)
+ exfat_msg(sb, KERN_ERR,
+ "failed to exfat_check_dir_empty : err(%d)",
+ err);
+ goto unlock;
+ }
+
+ ep = exfat_get_dentry(sb, &cdir, entry, &bh, &sector);
+ if (!ep) {
+ err = -EIO;
+ goto unlock;
+ }
+
+ num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep);
+ if (num_entries < 0) {
+ err = -EIO;
+ brelse(bh);
+ goto unlock;
+ }
+ num_entries++;
+ brelse(bh);
+
+ err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
+ if (err) {
+ exfat_msg(sb, KERN_ERR,
+ "failed to exfat_remove_entries : err(%d)",
+ err);
+ goto unlock;
+ }
+ ei->dir.dir = DIR_DELETED;
+ exfat_set_vol_flags(sb, VOL_CLEAN);
+
+ inode_inc_iversion(dir);
+ dir->i_mtime = dir->i_atime = current_time(dir);
+ if (IS_DIRSYNC(dir))
+ exfat_sync_inode(dir);
+ else
+ mark_inode_dirty(dir);
+ drop_nlink(dir);
+
+ clear_nlink(inode);
+ inode->i_mtime = inode->i_atime = current_time(inode);
+ exfat_unhash_inode(inode);
+ exfat_d_version_set(dentry, inode_query_iversion(dir));
+unlock:
+ mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
+ return err;
+}
+
+static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
+ int oldentry, struct exfat_uni_name *p_uniname,
+ struct exfat_inode_info *ei)
+{
+ int ret, num_old_entries, num_new_entries;
+ sector_t sector_old, sector_new;
+ struct exfat_dentry *epold, *epnew;
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *new_bh, *old_bh;
+ int sync = IS_DIRSYNC(inode);
+
+ epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh, &sector_old);
+ if (!epold)
+ return -EIO;
+
+ num_old_entries = exfat_count_ext_entries(sb, p_dir, oldentry, epold);
+ if (num_old_entries < 0)
+ return -EIO;
+ num_old_entries++;
+
+ num_new_entries = exfat_calc_num_entries(p_uniname);
+ if (num_new_entries < 0)
+ return num_new_entries;
+
+ if (num_old_entries < num_new_entries) {
+ int newentry;
+
+ newentry =
+ exfat_find_empty_entry(inode, p_dir, num_new_entries);
+ if (newentry < 0)
+ return newentry; /* -EIO or -ENOSPC */
+
+ epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh,
+ &sector_new);
+ if (!epnew)
+ return -EIO;
+
+ memcpy(epnew, epold, DENTRY_SIZE);
+ if (exfat_get_entry_type(epnew) == TYPE_FILE) {
+ epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
+ ei->attr |= ATTR_ARCHIVE;
+ }
+ exfat_update_bh(sb, new_bh, sync);
+ brelse(old_bh);
+ brelse(new_bh);
+
+ epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh,
+ &sector_old);
+ epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh,
+ &sector_new);
+ if (!epold || !epnew)
+ return -EIO;
+
+ memcpy(epnew, epold, DENTRY_SIZE);
+ exfat_update_bh(sb, new_bh, sync);
+ brelse(old_bh);
+ brelse(new_bh);
+
+ ret = exfat_init_ext_entry(inode, p_dir, newentry,
+ num_new_entries, p_uniname);
+ if (ret)
+ return ret;
+
+ exfat_remove_entries(inode, p_dir, oldentry, 0,
+ num_old_entries);
+ ei->entry = newentry;
+ } else {
+ if (exfat_get_entry_type(epold) == TYPE_FILE) {
+ epold->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
+ ei->attr |= ATTR_ARCHIVE;
+ }
+ exfat_update_bh(sb, old_bh, sync);
+ brelse(old_bh);
+ ret = exfat_init_ext_entry(inode, p_dir, oldentry,
+ num_new_entries, p_uniname);
+ if (ret)
+ return ret;
+
+ exfat_remove_entries(inode, p_dir, oldentry, num_new_entries,
+ num_old_entries);
+ }
+ return 0;
+}
+
+static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
+ int oldentry, struct exfat_chain *p_newdir,
+ struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei)
+{
+ int ret, newentry, num_new_entries, num_old_entries;
+ sector_t sector_mov, sector_new;
+ struct exfat_dentry *epmov, *epnew;
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *mov_bh, *new_bh;
+
+ epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh, &sector_mov);
+ if (!epmov)
+ return -EIO;
+
+ /* check if the source and target directory is the same */
+ if (exfat_get_entry_type(epmov) == TYPE_DIR &&
+ le32_to_cpu(epmov->dentry.stream.start_clu) == p_newdir->dir)
+ return -EINVAL;
+
+ num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry,
+ epmov);
+ if (num_old_entries < 0)
+ return -EIO;
+ num_old_entries++;
+
+ num_new_entries = exfat_calc_num_entries(p_uniname);
+ if (num_new_entries < 0)
+ return num_new_entries;
+
+ newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries);
+ if (newentry < 0)
+ return newentry; /* -EIO or -ENOSPC */
+
+ epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh, &sector_new);
+ if (!epnew)
+ return -EIO;
+
+ memcpy(epnew, epmov, DENTRY_SIZE);
+ if (exfat_get_entry_type(epnew) == TYPE_FILE) {
+ epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
+ ei->attr |= ATTR_ARCHIVE;
+ }
+ exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode));
+ brelse(mov_bh);
+ brelse(new_bh);
+
+ epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh,
+ &sector_mov);
+ epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh,
+ &sector_new);
+ if (!epmov || !epnew)
+ return -EIO;
+
+ memcpy(epnew, epmov, DENTRY_SIZE);
+ exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode));
+ brelse(mov_bh);
+ brelse(new_bh);
+
+ ret = exfat_init_ext_entry(inode, p_newdir, newentry, num_new_entries,
+ p_uniname);
+ if (ret)
+ return ret;
+
+ exfat_remove_entries(inode, p_olddir, oldentry, 0, num_old_entries);
+
+ exfat_chain_set(&ei->dir, p_newdir->dir, p_newdir->size,
+ p_newdir->flags);
+
+ ei->entry = newentry;
+ return 0;
+}
+
+static void exfat_update_parent_info(struct exfat_inode_info *ei,
+ struct inode *parent_inode)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(parent_inode->i_sb);
+ struct exfat_inode_info *parent_ei = EXFAT_I(parent_inode);
+ loff_t parent_isize = i_size_read(parent_inode);
+
+ /*
+ * the problem that struct exfat_inode_info caches wrong parent info.
+ *
+ * because of flag-mismatch of ei->dir,
+ * there is abnormal traversing cluster chain.
+ */
+ if (unlikely(parent_ei->flags != ei->dir.flags ||
+ parent_isize != EXFAT_CLU_TO_B(ei->dir.size, sbi) ||
+ parent_ei->start_clu != ei->dir.dir)) {
+ exfat_chain_set(&ei->dir, parent_ei->start_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(parent_isize, sbi),
+ parent_ei->flags);
+ }
+}
+
+/* rename or move a old file into a new file */
+static int __exfat_rename(struct inode *old_parent_inode,
+ struct exfat_inode_info *ei, struct inode *new_parent_inode,
+ struct dentry *new_dentry)
+{
+ int ret;
+ int dentry;
+ struct exfat_chain olddir, newdir;
+ struct exfat_chain *p_dir = NULL;
+ struct exfat_uni_name uni_name;
+ struct exfat_dentry *ep;
+ struct super_block *sb = old_parent_inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ const unsigned char *new_path = new_dentry->d_name.name;
+ struct inode *new_inode = new_dentry->d_inode;
+ int num_entries;
+ struct exfat_inode_info *new_ei = NULL;
+ unsigned int new_entry_type = TYPE_UNUSED;
+ int new_entry = 0;
+ struct buffer_head *old_bh, *new_bh = NULL;
+
+ /* check the validity of pointer parameters */
+ if (new_path == NULL || strlen(new_path) == 0)
+ return -EINVAL;
+
+ if (ei->dir.dir == DIR_DELETED) {
+ exfat_msg(sb, KERN_ERR,
+ "abnormal access to deleted source dentry");
+ return -ENOENT;
+ }
+
+ exfat_update_parent_info(ei, old_parent_inode);
+
+ exfat_chain_dup(&olddir, &ei->dir);
+ dentry = ei->entry;
+
+ ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh, NULL);
+ if (!ep) {
+ ret = -EIO;
+ goto out;
+ }
+ brelse(old_bh);
+
+ /* check whether new dir is existing directory and empty */
+ if (new_inode) {
+ ret = -EIO;
+ new_ei = EXFAT_I(new_inode);
+
+ if (new_ei->dir.dir == DIR_DELETED) {
+ exfat_msg(sb, KERN_ERR,
+ "abnormal access to deleted target dentry");
+ goto out;
+ }
+
+ exfat_update_parent_info(new_ei, new_parent_inode);
+
+ p_dir = &(new_ei->dir);
+ new_entry = new_ei->entry;
+ ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL);
+ if (!ep)
+ goto out;
+
+ new_entry_type = exfat_get_entry_type(ep);
+ brelse(new_bh);
+
+ /* if new_inode exists, update ei */
+ if (new_entry_type == TYPE_DIR) {
+ struct exfat_chain new_clu;
+
+ new_clu.dir = new_ei->start_clu;
+ new_clu.size =
+ EXFAT_B_TO_CLU_ROUND_UP(i_size_read(new_inode),
+ sbi);
+ new_clu.flags = new_ei->flags;
+
+ ret = exfat_check_dir_empty(sb, &new_clu);
+ if (ret)
+ goto out;
+ }
+ }
+
+ /* check the validity of directory name in the given new pathname */
+ ret = exfat_resolve_path(new_parent_inode, new_path, &newdir,
+ &uni_name);
+ if (ret)
+ goto out;
+
+ exfat_set_vol_flags(sb, VOL_DIRTY);
+
+ if (olddir.dir == newdir.dir)
+ ret = exfat_rename_file(new_parent_inode, &olddir, dentry,
+ &uni_name, ei);
+ else
+ ret = exfat_move_file(new_parent_inode, &olddir, dentry,
+ &newdir, &uni_name, ei);
+
+ if (!ret && new_inode) {
+ /* delete entries of new_dir */
+ ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL);
+ if (!ep) {
+ ret = -EIO;
+ goto del_out;
+ }
+
+ num_entries = exfat_count_ext_entries(sb, p_dir, new_entry, ep);
+ if (num_entries < 0) {
+ ret = -EIO;
+ goto del_out;
+ }
+ brelse(new_bh);
+
+ if (exfat_remove_entries(new_inode, p_dir, new_entry, 0,
+ num_entries + 1)) {
+ ret = -EIO;
+ goto del_out;
+ }
+
+ /* Free the clusters if new_inode is a dir(as if exfat_rmdir) */
+ if (new_entry_type == TYPE_DIR) {
+ /* new_ei, new_clu_to_free */
+ struct exfat_chain new_clu_to_free;
+
+ exfat_chain_set(&new_clu_to_free, new_ei->start_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(i_size_read(new_inode),
+ sbi), new_ei->flags);
+
+ if (exfat_free_cluster(new_inode, &new_clu_to_free)) {
+ /* just set I/O error only */
+ ret = -EIO;
+ }
+
+ i_size_write(new_inode, 0);
+ new_ei->start_clu = EXFAT_EOF_CLUSTER;
+ new_ei->flags = ALLOC_NO_FAT_CHAIN;
+ }
+del_out:
+ /* Update new_inode ei
+ * Prevent syncing removed new_inode
+ * (new_ei is already initialized above code ("if (new_inode)")
+ */
+ new_ei->dir.dir = DIR_DELETED;
+ }
+ exfat_set_vol_flags(sb, VOL_CLEAN);
+out:
+ return ret;
+}
+
+static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ struct inode *old_inode, *new_inode;
+ struct super_block *sb = old_dir->i_sb;
+ loff_t i_pos;
+ int err;
+
+ /*
+ * The VFS already checks for existence, so for local filesystems
+ * the RENAME_NOREPLACE implementation is equivalent to plain rename.
+ * Don't support any other flags
+ */
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+ old_inode = old_dentry->d_inode;
+ new_inode = new_dentry->d_inode;
+
+ err = __exfat_rename(old_dir, EXFAT_I(old_inode), new_dir, new_dentry);
+ if (err)
+ goto unlock;
+
+ inode_inc_iversion(new_dir);
+ new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime =
+ EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
+ if (IS_DIRSYNC(new_dir))
+ exfat_sync_inode(new_dir);
+ else
+ mark_inode_dirty(new_dir);
+
+ i_pos = ((loff_t)EXFAT_I(old_inode)->dir.dir << 32) |
+ (EXFAT_I(old_inode)->entry & 0xffffffff);
+ exfat_unhash_inode(old_inode);
+ exfat_hash_inode(old_inode, i_pos);
+ if (IS_DIRSYNC(new_dir))
+ exfat_sync_inode(old_inode);
+ else
+ mark_inode_dirty(old_inode);
+
+ if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) {
+ drop_nlink(old_dir);
+ if (!new_inode)
+ inc_nlink(new_dir);
+ }
+
+ inode_inc_iversion(old_dir);
+ old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ if (IS_DIRSYNC(old_dir))
+ exfat_sync_inode(old_dir);
+ else
+ mark_inode_dirty(old_dir);
+
+ if (new_inode) {
+ exfat_unhash_inode(new_inode);
+
+ /* skip drop_nlink if new_inode already has been dropped */
+ if (new_inode->i_nlink) {
+ drop_nlink(new_inode);
+ if (S_ISDIR(new_inode->i_mode))
+ drop_nlink(new_inode);
+ } else {
+ exfat_msg(sb, KERN_WARNING,
+ "abnormal access to an inode dropped");
+ WARN_ON(new_inode->i_nlink == 0);
+ }
+ new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime =
+ current_time(new_inode);
+ }
+
+unlock:
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ return err;
+}
+
+const struct inode_operations exfat_dir_inode_operations = {
+ .create = exfat_create,
+ .lookup = exfat_lookup,
+ .unlink = exfat_unlink,
+ .mkdir = exfat_mkdir,
+ .rmdir = exfat_rmdir,
+ .rename = exfat_rename,
+ .setattr = exfat_setattr,
+ .getattr = exfat_getattr,
+};
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
new file mode 100644
index 000000000000..6d1c3ae130ff
--- /dev/null
+++ b/fs/exfat/nls.c
@@ -0,0 +1,831 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <asm/unaligned.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+/* Upcase tabel macro */
+#define EXFAT_NUM_UPCASE (2918)
+#define UTBL_COUNT (0x10000)
+
+/*
+ * Upcase table in compressed format (7.2.5.1 Recommended Up-case Table
+ * in exfat specification, See:
+ * https://docs.microsoft.com/en-us/windows/win32/fileio/exfat-specification).
+ */
+static const unsigned short uni_def_upcase[EXFAT_NUM_UPCASE] = {
+ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+ 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
+ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+ 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
+ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+ 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
+ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+ 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
+ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+ 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+ 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
+ 0x0060, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+ 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+ 0x0058, 0x0059, 0x005a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
+ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
+ 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
+ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
+ 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
+ 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
+ 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
+ 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
+ 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
+ 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
+ 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
+ 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
+ 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
+ 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
+ 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
+ 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00f7,
+ 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178,
+ 0x0100, 0x0100, 0x0102, 0x0102, 0x0104, 0x0104, 0x0106, 0x0106,
+ 0x0108, 0x0108, 0x010a, 0x010a, 0x010c, 0x010c, 0x010e, 0x010e,
+ 0x0110, 0x0110, 0x0112, 0x0112, 0x0114, 0x0114, 0x0116, 0x0116,
+ 0x0118, 0x0118, 0x011a, 0x011a, 0x011c, 0x011c, 0x011e, 0x011e,
+ 0x0120, 0x0120, 0x0122, 0x0122, 0x0124, 0x0124, 0x0126, 0x0126,
+ 0x0128, 0x0128, 0x012a, 0x012a, 0x012c, 0x012c, 0x012e, 0x012e,
+ 0x0130, 0x0131, 0x0132, 0x0132, 0x0134, 0x0134, 0x0136, 0x0136,
+ 0x0138, 0x0139, 0x0139, 0x013b, 0x013b, 0x013d, 0x013d, 0x013f,
+ 0x013f, 0x0141, 0x0141, 0x0143, 0x0143, 0x0145, 0x0145, 0x0147,
+ 0x0147, 0x0149, 0x014a, 0x014a, 0x014c, 0x014c, 0x014e, 0x014e,
+ 0x0150, 0x0150, 0x0152, 0x0152, 0x0154, 0x0154, 0x0156, 0x0156,
+ 0x0158, 0x0158, 0x015a, 0x015a, 0x015c, 0x015c, 0x015e, 0x015e,
+ 0x0160, 0x0160, 0x0162, 0x0162, 0x0164, 0x0164, 0x0166, 0x0166,
+ 0x0168, 0x0168, 0x016a, 0x016a, 0x016c, 0x016c, 0x016e, 0x016e,
+ 0x0170, 0x0170, 0x0172, 0x0172, 0x0174, 0x0174, 0x0176, 0x0176,
+ 0x0178, 0x0179, 0x0179, 0x017b, 0x017b, 0x017d, 0x017d, 0x017f,
+ 0x0243, 0x0181, 0x0182, 0x0182, 0x0184, 0x0184, 0x0186, 0x0187,
+ 0x0187, 0x0189, 0x018a, 0x018b, 0x018b, 0x018d, 0x018e, 0x018f,
+ 0x0190, 0x0191, 0x0191, 0x0193, 0x0194, 0x01f6, 0x0196, 0x0197,
+ 0x0198, 0x0198, 0x023d, 0x019b, 0x019c, 0x019d, 0x0220, 0x019f,
+ 0x01a0, 0x01a0, 0x01a2, 0x01a2, 0x01a4, 0x01a4, 0x01a6, 0x01a7,
+ 0x01a7, 0x01a9, 0x01aa, 0x01ab, 0x01ac, 0x01ac, 0x01ae, 0x01af,
+ 0x01af, 0x01b1, 0x01b2, 0x01b3, 0x01b3, 0x01b5, 0x01b5, 0x01b7,
+ 0x01b8, 0x01b8, 0x01ba, 0x01bb, 0x01bc, 0x01bc, 0x01be, 0x01f7,
+ 0x01c0, 0x01c1, 0x01c2, 0x01c3, 0x01c4, 0x01c5, 0x01c4, 0x01c7,
+ 0x01c8, 0x01c7, 0x01ca, 0x01cb, 0x01ca, 0x01cd, 0x01cd, 0x01cf,
+ 0x01cf, 0x01d1, 0x01d1, 0x01d3, 0x01d3, 0x01d5, 0x01d5, 0x01d7,
+ 0x01d7, 0x01d9, 0x01d9, 0x01db, 0x01db, 0x018e, 0x01de, 0x01de,
+ 0x01e0, 0x01e0, 0x01e2, 0x01e2, 0x01e4, 0x01e4, 0x01e6, 0x01e6,
+ 0x01e8, 0x01e8, 0x01ea, 0x01ea, 0x01ec, 0x01ec, 0x01ee, 0x01ee,
+ 0x01f0, 0x01f1, 0x01f2, 0x01f1, 0x01f4, 0x01f4, 0x01f6, 0x01f7,
+ 0x01f8, 0x01f8, 0x01fa, 0x01fa, 0x01fc, 0x01fc, 0x01fe, 0x01fe,
+ 0x0200, 0x0200, 0x0202, 0x0202, 0x0204, 0x0204, 0x0206, 0x0206,
+ 0x0208, 0x0208, 0x020a, 0x020a, 0x020c, 0x020c, 0x020e, 0x020e,
+ 0x0210, 0x0210, 0x0212, 0x0212, 0x0214, 0x0214, 0x0216, 0x0216,
+ 0x0218, 0x0218, 0x021a, 0x021a, 0x021c, 0x021c, 0x021e, 0x021e,
+ 0x0220, 0x0221, 0x0222, 0x0222, 0x0224, 0x0224, 0x0226, 0x0226,
+ 0x0228, 0x0228, 0x022a, 0x022a, 0x022c, 0x022c, 0x022e, 0x022e,
+ 0x0230, 0x0230, 0x0232, 0x0232, 0x0234, 0x0235, 0x0236, 0x0237,
+ 0x0238, 0x0239, 0x2c65, 0x023b, 0x023b, 0x023d, 0x2c66, 0x023f,
+ 0x0240, 0x0241, 0x0241, 0x0243, 0x0244, 0x0245, 0x0246, 0x0246,
+ 0x0248, 0x0248, 0x024a, 0x024a, 0x024c, 0x024c, 0x024e, 0x024e,
+ 0x0250, 0x0251, 0x0252, 0x0181, 0x0186, 0x0255, 0x0189, 0x018a,
+ 0x0258, 0x018f, 0x025a, 0x0190, 0x025c, 0x025d, 0x025e, 0x025f,
+ 0x0193, 0x0261, 0x0262, 0x0194, 0x0264, 0x0265, 0x0266, 0x0267,
+ 0x0197, 0x0196, 0x026a, 0x2c62, 0x026c, 0x026d, 0x026e, 0x019c,
+ 0x0270, 0x0271, 0x019d, 0x0273, 0x0274, 0x019f, 0x0276, 0x0277,
+ 0x0278, 0x0279, 0x027a, 0x027b, 0x027c, 0x2c64, 0x027e, 0x027f,
+ 0x01a6, 0x0281, 0x0282, 0x01a9, 0x0284, 0x0285, 0x0286, 0x0287,
+ 0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x028d, 0x028e, 0x028f,
+ 0x0290, 0x0291, 0x01b7, 0x0293, 0x0294, 0x0295, 0x0296, 0x0297,
+ 0x0298, 0x0299, 0x029a, 0x029b, 0x029c, 0x029d, 0x029e, 0x029f,
+ 0x02a0, 0x02a1, 0x02a2, 0x02a3, 0x02a4, 0x02a5, 0x02a6, 0x02a7,
+ 0x02a8, 0x02a9, 0x02aa, 0x02ab, 0x02ac, 0x02ad, 0x02ae, 0x02af,
+ 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x02b5, 0x02b6, 0x02b7,
+ 0x02b8, 0x02b9, 0x02ba, 0x02bb, 0x02bc, 0x02bd, 0x02be, 0x02bf,
+ 0x02c0, 0x02c1, 0x02c2, 0x02c3, 0x02c4, 0x02c5, 0x02c6, 0x02c7,
+ 0x02c8, 0x02c9, 0x02ca, 0x02cb, 0x02cc, 0x02cd, 0x02ce, 0x02cf,
+ 0x02d0, 0x02d1, 0x02d2, 0x02d3, 0x02d4, 0x02d5, 0x02d6, 0x02d7,
+ 0x02d8, 0x02d9, 0x02da, 0x02db, 0x02dc, 0x02dd, 0x02de, 0x02df,
+ 0x02e0, 0x02e1, 0x02e2, 0x02e3, 0x02e4, 0x02e5, 0x02e6, 0x02e7,
+ 0x02e8, 0x02e9, 0x02ea, 0x02eb, 0x02ec, 0x02ed, 0x02ee, 0x02ef,
+ 0x02f0, 0x02f1, 0x02f2, 0x02f3, 0x02f4, 0x02f5, 0x02f6, 0x02f7,
+ 0x02f8, 0x02f9, 0x02fa, 0x02fb, 0x02fc, 0x02fd, 0x02fe, 0x02ff,
+ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307,
+ 0x0308, 0x0309, 0x030a, 0x030b, 0x030c, 0x030d, 0x030e, 0x030f,
+ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317,
+ 0x0318, 0x0319, 0x031a, 0x031b, 0x031c, 0x031d, 0x031e, 0x031f,
+ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327,
+ 0x0328, 0x0329, 0x032a, 0x032b, 0x032c, 0x032d, 0x032e, 0x032f,
+ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337,
+ 0x0338, 0x0339, 0x033a, 0x033b, 0x033c, 0x033d, 0x033e, 0x033f,
+ 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347,
+ 0x0348, 0x0349, 0x034a, 0x034b, 0x034c, 0x034d, 0x034e, 0x034f,
+ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357,
+ 0x0358, 0x0359, 0x035a, 0x035b, 0x035c, 0x035d, 0x035e, 0x035f,
+ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367,
+ 0x0368, 0x0369, 0x036a, 0x036b, 0x036c, 0x036d, 0x036e, 0x036f,
+ 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377,
+ 0x0378, 0x0379, 0x037a, 0x03fd, 0x03fe, 0x03ff, 0x037e, 0x037f,
+ 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387,
+ 0x0388, 0x0389, 0x038a, 0x038b, 0x038c, 0x038d, 0x038e, 0x038f,
+ 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
+ 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
+ 0x03a0, 0x03a1, 0x03a2, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
+ 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x0386, 0x0388, 0x0389, 0x038a,
+ 0x03b0, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
+ 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
+ 0x03a0, 0x03a1, 0x03a3, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
+ 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x03cf,
+ 0x03d0, 0x03d1, 0x03d2, 0x03d3, 0x03d4, 0x03d5, 0x03d6, 0x03d7,
+ 0x03d8, 0x03d8, 0x03da, 0x03da, 0x03dc, 0x03dc, 0x03de, 0x03de,
+ 0x03e0, 0x03e0, 0x03e2, 0x03e2, 0x03e4, 0x03e4, 0x03e6, 0x03e6,
+ 0x03e8, 0x03e8, 0x03ea, 0x03ea, 0x03ec, 0x03ec, 0x03ee, 0x03ee,
+ 0x03f0, 0x03f1, 0x03f9, 0x03f3, 0x03f4, 0x03f5, 0x03f6, 0x03f7,
+ 0x03f7, 0x03f9, 0x03fa, 0x03fa, 0x03fc, 0x03fd, 0x03fe, 0x03ff,
+ 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
+ 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,
+ 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
+ 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
+ 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
+ 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
+ 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
+ 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
+ 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
+ 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
+ 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
+ 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,
+ 0x0460, 0x0460, 0x0462, 0x0462, 0x0464, 0x0464, 0x0466, 0x0466,
+ 0x0468, 0x0468, 0x046a, 0x046a, 0x046c, 0x046c, 0x046e, 0x046e,
+ 0x0470, 0x0470, 0x0472, 0x0472, 0x0474, 0x0474, 0x0476, 0x0476,
+ 0x0478, 0x0478, 0x047a, 0x047a, 0x047c, 0x047c, 0x047e, 0x047e,
+ 0x0480, 0x0480, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487,
+ 0x0488, 0x0489, 0x048a, 0x048a, 0x048c, 0x048c, 0x048e, 0x048e,
+ 0x0490, 0x0490, 0x0492, 0x0492, 0x0494, 0x0494, 0x0496, 0x0496,
+ 0x0498, 0x0498, 0x049a, 0x049a, 0x049c, 0x049c, 0x049e, 0x049e,
+ 0x04a0, 0x04a0, 0x04a2, 0x04a2, 0x04a4, 0x04a4, 0x04a6, 0x04a6,
+ 0x04a8, 0x04a8, 0x04aa, 0x04aa, 0x04ac, 0x04ac, 0x04ae, 0x04ae,
+ 0x04b0, 0x04b0, 0x04b2, 0x04b2, 0x04b4, 0x04b4, 0x04b6, 0x04b6,
+ 0x04b8, 0x04b8, 0x04ba, 0x04ba, 0x04bc, 0x04bc, 0x04be, 0x04be,
+ 0x04c0, 0x04c1, 0x04c1, 0x04c3, 0x04c3, 0x04c5, 0x04c5, 0x04c7,
+ 0x04c7, 0x04c9, 0x04c9, 0x04cb, 0x04cb, 0x04cd, 0x04cd, 0x04c0,
+ 0x04d0, 0x04d0, 0x04d2, 0x04d2, 0x04d4, 0x04d4, 0x04d6, 0x04d6,
+ 0x04d8, 0x04d8, 0x04da, 0x04da, 0x04dc, 0x04dc, 0x04de, 0x04de,
+ 0x04e0, 0x04e0, 0x04e2, 0x04e2, 0x04e4, 0x04e4, 0x04e6, 0x04e6,
+ 0x04e8, 0x04e8, 0x04ea, 0x04ea, 0x04ec, 0x04ec, 0x04ee, 0x04ee,
+ 0x04f0, 0x04f0, 0x04f2, 0x04f2, 0x04f4, 0x04f4, 0x04f6, 0x04f6,
+ 0x04f8, 0x04f8, 0x04fa, 0x04fa, 0x04fc, 0x04fc, 0x04fe, 0x04fe,
+ 0x0500, 0x0500, 0x0502, 0x0502, 0x0504, 0x0504, 0x0506, 0x0506,
+ 0x0508, 0x0508, 0x050a, 0x050a, 0x050c, 0x050c, 0x050e, 0x050e,
+ 0x0510, 0x0510, 0x0512, 0x0512, 0x0514, 0x0515, 0x0516, 0x0517,
+ 0x0518, 0x0519, 0x051a, 0x051b, 0x051c, 0x051d, 0x051e, 0x051f,
+ 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527,
+ 0x0528, 0x0529, 0x052a, 0x052b, 0x052c, 0x052d, 0x052e, 0x052f,
+ 0x0530, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537,
+ 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f,
+ 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547,
+ 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f,
+ 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0557,
+ 0x0558, 0x0559, 0x055a, 0x055b, 0x055c, 0x055d, 0x055e, 0x055f,
+ 0x0560, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537,
+ 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f,
+ 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547,
+ 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f,
+ 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0xffff,
+ 0x17f6, 0x2c63, 0x1d7e, 0x1d7f, 0x1d80, 0x1d81, 0x1d82, 0x1d83,
+ 0x1d84, 0x1d85, 0x1d86, 0x1d87, 0x1d88, 0x1d89, 0x1d8a, 0x1d8b,
+ 0x1d8c, 0x1d8d, 0x1d8e, 0x1d8f, 0x1d90, 0x1d91, 0x1d92, 0x1d93,
+ 0x1d94, 0x1d95, 0x1d96, 0x1d97, 0x1d98, 0x1d99, 0x1d9a, 0x1d9b,
+ 0x1d9c, 0x1d9d, 0x1d9e, 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3,
+ 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, 0x1dab,
+ 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, 0x1db1, 0x1db2, 0x1db3,
+ 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb,
+ 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x1dc0, 0x1dc1, 0x1dc2, 0x1dc3,
+ 0x1dc4, 0x1dc5, 0x1dc6, 0x1dc7, 0x1dc8, 0x1dc9, 0x1dca, 0x1dcb,
+ 0x1dcc, 0x1dcd, 0x1dce, 0x1dcf, 0x1dd0, 0x1dd1, 0x1dd2, 0x1dd3,
+ 0x1dd4, 0x1dd5, 0x1dd6, 0x1dd7, 0x1dd8, 0x1dd9, 0x1dda, 0x1ddb,
+ 0x1ddc, 0x1ddd, 0x1dde, 0x1ddf, 0x1de0, 0x1de1, 0x1de2, 0x1de3,
+ 0x1de4, 0x1de5, 0x1de6, 0x1de7, 0x1de8, 0x1de9, 0x1dea, 0x1deb,
+ 0x1dec, 0x1ded, 0x1dee, 0x1def, 0x1df0, 0x1df1, 0x1df2, 0x1df3,
+ 0x1df4, 0x1df5, 0x1df6, 0x1df7, 0x1df8, 0x1df9, 0x1dfa, 0x1dfb,
+ 0x1dfc, 0x1dfd, 0x1dfe, 0x1dff, 0x1e00, 0x1e00, 0x1e02, 0x1e02,
+ 0x1e04, 0x1e04, 0x1e06, 0x1e06, 0x1e08, 0x1e08, 0x1e0a, 0x1e0a,
+ 0x1e0c, 0x1e0c, 0x1e0e, 0x1e0e, 0x1e10, 0x1e10, 0x1e12, 0x1e12,
+ 0x1e14, 0x1e14, 0x1e16, 0x1e16, 0x1e18, 0x1e18, 0x1e1a, 0x1e1a,
+ 0x1e1c, 0x1e1c, 0x1e1e, 0x1e1e, 0x1e20, 0x1e20, 0x1e22, 0x1e22,
+ 0x1e24, 0x1e24, 0x1e26, 0x1e26, 0x1e28, 0x1e28, 0x1e2a, 0x1e2a,
+ 0x1e2c, 0x1e2c, 0x1e2e, 0x1e2e, 0x1e30, 0x1e30, 0x1e32, 0x1e32,
+ 0x1e34, 0x1e34, 0x1e36, 0x1e36, 0x1e38, 0x1e38, 0x1e3a, 0x1e3a,
+ 0x1e3c, 0x1e3c, 0x1e3e, 0x1e3e, 0x1e40, 0x1e40, 0x1e42, 0x1e42,
+ 0x1e44, 0x1e44, 0x1e46, 0x1e46, 0x1e48, 0x1e48, 0x1e4a, 0x1e4a,
+ 0x1e4c, 0x1e4c, 0x1e4e, 0x1e4e, 0x1e50, 0x1e50, 0x1e52, 0x1e52,
+ 0x1e54, 0x1e54, 0x1e56, 0x1e56, 0x1e58, 0x1e58, 0x1e5a, 0x1e5a,
+ 0x1e5c, 0x1e5c, 0x1e5e, 0x1e5e, 0x1e60, 0x1e60, 0x1e62, 0x1e62,
+ 0x1e64, 0x1e64, 0x1e66, 0x1e66, 0x1e68, 0x1e68, 0x1e6a, 0x1e6a,
+ 0x1e6c, 0x1e6c, 0x1e6e, 0x1e6e, 0x1e70, 0x1e70, 0x1e72, 0x1e72,
+ 0x1e74, 0x1e74, 0x1e76, 0x1e76, 0x1e78, 0x1e78, 0x1e7a, 0x1e7a,
+ 0x1e7c, 0x1e7c, 0x1e7e, 0x1e7e, 0x1e80, 0x1e80, 0x1e82, 0x1e82,
+ 0x1e84, 0x1e84, 0x1e86, 0x1e86, 0x1e88, 0x1e88, 0x1e8a, 0x1e8a,
+ 0x1e8c, 0x1e8c, 0x1e8e, 0x1e8e, 0x1e90, 0x1e90, 0x1e92, 0x1e92,
+ 0x1e94, 0x1e94, 0x1e96, 0x1e97, 0x1e98, 0x1e99, 0x1e9a, 0x1e9b,
+ 0x1e9c, 0x1e9d, 0x1e9e, 0x1e9f, 0x1ea0, 0x1ea0, 0x1ea2, 0x1ea2,
+ 0x1ea4, 0x1ea4, 0x1ea6, 0x1ea6, 0x1ea8, 0x1ea8, 0x1eaa, 0x1eaa,
+ 0x1eac, 0x1eac, 0x1eae, 0x1eae, 0x1eb0, 0x1eb0, 0x1eb2, 0x1eb2,
+ 0x1eb4, 0x1eb4, 0x1eb6, 0x1eb6, 0x1eb8, 0x1eb8, 0x1eba, 0x1eba,
+ 0x1ebc, 0x1ebc, 0x1ebe, 0x1ebe, 0x1ec0, 0x1ec0, 0x1ec2, 0x1ec2,
+ 0x1ec4, 0x1ec4, 0x1ec6, 0x1ec6, 0x1ec8, 0x1ec8, 0x1eca, 0x1eca,
+ 0x1ecc, 0x1ecc, 0x1ece, 0x1ece, 0x1ed0, 0x1ed0, 0x1ed2, 0x1ed2,
+ 0x1ed4, 0x1ed4, 0x1ed6, 0x1ed6, 0x1ed8, 0x1ed8, 0x1eda, 0x1eda,
+ 0x1edc, 0x1edc, 0x1ede, 0x1ede, 0x1ee0, 0x1ee0, 0x1ee2, 0x1ee2,
+ 0x1ee4, 0x1ee4, 0x1ee6, 0x1ee6, 0x1ee8, 0x1ee8, 0x1eea, 0x1eea,
+ 0x1eec, 0x1eec, 0x1eee, 0x1eee, 0x1ef0, 0x1ef0, 0x1ef2, 0x1ef2,
+ 0x1ef4, 0x1ef4, 0x1ef6, 0x1ef6, 0x1ef8, 0x1ef8, 0x1efa, 0x1efb,
+ 0x1efc, 0x1efd, 0x1efe, 0x1eff, 0x1f08, 0x1f09, 0x1f0a, 0x1f0b,
+ 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, 0x1f08, 0x1f09, 0x1f0a, 0x1f0b,
+ 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, 0x1f18, 0x1f19, 0x1f1a, 0x1f1b,
+ 0x1f1c, 0x1f1d, 0x1f16, 0x1f17, 0x1f18, 0x1f19, 0x1f1a, 0x1f1b,
+ 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f, 0x1f28, 0x1f29, 0x1f2a, 0x1f2b,
+ 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, 0x1f28, 0x1f29, 0x1f2a, 0x1f2b,
+ 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, 0x1f38, 0x1f39, 0x1f3a, 0x1f3b,
+ 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, 0x1f38, 0x1f39, 0x1f3a, 0x1f3b,
+ 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, 0x1f48, 0x1f49, 0x1f4a, 0x1f4b,
+ 0x1f4c, 0x1f4d, 0x1f46, 0x1f47, 0x1f48, 0x1f49, 0x1f4a, 0x1f4b,
+ 0x1f4c, 0x1f4d, 0x1f4e, 0x1f4f, 0x1f50, 0x1f59, 0x1f52, 0x1f5b,
+ 0x1f54, 0x1f5d, 0x1f56, 0x1f5f, 0x1f58, 0x1f59, 0x1f5a, 0x1f5b,
+ 0x1f5c, 0x1f5d, 0x1f5e, 0x1f5f, 0x1f68, 0x1f69, 0x1f6a, 0x1f6b,
+ 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, 0x1f68, 0x1f69, 0x1f6a, 0x1f6b,
+ 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, 0x1fba, 0x1fbb, 0x1fc8, 0x1fc9,
+ 0x1fca, 0x1fcb, 0x1fda, 0x1fdb, 0x1ff8, 0x1ff9, 0x1fea, 0x1feb,
+ 0x1ffa, 0x1ffb, 0x1f7e, 0x1f7f, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b,
+ 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b,
+ 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b,
+ 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b,
+ 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab,
+ 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab,
+ 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fb8, 0x1fb9, 0x1fb2, 0x1fbc,
+ 0x1fb4, 0x1fb5, 0x1fb6, 0x1fb7, 0x1fb8, 0x1fb9, 0x1fba, 0x1fbb,
+ 0x1fbc, 0x1fbd, 0x1fbe, 0x1fbf, 0x1fc0, 0x1fc1, 0x1fc2, 0x1fc3,
+ 0x1fc4, 0x1fc5, 0x1fc6, 0x1fc7, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb,
+ 0x1fc3, 0x1fcd, 0x1fce, 0x1fcf, 0x1fd8, 0x1fd9, 0x1fd2, 0x1fd3,
+ 0x1fd4, 0x1fd5, 0x1fd6, 0x1fd7, 0x1fd8, 0x1fd9, 0x1fda, 0x1fdb,
+ 0x1fdc, 0x1fdd, 0x1fde, 0x1fdf, 0x1fe8, 0x1fe9, 0x1fe2, 0x1fe3,
+ 0x1fe4, 0x1fec, 0x1fe6, 0x1fe7, 0x1fe8, 0x1fe9, 0x1fea, 0x1feb,
+ 0x1fec, 0x1fed, 0x1fee, 0x1fef, 0x1ff0, 0x1ff1, 0x1ff2, 0x1ff3,
+ 0x1ff4, 0x1ff5, 0x1ff6, 0x1ff7, 0x1ff8, 0x1ff9, 0x1ffa, 0x1ffb,
+ 0x1ff3, 0x1ffd, 0x1ffe, 0x1fff, 0x2000, 0x2001, 0x2002, 0x2003,
+ 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, 0x200b,
+ 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2013,
+ 0x2014, 0x2015, 0x2016, 0x2017, 0x2018, 0x2019, 0x201a, 0x201b,
+ 0x201c, 0x201d, 0x201e, 0x201f, 0x2020, 0x2021, 0x2022, 0x2023,
+ 0x2024, 0x2025, 0x2026, 0x2027, 0x2028, 0x2029, 0x202a, 0x202b,
+ 0x202c, 0x202d, 0x202e, 0x202f, 0x2030, 0x2031, 0x2032, 0x2033,
+ 0x2034, 0x2035, 0x2036, 0x2037, 0x2038, 0x2039, 0x203a, 0x203b,
+ 0x203c, 0x203d, 0x203e, 0x203f, 0x2040, 0x2041, 0x2042, 0x2043,
+ 0x2044, 0x2045, 0x2046, 0x2047, 0x2048, 0x2049, 0x204a, 0x204b,
+ 0x204c, 0x204d, 0x204e, 0x204f, 0x2050, 0x2051, 0x2052, 0x2053,
+ 0x2054, 0x2055, 0x2056, 0x2057, 0x2058, 0x2059, 0x205a, 0x205b,
+ 0x205c, 0x205d, 0x205e, 0x205f, 0x2060, 0x2061, 0x2062, 0x2063,
+ 0x2064, 0x2065, 0x2066, 0x2067, 0x2068, 0x2069, 0x206a, 0x206b,
+ 0x206c, 0x206d, 0x206e, 0x206f, 0x2070, 0x2071, 0x2072, 0x2073,
+ 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x207a, 0x207b,
+ 0x207c, 0x207d, 0x207e, 0x207f, 0x2080, 0x2081, 0x2082, 0x2083,
+ 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x208a, 0x208b,
+ 0x208c, 0x208d, 0x208e, 0x208f, 0x2090, 0x2091, 0x2092, 0x2093,
+ 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b,
+ 0x209c, 0x209d, 0x209e, 0x209f, 0x20a0, 0x20a1, 0x20a2, 0x20a3,
+ 0x20a4, 0x20a5, 0x20a6, 0x20a7, 0x20a8, 0x20a9, 0x20aa, 0x20ab,
+ 0x20ac, 0x20ad, 0x20ae, 0x20af, 0x20b0, 0x20b1, 0x20b2, 0x20b3,
+ 0x20b4, 0x20b5, 0x20b6, 0x20b7, 0x20b8, 0x20b9, 0x20ba, 0x20bb,
+ 0x20bc, 0x20bd, 0x20be, 0x20bf, 0x20c0, 0x20c1, 0x20c2, 0x20c3,
+ 0x20c4, 0x20c5, 0x20c6, 0x20c7, 0x20c8, 0x20c9, 0x20ca, 0x20cb,
+ 0x20cc, 0x20cd, 0x20ce, 0x20cf, 0x20d0, 0x20d1, 0x20d2, 0x20d3,
+ 0x20d4, 0x20d5, 0x20d6, 0x20d7, 0x20d8, 0x20d9, 0x20da, 0x20db,
+ 0x20dc, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x20e1, 0x20e2, 0x20e3,
+ 0x20e4, 0x20e5, 0x20e6, 0x20e7, 0x20e8, 0x20e9, 0x20ea, 0x20eb,
+ 0x20ec, 0x20ed, 0x20ee, 0x20ef, 0x20f0, 0x20f1, 0x20f2, 0x20f3,
+ 0x20f4, 0x20f5, 0x20f6, 0x20f7, 0x20f8, 0x20f9, 0x20fa, 0x20fb,
+ 0x20fc, 0x20fd, 0x20fe, 0x20ff, 0x2100, 0x2101, 0x2102, 0x2103,
+ 0x2104, 0x2105, 0x2106, 0x2107, 0x2108, 0x2109, 0x210a, 0x210b,
+ 0x210c, 0x210d, 0x210e, 0x210f, 0x2110, 0x2111, 0x2112, 0x2113,
+ 0x2114, 0x2115, 0x2116, 0x2117, 0x2118, 0x2119, 0x211a, 0x211b,
+ 0x211c, 0x211d, 0x211e, 0x211f, 0x2120, 0x2121, 0x2122, 0x2123,
+ 0x2124, 0x2125, 0x2126, 0x2127, 0x2128, 0x2129, 0x212a, 0x212b,
+ 0x212c, 0x212d, 0x212e, 0x212f, 0x2130, 0x2131, 0x2132, 0x2133,
+ 0x2134, 0x2135, 0x2136, 0x2137, 0x2138, 0x2139, 0x213a, 0x213b,
+ 0x213c, 0x213d, 0x213e, 0x213f, 0x2140, 0x2141, 0x2142, 0x2143,
+ 0x2144, 0x2145, 0x2146, 0x2147, 0x2148, 0x2149, 0x214a, 0x214b,
+ 0x214c, 0x214d, 0x2132, 0x214f, 0x2150, 0x2151, 0x2152, 0x2153,
+ 0x2154, 0x2155, 0x2156, 0x2157, 0x2158, 0x2159, 0x215a, 0x215b,
+ 0x215c, 0x215d, 0x215e, 0x215f, 0x2160, 0x2161, 0x2162, 0x2163,
+ 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216a, 0x216b,
+ 0x216c, 0x216d, 0x216e, 0x216f, 0x2160, 0x2161, 0x2162, 0x2163,
+ 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216a, 0x216b,
+ 0x216c, 0x216d, 0x216e, 0x216f, 0x2180, 0x2181, 0x2182, 0x2183,
+ 0x2183, 0xffff, 0x034b, 0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba,
+ 0x24bb, 0x24bc, 0x24bd, 0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2,
+ 0x24c3, 0x24c4, 0x24c5, 0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca,
+ 0x24cb, 0x24cc, 0x24cd, 0x24ce, 0x24cf, 0xffff, 0x0746, 0x2c00,
+ 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07, 0x2c08,
+ 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f, 0x2c10,
+ 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17, 0x2c18,
+ 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f, 0x2c20,
+ 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27, 0x2c28,
+ 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x2c5f, 0x2c60,
+ 0x2c60, 0x2c62, 0x2c63, 0x2c64, 0x2c65, 0x2c66, 0x2c67, 0x2c67,
+ 0x2c69, 0x2c69, 0x2c6b, 0x2c6b, 0x2c6d, 0x2c6e, 0x2c6f, 0x2c70,
+ 0x2c71, 0x2c72, 0x2c73, 0x2c74, 0x2c75, 0x2c75, 0x2c77, 0x2c78,
+ 0x2c79, 0x2c7a, 0x2c7b, 0x2c7c, 0x2c7d, 0x2c7e, 0x2c7f, 0x2c80,
+ 0x2c80, 0x2c82, 0x2c82, 0x2c84, 0x2c84, 0x2c86, 0x2c86, 0x2c88,
+ 0x2c88, 0x2c8a, 0x2c8a, 0x2c8c, 0x2c8c, 0x2c8e, 0x2c8e, 0x2c90,
+ 0x2c90, 0x2c92, 0x2c92, 0x2c94, 0x2c94, 0x2c96, 0x2c96, 0x2c98,
+ 0x2c98, 0x2c9a, 0x2c9a, 0x2c9c, 0x2c9c, 0x2c9e, 0x2c9e, 0x2ca0,
+ 0x2ca0, 0x2ca2, 0x2ca2, 0x2ca4, 0x2ca4, 0x2ca6, 0x2ca6, 0x2ca8,
+ 0x2ca8, 0x2caa, 0x2caa, 0x2cac, 0x2cac, 0x2cae, 0x2cae, 0x2cb0,
+ 0x2cb0, 0x2cb2, 0x2cb2, 0x2cb4, 0x2cb4, 0x2cb6, 0x2cb6, 0x2cb8,
+ 0x2cb8, 0x2cba, 0x2cba, 0x2cbc, 0x2cbc, 0x2cbe, 0x2cbe, 0x2cc0,
+ 0x2cc0, 0x2cc2, 0x2cc2, 0x2cc4, 0x2cc4, 0x2cc6, 0x2cc6, 0x2cc8,
+ 0x2cc8, 0x2cca, 0x2cca, 0x2ccc, 0x2ccc, 0x2cce, 0x2cce, 0x2cd0,
+ 0x2cd0, 0x2cd2, 0x2cd2, 0x2cd4, 0x2cd4, 0x2cd6, 0x2cd6, 0x2cd8,
+ 0x2cd8, 0x2cda, 0x2cda, 0x2cdc, 0x2cdc, 0x2cde, 0x2cde, 0x2ce0,
+ 0x2ce0, 0x2ce2, 0x2ce2, 0x2ce4, 0x2ce5, 0x2ce6, 0x2ce7, 0x2ce8,
+ 0x2ce9, 0x2cea, 0x2ceb, 0x2cec, 0x2ced, 0x2cee, 0x2cef, 0x2cf0,
+ 0x2cf1, 0x2cf2, 0x2cf3, 0x2cf4, 0x2cf5, 0x2cf6, 0x2cf7, 0x2cf8,
+ 0x2cf9, 0x2cfa, 0x2cfb, 0x2cfc, 0x2cfd, 0x2cfe, 0x2cff, 0x10a0,
+ 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7, 0x10a8,
+ 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af, 0x10b0,
+ 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7, 0x10b8,
+ 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf, 0x10c0,
+ 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0xffff, 0xd21b, 0xff21,
+ 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29,
+ 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f, 0xff30, 0xff31,
+ 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39,
+ 0xff3a, 0xff5b, 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61,
+ 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67, 0xff68, 0xff69,
+ 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f, 0xff70, 0xff71,
+ 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79,
+ 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f, 0xff80, 0xff81,
+ 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89,
+ 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f, 0xff90, 0xff91,
+ 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99,
+ 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f, 0xffa0, 0xffa1,
+ 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9,
+ 0xffaa, 0xffab, 0xffac, 0xffad, 0xffae, 0xffaf, 0xffb0, 0xffb1,
+ 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9,
+ 0xffba, 0xffbb, 0xffbc, 0xffbd, 0xffbe, 0xffbf, 0xffc0, 0xffc1,
+ 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffc8, 0xffc9,
+ 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce, 0xffcf, 0xffd0, 0xffd1,
+ 0xffd2, 0xffd3, 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffd8, 0xffd9,
+ 0xffda, 0xffdb, 0xffdc, 0xffdd, 0xffde, 0xffdf, 0xffe0, 0xffe1,
+ 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe7, 0xffe8, 0xffe9,
+ 0xffea, 0xffeb, 0xffec, 0xffed, 0xffee, 0xffef, 0xfff0, 0xfff1,
+ 0xfff2, 0xfff3, 0xfff4, 0xfff5, 0xfff6, 0xfff7, 0xfff8, 0xfff9,
+ 0xfffa, 0xfffb, 0xfffc, 0xfffd, 0xfffe, 0xffff,
+};
+
+/*
+ * Allow full-width illegal characters :
+ * "MS windows 7" supports full-width-invalid-name-characters.
+ * So we should check half-width-invalid-name-characters(ASCII) only
+ * for compatibility.
+ *
+ * " * / : < > ? \ |
+ */
+static unsigned short bad_uni_chars[] = {
+ 0x0022, 0x002A, 0x002F, 0x003A,
+ 0x003C, 0x003E, 0x003F, 0x005C, 0x007C,
+ 0
+};
+
+static int exfat_convert_char_to_ucs2(struct nls_table *nls,
+ const unsigned char *ch, int ch_len, unsigned short *ucs2,
+ int *lossy)
+{
+ int len;
+
+ *ucs2 = 0x0;
+
+ if (ch[0] < 0x80) {
+ *ucs2 = ch[0];
+ return 1;
+ }
+
+ len = nls->char2uni(ch, ch_len, ucs2);
+ if (len < 0) {
+ /* conversion failed */
+ if (lossy != NULL)
+ *lossy |= NLS_NAME_LOSSY;
+ *ucs2 = '_';
+ return 1;
+ }
+ return len;
+}
+
+static int exfat_convert_ucs2_to_char(struct nls_table *nls,
+ unsigned short ucs2, unsigned char *ch, int *lossy)
+{
+ int len;
+
+ ch[0] = 0x0;
+
+ if (ucs2 < 0x0080) {
+ ch[0] = ucs2;
+ return 1;
+ }
+
+ len = nls->uni2char(ucs2, ch, MAX_CHARSET_SIZE);
+ if (len < 0) {
+ /* conversion failed */
+ if (lossy != NULL)
+ *lossy |= NLS_NAME_LOSSY;
+ ch[0] = '_';
+ return 1;
+ }
+ return len;
+}
+
+unsigned short exfat_toupper(struct super_block *sb, unsigned short a)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ return sbi->vol_utbl[a] ? sbi->vol_utbl[a] : a;
+}
+
+static unsigned short *exfat_wstrchr(unsigned short *str, unsigned short wchar)
+{
+ while (*str) {
+ if (*(str++) == wchar)
+ return str;
+ }
+ return NULL;
+}
+
+int exfat_uniname_ncmp(struct super_block *sb, unsigned short *a,
+ unsigned short *b, unsigned int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++, a++, b++)
+ if (exfat_toupper(sb, *a) != exfat_toupper(sb, *b))
+ return 1;
+ return 0;
+}
+
+static int exfat_utf16_to_utf8(struct super_block *sb,
+ struct exfat_uni_name *p_uniname, unsigned char *p_cstring,
+ int buflen)
+{
+ int len;
+ const unsigned short *uniname = p_uniname->name;
+
+ /* always len >= 0 */
+ len = utf16s_to_utf8s(uniname, MAX_NAME_LENGTH, UTF16_HOST_ENDIAN,
+ p_cstring, buflen);
+ p_cstring[len] = '\0';
+ return len;
+}
+
+static int exfat_utf8_to_utf16(struct super_block *sb,
+ const unsigned char *p_cstring, const int len,
+ struct exfat_uni_name *p_uniname, int *p_lossy)
+{
+ int i, unilen, lossy = NLS_NAME_NO_LOSSY;
+ unsigned short upname[MAX_NAME_LENGTH + 1];
+ unsigned short *uniname = p_uniname->name;
+
+ WARN_ON(!len);
+
+ unilen = utf8s_to_utf16s(p_cstring, len, UTF16_HOST_ENDIAN,
+ (wchar_t *)uniname, MAX_NAME_LENGTH + 2);
+ if (unilen < 0) {
+ exfat_msg(sb, KERN_ERR,
+ "failed to %s (err : %d) nls len : %d",
+ __func__, unilen, len);
+ return unilen;
+ }
+
+ if (unilen > MAX_NAME_LENGTH) {
+ exfat_msg(sb, KERN_ERR,
+ "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d",
+ __func__, len, unilen, MAX_NAME_LENGTH);
+ return -ENAMETOOLONG;
+ }
+
+ p_uniname->name_len = unilen & 0xFF;
+
+ for (i = 0; i < unilen; i++) {
+ if (*uniname < 0x0020 ||
+ exfat_wstrchr(bad_uni_chars, *uniname))
+ lossy |= NLS_NAME_LOSSY;
+
+ upname[i] = exfat_toupper(sb, *uniname);
+ uniname++;
+ }
+
+ *uniname = '\0';
+ p_uniname->name_len = unilen;
+ p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0,
+ CS_DEFAULT);
+
+ if (p_lossy)
+ *p_lossy = lossy;
+ return unilen;
+}
+
+#define PLANE_SIZE 0x00010000
+#define SURROGATE_MASK 0xfffff800
+#define SURROGATE_PAIR 0x0000d800
+#define SURROGATE_LOW 0x00000400
+#define SURROGATE_BITS 0x000003ff
+
+unsigned short exfat_high_surrogate(unicode_t u)
+{
+ return ((u - PLANE_SIZE) >> 10) + SURROGATE_PAIR;
+}
+
+unsigned short exfat_low_surrogate(unicode_t u)
+{
+ return ((u - PLANE_SIZE) & SURROGATE_BITS) | SURROGATE_PAIR |
+ SURROGATE_LOW;
+}
+
+static int __exfat_utf16_to_nls(struct super_block *sb,
+ struct exfat_uni_name *p_uniname, unsigned char *p_cstring,
+ int buflen)
+{
+ int i, j, len, out_len = 0;
+ unsigned char buf[MAX_CHARSET_SIZE];
+ const unsigned short *uniname = p_uniname->name;
+ struct nls_table *nls = EXFAT_SB(sb)->nls_io;
+
+ i = 0;
+ while (i < MAX_NAME_LENGTH && out_len < (buflen - 1)) {
+ if (*uniname == '\0')
+ break;
+ if ((*uniname & SURROGATE_MASK) != SURROGATE_PAIR) {
+ len = exfat_convert_ucs2_to_char(nls, *uniname, buf,
+ NULL);
+ } else {
+ /* Process UTF-16 surrogate pair as one character */
+ if (!(*uniname & SURROGATE_LOW) &&
+ i+1 < MAX_NAME_LENGTH &&
+ (*(uniname+1) & SURROGATE_MASK) == SURROGATE_PAIR &&
+ (*(uniname+1) & SURROGATE_LOW)) {
+ uniname++;
+ i++;
+ }
+
+ /*
+ * UTF-16 surrogate pair encodes code points above
+ * U+FFFF. Code points above U+FFFF are not supported
+ * by kernel NLS framework therefore use replacement
+ * character
+ */
+ len = 1;
+ buf[0] = '_';
+ }
+
+ if (out_len + len >= buflen)
+ len = buflen - 1 - out_len;
+ out_len += len;
+
+ if (len > 1) {
+ for (j = 0; j < len; j++)
+ *p_cstring++ = buf[j];
+ } else { /* len == 1 */
+ *p_cstring++ = *buf;
+ }
+
+ uniname++;
+ i++;
+ }
+
+ *p_cstring = '\0';
+ return out_len;
+}
+
+static int exfat_nls_to_ucs2(struct super_block *sb,
+ const unsigned char *p_cstring, const int len,
+ struct exfat_uni_name *p_uniname, int *p_lossy)
+{
+ int i = 0, unilen = 0, lossy = NLS_NAME_NO_LOSSY;
+ unsigned short upname[MAX_NAME_LENGTH + 1];
+ unsigned short *uniname = p_uniname->name;
+ struct nls_table *nls = EXFAT_SB(sb)->nls_io;
+
+ WARN_ON(!len);
+
+ while (unilen < MAX_NAME_LENGTH && i < len) {
+ i += exfat_convert_char_to_ucs2(nls, p_cstring + i, len - i,
+ uniname, &lossy);
+
+ if (*uniname < 0x0020 ||
+ exfat_wstrchr(bad_uni_chars, *uniname))
+ lossy |= NLS_NAME_LOSSY;
+
+ upname[unilen] = exfat_toupper(sb, *uniname);
+ uniname++;
+ unilen++;
+ }
+
+ if (p_cstring[i] != '\0')
+ lossy |= NLS_NAME_OVERLEN;
+
+ *uniname = '\0';
+ p_uniname->name_len = unilen;
+ p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0,
+ CS_DEFAULT);
+
+ if (p_lossy)
+ *p_lossy = lossy;
+ return unilen;
+}
+
+int exfat_utf16_to_nls(struct super_block *sb, struct exfat_uni_name *uniname,
+ unsigned char *p_cstring, int buflen)
+{
+ if (EXFAT_SB(sb)->options.utf8)
+ return exfat_utf16_to_utf8(sb, uniname, p_cstring,
+ buflen);
+ return __exfat_utf16_to_nls(sb, uniname, p_cstring, buflen);
+}
+
+int exfat_nls_to_utf16(struct super_block *sb, const unsigned char *p_cstring,
+ const int len, struct exfat_uni_name *uniname, int *p_lossy)
+{
+ if (EXFAT_SB(sb)->options.utf8)
+ return exfat_utf8_to_utf16(sb, p_cstring, len,
+ uniname, p_lossy);
+ return exfat_nls_to_ucs2(sb, p_cstring, len, uniname, p_lossy);
+}
+
+static int exfat_load_upcase_table(struct super_block *sb,
+ sector_t sector, unsigned long long num_sectors,
+ unsigned int utbl_checksum)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned int sect_size = sb->s_blocksize;
+ unsigned int i, index = 0, checksum = 0;
+ int ret;
+ unsigned char skip = false;
+ unsigned short *upcase_table;
+
+ upcase_table = kcalloc(UTBL_COUNT, sizeof(unsigned short), GFP_KERNEL);
+ if (!upcase_table)
+ return -ENOMEM;
+
+ sbi->vol_utbl = upcase_table;
+ num_sectors += sector;
+
+ while (sector < num_sectors) {
+ struct buffer_head *bh;
+
+ bh = sb_bread(sb, sector);
+ if (!bh) {
+ exfat_msg(sb, KERN_ERR,
+ "failed to read sector(0x%llx)\n",
+ (unsigned long long)sector);
+ ret = -EIO;
+ goto free_table;
+ }
+ sector++;
+ for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) {
+ unsigned short uni = get_unaligned_le16(bh->b_data + i);
+
+ checksum = ((checksum & 1) ? 0x80000000 : 0) +
+ (checksum >> 1) +
+ *(((unsigned char *)bh->b_data) + i);
+ checksum = ((checksum & 1) ? 0x80000000 : 0) +
+ (checksum >> 1) +
+ *(((unsigned char *)bh->b_data) + (i + 1));
+
+ if (skip) {
+ index += uni;
+ skip = false;
+ } else if (uni == index) {
+ index++;
+ } else if (uni == 0xFFFF) {
+ skip = true;
+ } else { /* uni != index , uni != 0xFFFF */
+ upcase_table[index] = uni;
+ index++;
+ }
+ }
+ brelse(bh);
+ }
+
+ if (index >= 0xFFFF && utbl_checksum == checksum)
+ return 0;
+
+ exfat_msg(sb, KERN_ERR,
+ "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)\n",
+ index, checksum, utbl_checksum);
+ ret = -EINVAL;
+free_table:
+ exfat_free_upcase_table(sbi);
+ return ret;
+}
+
+static int exfat_load_default_upcase_table(struct super_block *sb)
+{
+ int i, ret = -EIO;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned char skip = false;
+ unsigned short uni = 0, *upcase_table;
+ unsigned int index = 0;
+
+ upcase_table = kcalloc(UTBL_COUNT, sizeof(unsigned short), GFP_KERNEL);
+ if (!upcase_table)
+ return -ENOMEM;
+
+ sbi->vol_utbl = upcase_table;
+
+ for (i = 0; index <= 0xFFFF && i < EXFAT_NUM_UPCASE; i++) {
+ uni = uni_def_upcase[i];
+ if (skip) {
+ index += uni;
+ skip = false;
+ } else if (uni == index) {
+ index++;
+ } else if (uni == 0xFFFF) {
+ skip = true;
+ } else {
+ upcase_table[index] = uni;
+ index++;
+ }
+ }
+
+ if (index >= 0xFFFF)
+ return 0;
+
+ /* FATAL error: default upcase table has error */
+ exfat_free_upcase_table(sbi);
+ return ret;
+}
+
+int exfat_create_upcase_table(struct super_block *sb)
+{
+ int i, ret;
+ unsigned int tbl_clu, type;
+ sector_t sector;
+ unsigned long long tbl_size, num_sectors;
+ unsigned char blksize_bits = sb->s_blocksize_bits;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct buffer_head *bh;
+
+ clu.dir = sbi->root_dir;
+ clu.flags = ALLOC_FAT_CHAIN;
+
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ for (i = 0; i < sbi->dentries_per_clu; i++) {
+ ep = exfat_get_dentry(sb, &clu, i, &bh, NULL);
+ if (!ep)
+ return -EIO;
+
+ type = exfat_get_entry_type(ep);
+ if (type == TYPE_UNUSED) {
+ brelse(bh);
+ break;
+ }
+
+ if (type != TYPE_UPCASE) {
+ brelse(bh);
+ continue;
+ }
+
+ tbl_clu = le32_to_cpu(ep->dentry.upcase.start_clu);
+ tbl_size = le64_to_cpu(ep->dentry.upcase.size);
+
+ sector = exfat_cluster_to_sector(sbi, tbl_clu);
+ num_sectors = ((tbl_size - 1) >> blksize_bits) + 1;
+ ret = exfat_load_upcase_table(sb, sector, num_sectors,
+ le32_to_cpu(ep->dentry.upcase.checksum));
+
+ brelse(bh);
+ if (ret && ret != -EIO)
+ goto load_default;
+
+ /* load successfully */
+ return ret;
+ }
+
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+ }
+
+load_default:
+ /* load default upcase table */
+ return exfat_load_default_upcase_table(sb);
+}
+
+void exfat_free_upcase_table(struct exfat_sb_info *sbi)
+{
+ kfree(sbi->vol_utbl);
+}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
new file mode 100644
index 000000000000..16ed202ef527
--- /dev/null
+++ b/fs/exfat/super.c
@@ -0,0 +1,722 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/mount.h>
+#include <linux/cred.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include <linux/fs_struct.h>
+#include <linux/iversion.h>
+#include <linux/nls.h>
+#include <linux/buffer_head.h>
+
+#include "exfat_raw.h"
+#include "exfat_fs.h"
+
+static char exfat_default_iocharset[] = CONFIG_EXFAT_DEFAULT_IOCHARSET;
+static struct kmem_cache *exfat_inode_cachep;
+
+static void exfat_free_iocharset(struct exfat_sb_info *sbi)
+{
+ if (sbi->options.iocharset != exfat_default_iocharset)
+ kfree(sbi->options.iocharset);
+}
+
+static void exfat_delayed_free(struct rcu_head *p)
+{
+ struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu);
+
+ unload_nls(sbi->nls_io);
+ exfat_free_iocharset(sbi);
+ exfat_free_upcase_table(sbi);
+ kfree(sbi);
+}
+
+static void exfat_put_super(struct super_block *sb)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ mutex_lock(&sbi->s_lock);
+ if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state))
+ sync_blockdev(sb->s_bdev);
+ exfat_set_vol_flags(sb, VOL_CLEAN);
+ exfat_free_bitmap(sbi);
+ mutex_unlock(&sbi->s_lock);
+
+ call_rcu(&sbi->rcu, exfat_delayed_free);
+}
+
+static int exfat_sync_fs(struct super_block *sb, int wait)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ int err = 0;
+
+ /* If there are some dirty buffers in the bdev inode */
+ mutex_lock(&sbi->s_lock);
+ if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state)) {
+ sync_blockdev(sb->s_bdev);
+ if (exfat_set_vol_flags(sb, VOL_CLEAN))
+ err = -EIO;
+ }
+ mutex_unlock(&sbi->s_lock);
+ return err;
+}
+
+static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned long long id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+ if (sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED) {
+ mutex_lock(&sbi->s_lock);
+ if (exfat_count_used_clusters(sb, &sbi->used_clusters)) {
+ mutex_unlock(&sbi->s_lock);
+ return -EIO;
+ }
+ mutex_unlock(&sbi->s_lock);
+ }
+
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = sbi->cluster_size;
+ buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */
+ buf->f_bfree = buf->f_blocks - sbi->used_clusters;
+ buf->f_bavail = buf->f_bfree;
+ buf->f_fsid.val[0] = (unsigned int)id;
+ buf->f_fsid.val[1] = (unsigned int)(id >> 32);
+ /* Unicode utf16 255 characters */
+ buf->f_namelen = EXFAT_MAX_FILE_LEN * NLS_MAX_CHARSET_SIZE;
+ return 0;
+}
+
+int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct pbr64 *bpb;
+ bool sync = 0;
+
+ /* flags are not changed */
+ if (sbi->vol_flag == new_flag)
+ return 0;
+
+ sbi->vol_flag = new_flag;
+
+ /* skip updating volume dirty flag,
+ * if this volume has been mounted with read-only
+ */
+ if (sb_rdonly(sb))
+ return 0;
+
+ if (!sbi->pbr_bh) {
+ sbi->pbr_bh = sb_bread(sb, 0);
+ if (!sbi->pbr_bh) {
+ exfat_msg(sb, KERN_ERR, "failed to read boot sector");
+ return -ENOMEM;
+ }
+ }
+
+ bpb = (struct pbr64 *)sbi->pbr_bh->b_data;
+ bpb->bsx.vol_flags = cpu_to_le16(new_flag);
+
+ if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh))
+ sync = true;
+ else
+ sync = false;
+
+ set_buffer_uptodate(sbi->pbr_bh);
+ mark_buffer_dirty(sbi->pbr_bh);
+
+ if (sync)
+ sync_dirty_buffer(sbi->pbr_bh);
+ return 0;
+}
+
+static int exfat_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_mount_options *opts = &sbi->options;
+
+ /* Show partition info */
+ if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->fs_uid));
+ if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->fs_gid));
+ seq_printf(m, ",fmask=%04o,dmask=%04o", opts->fs_fmask, opts->fs_dmask);
+ if (opts->allow_utime)
+ seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
+ if (opts->utf8)
+ seq_puts(m, ",iocharset=utf8");
+ else if (sbi->nls_io)
+ seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
+ seq_printf(m, ",bps=%ld", sb->s_blocksize);
+ if (opts->errors == EXFAT_ERRORS_CONT)
+ seq_puts(m, ",errors=continue");
+ else if (opts->errors == EXFAT_ERRORS_PANIC)
+ seq_puts(m, ",errors=panic");
+ else
+ seq_puts(m, ",errors=remount-ro");
+ if (opts->discard)
+ seq_puts(m, ",discard");
+ if (opts->time_offset)
+ seq_printf(m, ",time_offset=%d", opts->time_offset);
+ return 0;
+}
+
+static struct inode *exfat_alloc_inode(struct super_block *sb)
+{
+ struct exfat_inode_info *ei;
+
+ ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS);
+ if (!ei)
+ return NULL;
+
+ init_rwsem(&ei->truncate_lock);
+ return &ei->vfs_inode;
+}
+
+static void exfat_free_inode(struct inode *inode)
+{
+ kmem_cache_free(exfat_inode_cachep, EXFAT_I(inode));
+}
+
+static const struct super_operations exfat_sops = {
+ .alloc_inode = exfat_alloc_inode,
+ .free_inode = exfat_free_inode,
+ .write_inode = exfat_write_inode,
+ .evict_inode = exfat_evict_inode,
+ .put_super = exfat_put_super,
+ .sync_fs = exfat_sync_fs,
+ .statfs = exfat_statfs,
+ .show_options = exfat_show_options,
+};
+
+enum {
+ Opt_uid,
+ Opt_gid,
+ Opt_umask,
+ Opt_dmask,
+ Opt_fmask,
+ Opt_allow_utime,
+ Opt_charset,
+ Opt_errors,
+ Opt_discard,
+ Opt_time_offset,
+};
+
+static const struct constant_table exfat_param_enums[] = {
+ { "continue", EXFAT_ERRORS_CONT },
+ { "panic", EXFAT_ERRORS_PANIC },
+ { "remount-ro", EXFAT_ERRORS_RO },
+ {}
+};
+
+static const struct fs_parameter_spec exfat_parameters[] = {
+ fsparam_u32("uid", Opt_uid),
+ fsparam_u32("gid", Opt_gid),
+ fsparam_u32oct("umask", Opt_umask),
+ fsparam_u32oct("dmask", Opt_dmask),
+ fsparam_u32oct("fmask", Opt_fmask),
+ fsparam_u32oct("allow_utime", Opt_allow_utime),
+ fsparam_string("iocharset", Opt_charset),
+ fsparam_enum("errors", Opt_errors, exfat_param_enums),
+ fsparam_flag("discard", Opt_discard),
+ fsparam_s32("time_offset", Opt_time_offset),
+ {}
+};
+
+static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct exfat_sb_info *sbi = fc->s_fs_info;
+ struct exfat_mount_options *opts = &sbi->options;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, exfat_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
+ break;
+ case Opt_gid:
+ opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
+ break;
+ case Opt_umask:
+ opts->fs_fmask = result.uint_32;
+ opts->fs_dmask = result.uint_32;
+ break;
+ case Opt_dmask:
+ opts->fs_dmask = result.uint_32;
+ break;
+ case Opt_fmask:
+ opts->fs_fmask = result.uint_32;
+ break;
+ case Opt_allow_utime:
+ opts->allow_utime = result.uint_32 & 0022;
+ break;
+ case Opt_charset:
+ exfat_free_iocharset(sbi);
+ opts->iocharset = kstrdup(param->string, GFP_KERNEL);
+ if (!opts->iocharset)
+ return -ENOMEM;
+ break;
+ case Opt_errors:
+ opts->errors = result.uint_32;
+ break;
+ case Opt_discard:
+ opts->discard = 1;
+ break;
+ case Opt_time_offset:
+ /*
+ * Make the limit 24 just in case someone invents something
+ * unusual.
+ */
+ if (result.int_32 < -24 * 60 || result.int_32 > 24 * 60)
+ return -EINVAL;
+ opts->time_offset = result.int_32;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void exfat_hash_init(struct super_block *sb)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ int i;
+
+ spin_lock_init(&sbi->inode_hash_lock);
+ for (i = 0; i < EXFAT_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
+}
+
+static int exfat_read_root(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct exfat_chain cdir;
+ int num_subdirs, num_clu = 0;
+
+ exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ ei->entry = -1;
+ ei->start_clu = sbi->root_dir;
+ ei->flags = ALLOC_FAT_CHAIN;
+ ei->type = TYPE_DIR;
+ ei->version = 0;
+ ei->rwoffset = 0;
+ ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
+ ei->hint_stat.eidx = 0;
+ ei->hint_stat.clu = sbi->root_dir;
+ ei->hint_femp.eidx = EXFAT_HINT_NONE;
+
+ exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ if (exfat_count_num_clusters(sb, &cdir, &num_clu))
+ return -EIO;
+ i_size_write(inode, num_clu << sbi->cluster_size_bits);
+
+ num_subdirs = exfat_count_dir_entries(sb, &cdir);
+ if (num_subdirs < 0)
+ return -EIO;
+ set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR);
+
+ inode->i_uid = sbi->options.fs_uid;
+ inode->i_gid = sbi->options.fs_gid;
+ inode_inc_iversion(inode);
+ inode->i_generation = 0;
+ inode->i_mode = exfat_make_mode(sbi, ATTR_SUBDIR, 0777);
+ inode->i_op = &exfat_dir_inode_operations;
+ inode->i_fop = &exfat_dir_operations;
+
+ inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1))
+ & ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+ EXFAT_I(inode)->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff;
+ EXFAT_I(inode)->i_size_aligned = i_size_read(inode);
+ EXFAT_I(inode)->i_size_ondisk = i_size_read(inode);
+
+ exfat_save_attr(inode, ATTR_SUBDIR);
+ inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
+ current_time(inode);
+ exfat_cache_init_inode(inode);
+ return 0;
+}
+
+static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb,
+ struct buffer_head **prev_bh)
+{
+ struct pbr *p_pbr = (struct pbr *) (*prev_bh)->b_data;
+ unsigned short logical_sect = 0;
+
+ logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits;
+
+ if (!is_power_of_2(logical_sect) ||
+ logical_sect < 512 || logical_sect > 4096) {
+ exfat_msg(sb, KERN_ERR, "bogus logical sector size %u",
+ logical_sect);
+ return NULL;
+ }
+
+ if (logical_sect < sb->s_blocksize) {
+ exfat_msg(sb, KERN_ERR,
+ "logical sector size too small for device (logical sector size = %u)",
+ logical_sect);
+ return NULL;
+ }
+
+ if (logical_sect > sb->s_blocksize) {
+ struct buffer_head *bh = NULL;
+
+ __brelse(*prev_bh);
+ *prev_bh = NULL;
+
+ if (!sb_set_blocksize(sb, logical_sect)) {
+ exfat_msg(sb, KERN_ERR,
+ "unable to set blocksize %u", logical_sect);
+ return NULL;
+ }
+ bh = sb_bread(sb, 0);
+ if (!bh) {
+ exfat_msg(sb, KERN_ERR,
+ "unable to read boot sector (logical sector size = %lu)",
+ sb->s_blocksize);
+ return NULL;
+ }
+
+ *prev_bh = bh;
+ p_pbr = (struct pbr *) bh->b_data;
+ }
+ return p_pbr;
+}
+
+/* mount the file system volume */
+static int __exfat_fill_super(struct super_block *sb)
+{
+ int ret;
+ struct pbr *p_pbr;
+ struct pbr64 *p_bpb;
+ struct buffer_head *bh;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ /* set block size to read super block */
+ sb_min_blocksize(sb, 512);
+
+ /* read boot sector */
+ bh = sb_bread(sb, 0);
+ if (!bh) {
+ exfat_msg(sb, KERN_ERR, "unable to read boot sector");
+ return -EIO;
+ }
+
+ /* PRB is read */
+ p_pbr = (struct pbr *)bh->b_data;
+
+ /* check the validity of PBR */
+ if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) {
+ exfat_msg(sb, KERN_ERR, "invalid boot record signature");
+ ret = -EINVAL;
+ goto free_bh;
+ }
+
+
+ /* check logical sector size */
+ p_pbr = exfat_read_pbr_with_logical_sector(sb, &bh);
+ if (!p_pbr) {
+ ret = -EIO;
+ goto free_bh;
+ }
+
+ /*
+ * res_zero field must be filled with zero to prevent mounting
+ * from FAT volume.
+ */
+ if (memchr_inv(p_pbr->bpb.f64.res_zero, 0,
+ sizeof(p_pbr->bpb.f64.res_zero))) {
+ ret = -EINVAL;
+ goto free_bh;
+ }
+
+ p_bpb = (struct pbr64 *)p_pbr;
+ if (!p_bpb->bsx.num_fats) {
+ exfat_msg(sb, KERN_ERR, "bogus number of FAT structure");
+ ret = -EINVAL;
+ goto free_bh;
+ }
+
+ sbi->sect_per_clus = 1 << p_bpb->bsx.sect_per_clus_bits;
+ sbi->sect_per_clus_bits = p_bpb->bsx.sect_per_clus_bits;
+ sbi->cluster_size_bits = sbi->sect_per_clus_bits + sb->s_blocksize_bits;
+ sbi->cluster_size = 1 << sbi->cluster_size_bits;
+ sbi->num_FAT_sectors = le32_to_cpu(p_bpb->bsx.fat_length);
+ sbi->FAT1_start_sector = le32_to_cpu(p_bpb->bsx.fat_offset);
+ sbi->FAT2_start_sector = p_bpb->bsx.num_fats == 1 ?
+ sbi->FAT1_start_sector :
+ sbi->FAT1_start_sector + sbi->num_FAT_sectors;
+ sbi->data_start_sector = le32_to_cpu(p_bpb->bsx.clu_offset);
+ sbi->num_sectors = le64_to_cpu(p_bpb->bsx.vol_length);
+ /* because the cluster index starts with 2 */
+ sbi->num_clusters = le32_to_cpu(p_bpb->bsx.clu_count) +
+ EXFAT_RESERVED_CLUSTERS;
+
+ sbi->root_dir = le32_to_cpu(p_bpb->bsx.root_cluster);
+ sbi->dentries_per_clu = 1 <<
+ (sbi->cluster_size_bits - DENTRY_SIZE_BITS);
+
+ sbi->vol_flag = le16_to_cpu(p_bpb->bsx.vol_flags);
+ sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
+ sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED;
+
+ if (le16_to_cpu(p_bpb->bsx.vol_flags) & VOL_DIRTY) {
+ sbi->vol_flag |= VOL_DIRTY;
+ exfat_msg(sb, KERN_WARNING,
+ "Volume was not properly unmounted. Some data may be corrupt. Please run fsck.");
+ }
+
+ /* exFAT file size is limited by a disk volume size */
+ sb->s_maxbytes = (u64)(sbi->num_clusters - EXFAT_RESERVED_CLUSTERS) <<
+ sbi->cluster_size_bits;
+
+ ret = exfat_create_upcase_table(sb);
+ if (ret) {
+ exfat_msg(sb, KERN_ERR, "failed to load upcase table");
+ goto free_bh;
+ }
+
+ ret = exfat_load_bitmap(sb);
+ if (ret) {
+ exfat_msg(sb, KERN_ERR, "failed to load alloc-bitmap");
+ goto free_upcase_table;
+ }
+
+ ret = exfat_count_used_clusters(sb, &sbi->used_clusters);
+ if (ret) {
+ exfat_msg(sb, KERN_ERR, "failed to scan clusters");
+ goto free_alloc_bitmap;
+ }
+
+ return 0;
+
+free_alloc_bitmap:
+ exfat_free_bitmap(sbi);
+free_upcase_table:
+ exfat_free_upcase_table(sbi);
+free_bh:
+ brelse(bh);
+ return ret;
+}
+
+static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct exfat_sb_info *sbi = sb->s_fs_info;
+ struct exfat_mount_options *opts = &sbi->options;
+ struct inode *root_inode;
+ int err;
+
+ if (opts->allow_utime == (unsigned short)-1)
+ opts->allow_utime = ~opts->fs_dmask & 0022;
+
+ if (opts->discard) {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+
+ if (!blk_queue_discard(q))
+ exfat_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
+ opts->discard = 0;
+ }
+
+ sb->s_flags |= SB_NODIRATIME;
+ sb->s_magic = EXFAT_SUPER_MAGIC;
+ sb->s_op = &exfat_sops;
+
+ sb->s_time_gran = 1;
+ sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
+ sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;
+
+ err = __exfat_fill_super(sb);
+ if (err) {
+ exfat_msg(sb, KERN_ERR, "failed to recognize exfat type");
+ goto check_nls_io;
+ }
+
+ /* set up enough so that it can read an inode */
+ exfat_hash_init(sb);
+
+ if (!strcmp(sbi->options.iocharset, "utf8"))
+ opts->utf8 = 1;
+ else {
+ sbi->nls_io = load_nls(sbi->options.iocharset);
+ if (!sbi->nls_io) {
+ exfat_msg(sb, KERN_ERR, "IO charset %s not found",
+ sbi->options.iocharset);
+ err = -EINVAL;
+ goto free_table;
+ }
+ }
+
+ if (sbi->options.utf8)
+ sb->s_d_op = &exfat_utf8_dentry_ops;
+ else
+ sb->s_d_op = &exfat_dentry_ops;
+
+ root_inode = new_inode(sb);
+ if (!root_inode) {
+ exfat_msg(sb, KERN_ERR, "failed to allocate root inode.");
+ err = -ENOMEM;
+ goto free_table;
+ }
+
+ root_inode->i_ino = EXFAT_ROOT_INO;
+ inode_set_iversion(root_inode, 1);
+ err = exfat_read_root(root_inode);
+ if (err) {
+ exfat_msg(sb, KERN_ERR, "failed to initialize root inode.");
+ goto put_inode;
+ }
+
+ exfat_hash_inode(root_inode, EXFAT_I(root_inode)->i_pos);
+ insert_inode_hash(root_inode);
+
+ sb->s_root = d_make_root(root_inode);
+ if (!sb->s_root) {
+ exfat_msg(sb, KERN_ERR, "failed to get the root dentry");
+ err = -ENOMEM;
+ goto put_inode;
+ }
+
+ return 0;
+
+put_inode:
+ iput(root_inode);
+ sb->s_root = NULL;
+
+free_table:
+ exfat_free_upcase_table(sbi);
+ exfat_free_bitmap(sbi);
+
+check_nls_io:
+ unload_nls(sbi->nls_io);
+ exfat_free_iocharset(sbi);
+ sb->s_fs_info = NULL;
+ kfree(sbi);
+ return err;
+}
+
+static int exfat_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, exfat_fill_super);
+}
+
+static void exfat_free(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations exfat_context_ops = {
+ .parse_param = exfat_parse_param,
+ .get_tree = exfat_get_tree,
+ .free = exfat_free,
+};
+
+static int exfat_init_fs_context(struct fs_context *fc)
+{
+ struct exfat_sb_info *sbi;
+
+ sbi = kzalloc(sizeof(struct exfat_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ mutex_init(&sbi->s_lock);
+ ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ sbi->options.fs_uid = current_uid();
+ sbi->options.fs_gid = current_gid();
+ sbi->options.fs_fmask = current->fs->umask;
+ sbi->options.fs_dmask = current->fs->umask;
+ sbi->options.allow_utime = -1;
+ sbi->options.iocharset = exfat_default_iocharset;
+ sbi->options.errors = EXFAT_ERRORS_RO;
+
+ fc->s_fs_info = sbi;
+ fc->ops = &exfat_context_ops;
+ return 0;
+}
+
+static struct file_system_type exfat_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "exfat",
+ .init_fs_context = exfat_init_fs_context,
+ .parameters = exfat_parameters,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+static void exfat_inode_init_once(void *foo)
+{
+ struct exfat_inode_info *ei = (struct exfat_inode_info *)foo;
+
+ INIT_HLIST_NODE(&ei->i_hash_fat);
+ inode_init_once(&ei->vfs_inode);
+}
+
+static int __init init_exfat_fs(void)
+{
+ int err;
+
+ err = exfat_cache_init();
+ if (err)
+ return err;
+
+ exfat_inode_cachep = kmem_cache_create("exfat_inode_cache",
+ sizeof(struct exfat_inode_info),
+ 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ exfat_inode_init_once);
+ if (!exfat_inode_cachep) {
+ err = -ENOMEM;
+ goto shutdown_cache;
+ }
+
+ err = register_filesystem(&exfat_fs_type);
+ if (err)
+ goto destroy_cache;
+
+ return 0;
+
+destroy_cache:
+ kmem_cache_destroy(exfat_inode_cachep);
+shutdown_cache:
+ exfat_cache_shutdown();
+ return err;
+}
+
+static void __exit exit_exfat_fs(void)
+{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(exfat_inode_cachep);
+ unregister_filesystem(&exfat_fs_type);
+ exfat_cache_shutdown();
+}
+
+module_init(init_exfat_fs);
+module_exit(exit_exfat_fs);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("exFAT filesystem support");
+MODULE_AUTHOR("Samsung Electronics Co., Ltd.");
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0456bc990b5e..943cc469f42f 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -56,6 +56,7 @@
#include <linux/buffer_head.h>
#include <linux/init.h>
+#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
@@ -84,8 +85,8 @@
printk("\n"); \
} while (0)
#else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
+# define ea_idebug(inode, f...) no_printk(f)
+# define ea_bdebug(bh, f...) no_printk(f)
#endif
static int ext2_xattr_set2(struct inode *, struct buffer_head *,
@@ -790,7 +791,15 @@ ext2_xattr_delete_inode(struct inode *inode)
struct buffer_head *bh = NULL;
struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb);
- down_write(&EXT2_I(inode)->xattr_sem);
+ /*
+ * We are the only ones holding inode reference. The xattr_sem should
+ * better be unlocked! We could as well just not acquire xattr_sem at
+ * all but this makes the code more futureproof. OTOH we need trylock
+ * here to avoid false-positive warning from lockdep about reclaim
+ * circular dependency.
+ */
+ if (WARN_ON_ONCE(!down_write_trylock(&EXT2_I(inode)->xattr_sem)))
+ return;
if (!EXT2_I(inode)->i_file_acl)
goto cleanup;
@@ -864,8 +873,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
true);
if (error) {
if (error == -EBUSY) {
- ea_bdebug(bh, "already in cache (%d cache entries)",
- atomic_read(&ext2_xattr_cache->c_entry_count));
+ ea_bdebug(bh, "already in cache");
error = 0;
}
} else
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index cee888cdc235..16272e6ddcf4 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -39,7 +39,7 @@ struct ext2_xattr_entry {
__le32 e_value_block; /* disk block attribute is stored on (n/i) */
__le32 e_value_size; /* size of attribute value */
__le32 e_hash; /* hash value of name and value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
#define EXT2_XATTR_PAD_BITS 2
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8fd0b3cdab4c..0e0a4d6209c7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -516,10 +516,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
wait_on_buffer(bh);
ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
- ext4_set_errno(sb, EIO);
- ext4_error(sb, "Cannot read block bitmap - "
- "block_group = %u, block_bitmap = %llu",
- block_group, (unsigned long long) bh->b_blocknr);
+ ext4_error_err(sb, EIO, "Cannot read block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, (unsigned long long) bh->b_blocknr);
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EIO;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 0a734ffb4310..16e9b2fda03a 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -166,10 +166,8 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
(start_blk + count < start_blk) ||
- (start_blk + count > ext4_blocks_count(sbi->s_es))) {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ (start_blk + count > ext4_blocks_count(sbi->s_es)))
return 0;
- }
if (system_blks == NULL)
return 1;
@@ -181,10 +179,8 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
n = n->rb_left;
else if (start_blk >= (entry->start_blk + entry->count))
n = n->rb_right;
- else {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ else
return 0;
- }
}
return 1;
}
@@ -220,10 +216,12 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
} else {
if (!ext4_data_block_valid_rcu(sbi, system_blks,
map.m_pblk, n)) {
- ext4_error(sb, "blocks %llu-%llu from inode %u "
- "overlap system zone", map.m_pblk,
- map.m_pblk + map.m_len - 1, ino);
err = -EFSCORRUPTED;
+ __ext4_error(sb, __func__, __LINE__, -err,
+ map.m_pblk, "blocks %llu-%llu "
+ "from inode %u overlap system zone",
+ map.m_pblk,
+ map.m_pblk + map.m_len - 1, ino);
break;
}
err = add_system_zone(system_blks, map.m_pblk, n);
@@ -365,7 +363,6 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
__le32 *bref = p;
unsigned int blk;
@@ -379,7 +376,6 @@ int ext4_check_blockref(const char *function, unsigned int line,
if (blk &&
unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
blk, 1))) {
- es->s_last_error_block = cpu_to_le64(blk);
ext4_error_inode(inode, function, line, blk,
"invalid block");
return -EFSCORRUPTED;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9aa1f75409b0..c654205f648d 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -392,7 +392,7 @@ struct fname {
__u32 inode;
__u8 name_len;
__u8 file_type;
- char name[0];
+ char name[];
};
/*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 61b37a052052..91eb4381cae5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -414,7 +414,7 @@ struct flex_groups {
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
-#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */
@@ -487,7 +487,7 @@ enum {
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
- EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+/* 22 was formerly EXT4_INODE_EOFBLOCKS */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
@@ -533,7 +533,6 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EXTENTS);
CHECK_FLAG_VALUE(VERITY);
CHECK_FLAG_VALUE(EA_INODE);
- CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
CHECK_FLAG_VALUE(PROJINHERIT);
CHECK_FLAG_VALUE(RESERVED);
@@ -2771,21 +2770,20 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno,
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
ext4_group_t block_group,
unsigned int flags);
-extern void ext4_set_errno(struct super_block *sb, int err);
-extern __printf(4, 5)
-void __ext4_error(struct super_block *, const char *, unsigned int,
+extern __printf(6, 7)
+void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
const char *, ...);
-extern __printf(5, 6)
-void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
- const char *, ...);
+extern __printf(6, 7)
+void __ext4_error_inode(struct inode *, const char *, unsigned int,
+ ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
-extern __printf(4, 5)
-void __ext4_abort(struct super_block *, const char *, unsigned int,
+extern __printf(5, 6)
+void __ext4_abort(struct super_block *, const char *, unsigned int, int,
const char *, ...);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
@@ -2806,8 +2804,12 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define EXT4_ERROR_INODE(inode, fmt, a...) \
ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
-#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
- ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \
+ __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)
+
+#define ext4_error_inode_block(inode, block, err, fmt, a...) \
+ __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \
+ (fmt), ## a)
#define EXT4_ERROR_FILE(file, block, fmt, a...) \
ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
@@ -2815,13 +2817,18 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#ifdef CONFIG_PRINTK
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
- __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+ __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
+#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \
+ __ext4_error_inode((inode), (func), (line), (block), \
+ (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...) \
__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...) \
- __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
-#define ext4_abort(sb, fmt, ...) \
- __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+ __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
+#define ext4_error_err(sb, err, fmt, ...) \
+ __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
+#define ext4_abort(sb, err, fmt, ...) \
+ __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...) \
__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...) \
@@ -2839,7 +2846,12 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_error_inode(inode, "", 0, block, " "); \
+ __ext4_error_inode(inode, "", 0, block, 0, " "); \
+} while (0)
+#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, err, " "); \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...) \
do { \
@@ -2849,12 +2861,17 @@ do { \
#define ext4_error(sb, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_error(sb, "", 0, " "); \
+ __ext4_error(sb, "", 0, 0, 0, " "); \
+} while (0)
+#define ext4_error_err(sb, err, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error(sb, "", 0, err, 0, " "); \
} while (0)
-#define ext4_abort(sb, fmt, ...) \
+#define ext4_abort(sb, err, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_abort(sb, "", 0, " "); \
+ __ext4_abort(sb, "", 0, err, " "); \
} while (0)
#define ext4_warning(sb, fmt, ...) \
do { \
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 1f53d64e42a5..7f16e1af8d5c 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -80,8 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb)
* take the FS itself readonly cleanly.
*/
if (journal && is_journal_aborted(journal)) {
- ext4_set_errno(sb, -journal->j_errno);
- ext4_abort(sb, "Detected aborted journal");
+ ext4_abort(sb, -journal->j_errno, "Detected aborted journal");
return -EROFS;
}
return 0;
@@ -272,8 +271,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
if (err) {
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
- ext4_set_errno(inode->i_sb, -err);
- __ext4_abort(inode->i_sb, where, line,
+ __ext4_abort(inode->i_sb, where, line, -err,
"error %d when attempting revoke", err);
}
BUFFER_TRACE(bh, "exit");
@@ -332,6 +330,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
err);
}
} else {
+ set_buffer_uptodate(bh);
if (inode)
mark_buffer_dirty_inode(bh, inode);
else
@@ -342,11 +341,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
struct ext4_super_block *es;
es = EXT4_SB(inode->i_sb)->s_es;
- es->s_last_error_block =
- cpu_to_le64(bh->b_blocknr);
- ext4_set_errno(inode->i_sb, EIO);
- ext4_error_inode(inode, where, line,
- bh->b_blocknr,
+ ext4_error_inode_err(inode, where, line,
+ bh->b_blocknr, EIO,
"IO error syncing itable block");
err = -EIO;
}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 7ea4f6fa173b..4b9002f0e84c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -512,6 +512,9 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 0;
if (ext4_should_journal_data(inode))
return 0;
+ /* temporary fix to prevent generic/422 test failures */
+ if (!test_opt(inode->i_sb, DELALLOC))
+ return 0;
return 1;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 954013d6076b..031752cfb6f7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -28,6 +28,7 @@
#include <linux/uaccess.h>
#include <linux/fiemap.h>
#include <linux/backing-dev.h>
+#include <linux/iomap.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
@@ -83,13 +84,6 @@ static void ext4_extent_block_csum_set(struct inode *inode,
et->et_checksum = ext4_extent_block_csum(inode, eh);
}
-static int ext4_split_extent(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_map_blocks *map,
- int split_flag,
- int flags);
-
static int ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path **ppath,
@@ -97,9 +91,6 @@ static int ext4_split_extent_at(handle_t *handle,
int split_flag,
int flags);
-static int ext4_find_delayed_extent(struct inode *inode,
- struct extent_status *newes);
-
static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
/*
@@ -358,8 +349,8 @@ static int ext4_valid_extent_idx(struct inode *inode,
}
static int ext4_valid_extent_entries(struct inode *inode,
- struct ext4_extent_header *eh,
- int depth)
+ struct ext4_extent_header *eh,
+ ext4_fsblk_t *pblk, int depth)
{
unsigned short entries;
if (eh->eh_entries == 0)
@@ -370,8 +361,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
- ext4_fsblk_t pblock = 0;
ext4_lblk_t lblock = 0;
ext4_lblk_t prev = 0;
int len = 0;
@@ -383,8 +372,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
lblock = le32_to_cpu(ext->ee_block);
len = ext4_ext_get_actual_len(ext);
if ((lblock <= prev) && prev) {
- pblock = ext4_ext_pblock(ext);
- es->s_last_error_block = cpu_to_le64(pblock);
+ *pblk = ext4_ext_pblock(ext);
return 0;
}
ext++;
@@ -431,7 +419,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
- if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
@@ -449,14 +437,14 @@ static int __ext4_ext_check(const char *function, unsigned int line,
return 0;
corrupted:
- ext4_set_errno(inode->i_sb, -err);
- ext4_error_inode(inode, function, line, 0,
- "pblk %llu bad header/extent: %s - magic %x, "
- "entries %u, max %u(%u), depth %u(%u)",
- (unsigned long long) pblk, error_msg,
- le16_to_cpu(eh->eh_magic),
- le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
- max, le16_to_cpu(eh->eh_depth), depth);
+ ext4_error_inode_err(inode, function, line, 0, -err,
+ "pblk %llu bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ (unsigned long long) pblk, error_msg,
+ le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries),
+ le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
return err;
}
@@ -556,6 +544,12 @@ int ext4_ext_precache(struct inode *inode)
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
+ /* Don't cache anything if there are no external extent blocks */
+ if (!depth) {
+ up_read(&ei->i_data_sem);
+ return ret;
+ }
+
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
GFP_NOFS);
if (path == NULL) {
@@ -563,9 +557,6 @@ int ext4_ext_precache(struct inode *inode)
return -ENOMEM;
}
- /* Don't cache anything if there are no external extent blocks */
- if (depth == 0)
- goto out;
path[0].p_hdr = ext_inode_hdr(inode);
ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
if (ret)
@@ -2134,155 +2125,6 @@ cleanup:
return err;
}
-static int ext4_fill_fiemap_extents(struct inode *inode,
- ext4_lblk_t block, ext4_lblk_t num,
- struct fiemap_extent_info *fieinfo)
-{
- struct ext4_ext_path *path = NULL;
- struct ext4_extent *ex;
- struct extent_status es;
- ext4_lblk_t next, next_del, start = 0, end = 0;
- ext4_lblk_t last = block + num;
- int exists, depth = 0, err = 0;
- unsigned int flags = 0;
- unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
-
- while (block < last && block != EXT_MAX_BLOCKS) {
- num = last - block;
- /* find extent for this block */
- down_read(&EXT4_I(inode)->i_data_sem);
-
- path = ext4_find_extent(inode, block, &path, 0);
- if (IS_ERR(path)) {
- up_read(&EXT4_I(inode)->i_data_sem);
- err = PTR_ERR(path);
- path = NULL;
- break;
- }
-
- depth = ext_depth(inode);
- if (unlikely(path[depth].p_hdr == NULL)) {
- up_read(&EXT4_I(inode)->i_data_sem);
- EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- err = -EFSCORRUPTED;
- break;
- }
- ex = path[depth].p_ext;
- next = ext4_ext_next_allocated_block(path);
-
- flags = 0;
- exists = 0;
- if (!ex) {
- /* there is no extent yet, so try to allocate
- * all requested space */
- start = block;
- end = block + num;
- } else if (le32_to_cpu(ex->ee_block) > block) {
- /* need to allocate space before found extent */
- start = block;
- end = le32_to_cpu(ex->ee_block);
- if (block + num < end)
- end = block + num;
- } else if (block >= le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex)) {
- /* need to allocate space after found extent */
- start = block;
- end = block + num;
- if (end >= next)
- end = next;
- } else if (block >= le32_to_cpu(ex->ee_block)) {
- /*
- * some part of requested space is covered
- * by found extent
- */
- start = block;
- end = le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex);
- if (block + num < end)
- end = block + num;
- exists = 1;
- } else {
- BUG();
- }
- BUG_ON(end <= start);
-
- if (!exists) {
- es.es_lblk = start;
- es.es_len = end - start;
- es.es_pblk = 0;
- } else {
- es.es_lblk = le32_to_cpu(ex->ee_block);
- es.es_len = ext4_ext_get_actual_len(ex);
- es.es_pblk = ext4_ext_pblock(ex);
- if (ext4_ext_is_unwritten(ex))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
- }
-
- /*
- * Find delayed extent and update es accordingly. We call
- * it even in !exists case to find out whether es is the
- * last existing extent or not.
- */
- next_del = ext4_find_delayed_extent(inode, &es);
- if (!exists && next_del) {
- exists = 1;
- flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- }
- up_read(&EXT4_I(inode)->i_data_sem);
-
- if (unlikely(es.es_len == 0)) {
- EXT4_ERROR_INODE(inode, "es.es_len == 0");
- err = -EFSCORRUPTED;
- break;
- }
-
- /*
- * This is possible iff next == next_del == EXT_MAX_BLOCKS.
- * we need to check next == EXT_MAX_BLOCKS because it is
- * possible that an extent is with unwritten and delayed
- * status due to when an extent is delayed allocated and
- * is allocated by fallocate status tree will track both of
- * them in a extent.
- *
- * So we could return a unwritten and delayed extent, and
- * its block is equal to 'next'.
- */
- if (next == next_del && next == EXT_MAX_BLOCKS) {
- flags |= FIEMAP_EXTENT_LAST;
- if (unlikely(next_del != EXT_MAX_BLOCKS ||
- next != EXT_MAX_BLOCKS)) {
- EXT4_ERROR_INODE(inode,
- "next extent == %u, next "
- "delalloc extent = %u",
- next, next_del);
- err = -EFSCORRUPTED;
- break;
- }
- }
-
- if (exists) {
- err = fiemap_fill_next_extent(fieinfo,
- (__u64)es.es_lblk << blksize_bits,
- (__u64)es.es_pblk << blksize_bits,
- (__u64)es.es_len << blksize_bits,
- flags);
- if (err < 0)
- break;
- if (err == 1) {
- err = 0;
- break;
- }
- }
-
- block = es.es_lblk + es.es_len;
- }
-
- ext4_ext_drop_refs(path);
- kfree(path);
- return err;
-}
-
static int ext4_fill_es_cache_info(struct inode *inode,
ext4_lblk_t block, ext4_lblk_t num,
struct fiemap_extent_info *fieinfo)
@@ -3874,64 +3716,11 @@ out:
return err;
}
-/*
- * Handle EOFBLOCKS_FL flag, clearing it if necessary
- */
-static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
- ext4_lblk_t lblk,
- struct ext4_ext_path *path,
- unsigned int len)
-{
- int i, depth;
- struct ext4_extent_header *eh;
- struct ext4_extent *last_ex;
-
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
- return 0;
-
- depth = ext_depth(inode);
- eh = path[depth].p_hdr;
-
- /*
- * We're going to remove EOFBLOCKS_FL entirely in future so we
- * do not care for this case anymore. Simply remove the flag
- * if there are no extents.
- */
- if (unlikely(!eh->eh_entries))
- goto out;
- last_ex = EXT_LAST_EXTENT(eh);
- /*
- * We should clear the EOFBLOCKS_FL flag if we are writing the
- * last block in the last extent in the file. We test this by
- * first checking to see if the caller to
- * ext4_ext_get_blocks() was interested in the last block (or
- * a block beyond the last block) in the current extent. If
- * this turns out to be false, we can bail out from this
- * function immediately.
- */
- if (lblk + len < le32_to_cpu(last_ex->ee_block) +
- ext4_ext_get_actual_len(last_ex))
- return 0;
- /*
- * If the caller does appear to be planning to write at or
- * beyond the end of the current extent, we then test to see
- * if the current extent is the last extent in the file, by
- * checking to make sure it was reached via the rightmost node
- * at each level of the tree.
- */
- for (i = depth-1; i >= 0; i--)
- if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
- return 0;
-out:
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
- return ext4_mark_inode_dirty(handle, inode);
-}
-
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path **ppath,
- unsigned int allocated)
+ unsigned int *allocated)
{
struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
@@ -3991,14 +3780,12 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
- if (err)
- return err;
+
map->m_flags |= EXT4_MAP_UNWRITTEN;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
- return allocated;
+ if (*allocated > map->m_len)
+ *allocated = map->m_len;
+ map->m_len = *allocated;
+ return 0;
}
static int
@@ -4007,7 +3794,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_ext_path **ppath, int flags,
unsigned int allocated, ext4_fsblk_t newblock)
{
+#ifdef EXT_DEBUG
struct ext4_ext_path *path = *ppath;
+#endif
int ret = 0;
int err = 0;
@@ -4047,11 +3836,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
}
ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
ppath);
- if (ret >= 0) {
+ if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk,
- path, map->m_len);
- } else
+ else
err = ret;
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = newblock;
@@ -4100,12 +3887,6 @@ out:
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
- if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
- map->m_len);
- if (err < 0)
- goto out2;
- }
out1:
if (allocated > map->m_len)
allocated = map->m_len;
@@ -4244,12 +4025,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent newex, *ex, *ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0;
- int free_on_err = 0, err = 0, depth, ret;
+ int err = 0, depth, ret;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
ext4_lblk_t cluster_offset;
- bool map_from_cluster = false;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -4308,12 +4088,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
- allocated = convert_initialized_extent(
- handle, inode, map, &path,
- allocated);
+ err = convert_initialized_extent(handle,
+ inode, map, &path, &allocated);
goto out2;
- } else if (!ext4_ext_is_unwritten(ex))
+ } else if (!ext4_ext_is_unwritten(ex)) {
goto out;
+ }
ret = ext4_ext_handle_unwritten_extents(
handle, inode, map, &path, flags,
@@ -4364,7 +4144,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4385,7 +4164,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4442,7 +4220,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
goto out2;
ext_debug("allocate new block: goal %llu, found %llu/%u\n",
ar.goal, newblock, allocated);
- free_on_err = 1;
allocated_clusters = ar.len;
ar.len = EXT4_C2B(sbi, ar.len) - offset;
if (ar.len > allocated)
@@ -4453,28 +4230,28 @@ got_allocated_blocks:
ext4_ext_store_pblock(&newex, newblock + offset);
newex.ee_len = cpu_to_le16(ar.len);
/* Mark unwritten */
- if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN;
}
- err = 0;
- if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
- err = check_eofblocks_fl(handle, inode, map->m_lblk,
- path, ar.len);
- if (!err)
- err = ext4_ext_insert_extent(handle, inode, &path,
- &newex, flags);
-
- if (err && free_on_err) {
- int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
- EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
- /* free data blocks we just allocated */
- /* not a good idea to call discard here directly,
- * but otherwise we'd need to call it every free() */
- ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, NULL, newblock,
- EXT4_C2B(sbi, allocated_clusters), fb_flags);
+ err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
+ if (err) {
+ if (allocated_clusters) {
+ int fb_flags = 0;
+
+ /*
+ * free data blocks we just allocated.
+ * not a good idea to call discard here directly,
+ * but otherwise we'd need to call it every free().
+ */
+ ext4_discard_preallocations(inode);
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters),
+ fb_flags);
+ }
goto out2;
}
@@ -4491,7 +4268,7 @@ got_allocated_blocks:
* clusters discovered to be delayed allocated. Once allocated, a
* cluster is not included in the reserved count.
*/
- if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
+ if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
/*
* When allocating delayed allocated clusters, simply
@@ -4645,10 +4422,6 @@ retry:
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
inode->i_mtime = inode->i_ctime;
- } else {
- if (epos > inode->i_size)
- ext4_set_inode_flag(inode,
- EXT4_INODE_EOFBLOCKS);
}
ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4802,16 +4575,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
inode->i_mtime = inode->i_ctime = current_time(inode);
- if (new_size) {
+ if (new_size)
ext4_update_inode_size(inode, new_size);
- } else {
- /*
- * Mark that we allocate beyond EOF so the subsequent truncate
- * can proceed even if the new size is the same as i_size.
- */
- if (offset + len > inode->i_size)
- ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
- }
ext4_mark_inode_dirty(handle, inode);
/* Zero out partial block at the edges of the range */
@@ -5009,64 +4774,13 @@ int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
return ret < 0 ? ret : err;
}
-/*
- * If newes is not existing extent (newes->ec_pblk equals zero) find
- * delayed extent at start of newes and update newes accordingly and
- * return start of the next delayed extent.
- *
- * If newes is existing extent (newes->ec_pblk is not equal zero)
- * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
- * extent found. Leave newes unmodified.
- */
-static int ext4_find_delayed_extent(struct inode *inode,
- struct extent_status *newes)
-{
- struct extent_status es;
- ext4_lblk_t block, next_del;
-
- if (newes->es_pblk == 0) {
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- newes->es_lblk,
- newes->es_lblk + newes->es_len - 1,
- &es);
-
- /*
- * No extent in extent-tree contains block @newes->es_pblk,
- * then the block may stay in 1)a hole or 2)delayed-extent.
- */
- if (es.es_len == 0)
- /* A hole found. */
- return 0;
-
- if (es.es_lblk > newes->es_lblk) {
- /* A hole found. */
- newes->es_len = min(es.es_lblk - newes->es_lblk,
- newes->es_len);
- return 0;
- }
-
- newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
- }
-
- block = newes->es_lblk + newes->es_len;
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
- EXT_MAX_BLOCKS, &es);
- if (es.es_len == 0)
- next_del = EXT_MAX_BLOCKS;
- else
- next_del = es.es_lblk;
-
- return next_del;
-}
-
-static int ext4_xattr_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo)
+static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
{
__u64 physical = 0;
- __u64 length;
- __u32 flags = FIEMAP_EXTENT_LAST;
+ __u64 length = 0;
int blockbits = inode->i_sb->s_blocksize_bits;
int error = 0;
+ u16 iomap_type;
/* in-inode? */
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
@@ -5081,40 +4795,49 @@ static int ext4_xattr_fiemap(struct inode *inode,
EXT4_I(inode)->i_extra_isize;
physical += offset;
length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
- flags |= FIEMAP_EXTENT_DATA_INLINE;
brelse(iloc.bh);
- } else { /* external block */
+ iomap_type = IOMAP_INLINE;
+ } else if (EXT4_I(inode)->i_file_acl) { /* external block */
physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
+ iomap_type = IOMAP_MAPPED;
+ } else {
+ /* no in-inode or external block for xattr, so return -ENOENT */
+ error = -ENOENT;
+ goto out;
}
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, 0, physical,
- length, flags);
- return (error < 0 ? error : 0);
+ iomap->addr = physical;
+ iomap->offset = 0;
+ iomap->length = length;
+ iomap->type = iomap_type;
+ iomap->flags = 0;
+out:
+ return error;
}
-static int _ext4_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len,
- int (*fill)(struct inode *, ext4_lblk_t,
- ext4_lblk_t,
- struct fiemap_extent_info *))
+static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
- ext4_lblk_t start_blk;
- u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
+ int error;
- int error = 0;
-
- if (ext4_has_inline_data(inode)) {
- int has_inline = 1;
+ error = ext4_iomap_xattr_fiemap(inode, iomap);
+ if (error == 0 && (offset >= iomap->length))
+ error = -ENOENT;
+ return error;
+}
- error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
- start, len);
+static const struct iomap_ops ext4_iomap_xattr_ops = {
+ .iomap_begin = ext4_iomap_xattr_begin,
+};
- if (has_inline)
- return error;
- }
+static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len, bool from_es_cache)
+{
+ ext4_lblk_t start_blk;
+ u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR;
+ int error = 0;
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
@@ -5123,19 +4846,19 @@ static int _ext4_fiemap(struct inode *inode,
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
- /* fallback to generic here if not in extents fmt */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- fill == ext4_fill_fiemap_extents)
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext4_get_block);
-
- if (fill == ext4_fill_es_cache_info)
+ if (from_es_cache)
ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
+
if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
return -EBADR;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
- error = ext4_xattr_fiemap(inode, fieinfo);
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
+ } else if (!from_es_cache) {
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_report_ops);
} else {
ext4_lblk_t len_blks;
__u64 last_blk;
@@ -5150,7 +4873,8 @@ static int _ext4_fiemap(struct inode *inode,
* Walk the extent tree gathering extent information
* and pushing extents back to the user.
*/
- error = fill(inode, start_blk, len_blks, fieinfo);
+ error = ext4_fill_es_cache_info(inode, start_blk, len_blks,
+ fieinfo);
}
return error;
}
@@ -5158,8 +4882,7 @@ static int _ext4_fiemap(struct inode *inode,
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
- return _ext4_fiemap(inode, fieinfo, start, len,
- ext4_fill_fiemap_extents);
+ return _ext4_fiemap(inode, fieinfo, start, len, false);
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5175,8 +4898,7 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
return 0;
}
- return _ext4_fiemap(inode, fieinfo, start, len,
- ext4_fill_es_cache_info);
+ return _ext4_fiemap(inode, fieinfo, start, len, true);
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5f225881176b..0d624250a62b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -872,6 +872,7 @@ const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f95ee99091e4..b420c9dc444d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -196,10 +196,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
if (!buffer_uptodate(bh)) {
put_bh(bh);
- ext4_set_errno(sb, EIO);
- ext4_error(sb, "Cannot read inode bitmap - "
- "block_group = %u, inode_bitmap = %llu",
- block_group, bitmap_blk);
+ ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return ERR_PTR(-EIO);
@@ -712,21 +711,34 @@ out:
static int find_inode_bit(struct super_block *sb, ext4_group_t group,
struct buffer_head *bitmap, unsigned long *ino)
{
+ bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL;
+ unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb);
+
next:
*ino = ext4_find_next_zero_bit((unsigned long *)
bitmap->b_data,
EXT4_INODES_PER_GROUP(sb), *ino);
if (*ino >= EXT4_INODES_PER_GROUP(sb))
- return 0;
+ goto not_found;
- if ((EXT4_SB(sb)->s_journal == NULL) &&
- recently_deleted(sb, group, *ino)) {
+ if (check_recently_deleted && recently_deleted(sb, group, *ino)) {
+ recently_deleted_ino = *ino;
*ino = *ino + 1;
if (*ino < EXT4_INODES_PER_GROUP(sb))
goto next;
- return 0;
+ goto not_found;
}
-
+ return 1;
+not_found:
+ if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb))
+ return 0;
+ /*
+ * Not reusing recently deleted inodes is mostly a preference. We don't
+ * want to report ENOSPC or skew allocation patterns because of that.
+ * So return even recently deleted inode if we could find better in the
+ * given range.
+ */
+ *ino = recently_deleted_ino;
return 1;
}
@@ -1231,9 +1243,9 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- ext4_set_errno(sb, -err);
- ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
- ino, err);
+ ext4_error_err(sb, -err,
+ "couldn't read orphan inode %lu (err %d)",
+ ino, err);
return inode;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 569fc68e8975..107f0043f67f 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1019,7 +1019,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
* (should be rare).
*/
if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, nr,
+ ext4_error_inode_block(inode, nr, EIO,
"Read failure");
continue;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index fad82d08fca5..f35e289e17aa 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -98,10 +98,9 @@ int ext4_get_max_inline_size(struct inode *inode)
error = ext4_get_inode_loc(inode, &iloc);
if (error) {
- ext4_set_errno(inode->i_sb, -error);
- ext4_error_inode(inode, __func__, __LINE__, 0,
- "can't get inode location %lu",
- inode->i_ino);
+ ext4_error_inode_err(inode, __func__, __LINE__, 0, -error,
+ "can't get inode location %lu",
+ inode->i_ino);
return 0;
}
@@ -1762,9 +1761,9 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
- ext4_set_errno(dir->i_sb, -err);
- EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
- err, dir->i_ino);
+ EXT4_ERROR_INODE_ERR(dir, -err,
+ "error %d getting inode %lu block",
+ err, dir->i_ino);
return true;
}
@@ -1857,47 +1856,6 @@ out:
return error;
}
-int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline, __u64 start, __u64 len)
-{
- __u64 physical = 0;
- __u64 inline_len;
- __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
- FIEMAP_EXTENT_LAST;
- int error = 0;
- struct ext4_iloc iloc;
-
- down_read(&EXT4_I(inode)->xattr_sem);
- if (!ext4_has_inline_data(inode)) {
- *has_inline = 0;
- goto out;
- }
- inline_len = min_t(size_t, ext4_get_inline_size(inode),
- i_size_read(inode));
- if (start >= inline_len)
- goto out;
- if (start + len < inline_len)
- inline_len = start + len;
- inline_len -= start;
-
- error = ext4_get_inode_loc(inode, &iloc);
- if (error)
- goto out;
-
- physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
- physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
- physical += offsetof(struct ext4_inode, i_block);
-
- brelse(iloc.bh);
-out:
- up_read(&EXT4_I(inode)->xattr_sem);
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, start, physical,
- inline_len, flags);
- return (error < 0 ? error : 0);
-}
-
int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
{
handle_t *handle;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fa0ff78dc033..e416096fc081 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -269,10 +269,9 @@ void ext4_evict_inode(struct inode *inode)
if (inode->i_blocks) {
err = ext4_truncate(inode);
if (err) {
- ext4_set_errno(inode->i_sb, -err);
- ext4_error(inode->i_sb,
- "couldn't truncate inode %lu (err %d)",
- inode->i_ino, err);
+ ext4_error_err(inode->i_sb, -err,
+ "couldn't truncate inode %lu (err %d)",
+ inode->i_ino, err);
goto stop_handle;
}
}
@@ -2478,10 +2477,9 @@ update_disksize:
up_write(&EXT4_I(inode)->i_data_sem);
err2 = ext4_mark_inode_dirty(handle, inode);
if (err2) {
- ext4_set_errno(inode->i_sb, -err2);
- ext4_error(inode->i_sb,
- "Failed to mark inode %lu dirty",
- inode->i_ino);
+ ext4_error_err(inode->i_sb, -err2,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
}
if (!err)
err = err2;
@@ -3212,7 +3210,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
return 0;
}
- return generic_block_bmap(mapping, block, ext4_get_block);
+ return iomap_bmap(mapping, block, &ext4_iomap_ops);
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -3333,6 +3331,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
iomap->offset = (u64) map->m_lblk << blkbits;
iomap->length = (u64) map->m_len << blkbits;
+ if ((map->m_flags & EXT4_MAP_MAPPED) &&
+ !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ iomap->flags |= IOMAP_F_MERGED;
+
/*
* Flags passed to ext4_map_blocks() for direct I/O writes can result
* in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3542,12 +3544,28 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ /*
+ * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
+ * So handle it here itself instead of querying ext4_map_blocks().
+ * Since ext4_map_blocks() will warn about it and will return
+ * -EIO error.
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (offset >= sbi->s_bitmap_maxbytes) {
+ map.m_flags = 0;
+ goto set_iomap;
+ }
+ }
+
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
if (ret == 0)
delalloc = ext4_iomap_is_delalloc(inode, &map);
+set_iomap:
ext4_set_iomap(inode, iomap, &map, offset, length);
if (delalloc && iomap->type == IOMAP_HOLE)
iomap->type = IOMAP_DELALLOC;
@@ -4144,8 +4162,6 @@ int ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return 0;
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
@@ -4364,8 +4380,7 @@ make_io:
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
simulate_eio:
- ext4_set_errno(inode->i_sb, EIO);
- EXT4_ERROR_INODE_BLOCK(inode, block,
+ ext4_error_inode_block(inode, block, EIO,
"unable to read itable block");
brelse(bh);
return -EIO;
@@ -4517,7 +4532,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
(ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
return ERR_PTR(-ESTALE);
- __ext4_error(sb, function, line,
+ __ext4_error(sb, function, line, EFSCORRUPTED, 0,
"inode #%lu: comm %s: iget: illegal inode #",
ino, current->comm);
return ERR_PTR(-EFSCORRUPTED);
@@ -4580,9 +4595,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
- ext4_set_errno(inode->i_sb, EFSBADCRC);
- ext4_error_inode(inode, function, line, 0,
- "iget: checksum invalid");
+ ext4_error_inode_err(inode, function, line, 0, EFSBADCRC,
+ "iget: checksum invalid");
ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4812,7 +4826,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
struct ext4_inode_info *ei)
{
struct inode *inode = &(ei->vfs_inode);
- u64 i_blocks = inode->i_blocks;
+ u64 i_blocks = READ_ONCE(inode->i_blocks);
struct super_block *sb = inode->i_sb;
if (i_blocks <= ~0U) {
@@ -4982,7 +4996,7 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+ if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) {
ext4_isize_set(raw_inode, ei->i_disksize);
need_datasync = 1;
}
@@ -5131,9 +5145,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
- ext4_set_errno(inode->i_sb, EIO);
- EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
- "IO error syncing inode");
+ ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
+ "IO error syncing inode");
err = -EIO;
}
brelse(iloc.bh);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a0ec750018dd..bfc1281fc4cb 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -327,18 +327,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
- if (flags & EXT4_EOFBLOCKS_FL) {
- /* we don't support adding EOFBLOCKS flag */
- if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
- err = -EOPNOTSUPP;
- goto flags_out;
- }
- } else if (oldflags & EXT4_EOFBLOCKS_FL) {
- err = ext4_truncate(inode);
- if (err)
- goto flags_out;
- }
-
if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) {
if (!ext4_has_feature_casefold(sb)) {
err = -EOPNOTSUPP;
@@ -1210,6 +1198,11 @@ resizefs_out:
return -EOPNOTSUPP;
return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_nonce(filp, (void __user *)arg);
+
case EXT4_IOC_CLEAR_ES_CACHE:
{
if (!inode_owner_or_capable(inode))
@@ -1370,6 +1363,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_REMOVE_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
case EXT4_IOC_SHUTDOWN:
case FS_IOC_GETFSMAP:
case FS_IOC_ENABLE_VERITY:
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 51a78eb65f3c..87c85be4c12e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1901,8 +1901,15 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
BUG_ON(buddy == NULL);
k = mb_find_next_zero_bit(buddy, max, 0);
- BUG_ON(k >= max);
-
+ if (k >= max) {
+ ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
+ "%d free clusters of order %d. But found 0",
+ grp->bb_counters[i], i);
+ ext4_mark_group_bitmap_corrupted(ac->ac_sb,
+ e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ break;
+ }
ac->ac_found++;
ac->ac_b_ex.fe_len = 1 << i;
@@ -3914,9 +3921,9 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
- ext4_set_errno(sb, -err);
- ext4_error(sb, "Error %d reading block bitmap for %u",
- err, group);
+ ext4_error_err(sb, -err,
+ "Error %d reading block bitmap for %u",
+ err, group);
return 0;
}
@@ -4083,18 +4090,16 @@ repeat:
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_set_errno(sb, -err);
- ext4_error(sb, "Error %d loading buddy information for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
- ext4_set_errno(sb, -err);
- ext4_error(sb, "Error %d reading block bitmap for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
+ err, group);
ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -4302,7 +4307,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
- pa_inode_list) {
+ pa_inode_list,
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/*
@@ -4347,9 +4353,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_set_errno(sb, -err);
- ext4_error(sb, "Error %d loading buddy information for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
ext4_lock_group(sb, group);
@@ -4386,7 +4391,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* Add the prealloc space to lg */
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
- pa_inode_list) {
+ pa_inode_list,
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
spin_unlock(&tmp_pa->pa_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 87f7551c5132..d34cb8c46655 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -175,8 +175,8 @@ static int kmmpd(void *data)
*/
if (retval) {
if ((failed_writes % 60) == 0) {
- ext4_set_errno(sb, -retval);
- ext4_error(sb, "Error writing to MMP block");
+ ext4_error_err(sb, -retval,
+ "Error writing to MMP block");
}
failed_writes++;
}
@@ -208,9 +208,9 @@ static int kmmpd(void *data)
retval = read_mmp_block(sb, &bh_check, mmp_block);
if (retval) {
- ext4_set_errno(sb, -retval);
- ext4_error(sb, "error reading MMP data: %d",
- retval);
+ ext4_error_err(sb, -retval,
+ "error reading MMP data: %d",
+ retval);
goto exit_thread;
}
@@ -222,8 +222,7 @@ static int kmmpd(void *data)
"Error while updating MMP info. "
"The filesystem seems to have been"
" multiply mounted.");
- ext4_set_errno(sb, EBUSY);
- ext4_error(sb, "abort");
+ ext4_error_err(sb, EBUSY, "abort");
put_bh(bh_check);
retval = -EBUSY;
goto exit_thread;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 30ce3dc69378..1ed86fb6c302 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -422,8 +422,8 @@ repair_branches:
block_len_in_page, 0, &err2);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
- EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
- "Unable to copy data block,"
+ ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset),
+ EIO, "Unable to copy data block,"
" data will be lost.");
*err = -EIO;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b05ea72f38fd..a8aca4772aaa 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -160,9 +160,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
!ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
- ext4_set_errno(inode->i_sb, EFSBADCRC);
- ext4_error_inode(inode, func, line, block,
- "Directory index failed checksum");
+ ext4_error_inode_err(inode, func, line, block,
+ EFSBADCRC,
+ "Directory index failed checksum");
brelse(bh);
return ERR_PTR(-EFSBADCRC);
}
@@ -172,9 +172,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
!ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
- ext4_set_errno(inode->i_sb, EFSBADCRC);
- ext4_error_inode(inode, func, line, block,
- "Directory block failed checksum");
+ ext4_error_inode_err(inode, func, line, block,
+ EFSBADCRC,
+ "Directory block failed checksum");
brelse(bh);
return ERR_PTR(-EFSBADCRC);
}
@@ -233,13 +233,13 @@ struct dx_root
u8 unused_flags;
}
info;
- struct dx_entry entries[0];
+ struct dx_entry entries[];
};
struct dx_node
{
struct fake_dirent fake;
- struct dx_entry entries[0];
+ struct dx_entry entries[];
};
@@ -1532,9 +1532,9 @@ restart:
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_set_errno(sb, EIO);
- EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
- (unsigned long) block);
+ EXT4_ERROR_INODE_ERR(dir, EIO,
+ "reading directory lblock %lu",
+ (unsigned long) block);
brelse(bh);
ret = ERR_PTR(-EIO);
goto cleanup_and_exit;
@@ -1543,9 +1543,9 @@ restart:
!is_dx_internal_node(dir, block,
(struct ext4_dir_entry *)bh->b_data) &&
!ext4_dirblock_csum_verify(dir, bh)) {
- ext4_set_errno(sb, EFSBADCRC);
- EXT4_ERROR_INODE(dir, "checksumming directory "
- "block %lu", (unsigned long)block);
+ EXT4_ERROR_INODE_ERR(dir, EFSBADCRC,
+ "checksumming directory "
+ "block %lu", (unsigned long)block);
brelse(bh);
ret = ERR_PTR(-EFSBADCRC);
goto cleanup_and_exit;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 68b39e75446a..de6fe969f773 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -125,11 +125,10 @@ static void ext4_finish_bio(struct bio *bio)
}
bh = head = page_buffers(page);
/*
- * We check all buffers in the page under BH_Uptodate_Lock
+ * We check all buffers in the page under b_uptodate_lock
* to avoid races with other end io clearing async_write flags
*/
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ spin_lock_irqsave(&head->b_uptodate_lock, flags);
do {
if (bh_offset(bh) < bio_start ||
bh_offset(bh) + bh->b_size > bio_end) {
@@ -141,8 +140,7 @@ static void ext4_finish_bio(struct bio *bio)
if (bio->bi_status)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
fscrypt_free_bounce_page(bounce_page);
end_page_writeback(page);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0c7c4adb664e..9728e7b0e84f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -43,7 +43,7 @@
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
-
+#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -335,10 +335,12 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
-static void __save_error_info(struct super_block *sb, const char *func,
- unsigned int line)
+static void __save_error_info(struct super_block *sb, int error,
+ __u32 ino, __u64 block,
+ const char *func, unsigned int line)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ int err;
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
if (bdev_read_only(sb->s_bdev))
@@ -347,8 +349,62 @@ static void __save_error_info(struct super_block *sb, const char *func,
ext4_update_tstamp(es, s_last_error_time);
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
es->s_last_error_line = cpu_to_le32(line);
- if (es->s_last_error_errcode == 0)
- es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED;
+ es->s_last_error_ino = cpu_to_le32(ino);
+ es->s_last_error_block = cpu_to_le64(block);
+ switch (error) {
+ case EIO:
+ err = EXT4_ERR_EIO;
+ break;
+ case ENOMEM:
+ err = EXT4_ERR_ENOMEM;
+ break;
+ case EFSBADCRC:
+ err = EXT4_ERR_EFSBADCRC;
+ break;
+ case 0:
+ case EFSCORRUPTED:
+ err = EXT4_ERR_EFSCORRUPTED;
+ break;
+ case ENOSPC:
+ err = EXT4_ERR_ENOSPC;
+ break;
+ case ENOKEY:
+ err = EXT4_ERR_ENOKEY;
+ break;
+ case EROFS:
+ err = EXT4_ERR_EROFS;
+ break;
+ case EFBIG:
+ err = EXT4_ERR_EFBIG;
+ break;
+ case EEXIST:
+ err = EXT4_ERR_EEXIST;
+ break;
+ case ERANGE:
+ err = EXT4_ERR_ERANGE;
+ break;
+ case EOVERFLOW:
+ err = EXT4_ERR_EOVERFLOW;
+ break;
+ case EBUSY:
+ err = EXT4_ERR_EBUSY;
+ break;
+ case ENOTDIR:
+ err = EXT4_ERR_ENOTDIR;
+ break;
+ case ENOTEMPTY:
+ err = EXT4_ERR_ENOTEMPTY;
+ break;
+ case ESHUTDOWN:
+ err = EXT4_ERR_ESHUTDOWN;
+ break;
+ case EFAULT:
+ err = EXT4_ERR_EFAULT;
+ break;
+ default:
+ err = EXT4_ERR_UNKNOWN;
+ }
+ es->s_last_error_errcode = err;
if (!es->s_first_error_time) {
es->s_first_error_time = es->s_last_error_time;
es->s_first_error_time_hi = es->s_last_error_time_hi;
@@ -368,11 +424,13 @@ static void __save_error_info(struct super_block *sb, const char *func,
le32_add_cpu(&es->s_error_count, 1);
}
-static void save_error_info(struct super_block *sb, const char *func,
- unsigned int line)
+static void save_error_info(struct super_block *sb, int error,
+ __u32 ino, __u64 block,
+ const char *func, unsigned int line)
{
- __save_error_info(sb, func, line);
- ext4_commit_super(sb, 1);
+ __save_error_info(sb, error, ino, block, func, line);
+ if (!bdev_read_only(sb->s_bdev))
+ ext4_commit_super(sb, 1);
}
/*
@@ -477,7 +535,8 @@ static void ext4_handle_error(struct super_block *sb)
"EXT4-fs error")
void __ext4_error(struct super_block *sb, const char *function,
- unsigned int line, const char *fmt, ...)
+ unsigned int line, int error, __u64 block,
+ const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -495,24 +554,21 @@ void __ext4_error(struct super_block *sb, const char *function,
sb->s_id, function, line, current->comm, &vaf);
va_end(args);
}
- save_error_info(sb, function, line);
+ save_error_info(sb, error, 0, block, function, line);
ext4_handle_error(sb);
}
void __ext4_error_inode(struct inode *inode, const char *function,
- unsigned int line, ext4_fsblk_t block,
+ unsigned int line, ext4_fsblk_t block, int error,
const char *fmt, ...)
{
va_list args;
struct va_format vaf;
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return;
trace_ext4_error(inode->i_sb, function, line);
- es->s_last_error_ino = cpu_to_le32(inode->i_ino);
- es->s_last_error_block = cpu_to_le64(block);
if (ext4_error_ratelimit(inode->i_sb)) {
va_start(args, fmt);
vaf.fmt = fmt;
@@ -529,7 +585,8 @@ void __ext4_error_inode(struct inode *inode, const char *function,
current->comm, &vaf);
va_end(args);
}
- save_error_info(inode->i_sb, function, line);
+ save_error_info(inode->i_sb, error, inode->i_ino, block,
+ function, line);
ext4_handle_error(inode->i_sb);
}
@@ -548,7 +605,6 @@ void __ext4_error_file(struct file *file, const char *function,
trace_ext4_error(inode->i_sb, function, line);
es = EXT4_SB(inode->i_sb)->s_es;
- es->s_last_error_ino = cpu_to_le32(inode->i_ino);
if (ext4_error_ratelimit(inode->i_sb)) {
path = file_path(file, pathname, sizeof(pathname));
if (IS_ERR(path))
@@ -570,7 +626,8 @@ void __ext4_error_file(struct file *file, const char *function,
current->comm, path, &vaf);
va_end(args);
}
- save_error_info(inode->i_sb, function, line);
+ save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block,
+ function, line);
ext4_handle_error(inode->i_sb);
}
@@ -614,66 +671,6 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
return errstr;
}
-void ext4_set_errno(struct super_block *sb, int err)
-{
- if (err < 0)
- err = -err;
-
- switch (err) {
- case EIO:
- err = EXT4_ERR_EIO;
- break;
- case ENOMEM:
- err = EXT4_ERR_ENOMEM;
- break;
- case EFSBADCRC:
- err = EXT4_ERR_EFSBADCRC;
- break;
- case EFSCORRUPTED:
- err = EXT4_ERR_EFSCORRUPTED;
- break;
- case ENOSPC:
- err = EXT4_ERR_ENOSPC;
- break;
- case ENOKEY:
- err = EXT4_ERR_ENOKEY;
- break;
- case EROFS:
- err = EXT4_ERR_EROFS;
- break;
- case EFBIG:
- err = EXT4_ERR_EFBIG;
- break;
- case EEXIST:
- err = EXT4_ERR_EEXIST;
- break;
- case ERANGE:
- err = EXT4_ERR_ERANGE;
- break;
- case EOVERFLOW:
- err = EXT4_ERR_EOVERFLOW;
- break;
- case EBUSY:
- err = EXT4_ERR_EBUSY;
- break;
- case ENOTDIR:
- err = EXT4_ERR_ENOTDIR;
- break;
- case ENOTEMPTY:
- err = EXT4_ERR_ENOTEMPTY;
- break;
- case ESHUTDOWN:
- err = EXT4_ERR_ESHUTDOWN;
- break;
- case EFAULT:
- err = EXT4_ERR_EFAULT;
- break;
- default:
- err = EXT4_ERR_UNKNOWN;
- }
- EXT4_SB(sb)->s_es->s_last_error_errcode = err;
-}
-
/* __ext4_std_error decodes expected errors from journaling functions
* automatically and invokes the appropriate error response. */
@@ -698,8 +695,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
sb->s_id, function, line, errstr);
}
- ext4_set_errno(sb, -errno);
- save_error_info(sb, function, line);
+ save_error_info(sb, -errno, 0, 0, function, line);
ext4_handle_error(sb);
}
@@ -714,7 +710,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
*/
void __ext4_abort(struct super_block *sb, const char *function,
- unsigned int line, const char *fmt, ...)
+ unsigned int line, int error, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -722,7 +718,7 @@ void __ext4_abort(struct super_block *sb, const char *function,
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
return;
- save_error_info(sb, function, line);
+ save_error_info(sb, error, 0, 0, function, line);
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -741,7 +737,6 @@ void __ext4_abort(struct super_block *sb, const char *function,
sb->s_flags |= SB_RDONLY;
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
- save_error_info(sb, function, line);
}
if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
if (EXT4_SB(sb)->s_journal &&
@@ -815,15 +810,12 @@ __acquires(bitlock)
{
struct va_format vaf;
va_list args;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
return;
trace_ext4_error(sb, function, line);
- es->s_last_error_ino = cpu_to_le32(ino);
- es->s_last_error_block = cpu_to_le64(block);
- __save_error_info(sb, function, line);
+ __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
if (ext4_error_ratelimit(sb)) {
va_start(args, fmt);
@@ -927,7 +919,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
- char b[BDEVNAME_SIZE];
bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
@@ -935,8 +926,9 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
return bdev;
fail:
- ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
- __bdevname(dev, b), PTR_ERR(bdev));
+ ext4_msg(sb, KERN_ERR,
+ "failed to open journal device unknown-block(%u,%u) %ld",
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
return NULL;
}
@@ -1024,17 +1016,22 @@ static void ext4_put_super(struct super_block *sb)
destroy_workqueue(sbi->rsv_conversion_wq);
+ /*
+ * Unregister sysfs before destroying jbd2 journal.
+ * Since we could still access attr_journal_task attribute via sysfs
+ * path which could have sbi->s_journal->j_task as NULL
+ */
+ ext4_unregister_sysfs(sb);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
if ((err < 0) && !aborted) {
- ext4_set_errno(sb, -err);
- ext4_abort(sb, "Couldn't clean up the journal");
+ ext4_abort(sb, -err, "Couldn't clean up the journal");
}
}
- ext4_unregister_sysfs(sb);
ext4_es_unregister_shrinker(sbi);
del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
@@ -2180,6 +2177,14 @@ static int parse_options(char *options, struct super_block *sb,
}
}
#endif
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ int blocksize =
+ BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+ if (blocksize < PAGE_SIZE)
+ ext4_msg(sb, KERN_WARNING, "Warning: mounting with an "
+ "experimental mount option 'dioread_nolock' "
+ "for blocksize < PAGE_SIZE");
+ }
return 1;
}
@@ -3609,7 +3614,8 @@ int ext4_calculate_overhead(struct super_block *sb)
*/
if (sbi->s_journal && !sbi->journal_bdev)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
- else if (ext4_has_feature_journal(sb) && !sbi->s_journal) {
+ else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
+ /* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
if (j_inode) {
j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
@@ -3785,7 +3791,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sb, NO_UID32);
/* xattr user namespace & acls are now defaulted on */
set_opt(sb, XATTR_USER);
- set_opt(sb, DIOREAD_NOLOCK);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
@@ -3835,6 +3840,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ if (blocksize == PAGE_SIZE)
+ set_opt(sb, DIOREAD_NOLOCK);
+
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
blocksize > EXT4_MAX_BLOCK_SIZE) {
ext4_msg(sb, KERN_ERR,
@@ -4157,7 +4166,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
sbi->s_inodes_per_group > blocksize * 8) {
ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
- sbi->s_blocks_per_group);
+ sbi->s_inodes_per_group);
goto failed_mount;
}
sbi->s_itb_per_group = sbi->s_inodes_per_group /
@@ -4286,9 +4295,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_BLOCKS_PER_GROUP(sb) - 1);
do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
- ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
"(block count %llu, first data block %u, "
- "blocks per group %lu)", sbi->s_groups_count,
+ "blocks per group %lu)", blocks_count,
ext4_blocks_count(es),
le32_to_cpu(es->s_first_data_block),
EXT4_BLOCKS_PER_GROUP(sb));
@@ -5433,7 +5442,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
- ext4_abort(sb, "Abort forced by user");
+ ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -5622,10 +5631,8 @@ static int ext4_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = dquot->dq_dqb.dqb_bsoftlimit;
- if (dquot->dq_dqb.dqb_bhardlimit &&
- (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
- limit = dquot->dq_dqb.dqb_bhardlimit;
+ limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
+ dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
if (limit && buf->f_blocks > limit) {
@@ -5637,11 +5644,8 @@ static int ext4_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = dquot->dq_dqb.dqb_isoftlimit;
- if (dquot->dq_dqb.dqb_ihardlimit &&
- (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
- limit = dquot->dq_dqb.dqb_ihardlimit;
-
+ limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
+ dquot->dq_dqb.dqb_ihardlimit);
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index d218ebdafa4a..04bfaf63752c 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -13,6 +13,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
+#include <linux/part_stat.h>
#include "ext4.h"
#include "ext4_jbd2.h"
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8cac7d95c3ad..21df43a25328 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -245,7 +245,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
bh->b_data);
errout:
if (error)
- __ext4_error_inode(inode, function, line, 0,
+ __ext4_error_inode(inode, function, line, 0, -error,
"corrupted xattr block %llu",
(unsigned long long) bh->b_blocknr);
else
@@ -269,7 +269,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
errout:
if (error)
- __ext4_error_inode(inode, function, line, 0,
+ __ext4_error_inode(inode, function, line, 0, -error,
"corrupted in-inode xattr");
return error;
}
@@ -2880,9 +2880,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
if (IS_ERR(bh)) {
error = PTR_ERR(bh);
if (error == -EIO) {
- ext4_set_errno(inode->i_sb, EIO);
- EXT4_ERROR_INODE(inode, "block %llu read error",
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE_ERR(inode, EIO,
+ "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
}
bh = NULL;
goto cleanup;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index f39cad2abe2a..ffe21ac77f78 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -48,7 +48,7 @@ struct ext4_xattr_entry {
__le32 e_value_inum; /* inode in which the value is stored */
__le32 e_value_size; /* size of attribute value */
__le32 e_hash; /* hash value of name and value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
#define EXT4_XATTR_PAD_BITS 2
@@ -118,7 +118,7 @@ struct ext4_xattr_ibody_find {
struct ext4_xattr_inode_array {
unsigned int count; /* # of used items in the array */
- struct inode *inodes[0];
+ struct inode *inodes[];
};
extern const struct xattr_handler ext4_xattr_user_handler;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index f0faada30f30..bb68d21e1f8c 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -118,3 +118,12 @@ config F2FS_FS_LZ4
default y
help
Support LZ4 compress algorithm, if unsure, say Y.
+
+config F2FS_FS_ZSTD
+ bool "ZSTD compression support"
+ depends on F2FS_FS_COMPRESSION
+ select ZSTD_COMPRESS
+ select ZSTD_DECOMPRESS
+ default y
+ help
+ Support ZSTD compress algorithm, if unsure, say Y.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 44e84ac5c941..852890b72d6a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -50,9 +50,6 @@ repeat:
return page;
}
-/*
- * We guarantee no failure on the returned page.
- */
static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
bool is_meta)
{
@@ -206,7 +203,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
}
/*
- * Readahead CP/NAT/SIT/SSA pages
+ * Readahead CP/NAT/SIT/SSA/POR pages
*/
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync)
@@ -898,7 +895,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
return -ENOMEM;
/*
* Finding out valid cp block involves read both
- * sets( cp pack1 and cp pack 2)
+ * sets( cp pack 1 and cp pack 2)
*/
cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
@@ -1250,20 +1247,20 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
f2fs_unlock_all(sbi);
}
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!get_pages(sbi, F2FS_WB_CP_DATA))
+ if (!get_pages(sbi, type))
break;
if (unlikely(f2fs_cp_error(sbi)))
break;
- io_schedule_timeout(5*HZ);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
}
finish_wait(&sbi->cp_wait, &wait);
}
@@ -1301,10 +1298,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
else
__clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
- is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
__set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+ if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
+ __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
+
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
__set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
else
@@ -1384,13 +1385,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Flush all the NAT/SIT pages */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
- !f2fs_cp_error(sbi));
- /*
- * modify checkpoint
- * version number is already updated
- */
+ /* start to update checkpoint, cp ver is already updated previously */
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
@@ -1493,11 +1489,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
- !f2fs_cp_error(sbi));
+ /* Wait for all dirty meta pages to be submitted for IO */
+ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
/* wait for previous submitted meta pages writeback */
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
/* flush all device cache */
err = f2fs_flush_device_cache(sbi);
@@ -1506,7 +1502,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* barrier and flush checkpoint cp pack 2 page if it can */
commit_checkpoint(sbi, ckpt, start_blk);
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
/*
* invalidate intermediate page cache borrowed from meta inode which are
@@ -1543,9 +1539,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0;
}
-/*
- * We guarantee that this checkpoint procedure will not fail.
- */
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1613,7 +1606,6 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_sit_entries(sbi, cpc);
- /* unlock all the fs_lock[] in do_checkpoint() */
err = do_checkpoint(sbi, cpc);
if (err)
f2fs_release_discard_addrs(sbi);
@@ -1626,7 +1618,7 @@ stop:
if (cpc->reason & CP_RECOVERY)
f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
- /* do checkpoint periodically */
+ /* update CP_TIME to trigger checkpoint periodically */
f2fs_update_time(sbi, CP_TIME);
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d8a64be90a50..df7b2d15eacd 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -11,6 +11,7 @@
#include <linux/backing-dev.h>
#include <linux/lzo.h>
#include <linux/lz4.h>
+#include <linux/zstd.h>
#include "f2fs.h"
#include "node.h"
@@ -20,6 +21,8 @@ struct f2fs_compress_ops {
int (*init_compress_ctx)(struct compress_ctx *cc);
void (*destroy_compress_ctx)(struct compress_ctx *cc);
int (*compress_pages)(struct compress_ctx *cc);
+ int (*init_decompress_ctx)(struct decompress_io_ctx *dic);
+ void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic);
int (*decompress_pages)(struct decompress_io_ctx *dic);
};
@@ -52,7 +55,7 @@ bool f2fs_is_compressed_page(struct page *page)
}
static void f2fs_set_compressed_page(struct page *page,
- struct inode *inode, pgoff_t index, void *data, refcount_t *r)
+ struct inode *inode, pgoff_t index, void *data)
{
SetPagePrivate(page);
set_page_private(page, (unsigned long)data);
@@ -60,8 +63,6 @@ static void f2fs_set_compressed_page(struct page *page,
/* i_crypto_info and iv index */
page->index = index;
page->mapping = inode->i_mapping;
- if (r)
- refcount_inc(r);
}
static void f2fs_put_compressed_page(struct page *page)
@@ -291,6 +292,165 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = {
};
#endif
+#ifdef CONFIG_F2FS_FS_ZSTD
+#define F2FS_ZSTD_DEFAULT_CLEVEL 1
+
+static int zstd_init_compress_ctx(struct compress_ctx *cc)
+{
+ ZSTD_parameters params;
+ ZSTD_CStream *stream;
+ void *workspace;
+ unsigned int workspace_size;
+
+ params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0);
+ workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams);
+
+ workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
+ workspace_size, GFP_NOFS);
+ if (!workspace)
+ return -ENOMEM;
+
+ stream = ZSTD_initCStream(params, 0, workspace, workspace_size);
+ if (!stream) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initCStream failed\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__);
+ kvfree(workspace);
+ return -EIO;
+ }
+
+ cc->private = workspace;
+ cc->private2 = stream;
+
+ cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE;
+ return 0;
+}
+
+static void zstd_destroy_compress_ctx(struct compress_ctx *cc)
+{
+ kvfree(cc->private);
+ cc->private = NULL;
+ cc->private2 = NULL;
+}
+
+static int zstd_compress_pages(struct compress_ctx *cc)
+{
+ ZSTD_CStream *stream = cc->private2;
+ ZSTD_inBuffer inbuf;
+ ZSTD_outBuffer outbuf;
+ int src_size = cc->rlen;
+ int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE;
+ int ret;
+
+ inbuf.pos = 0;
+ inbuf.src = cc->rbuf;
+ inbuf.size = src_size;
+
+ outbuf.pos = 0;
+ outbuf.dst = cc->cbuf->cdata;
+ outbuf.size = dst_size;
+
+ ret = ZSTD_compressStream(stream, &outbuf, &inbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ ret = ZSTD_endStream(stream, &outbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_endStream returned %d\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ cc->clen = outbuf.pos;
+ return 0;
+}
+
+static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
+{
+ ZSTD_DStream *stream;
+ void *workspace;
+ unsigned int workspace_size;
+
+ workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE);
+
+ workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode),
+ workspace_size, GFP_NOFS);
+ if (!workspace)
+ return -ENOMEM;
+
+ stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE,
+ workspace, workspace_size);
+ if (!stream) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n",
+ KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__);
+ kvfree(workspace);
+ return -EIO;
+ }
+
+ dic->private = workspace;
+ dic->private2 = stream;
+
+ return 0;
+}
+
+static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic)
+{
+ kvfree(dic->private);
+ dic->private = NULL;
+ dic->private2 = NULL;
+}
+
+static int zstd_decompress_pages(struct decompress_io_ctx *dic)
+{
+ ZSTD_DStream *stream = dic->private2;
+ ZSTD_inBuffer inbuf;
+ ZSTD_outBuffer outbuf;
+ int ret;
+
+ inbuf.pos = 0;
+ inbuf.src = dic->cbuf->cdata;
+ inbuf.size = dic->clen;
+
+ outbuf.pos = 0;
+ outbuf.dst = dic->rbuf;
+ outbuf.size = dic->rlen;
+
+ ret = ZSTD_decompressStream(stream, &outbuf, &inbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+ KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ if (dic->rlen != outbuf.pos) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, "
+ "expected:%lu\n", KERN_ERR,
+ F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__, dic->rlen,
+ PAGE_SIZE << dic->log_cluster_size);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_zstd_ops = {
+ .init_compress_ctx = zstd_init_compress_ctx,
+ .destroy_compress_ctx = zstd_destroy_compress_ctx,
+ .compress_pages = zstd_compress_pages,
+ .init_decompress_ctx = zstd_init_decompress_ctx,
+ .destroy_decompress_ctx = zstd_destroy_decompress_ctx,
+ .decompress_pages = zstd_decompress_pages,
+};
+#endif
+
static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
#ifdef CONFIG_F2FS_FS_LZO
&f2fs_lzo_ops,
@@ -302,6 +462,11 @@ static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
#else
NULL,
#endif
+#ifdef CONFIG_F2FS_FS_ZSTD
+ &f2fs_zstd_ops,
+#else
+ NULL,
+#endif
};
bool f2fs_is_compress_backend_ready(struct inode *inode)
@@ -334,9 +499,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
cc->cluster_size, fi->i_compress_algorithm);
- ret = cops->init_compress_ctx(cc);
- if (ret)
- goto out;
+ if (cops->init_compress_ctx) {
+ ret = cops->init_compress_ctx(cc);
+ if (ret)
+ goto out;
+ }
max_len = COMPRESS_HEADER_SIZE + cc->clen;
cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
@@ -380,21 +547,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
}
cc->cbuf->clen = cpu_to_le32(cc->clen);
- cc->cbuf->chksum = cpu_to_le32(0);
for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
cc->cbuf->reserved[i] = cpu_to_le32(0);
+ nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
+
+ /* zero out any unused part of the last page */
+ memset(&cc->cbuf->cdata[cc->clen], 0,
+ (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE));
+
vunmap(cc->cbuf);
vunmap(cc->rbuf);
- nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
-
for (i = nr_cpages; i < cc->nr_cpages; i++) {
f2fs_put_compressed_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
+ if (cops->destroy_compress_ctx)
+ cops->destroy_compress_ctx(cc);
+
cc->nr_cpages = nr_cpages;
trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
@@ -413,7 +586,8 @@ out_free_cpages:
kfree(cc->cpages);
cc->cpages = NULL;
destroy_compress_ctx:
- cops->destroy_compress_ctx(cc);
+ if (cops->destroy_compress_ctx)
+ cops->destroy_compress_ctx(cc);
out:
trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
cc->clen, ret);
@@ -447,10 +621,16 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
goto out_free_dic;
}
+ if (cops->init_decompress_ctx) {
+ ret = cops->init_decompress_ctx(dic);
+ if (ret)
+ goto out_free_dic;
+ }
+
dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
if (!dic->rbuf) {
ret = -ENOMEM;
- goto out_free_dic;
+ goto destroy_decompress_ctx;
}
dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO);
@@ -473,7 +653,12 @@ out_vunmap_cbuf:
vunmap(dic->cbuf);
out_vunmap_rbuf:
vunmap(dic->rbuf);
+destroy_decompress_ctx:
+ if (cops->destroy_decompress_ctx)
+ cops->destroy_decompress_ctx(dic);
out_free_dic:
+ if (verity)
+ refcount_set(&dic->ref, dic->nr_cpages);
if (!verity)
f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
ret, false);
@@ -532,8 +717,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
return true;
}
-/* return # of compressed block addresses */
-static int f2fs_compressed_blocks(struct compress_ctx *cc)
+static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
{
struct dnode_of_data dn;
int ret;
@@ -554,10 +738,15 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc)
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode,
+ blkaddr = data_blkaddr(dn.inode,
dn.node_page, dn.ofs_in_node + i);
- if (blkaddr != NULL_ADDR)
- ret++;
+ if (compr) {
+ if (__is_valid_data_blkaddr(blkaddr))
+ ret++;
+ } else {
+ if (blkaddr != NULL_ADDR)
+ ret++;
+ }
}
}
fail:
@@ -565,6 +754,18 @@ fail:
return ret;
}
+/* return # of compressed blocks in compressed cluster */
+static int f2fs_compressed_blocks(struct compress_ctx *cc)
+{
+ return __f2fs_cluster_blocks(cc, true);
+}
+
+/* return # of valid blocks in compressed cluster */
+static int f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
+{
+ return __f2fs_cluster_blocks(cc, false);
+}
+
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
{
struct compress_ctx cc = {
@@ -574,7 +775,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
.cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
};
- return f2fs_compressed_blocks(&cc);
+ return f2fs_cluster_blocks(&cc, false);
}
static bool cluster_may_compress(struct compress_ctx *cc)
@@ -623,7 +824,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
bool prealloc;
retry:
- ret = f2fs_compressed_blocks(cc);
+ ret = f2fs_cluster_blocks(cc, false);
if (ret <= 0)
return ret;
@@ -653,7 +854,7 @@ retry:
struct bio *bio = NULL;
ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size,
- &last_block_in_bio, false);
+ &last_block_in_bio, false, true);
f2fs_destroy_compress_ctx(cc);
if (ret)
goto release_pages;
@@ -772,7 +973,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
.encrypted_page = NULL,
.compressed_page = NULL,
.submitted = false,
- .need_lock = LOCK_RETRY,
.io_type = io_type,
.io_wbc = wbc,
.encrypted = f2fs_encrypted_file(cc->inode),
@@ -785,16 +985,17 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
loff_t psize;
int i, err;
- set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+ if (!f2fs_trylock_op(sbi))
+ return -EAGAIN;
- f2fs_lock_op(sbi);
+ set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
if (err)
goto out_unlock_op;
for (i = 0; i < cc->cluster_size; i++) {
- if (datablock_addr(dn.inode, dn.node_page,
+ if (data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i) == NULL_ADDR)
goto out_put_dnode;
}
@@ -813,7 +1014,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
cic->inode = inode;
- refcount_set(&cic->ref, 1);
+ refcount_set(&cic->ref, cc->nr_cpages);
cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
cc->log_cluster_size, GFP_NOFS);
if (!cic->rpages)
@@ -823,8 +1024,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
for (i = 0; i < cc->nr_cpages; i++) {
f2fs_set_compressed_page(cc->cpages[i], inode,
- cc->rpages[i + 1]->index,
- cic, i ? &cic->ref : NULL);
+ cc->rpages[i + 1]->index, cic);
fio.compressed_page = cc->cpages[i];
if (fio.encrypted) {
fio.page = cc->rpages[i + 1];
@@ -843,9 +1043,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
- dn.ofs_in_node);
- fio.page = cic->rpages[i];
+ blkaddr = f2fs_data_blkaddr(&dn);
+ fio.page = cc->rpages[i];
fio.old_blkaddr = blkaddr;
/* cluster header */
@@ -895,10 +1094,10 @@ unlock_continue:
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
- down_write(&fi->i_sem);
+ spin_lock(&fi->i_size_lock);
if (fi->last_disk_size < psize)
fi->last_disk_size = psize;
- up_write(&fi->i_sem);
+ spin_unlock(&fi->i_size_lock);
f2fs_put_rpages(cc);
f2fs_destroy_compress_ctx(cc);
@@ -984,24 +1183,30 @@ retry_write:
unlock_page(cc->rpages[i]);
ret = 0;
} else if (ret == -EAGAIN) {
+ /*
+ * for quota file, just redirty left pages to
+ * avoid deadlock caused by cluster update race
+ * from foreground operation.
+ */
+ if (IS_NOQUOTA(cc->inode)) {
+ err = 0;
+ goto out_err;
+ }
ret = 0;
cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
lock_page(cc->rpages[i]);
clear_page_dirty_for_io(cc->rpages[i]);
goto retry_write;
}
err = ret;
- goto out_fail;
+ goto out_err;
}
*submitted += _submitted;
}
return 0;
-
-out_fail:
- /* TODO: revoke partially updated block addresses */
- BUG_ON(compr_blocks);
out_err:
for (++i; i < cc->cluster_size; i++) {
if (!cc->rpages[i])
@@ -1069,7 +1274,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
dic->inode = cc->inode;
- refcount_set(&dic->ref, 1);
+ refcount_set(&dic->ref, cc->nr_cpages);
dic->cluster_idx = cc->cluster_idx;
dic->cluster_size = cc->cluster_size;
dic->log_cluster_size = cc->log_cluster_size;
@@ -1093,8 +1298,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
goto out_free;
f2fs_set_compressed_page(page, cc->inode,
- start_idx + i + 1,
- dic, i ? &dic->ref : NULL);
+ start_idx + i + 1, dic);
dic->cpages[i] = page;
}
@@ -1104,20 +1308,16 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
goto out_free;
for (i = 0; i < dic->cluster_size; i++) {
- if (cc->rpages[i])
+ if (cc->rpages[i]) {
+ dic->tpages[i] = cc->rpages[i];
continue;
+ }
dic->tpages[i] = f2fs_grab_page();
if (!dic->tpages[i])
goto out_free;
}
- for (i = 0; i < dic->cluster_size; i++) {
- if (dic->tpages[i])
- continue;
- dic->tpages[i] = cc->rpages[i];
- }
-
return dic;
out_free:
@@ -1133,7 +1333,10 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
for (i = 0; i < dic->cluster_size; i++) {
if (dic->rpages[i])
continue;
- f2fs_put_page(dic->tpages[i], 1);
+ if (!dic->tpages[i])
+ continue;
+ unlock_page(dic->tpages[i]);
+ put_page(dic->tpages[i]);
}
kfree(dic->tpages);
}
@@ -1162,15 +1365,17 @@ void f2fs_decompress_end_io(struct page **rpages,
if (!rpage)
continue;
- if (err || PageError(rpage)) {
- ClearPageUptodate(rpage);
- ClearPageError(rpage);
- } else {
- if (!verity || fsverity_verify_page(rpage))
- SetPageUptodate(rpage);
- else
- SetPageError(rpage);
+ if (err || PageError(rpage))
+ goto clear_uptodate;
+
+ if (!verity || fsverity_verify_page(rpage)) {
+ SetPageUptodate(rpage);
+ goto unlock;
}
+clear_uptodate:
+ ClearPageUptodate(rpage);
+ ClearPageError(rpage);
+unlock:
unlock_page(rpage);
}
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b27b72107911..cdf2f626bea7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -54,17 +54,13 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
}
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail)
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio)
{
- struct bio *bio;
-
- if (no_fail) {
+ if (noio) {
/* No failure on bio allocation */
- bio = __f2fs_bio_alloc(GFP_NOIO, npages);
- if (!bio)
- bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
- return bio;
+ return __f2fs_bio_alloc(GFP_NOIO, npages);
}
+
if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
return NULL;
@@ -143,6 +139,8 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
f2fs_decompress_pages(bio, page, verity);
continue;
}
+ if (verity)
+ continue;
#endif
/* PG_error was set if any post_read step failed */
@@ -191,12 +189,38 @@ static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
static void f2fs_verify_bio(struct bio *bio)
{
- struct page *page = bio_first_page_all(bio);
- struct decompress_io_ctx *dic =
- (struct decompress_io_ctx *)page_private(page);
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
- f2fs_verify_pages(dic->rpages, dic->cluster_size);
- f2fs_free_dic(dic);
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ struct page *page = bv->bv_page;
+ struct decompress_io_ctx *dic;
+
+ dic = (struct decompress_io_ctx *)page_private(page);
+
+ if (dic) {
+ if (refcount_dec_not_one(&dic->ref))
+ continue;
+ f2fs_verify_pages(dic->rpages,
+ dic->cluster_size);
+ f2fs_free_dic(dic);
+ continue;
+ }
+
+ if (bio->bi_status || PageError(page))
+ goto clear_uptodate;
+
+ if (fsverity_verify_page(page)) {
+ SetPageUptodate(page);
+ goto unlock;
+ }
+clear_uptodate:
+ ClearPageUptodate(page);
+ ClearPageError(page);
+unlock:
+ dec_page_count(F2FS_P_SB(page), __read_io_type(page));
+ unlock_page(page);
+ }
}
#endif
@@ -364,9 +388,6 @@ static void f2fs_write_end_io(struct bio *bio)
bio_put(bio);
}
-/*
- * Return true, if pre_bio's bdev is same as its target device.
- */
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
block_t blk_addr, struct bio *bio)
{
@@ -403,6 +424,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
+/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
static bool __same_bdev(struct f2fs_sb_info *sbi,
block_t blk_addr, struct bio *bio)
{
@@ -410,9 +434,6 @@ static bool __same_bdev(struct f2fs_sb_info *sbi,
return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
}
-/*
- * Low-level block read/write IO operations.
- */
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
@@ -445,7 +466,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
if (type != DATA && type != NODE)
goto submit_io;
- if (test_opt(sbi, LFS) && current->plug)
+ if (f2fs_lfs_mode(sbi) && current->plug)
blk_finish_plug(current->plug);
if (F2FS_IO_ALIGNED(sbi))
@@ -928,14 +949,15 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
unsigned nr_pages, unsigned op_flag,
- pgoff_t first_idx)
+ pgoff_t first_idx, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
struct bio_post_read_ctx *ctx;
unsigned int post_read_steps = 0;
- bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
+ bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES),
+ for_write);
if (!bio)
return ERR_PTR(-ENOMEM);
f2fs_target_device(sbi, blkaddr, bio);
@@ -970,12 +992,12 @@ static void f2fs_release_read_bio(struct bio *bio)
/* This can handle encryption stuffs */
static int f2fs_submit_page_read(struct inode *inode, struct page *page,
- block_t blkaddr)
+ block_t blkaddr, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
- bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index);
+ bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index, for_write);
if (IS_ERR(bio))
return PTR_ERR(bio);
@@ -1047,8 +1069,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
for (; count > 0; dn->ofs_in_node++) {
- block_t blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ block_t blkaddr = f2fs_data_blkaddr(dn);
if (blkaddr == NULL_ADDR) {
dn->data_blkaddr = NEW_ADDR;
__set_data_blkaddr(dn);
@@ -1162,7 +1183,7 @@ got_it:
return page;
}
- err = f2fs_submit_page_read(inode, page, dn.data_blkaddr);
+ err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, for_write);
if (err)
goto put_err;
return page;
@@ -1300,8 +1321,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
if (err)
return err;
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
if (dn->data_blkaddr != NULL_ADDR)
goto alloc;
@@ -1388,13 +1408,9 @@ void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
}
/*
- * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
- * f2fs_map_blocks structure.
- * If original data blocks are allocated, then give them to blockdev.
- * Otherwise,
- * a. preallocate requested block addresses
- * b. do not use extent cache for better performance
- * c. give the block addresses to blockdev
+ * f2fs_map_blocks() tries to find or build mapping relationship which
+ * maps continuous logical blocks to physical blocks, and return such
+ * info via f2fs_map_blocks structure.
*/
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag)
@@ -1422,7 +1438,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
end = pgofs + maxblocks;
if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
- if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create)
goto next_dnode;
@@ -1467,7 +1483,7 @@ next_dnode:
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
- blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
+ blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
@@ -1477,7 +1493,7 @@ next_block:
if (__is_valid_data_blkaddr(blkaddr)) {
/* use out-place-update for driect IO under LFS mode */
- if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create) {
err = __allocate_data_block(&dn, map->m_seg_type);
if (err)
@@ -1980,7 +1996,8 @@ submit_and_realloc:
}
if (bio == NULL) {
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
- is_readahead ? REQ_RAHEAD : 0, page->index);
+ is_readahead ? REQ_RAHEAD : 0, page->index,
+ false);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2015,7 +2032,7 @@ out:
#ifdef CONFIG_F2FS_FS_COMPRESSION
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
- bool is_readahead)
+ bool is_readahead, bool for_write)
{
struct dnode_of_data dn;
struct inode *inode = cc->inode;
@@ -2031,7 +2048,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (f2fs_readpage_limit(inode) +
+ blocksize - 1) >> blkbits;
/* get rid of pages beyond EOF */
for (i = 0; i < cc->cluster_size; i++) {
@@ -2067,7 +2085,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
+ blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i);
if (!__is_valid_data_blkaddr(blkaddr))
@@ -2096,7 +2114,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
struct page *page = dic->cpages[i];
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
+ blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
if (bio && !page_is_mergeable(sbi, bio,
@@ -2109,7 +2127,7 @@ submit_and_realloc:
if (!bio) {
bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
is_readahead ? REQ_RAHEAD : 0,
- page->index);
+ page->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2210,7 +2228,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead);
+ is_readahead, false);
f2fs_destroy_compress_ctx(&cc);
if (ret)
goto set_error_page;
@@ -2253,7 +2271,7 @@ next_page:
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead);
+ is_readahead, false);
f2fs_destroy_compress_ctx(&cc);
}
}
@@ -2326,7 +2344,7 @@ retry_encrypt:
/* flush pending IOs and wait for a while in the ENOMEM case */
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
f2fs_flush_merged_writes(fio->sbi);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
gfp_flags |= __GFP_NOFAIL;
goto retry_encrypt;
}
@@ -2397,7 +2415,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- if (test_opt(sbi, LFS))
+ if (f2fs_lfs_mode(sbi))
return true;
if (S_ISDIR(inode->i_mode))
return true;
@@ -2647,10 +2665,10 @@ write:
if (err) {
file_set_keep_isize(inode);
} else {
- down_write(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
if (F2FS_I(inode)->last_disk_size < psize)
F2FS_I(inode)->last_disk_size = psize;
- up_write(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
}
done:
@@ -2917,7 +2935,7 @@ result:
if (wbc->sync_mode == WB_SYNC_ALL) {
cond_resched();
congestion_wait(BLK_RW_ASYNC,
- HZ/50);
+ DEFAULT_IO_TIMEOUT);
goto retry_write;
}
goto next;
@@ -2973,15 +2991,17 @@ next:
static inline bool __should_serialize_io(struct inode *inode,
struct writeback_control *wbc)
{
+ /* to avoid deadlock in path of data flush */
+ if (F2FS_I(inode)->cp_task)
+ return false;
+
if (!S_ISREG(inode->i_mode))
return false;
- if (f2fs_compressed_file(inode))
- return true;
if (IS_NOQUOTA(inode))
return false;
- /* to avoid deadlock in path of data flush */
- if (F2FS_I(inode)->cp_task)
- return false;
+
+ if (f2fs_compressed_file(inode))
+ return true;
if (wbc->sync_mode != WB_SYNC_ALL)
return true;
if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
@@ -3283,7 +3303,7 @@ repeat:
err = -EFSCORRUPTED;
goto fail;
}
- err = f2fs_submit_page_read(inode, page, blkaddr);
+ err = f2fs_submit_page_read(inode, page, blkaddr, true);
if (err)
goto fail;
@@ -3464,7 +3484,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
iter, rw == WRITE ? get_data_block_dio_write :
get_data_block_dio, NULL, f2fs_dio_submit_bio,
- DIO_LOCKING | DIO_SKIP_HOLES);
+ rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES :
+ DIO_SKIP_HOLES);
if (do_opu)
up_read(&fi->i_gc_rwsem[READ]);
@@ -3861,7 +3882,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
int __init f2fs_init_bio_entry_cache(void)
{
- bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab",
+ bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
if (!bio_entry_slab)
return -ENOMEM;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 6b89eae5e4ca..0dbcb0f9c019 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -301,6 +301,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ssa_area_segs, si->main_area_segs);
seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
si->overp_segs, si->rsvd_segs);
+ seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n",
+ ktime_get_boottime_seconds(),
+ SIT_I(si->sbi)->mounted_time);
if (test_opt(si->sbi, DISCARD))
seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
si->utilization, si->valid_count, si->discard_blks);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 27d0dd7a16d6..44bfc464df78 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -471,7 +471,6 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
struct page *dpage)
{
struct page *page;
- int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir));
int err;
if (is_inode_flag_set(inode, FI_NEW_INODE)) {
@@ -498,8 +497,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- if ((IS_ENCRYPTED(dir) || dummy_encrypt) &&
- f2fs_may_encrypt(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
goto put_error;
@@ -850,12 +848,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
0);
set_page_dirty(page);
- dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir, false);
-
- if (inode)
- f2fs_drop_nlink(dir, inode);
-
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
f2fs_clear_page_cache_dirty_tag(page);
@@ -867,6 +859,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
f2fs_remove_dirty_inode(dir);
}
f2fs_put_page(page, 1);
+
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
+
+ if (inode)
+ f2fs_drop_nlink(dir, inode);
}
bool f2fs_empty_dir(struct inode *dir)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5355be6b6755..ba470d5687fe 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,6 +22,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/quotaops.h>
+#include <linux/part_stat.h>
#include <crypto/hash.h>
#include <linux/fscrypt.h>
@@ -74,7 +75,6 @@ extern const char *f2fs_fault_name[FAULT_MAX];
/*
* For mount options
*/
-#define F2FS_MOUNT_BG_GC 0x00000001
#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002
#define F2FS_MOUNT_DISCARD 0x00000004
#define F2FS_MOUNT_NOHEAP 0x00000008
@@ -88,11 +88,8 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
-#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
-#define F2FS_MOUNT_ADAPTIVE 0x00020000
-#define F2FS_MOUNT_LFS 0x00040000
#define F2FS_MOUNT_USRQUOTA 0x00080000
#define F2FS_MOUNT_GRPQUOTA 0x00100000
#define F2FS_MOUNT_PRJQUOTA 0x00200000
@@ -100,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000
#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
+#define F2FS_MOUNT_NORECOVERY 0x04000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -138,6 +136,8 @@ struct f2fs_mount_info {
int whint_mode;
int alloc_mode; /* segment allocation policy */
int fsync_mode; /* fsync policy */
+ int fs_mode; /* fs mode: LFS or ADAPTIVE */
+ int bggc_mode; /* bggc mode: off, on or sync */
bool test_dummy_encryption; /* test dummy encryption */
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
@@ -331,8 +331,8 @@ struct discard_policy {
bool io_aware; /* issue discard in idle time */
bool sync; /* submit discard with REQ_SYNC flag */
bool ordered; /* issue discard by lba order */
+ bool timeout; /* discard timeout for put_super */
unsigned int granularity; /* discard granularity */
- int timeout; /* discard timeout for put_super */
};
struct discard_cmd_control {
@@ -427,6 +427,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15)
#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
+#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64)
#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL
#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL
@@ -559,6 +560,9 @@ enum {
#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */
+/* congestion wait timeout value, default: 20ms */
+#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20))
+
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
@@ -675,6 +679,44 @@ enum {
MAX_GC_FAILURE
};
+/* used for f2fs_inode_info->flags */
+enum {
+ FI_NEW_INODE, /* indicate newly allocated inode */
+ FI_DIRTY_INODE, /* indicate inode is dirty or not */
+ FI_AUTO_RECOVER, /* indicate inode is recoverable */
+ FI_DIRTY_DIR, /* indicate directory has dirty pages */
+ FI_INC_LINK, /* need to increment i_nlink */
+ FI_ACL_MODE, /* indicate acl mode */
+ FI_NO_ALLOC, /* should not allocate any blocks */
+ FI_FREE_NID, /* free allocated nide */
+ FI_NO_EXTENT, /* not to use the extent cache */
+ FI_INLINE_XATTR, /* used for inline xattr */
+ FI_INLINE_DATA, /* used for inline data*/
+ FI_INLINE_DENTRY, /* used for inline dentry */
+ FI_APPEND_WRITE, /* inode has appended data */
+ FI_UPDATE_WRITE, /* inode has in-place-update data */
+ FI_NEED_IPU, /* used for ipu per file */
+ FI_ATOMIC_FILE, /* indicate atomic file */
+ FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
+ FI_VOLATILE_FILE, /* indicate volatile file */
+ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
+ FI_DROP_CACHE, /* drop dirty page cache */
+ FI_DATA_EXIST, /* indicate data exists */
+ FI_INLINE_DOTS, /* indicate inline dot dentries */
+ FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
+ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
+ FI_HOT_DATA, /* indicate file is hot */
+ FI_EXTRA_ATTR, /* indicate file has extra attribute */
+ FI_PROJ_INHERIT, /* indicate file inherits projectid */
+ FI_PIN_FILE, /* indicate file should not be gced */
+ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
+ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
+ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
+ FI_MMAP_FILE, /* indicate file was mmapped */
+ FI_MAX, /* max flag, never be used */
+};
+
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
unsigned long i_flags; /* keep an inode flags for ioctl */
@@ -687,7 +729,7 @@ struct f2fs_inode_info {
umode_t i_acl_mode; /* keep file acl mode temporarily */
/* Use below internally in f2fs*/
- unsigned long flags; /* use to pass per-file flags */
+ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
@@ -696,6 +738,7 @@ struct f2fs_inode_info {
struct task_struct *cp_task; /* separate cp/wb IO stats*/
nid_t i_xattr_nid; /* node id that contains xattrs */
loff_t last_disk_size; /* lastly written file size */
+ spinlock_t i_size_lock; /* protect last_disk_size */
#ifdef CONFIG_QUOTA
struct dquot *i_dquot[MAXQUOTAS];
@@ -1172,6 +1215,20 @@ enum {
};
enum {
+ BGGC_MODE_ON, /* background gc is on */
+ BGGC_MODE_OFF, /* background gc is off */
+ BGGC_MODE_SYNC, /*
+ * background gc is on, migrating blocks
+ * like foreground gc
+ */
+};
+
+enum {
+ FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */
+ FS_MODE_LFS, /* use lfs allocation only */
+};
+
+enum {
WHINT_MODE_OFF, /* not pass down write hints */
WHINT_MODE_USER, /* try to pass down hints given by users */
WHINT_MODE_FS, /* pass down hints with F2FS policy */
@@ -1211,13 +1268,13 @@ enum fsync_mode {
enum compress_algorithm_type {
COMPRESS_LZO,
COMPRESS_LZ4,
+ COMPRESS_ZSTD,
COMPRESS_MAX,
};
-#define COMPRESS_DATA_RESERVED_SIZE 4
+#define COMPRESS_DATA_RESERVED_SIZE 5
struct compress_data {
__le32 clen; /* compressed data size */
- __le32 chksum; /* checksum of compressed data */
__le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */
u8 cdata[]; /* compressed data */
};
@@ -1241,6 +1298,7 @@ struct compress_ctx {
size_t rlen; /* valid data length in rbuf */
size_t clen; /* valid data length in cbuf */
void *private; /* payload buffer for specified compression algorithm */
+ void *private2; /* extra payload buffer */
};
/* compress context for write IO path */
@@ -1270,11 +1328,14 @@ struct decompress_io_ctx {
size_t clen; /* valid data length in cbuf */
refcount_t ref; /* referrence count of compressed page */
bool failed; /* indicate IO error during decompression */
+ void *private; /* payload buffer for specified decompression algorithm */
+ void *private2; /* extra payload buffer */
};
#define NULL_CLUSTER ((unsigned int)(~0))
#define MIN_COMPRESS_LOG_SIZE 2
#define MAX_COMPRESS_LOG_SIZE 8
+#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE)
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
@@ -1470,6 +1531,9 @@ struct f2fs_sb_info {
__u32 s_chksum_seed;
struct workqueue_struct *post_read_wq; /* post read workqueue */
+
+ struct kmem_cache *inline_xattr_slab; /* inline xattr entry */
+ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */
};
struct f2fs_private_dio {
@@ -2210,7 +2274,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
dquot_free_inode(inode);
} else {
if (unlikely(inode->i_blocks == 0)) {
- f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
+ f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu",
inode->i_ino,
(unsigned long long)inode->i_blocks);
set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -2378,7 +2442,7 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
}
static inline int f2fs_has_extra_attr(struct inode *inode);
-static inline block_t datablock_addr(struct inode *inode,
+static inline block_t data_blkaddr(struct inode *inode,
struct page *node_page, unsigned int offset)
{
struct f2fs_node *raw_node;
@@ -2388,9 +2452,9 @@ static inline block_t datablock_addr(struct inode *inode,
raw_node = F2FS_NODE(node_page);
- /* from GC path only */
if (is_inode) {
if (!inode)
+ /* from GC path only */
base = offset_in_addr(&raw_node->i);
else if (f2fs_has_extra_attr(inode))
base = get_extra_isize(inode);
@@ -2400,6 +2464,11 @@ static inline block_t datablock_addr(struct inode *inode,
return le32_to_cpu(addr_array[base + offset]);
}
+static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn)
+{
+ return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node);
+}
+
static inline int f2fs_test_bit(unsigned int nr, char *addr)
{
int mask;
@@ -2497,43 +2566,6 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
return flags & F2FS_OTHER_FLMASK;
}
-/* used for f2fs_inode_info->flags */
-enum {
- FI_NEW_INODE, /* indicate newly allocated inode */
- FI_DIRTY_INODE, /* indicate inode is dirty or not */
- FI_AUTO_RECOVER, /* indicate inode is recoverable */
- FI_DIRTY_DIR, /* indicate directory has dirty pages */
- FI_INC_LINK, /* need to increment i_nlink */
- FI_ACL_MODE, /* indicate acl mode */
- FI_NO_ALLOC, /* should not allocate any blocks */
- FI_FREE_NID, /* free allocated nide */
- FI_NO_EXTENT, /* not to use the extent cache */
- FI_INLINE_XATTR, /* used for inline xattr */
- FI_INLINE_DATA, /* used for inline data*/
- FI_INLINE_DENTRY, /* used for inline dentry */
- FI_APPEND_WRITE, /* inode has appended data */
- FI_UPDATE_WRITE, /* inode has in-place-update data */
- FI_NEED_IPU, /* used for ipu per file */
- FI_ATOMIC_FILE, /* indicate atomic file */
- FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
- FI_VOLATILE_FILE, /* indicate volatile file */
- FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
- FI_DROP_CACHE, /* drop dirty page cache */
- FI_DATA_EXIST, /* indicate data exists */
- FI_INLINE_DOTS, /* indicate inline dot dentries */
- FI_DO_DEFRAG, /* indicate defragment is running */
- FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
- FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
- FI_HOT_DATA, /* indicate file is hot */
- FI_EXTRA_ATTR, /* indicate file has extra attribute */
- FI_PROJ_INHERIT, /* indicate file inherits projectid */
- FI_PIN_FILE, /* indicate file should not be gced */
- FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
- FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
- FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
- FI_MMAP_FILE, /* indicate file was mmapped */
-};
-
static inline void __mark_inode_dirty_flag(struct inode *inode,
int flag, bool set)
{
@@ -2548,27 +2580,24 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
case FI_PIN_FILE:
- case FI_COMPRESSED_FILE:
f2fs_mark_inode_dirty_sync(inode, true);
}
}
static inline void set_inode_flag(struct inode *inode, int flag)
{
- if (!test_bit(flag, &F2FS_I(inode)->flags))
- set_bit(flag, &F2FS_I(inode)->flags);
+ test_and_set_bit(flag, F2FS_I(inode)->flags);
__mark_inode_dirty_flag(inode, flag, true);
}
static inline int is_inode_flag_set(struct inode *inode, int flag)
{
- return test_bit(flag, &F2FS_I(inode)->flags);
+ return test_bit(flag, F2FS_I(inode)->flags);
}
static inline void clear_inode_flag(struct inode *inode, int flag)
{
- if (test_bit(flag, &F2FS_I(inode)->flags))
- clear_bit(flag, &F2FS_I(inode)->flags);
+ test_and_clear_bit(flag, F2FS_I(inode)->flags);
__mark_inode_dirty_flag(inode, flag, false);
}
@@ -2659,19 +2688,19 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
struct f2fs_inode_info *fi = F2FS_I(inode);
if (ri->i_inline & F2FS_INLINE_XATTR)
- set_bit(FI_INLINE_XATTR, &fi->flags);
+ set_bit(FI_INLINE_XATTR, fi->flags);
if (ri->i_inline & F2FS_INLINE_DATA)
- set_bit(FI_INLINE_DATA, &fi->flags);
+ set_bit(FI_INLINE_DATA, fi->flags);
if (ri->i_inline & F2FS_INLINE_DENTRY)
- set_bit(FI_INLINE_DENTRY, &fi->flags);
+ set_bit(FI_INLINE_DENTRY, fi->flags);
if (ri->i_inline & F2FS_DATA_EXIST)
- set_bit(FI_DATA_EXIST, &fi->flags);
+ set_bit(FI_DATA_EXIST, fi->flags);
if (ri->i_inline & F2FS_INLINE_DOTS)
- set_bit(FI_INLINE_DOTS, &fi->flags);
+ set_bit(FI_INLINE_DOTS, fi->flags);
if (ri->i_inline & F2FS_EXTRA_ATTR)
- set_bit(FI_EXTRA_ATTR, &fi->flags);
+ set_bit(FI_EXTRA_ATTR, fi->flags);
if (ri->i_inline & F2FS_PIN_FILE)
- set_bit(FI_PIN_FILE, &fi->flags);
+ set_bit(FI_PIN_FILE, fi->flags);
}
static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -2856,9 +2885,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
if (!f2fs_is_time_consistent(inode))
return false;
- down_read(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
- up_read(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
return ret;
}
@@ -3212,7 +3241,7 @@ void f2fs_drop_inmem_pages(struct inode *inode);
void f2fs_drop_inmem_page(struct inode *inode, struct page *page);
int f2fs_commit_inmem_pages(struct inode *inode);
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
-void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg);
int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi);
int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
@@ -3308,7 +3337,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
void f2fs_update_dirty_page(struct inode *inode, struct page *page);
void f2fs_remove_dirty_inode(struct inode *inode);
int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi);
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_checkpoint_caches(void);
@@ -3319,7 +3348,7 @@ void f2fs_destroy_checkpoint_caches(void);
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail);
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_bio(struct f2fs_sb_info *sbi,
@@ -3775,7 +3804,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
- bool is_readahead);
+ bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
void f2fs_free_dic(struct decompress_io_ctx *dic);
void f2fs_decompress_end_io(struct page **rpages,
@@ -3812,6 +3841,7 @@ static inline void set_compress_context(struct inode *inode)
F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
set_inode_flag(inode, FI_COMPRESSED_FILE);
stat_inc_compr_inode(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline u64 f2fs_disable_compressed_file(struct inode *inode)
@@ -3820,12 +3850,17 @@ static inline u64 f2fs_disable_compressed_file(struct inode *inode)
if (!f2fs_compressed_file(inode))
return 0;
- if (fi->i_compr_blocks)
- return fi->i_compr_blocks;
+ if (S_ISREG(inode->i_mode)) {
+ if (get_dirty_pages(inode))
+ return 1;
+ if (fi->i_compr_blocks)
+ return fi->i_compr_blocks;
+ }
fi->i_flags &= ~F2FS_COMPR_FL;
- clear_inode_flag(inode, FI_COMPRESSED_FILE);
stat_dec_compr_inode(inode);
+ clear_inode_flag(inode, FI_COMPRESSED_FILE);
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -3902,31 +3937,25 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi)
return false;
}
-
-static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
+static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
{
- clear_opt(sbi, ADAPTIVE);
- clear_opt(sbi, LFS);
-
- switch (mt) {
- case F2FS_MOUNT_ADAPTIVE:
- set_opt(sbi, ADAPTIVE);
- break;
- case F2FS_MOUNT_LFS:
- set_opt(sbi, LFS);
- break;
- }
+ return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
-static inline bool f2fs_may_encrypt(struct inode *inode)
+static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode)
{
#ifdef CONFIG_FS_ENCRYPTION
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
umode_t mode = inode->i_mode;
- return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
-#else
- return false;
+ /*
+ * If the directory encrypted or dummy encryption enabled,
+ * then we should encrypt the inode.
+ */
+ if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi))
+ return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
#endif
+ return false;
}
static inline bool f2fs_may_compress(struct inode *inode)
@@ -3970,7 +3999,7 @@ static inline int allow_outplace_dio(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int rw = iov_iter_rw(iter);
- return (test_opt(sbi, LFS) && (rw == WRITE) &&
+ return (f2fs_lfs_mode(sbi) && (rw == WRITE) &&
!block_unaligned_IO(inode, iocb, iter));
}
@@ -3992,7 +4021,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
*/
if (f2fs_sb_has_blkzoned(sbi))
return true;
- if (test_opt(sbi, LFS) && (rw == WRITE)) {
+ if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
if (block_unaligned_IO(inode, iocb, iter))
return true;
if (F2FS_IO_ALIGNED(sbi))
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0d4da644df3b..6ab8f621a3c5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -106,13 +106,20 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
err = f2fs_get_block(&dn, page->index);
f2fs_put_dnode(&dn);
__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
- if (err) {
- unlock_page(page);
- goto out_sem;
- }
}
- /* fill the page */
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (!need_alloc) {
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+ f2fs_put_dnode(&dn);
+ }
+#endif
+ if (err) {
+ unlock_page(page);
+ goto out_sem;
+ }
+
f2fs_wait_on_page_writeback(page, DATA, false, true);
/* wait for GCed page writeback via META_MAPPING */
@@ -448,8 +455,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
@@ -793,6 +799,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
}
flags = fi->i_flags;
+ if (flags & F2FS_COMPR_FL)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
if (flags & F2FS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (IS_ENCRYPTED(inode))
@@ -804,7 +812,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
if (IS_VERITY(inode))
stat->attributes |= STATX_ATTR_VERITY;
- stat->attributes_mask |= (STATX_ATTR_APPEND |
+ stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+ STATX_ATTR_APPEND |
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP |
@@ -929,10 +938,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- down_write(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
inode->i_mtime = inode->i_ctime = current_time(inode);
F2FS_I(inode)->last_disk_size = i_size_read(inode);
- up_write(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
}
__setattr_copy(inode, attr);
@@ -1109,8 +1118,7 @@ next_dnode:
done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
dn.ofs_in_node, len);
for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
- *blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ *blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(*blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, *blkaddr,
@@ -1121,7 +1129,7 @@ next_dnode:
if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) {
- if (test_opt(sbi, LFS)) {
+ if (f2fs_lfs_mode(sbi)) {
f2fs_put_dnode(&dn);
return -EOPNOTSUPP;
}
@@ -1199,8 +1207,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
ADDRS_PER_PAGE(dn.node_page, dst_inode) -
dn.ofs_in_node, len - i);
do {
- dn.data_blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ dn.data_blkaddr = f2fs_data_blkaddr(&dn);
f2fs_truncate_data_blocks_range(&dn, 1);
if (do_replace[i]) {
@@ -1376,8 +1383,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
int ret;
for (; index < end; index++, dn->ofs_in_node++) {
- if (datablock_addr(dn->inode, dn->node_page,
- dn->ofs_in_node) == NULL_ADDR)
+ if (f2fs_data_blkaddr(dn) == NULL_ADDR)
count++;
}
@@ -1388,8 +1394,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
dn->ofs_in_node = ofs_in_node;
for (index = start; index < end; index++, dn->ofs_in_node++) {
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
/*
* f2fs_reserve_new_blocks will not guarantee entire block
* allocation.
@@ -1787,12 +1792,15 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+ u32 masked_flags = fi->i_flags & mask;
+
+ f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask));
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode))
return -EPERM;
- if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) {
+ if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) {
if (!f2fs_sb_has_casefold(F2FS_I_SB(inode)))
return -EOPNOTSUPP;
if (!f2fs_empty_dir(inode))
@@ -1806,27 +1814,22 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
return -EINVAL;
}
- if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) {
- if (S_ISREG(inode->i_mode) &&
- (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) ||
- F2FS_HAS_BLOCKS(inode)))
- return -EINVAL;
+ if ((iflags ^ masked_flags) & F2FS_COMPR_FL) {
+ if (masked_flags & F2FS_COMPR_FL) {
+ if (f2fs_disable_compressed_file(inode))
+ return -EINVAL;
+ }
if (iflags & F2FS_NOCOMP_FL)
return -EINVAL;
if (iflags & F2FS_COMPR_FL) {
- int err = f2fs_convert_inline_inode(inode);
-
- if (err)
- return err;
-
if (!f2fs_may_compress(inode))
return -EINVAL;
set_compress_context(inode);
}
}
- if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) {
- if (fi->i_flags & F2FS_COMPR_FL)
+ if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) {
+ if (masked_flags & F2FS_COMPR_FL)
return -EINVAL;
}
@@ -2423,6 +2426,14 @@ static int f2fs_ioc_get_encryption_key_status(struct file *filp,
return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
}
+static int f2fs_ioc_get_encryption_nonce(struct file *filp, unsigned long arg)
+{
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fscrypt_ioctl_get_nonce(filp, (void __user *)arg);
+}
+
static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -3393,6 +3404,21 @@ out:
return err;
}
+static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ __u64 blocks;
+
+ if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+ return -EOPNOTSUPP;
+
+ if (!f2fs_compressed_file(inode))
+ return -EINVAL;
+
+ blocks = F2FS_I(inode)->i_compr_blocks;
+ return put_user(blocks, (u64 __user *)arg);
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3437,6 +3463,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_remove_encryption_key_all_users(filp, arg);
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
return f2fs_ioc_get_encryption_key_status(filp, arg);
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ return f2fs_ioc_get_encryption_nonce(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT:
return f2fs_ioc_gc(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT_RANGE:
@@ -3471,6 +3499,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_get_volume_name(filp, arg);
case F2FS_IOC_SET_VOLUME_NAME:
return f2fs_set_volume_name(filp, arg);
+ case F2FS_IOC_GET_COMPRESS_BLOCKS:
+ return f2fs_get_compress_blocks(filp, arg);
default:
return -ENOTTY;
}
@@ -3498,8 +3528,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- if (!f2fs_is_compress_backend_ready(inode))
- return -EOPNOTSUPP;
+ if (!f2fs_is_compress_backend_ready(inode)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode)) {
@@ -3611,6 +3643,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_REMOVE_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
case F2FS_IOC_GARBAGE_COLLECT:
case F2FS_IOC_GARBAGE_COLLECT_RANGE:
case F2FS_IOC_WRITE_CHECKPOINT:
@@ -3628,6 +3661,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_MEASURE_VERITY:
case F2FS_IOC_GET_VOLUME_NAME:
case F2FS_IOC_SET_VOLUME_NAME:
+ case F2FS_IOC_GET_COMPRESS_BLOCKS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index db8725d473b5..26248c8936db 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -31,6 +31,8 @@ static int gc_thread_func(void *data)
set_freezable();
do {
+ bool sync_mode;
+
wait_event_interruptible_timeout(*wq,
kthread_should_stop() || freezing(current) ||
gc_th->gc_wake,
@@ -101,15 +103,17 @@ static int gc_thread_func(void *data)
do_gc:
stat_inc_bggc_count(sbi->stat_info);
+ sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
+
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
+ if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
prefree_segments(sbi), free_segments(sbi));
/* balancing f2fs's metadata periodically */
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, true);
next:
sb_end_write(sbi->sb);
@@ -192,7 +196,10 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->ofs_unit = sbi->segs_per_sec;
}
- /* we need to check every dirty segments in the FG_GC case */
+ /*
+ * adjust candidates range, should select all dirty segments for
+ * foreground GC and urgent GC cases.
+ */
if (gc_type != FG_GC &&
(sbi->gc_mode != GC_URGENT) &&
p->max_search > sbi->max_victim_search)
@@ -634,7 +641,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
*nofs = ofs_of_node(node_page);
- source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
+ source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr) {
@@ -762,7 +769,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
struct page *page, *mpage;
block_t newaddr;
int err = 0;
- bool lfs_mode = test_opt(fio.sbi, LFS);
+ bool lfs_mode = f2fs_lfs_mode(fio.sbi);
/* do not read out */
page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
@@ -970,7 +977,8 @@ retry:
if (err) {
clear_cold_data(page);
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry;
}
if (is_dirty)
@@ -1018,8 +1026,8 @@ next_step:
* race condition along with SSR block allocation.
*/
if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
- get_valid_blocks(sbi, segno, false) ==
- sbi->blocks_per_seg)
+ get_valid_blocks(sbi, segno, true) ==
+ BLKS_PER_SEC(sbi))
return submitted;
if (check_valid_map(sbi, segno, off) == 0)
@@ -1203,7 +1211,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
if (get_valid_blocks(sbi, segno, false) == 0)
goto freed;
- if (__is_large_section(sbi) &&
+ if (gc_type == BG_GC && __is_large_section(sbi) &&
migrated >= sbi->migration_granularity)
goto skip;
if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
@@ -1233,12 +1241,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
segno, gc_type);
stat_inc_seg_count(sbi, type, gc_type);
+ migrated++;
freed:
if (gc_type == FG_GC &&
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
- migrated++;
if (__is_large_section(sbi) && segno + 1 < end_segno)
sbi->next_victim_seg[gc_type] = segno + 1;
@@ -1434,12 +1442,19 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
{
struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi);
- int section_count = le32_to_cpu(raw_sb->section_count);
- int segment_count = le32_to_cpu(raw_sb->segment_count);
- int segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
- long long block_count = le64_to_cpu(raw_sb->block_count);
+ int section_count;
+ int segment_count;
+ int segment_count_main;
+ long long block_count;
int segs = secs * sbi->segs_per_sec;
+ down_write(&sbi->sb_lock);
+
+ section_count = le32_to_cpu(raw_sb->section_count);
+ segment_count = le32_to_cpu(raw_sb->segment_count);
+ segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
+ block_count = le64_to_cpu(raw_sb->block_count);
+
raw_sb->section_count = cpu_to_le32(section_count + secs);
raw_sb->segment_count = cpu_to_le32(segment_count + segs);
raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
@@ -1453,6 +1468,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
raw_sb->devs[last_dev].total_segments =
cpu_to_le32(dev_segs + segs);
}
+
+ up_write(&sbi->sb_lock);
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
@@ -1570,11 +1587,17 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
goto out;
}
+ mutex_lock(&sbi->cp_mutex);
update_fs_metadata(sbi, -secs);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ mutex_unlock(&sbi->cp_mutex);
+
err = f2fs_sync_fs(sbi->sb, 1);
if (err) {
+ mutex_lock(&sbi->cp_mutex);
update_fs_metadata(sbi, secs);
+ mutex_unlock(&sbi->cp_mutex);
update_sb_metadata(sbi, secs);
f2fs_commit_super(sbi, false);
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 78c3f1d70f1d..44582a4db513 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -291,13 +291,30 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
fi->i_flags & F2FS_COMPR_FL &&
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
i_log_cluster_size)) {
- if (ri->i_compress_algorithm >= COMPRESS_MAX)
+ if (ri->i_compress_algorithm >= COMPRESS_MAX) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
+ "compress algorithm: %u, run fsck to fix",
+ __func__, inode->i_ino,
+ ri->i_compress_algorithm);
return false;
- if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks)
+ }
+ if (le64_to_cpu(ri->i_compr_blocks) >
+ SECTOR_TO_BLOCK(inode->i_blocks)) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent "
+ "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix",
+ __func__, inode->i_ino,
+ le64_to_cpu(ri->i_compr_blocks),
+ SECTOR_TO_BLOCK(inode->i_blocks));
return false;
+ }
if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
- ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE)
+ ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
+ "log cluster size: %u, run fsck to fix",
+ __func__, inode->i_ino,
+ ri->i_log_cluster_size);
return false;
+ }
}
return true;
@@ -345,7 +362,7 @@ static int do_read_inode(struct inode *inode)
fi->i_flags = le32_to_cpu(ri->i_flags);
if (S_ISREG(inode->i_mode))
fi->i_flags &= ~F2FS_PROJINHERIT_FL;
- fi->flags = 0;
+ bitmap_zero(fi->flags, FI_MAX);
fi->i_advise = ri->i_advise;
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
@@ -518,7 +535,7 @@ retry:
inode = f2fs_iget(sb, ino);
if (IS_ERR(inode)) {
if (PTR_ERR(inode) == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry;
}
}
@@ -759,7 +776,7 @@ no_delete:
else
f2fs_inode_synced(inode);
- /* ino == 0, if f2fs_new_inode() was failed t*/
+ /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */
if (inode->i_ino)
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
inode->i_ino);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2aa035422c0f..f54119da2217 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -75,9 +75,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
set_inode_flag(inode, FI_NEW_INODE);
- /* If the directory encrypted, then we should encrypt the inode. */
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- f2fs_may_encrypt(inode))
+ if (f2fs_may_encrypt(dir, inode))
f2fs_set_encrypted_inode(inode);
if (f2fs_sb_has_extra_attr(sbi)) {
@@ -177,7 +175,7 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
}
/*
- * Set multimedia files as cold files for hot/cold data separation
+ * Set file's temperature for hot/cold data separation
*/
static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
const unsigned char *name)
@@ -876,12 +874,6 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
- int err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
- }
-
return __f2fs_tmpfile(dir, dentry, mode, NULL);
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9d02cdcdbb07..ecbd6bd14a49 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -510,9 +510,6 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
return nr - nr_shrink;
}
-/*
- * This function always returns success
- */
int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
struct node_info *ni)
{
@@ -716,8 +713,7 @@ got:
/*
* Caller should call f2fs_put_dnode(dn).
* Also, it should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
- * In the case of RDONLY_NODE, we don't need to care about mutex.
+ * f2fs_unlock_op() only if mode is set with ALLOC_NODE.
*/
int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
@@ -809,8 +805,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
dn->nid = nids[level];
dn->ofs_in_node = offset[level];
dn->node_page = npage[level];
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
return 0;
release_pages:
@@ -1188,8 +1183,9 @@ int f2fs_remove_inode_page(struct inode *inode)
}
if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) {
- f2fs_warn(F2FS_I_SB(inode), "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
- inode->i_ino, (unsigned long long)inode->i_blocks);
+ f2fs_warn(F2FS_I_SB(inode),
+ "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu",
+ inode->i_ino, (unsigned long long)inode->i_blocks);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
}
@@ -1562,15 +1558,16 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (atomic && !test_opt(sbi, NOBARRIER))
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
- set_page_writeback(page);
- ClearPageError(page);
-
+ /* should add to global list before clearing PAGECACHE status */
if (f2fs_in_warm_node_list(sbi, page)) {
seq = f2fs_add_fsync_node_entry(sbi, page);
if (seq_id)
*seq_id = seq;
}
+ set_page_writeback(page);
+ ClearPageError(page);
+
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
@@ -1979,7 +1976,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
goto skip_write;
/* balancing f2fs's metadata in background */
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, true);
/* collect a number of dirty node pages and write together */
if (wbc->sync_mode != WB_SYNC_ALL &&
@@ -2602,7 +2599,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
retry:
ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
if (!ipage) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry;
}
@@ -3193,22 +3190,22 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
int __init f2fs_create_node_manager_caches(void)
{
- nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
+ nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry",
sizeof(struct nat_entry));
if (!nat_entry_slab)
goto fail;
- free_nid_slab = f2fs_kmem_cache_create("free_nid",
+ free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid",
sizeof(struct free_nid));
if (!free_nid_slab)
goto destroy_nat_entry;
- nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
+ nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set",
sizeof(struct nat_entry_set));
if (!nat_entry_set_slab)
goto destroy_free_nid;
- fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
+ fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry",
sizeof(struct fsync_node_entry));
if (!fsync_node_entry_slab)
goto destroy_nat_entry_set;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 763d5c0951d1..dd804c07eeb0 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -496,8 +496,7 @@ out:
return 0;
truncate_out:
- if (datablock_addr(tdn.inode, tdn.node_page,
- tdn.ofs_in_node) == blkaddr)
+ if (f2fs_data_blkaddr(&tdn) == blkaddr)
f2fs_truncate_data_blocks_range(&tdn, 1);
if (dn->inode->i_ino == nid && !dn->inode_page_locked)
unlock_page(dn->inode_page);
@@ -535,7 +534,7 @@ retry_dn:
err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry_dn;
}
goto out;
@@ -560,8 +559,8 @@ retry_dn:
for (; start < end; start++, dn.ofs_in_node++) {
block_t src, dest;
- src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
- dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
+ src = f2fs_data_blkaddr(&dn);
+ dest = data_blkaddr(dn.inode, page, dn.ofs_in_node);
if (__is_valid_data_blkaddr(src) &&
!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
@@ -618,7 +617,8 @@ retry_prev:
err = check_index_in_prev_nodes(sbi, dest, &dn);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry_prev;
}
goto err;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index cf0eb002cfd4..b7a9421472a7 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -172,7 +172,7 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
- if (test_opt(sbi, LFS))
+ if (f2fs_lfs_mode(sbi))
return false;
if (sbi->gc_mode == GC_URGENT)
return true;
@@ -245,7 +245,8 @@ retry:
LOOKUP_NODE);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
cond_resched();
goto retry;
}
@@ -312,7 +313,7 @@ next:
skip:
iput(inode);
}
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
cond_resched();
if (gc_failure) {
if (++looped >= count)
@@ -415,7 +416,8 @@ retry:
err = f2fs_do_write_data_page(&fio);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
cond_resched();
goto retry;
}
@@ -494,7 +496,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
/* balance_fs_bg is able to be pending */
if (need && excess_cached_nats(sbi))
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, false);
if (!f2fs_is_checkpoint_ready(sbi))
return;
@@ -509,7 +511,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
}
}
-void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
{
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return;
@@ -538,7 +540,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
excess_dirty_nats(sbi) ||
excess_dirty_nodes(sbi) ||
f2fs_time_over(sbi, CP_TIME)) {
- if (test_opt(sbi, DATA_FLUSH)) {
+ if (test_opt(sbi, DATA_FLUSH) && from_bg) {
struct blk_plug plug;
mutex_lock(&sbi->flush_lock);
@@ -1078,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
- dpolicy->timeout = 0;
+ dpolicy->timeout = false;
if (discard_type == DPOLICY_BG) {
dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1103,6 +1105,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->io_aware = false;
/* we need to issue all to keep CP_TRIMMED_FLAG */
dpolicy->granularity = 1;
+ dpolicy->timeout = true;
}
}
@@ -1471,12 +1474,12 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
int i, issued = 0;
bool io_interrupted = false;
- if (dpolicy->timeout != 0)
- f2fs_update_time(sbi, dpolicy->timeout);
+ if (dpolicy->timeout)
+ f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT);
for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
- if (dpolicy->timeout != 0 &&
- f2fs_time_over(sbi, dpolicy->timeout))
+ if (dpolicy->timeout &&
+ f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
break;
if (i + 1 < dpolicy->granularity)
@@ -1497,8 +1500,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
list_for_each_entry_safe(dc, tmp, pend_list, list) {
f2fs_bug_on(sbi, dc->state != D_PREP);
- if (dpolicy->timeout != 0 &&
- f2fs_time_over(sbi, dpolicy->timeout))
+ if (dpolicy->timeout &&
+ f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
break;
if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
@@ -1677,7 +1680,6 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
- dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
__issue_discard_cmd(sbi, &dpolicy);
dropped = __drop_discard_cmd(sbi);
@@ -1940,7 +1942,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
bool force = (cpc->reason & CP_DISCARD);
- bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
+ bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
mutex_lock(&dirty_i->seglist_lock);
@@ -1972,7 +1974,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
(end - 1) <= cpc->trim_end)
continue;
- if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) {
+ if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) {
f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
(end - start) << sbi->log_blocks_per_seg);
continue;
@@ -2801,7 +2803,7 @@ next:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
trimmed += __wait_all_discard_cmd(sbi, NULL);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto next;
}
skip:
@@ -2830,7 +2832,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
struct discard_policy dpolicy;
unsigned long long trimmed = 0;
int err = 0;
- bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
+ bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
@@ -3193,7 +3195,7 @@ static void update_device_state(struct f2fs_io_info *fio)
static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
int type = __get_segment_type(fio);
- bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA);
+ bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
if (keep_order)
down_read(&fio->sbi->io_order_lock);
@@ -4071,7 +4073,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
- sit_i->mounted_time = ktime_get_real_seconds();
+ sit_i->mounted_time = ktime_get_boottime_seconds();
init_rwsem(&sit_i->sentry_lock);
return 0;
}
@@ -4678,7 +4680,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
- if (!test_opt(sbi, LFS))
+ if (!f2fs_lfs_mode(sbi))
sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
@@ -4830,22 +4832,22 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
int __init f2fs_create_segment_manager_caches(void)
{
- discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+ discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry",
sizeof(struct discard_entry));
if (!discard_entry_slab)
goto fail;
- discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd",
+ discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd",
sizeof(struct discard_cmd));
if (!discard_cmd_slab)
goto destroy_discard_entry;
- sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+ sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set",
sizeof(struct sit_entry_set));
if (!sit_entry_set_slab)
goto destroy_discard_cmd;
- inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+ inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry",
sizeof(struct inmem_pages));
if (!inmem_entry_slab)
goto destroy_sit_entry_set;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 459dc3901a57..7a83bd530812 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -756,7 +756,7 @@ static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi,
bool base_time)
{
struct sit_info *sit_i = SIT_I(sbi);
- time64_t diff, now = ktime_get_real_seconds();
+ time64_t diff, now = ktime_get_boottime_seconds();
if (now >= sit_i->mounted_time)
return sit_i->elapsed_time + now - sit_i->mounted_time;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index a467aca29cfe..d66de5999a26 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -58,7 +58,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
/* count extent cache entries */
count += __count_extent_cache(sbi);
- /* shrink clean nat cache entries */
+ /* count clean nat cache entries */
count += __count_nat_entries(sbi);
/* count free nids cache entries */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 65a7a432dfee..f2dfc21c6abb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -24,6 +24,7 @@
#include <linux/sysfs.h>
#include <linux/quota.h>
#include <linux/unicode.h>
+#include <linux/part_stat.h>
#include "f2fs.h"
#include "node.h"
@@ -427,14 +428,11 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
- set_opt(sbi, BG_GC);
- clear_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
} else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
- clear_opt(sbi, BG_GC);
- clear_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF;
} else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
- set_opt(sbi, BG_GC);
- set_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
} else {
kvfree(name);
return -EINVAL;
@@ -446,7 +444,7 @@ static int parse_options(struct super_block *sb, char *options)
break;
case Opt_norecovery:
/* this option mounts f2fs with ro */
- set_opt(sbi, DISABLE_ROLL_FORWARD);
+ set_opt(sbi, NORECOVERY);
if (!f2fs_readonly(sb))
return -EINVAL;
break;
@@ -600,10 +598,10 @@ static int parse_options(struct super_block *sb, char *options)
kvfree(name);
return -EINVAL;
}
- set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
} else if (strlen(name) == 3 &&
!strncmp(name, "lfs", 3)) {
- set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
} else {
kvfree(name);
return -EINVAL;
@@ -832,6 +830,10 @@ static int parse_options(struct super_block *sb, char *options)
!strcmp(name, "lz4")) {
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_LZ4;
+ } else if (strlen(name) == 4 &&
+ !strcmp(name, "zstd")) {
+ F2FS_OPTION(sbi).compress_algorithm =
+ COMPRESS_ZSTD;
} else {
kfree(name);
return -EINVAL;
@@ -904,7 +906,7 @@ static int parse_options(struct super_block *sb, char *options)
}
#endif
- if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
+ if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
F2FS_IO_SIZE_KB(sbi));
return -EINVAL;
@@ -934,7 +936,7 @@ static int parse_options(struct super_block *sb, char *options)
}
}
- if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) {
+ if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n");
return -EINVAL;
}
@@ -960,6 +962,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
init_rwsem(&fi->i_sem);
+ spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_ilist);
@@ -1172,7 +1175,7 @@ static void f2fs_put_super(struct super_block *sb)
/* our cp_error case, we can wait for any writeback page */
f2fs_flush_merged_writes(sbi);
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
f2fs_bug_on(sbi, sbi->fsync_node_num);
@@ -1204,6 +1207,7 @@ static void f2fs_put_super(struct super_block *sb)
kvfree(sbi->raw_super);
destroy_device_list(sbi);
+ f2fs_destroy_xattr_caches(sbi);
mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -1420,6 +1424,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
case COMPRESS_LZ4:
algtype = "lz4";
break;
+ case COMPRESS_ZSTD:
+ algtype = "zstd";
+ break;
}
seq_printf(seq, ",compress_algorithm=%s", algtype);
@@ -1436,16 +1443,17 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
- if (test_opt(sbi, FORCE_FG_GC))
- seq_printf(seq, ",background_gc=%s", "sync");
- else
- seq_printf(seq, ",background_gc=%s", "on");
- } else {
+ if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC)
+ seq_printf(seq, ",background_gc=%s", "sync");
+ else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON)
+ seq_printf(seq, ",background_gc=%s", "on");
+ else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF)
seq_printf(seq, ",background_gc=%s", "off");
- }
+
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
+ if (test_opt(sbi, NORECOVERY))
+ seq_puts(seq, ",norecovery");
if (test_opt(sbi, DISCARD))
seq_puts(seq, ",discard");
else
@@ -1497,9 +1505,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",data_flush");
seq_puts(seq, ",mode=");
- if (test_opt(sbi, ADAPTIVE))
+ if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE)
seq_puts(seq, "adaptive");
- else if (test_opt(sbi, LFS))
+ else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS)
seq_puts(seq, "lfs");
seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
if (test_opt(sbi, RESERVE_ROOT))
@@ -1570,11 +1578,11 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).test_dummy_encryption = false;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
- F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO;
+ F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4;
F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE;
F2FS_OPTION(sbi).compress_ext_cnt = 0;
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
- set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
@@ -1586,9 +1594,9 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, FLUSH_MERGE);
set_opt(sbi, DISCARD);
if (f2fs_sb_has_blkzoned(sbi))
- set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
else
- set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
@@ -1657,7 +1665,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
out_unlock:
up_write(&sbi->gc_lock);
restore_flag:
- sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return err;
}
@@ -1780,7 +1788,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* or if background_gc = off is passed in mount
* option. Also sync the filesystem.
*/
- if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) {
+ if ((*flags & SB_RDONLY) ||
+ F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) {
if (sbi->gc_thread) {
f2fs_stop_gc_thread(sbi);
need_restart_gc = true;
@@ -1885,7 +1894,8 @@ repeat:
page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
if (IS_ERR(page)) {
if (PTR_ERR(page) == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto repeat;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -1927,6 +1937,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
int offset = off & (sb->s_blocksize - 1);
size_t towrite = len;
struct page *page;
+ void *fsdata = NULL;
char *kaddr;
int err = 0;
int tocopy;
@@ -1936,10 +1947,11 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
towrite);
retry:
err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
- &page, NULL);
+ &page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -1952,7 +1964,7 @@ retry:
flush_dcache_page(page);
a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
- page, NULL);
+ page, fsdata);
offset = 0;
towrite -= tocopy;
off += tocopy;
@@ -3456,12 +3468,17 @@ try_onemore:
}
}
+ /* init per sbi slab cache */
+ err = f2fs_init_xattr_caches(sbi);
+ if (err)
+ goto free_io_dummy;
+
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
f2fs_err(sbi, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
- goto free_io_dummy;
+ goto free_xattr_cache;
}
err = f2fs_get_valid_checkpoint(sbi);
@@ -3589,7 +3606,7 @@ try_onemore:
f2fs_err(sbi, "Cannot turn on quotas: error %d", err);
}
#endif
- /* if there are nt orphan nodes free them */
+ /* if there are any orphan inodes, free them */
err = f2fs_recover_orphan_inodes(sbi);
if (err)
goto free_meta;
@@ -3598,7 +3615,8 @@ try_onemore:
goto reset_checkpoint;
/* recover fsynced data */
- if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) &&
+ !test_opt(sbi, NORECOVERY)) {
/*
* mount should be failed, when device has readonly mode, and
* previous checkpoint was not done by clean system shutdown.
@@ -3664,7 +3682,7 @@ reset_checkpoint:
* If filesystem is not mounted as read-only then
* do start the gc_thread.
*/
- if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) {
+ if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) {
/* After POR, we can run background GC thread.*/
err = f2fs_start_gc_thread(sbi);
if (err)
@@ -3733,6 +3751,8 @@ free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
sbi->meta_inode = NULL;
+free_xattr_cache:
+ f2fs_destroy_xattr_caches(sbi);
free_io_dummy:
mempool_destroy(sbi->write_io_dummy);
free_percpu:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 91d649790b1b..e3bbbef9b4f0 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -109,47 +109,47 @@ static ssize_t features_show(struct f2fs_attr *a,
return sprintf(buf, "0\n");
if (f2fs_sb_has_encrypt(sbi))
- len += snprintf(buf, PAGE_SIZE - len, "%s",
+ len += scnprintf(buf, PAGE_SIZE - len, "%s",
"encryption");
if (f2fs_sb_has_blkzoned(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "blkzoned");
if (f2fs_sb_has_extra_attr(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "extra_attr");
if (f2fs_sb_has_project_quota(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "projquota");
if (f2fs_sb_has_inode_chksum(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_checksum");
if (f2fs_sb_has_flexible_inline_xattr(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "flexible_inline_xattr");
if (f2fs_sb_has_quota_ino(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "quota_ino");
if (f2fs_sb_has_inode_crtime(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_crtime");
if (f2fs_sb_has_lost_found(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "lost_found");
if (f2fs_sb_has_verity(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "verity");
if (f2fs_sb_has_sb_chksum(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "sb_checksum");
if (f2fs_sb_has_casefold(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "casefold");
if (f2fs_sb_has_compression(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "compression");
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "pin_file");
- len += snprintf(buf + len, PAGE_SIZE - len, "\n");
+ len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
return len;
}
@@ -185,6 +185,12 @@ static ssize_t encoding_show(struct f2fs_attr *a,
return sprintf(buf, "(none)");
}
+static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+}
+
#ifdef CONFIG_F2FS_STAT_FS
static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
@@ -233,16 +239,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
int hot_count = sbi->raw_super->hot_ext_count;
int len = 0, i;
- len += snprintf(buf + len, PAGE_SIZE - len,
+ len += scnprintf(buf + len, PAGE_SIZE - len,
"cold file extension:\n");
for (i = 0; i < cold_count; i++)
- len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
extlist[i]);
- len += snprintf(buf + len, PAGE_SIZE - len,
+ len += scnprintf(buf + len, PAGE_SIZE - len,
"hot file extension:\n");
for (i = cold_count; i < cold_count + hot_count; i++)
- len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
extlist[i]);
return len;
}
@@ -544,6 +550,7 @@ F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
+F2FS_GENERAL_RO_ATTR(mounted_time_sec);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -573,7 +580,9 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
+#endif
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -621,6 +630,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reserved_blocks),
ATTR_LIST(current_reserved_blocks),
ATTR_LIST(encoding),
+ ATTR_LIST(mounted_time_sec),
#ifdef CONFIG_F2FS_STAT_FS
ATTR_LIST(cp_foreground_calls),
ATTR_LIST(cp_background_calls),
@@ -654,7 +664,9 @@ static struct attribute *f2fs_feat_attrs[] = {
#endif
ATTR_LIST(sb_checksum),
ATTR_LIST(casefold),
+#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compression),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 296b3189448a..4f6582ef7ee3 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -23,6 +23,25 @@
#include "xattr.h"
#include "segment.h"
+static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
+{
+ if (likely(size == sbi->inline_xattr_slab_size)) {
+ *is_inline = true;
+ return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS);
+ }
+ *is_inline = false;
+ return f2fs_kzalloc(sbi, size, GFP_NOFS);
+}
+
+static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
+ bool is_inline)
+{
+ if (is_inline)
+ kmem_cache_free(sbi->inline_xattr_slab, xattr_addr);
+ else
+ kvfree(xattr_addr);
+}
+
static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *name, void *buffer, size_t size)
@@ -301,7 +320,8 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
unsigned int index, unsigned int len,
const char *name, struct f2fs_xattr_entry **xe,
- void **base_addr, int *base_size)
+ void **base_addr, int *base_size,
+ bool *is_inline)
{
void *cur_addr, *txattr_addr, *last_txattr_addr;
void *last_addr = NULL;
@@ -312,12 +332,12 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
if (!xnid && !inline_size)
return -ENODATA;
- *base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE;
- txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
+ *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE;
+ txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline);
if (!txattr_addr)
return -ENOMEM;
- last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode);
+ last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode);
/* read from inline xattr */
if (inline_size) {
@@ -362,7 +382,7 @@ check:
*base_addr = txattr_addr;
return 0;
out:
- kvfree(txattr_addr);
+ xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline);
return err;
}
@@ -499,6 +519,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
unsigned int size, len;
void *base_addr = NULL;
int base_size;
+ bool is_inline;
if (name == NULL)
return -EINVAL;
@@ -509,7 +530,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
- &entry, &base_addr, &base_size);
+ &entry, &base_addr, &base_size, &is_inline);
up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -532,14 +553,13 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
}
error = size;
out:
- kvfree(base_addr);
+ xattr_free(F2FS_I_SB(inode), base_addr, is_inline);
return error;
}
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct inode *inode = d_inode(dentry);
- nid_t xnid = F2FS_I(inode)->i_xattr_nid;
struct f2fs_xattr_entry *entry;
void *base_addr, *last_base_addr;
int error = 0;
@@ -551,7 +571,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (error)
return error;
- last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+ last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
@@ -609,7 +629,6 @@ static int __f2fs_setxattr(struct inode *inode, int index,
{
struct f2fs_xattr_entry *here, *last;
void *base_addr, *last_base_addr;
- nid_t xnid = F2FS_I(inode)->i_xattr_nid;
int found, newsize;
size_t len;
__u32 new_hsize;
@@ -633,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (error)
return error;
- last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+ last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
/* find entry with wanted name. */
here = __find_xattr(base_addr, last_base_addr, index, len, name);
@@ -758,14 +777,34 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- /* protect xattr_ver */
- down_write(&F2FS_I(inode)->i_sem);
down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
up_write(&F2FS_I(inode)->i_xattr_sem);
- up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
f2fs_update_time(sbi, REQ_TIME);
return err;
}
+
+int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ char slab_name[32];
+
+ sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev));
+
+ sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size *
+ sizeof(__le32) + XATTR_PADDING_SIZE;
+
+ sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name,
+ sbi->inline_xattr_slab_size);
+ if (!sbi->inline_xattr_slab)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi)
+{
+ kmem_cache_destroy(sbi->inline_xattr_slab);
+}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index de0c600b9cab..938fcd20565d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -49,7 +49,7 @@ struct f2fs_xattr_entry {
__u8 e_name_index;
__u8 e_name_len;
__le16 e_value_size; /* size of attribute value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
@@ -73,7 +73,8 @@ struct f2fs_xattr_entry {
entry = XATTR_NEXT_ENTRY(entry))
#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
#define XATTR_PADDING_SIZE (sizeof(__u32))
-#define XATTR_SIZE(x,i) (((x) ? VALID_XATTR_BLOCK_SIZE : 0) + \
+#define XATTR_SIZE(i) ((F2FS_I(i)->i_xattr_nid ? \
+ VALID_XATTR_BLOCK_SIZE : 0) + \
(inline_xattr_size(i)))
#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \
VALID_XATTR_BLOCK_SIZE)
@@ -130,6 +131,8 @@ extern int f2fs_setxattr(struct inode *, int, const char *,
extern int f2fs_getxattr(struct inode *, int, const char *, void *,
size_t, struct page *);
extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
+extern int f2fs_init_xattr_caches(struct f2fs_sb_info *);
+extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *);
#else
#define f2fs_xattr_handlers NULL
@@ -150,6 +153,8 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
{
return -EOPNOTSUPP;
}
+static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { }
#endif
#ifdef CONFIG_F2FS_FS_SECURITY
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 77bf5f95362d..90b8d879fbaf 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -272,7 +272,9 @@ struct file_system_type *get_fs_type(const char *name)
fs = __get_fs_type(name, len);
if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
fs = __get_fs_type(name, len);
- WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name);
+ if (!fs)
+ pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
+ len, name);
}
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 7e6fb43f9541..ab53e42a874a 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -368,8 +368,6 @@ bool fs_validate_description(const char *name,
const struct fs_parameter_spec *param, *p2;
bool good = true;
- pr_notice("*** VALIDATE %s ***\n", name);
-
for (param = desc; param->name; param++) {
/* Check for duplicate parameter names */
for (p2 = desc; p2 < param; p2++) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 09e6be8aa036..2e939f5fe751 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -21,6 +21,7 @@
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
+#include "quota.h"
#include "rgrp.h"
#include "trans.h"
#include "util.h"
@@ -116,14 +117,14 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
return -E2BIG;
- ret = gfs2_rsqa_alloc(ip);
+ ret = gfs2_qa_get(ip);
if (ret)
return ret;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret)
- return ret;
+ goto out;
need_unlock = true;
}
@@ -143,5 +144,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
unlock:
if (need_unlock)
gfs2_glock_dq_uninit(&gh);
+out:
+ gfs2_qa_put(ip);
return ret;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ba83b49ce18c..786c1ce8f030 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -805,11 +805,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
bd = bh->b_private;
if (bd) {
gfs2_assert_warn(sdp, bd->bd_bh == bh);
- if (!list_empty(&bd->bd_list))
- list_del_init(&bd->bd_list);
bd->bd_bh = NULL;
bh->b_private = NULL;
- kmem_cache_free(gfs2_bufdata_cachep, bd);
+ /*
+ * The bd may still be queued as a revoke, in which
+ * case we must not dequeue nor free it.
+ */
+ if (!bd->bd_blkno && !list_empty(&bd->bd_list))
+ list_del_init(&bd->bd_list);
+ if (list_empty(&bd->bd_list))
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
}
bh = bh->b_this_page;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 08f6fbb3655e..936a8ec6b48e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2183,7 +2183,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
inode_dio_wait(inode);
- ret = gfs2_rsqa_alloc(ip);
+ ret = gfs2_qa_get(ip);
if (ret)
goto out;
@@ -2194,7 +2194,8 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
ret = do_shrink(inode, newsize);
out:
- gfs2_rsqa_delete(ip, NULL);
+ gfs2_rs_delete(ip, NULL);
+ gfs2_qa_put(ip);
return ret;
}
@@ -2223,7 +2224,7 @@ void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
struct gfs2_journal_extent *jext;
while(!list_empty(&jd->extent_list)) {
- jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
+ jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
list_del(&jext->list);
kfree(jext);
}
@@ -2244,7 +2245,7 @@ static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 b
struct gfs2_journal_extent *jext;
if (!list_empty(&jd->extent_list)) {
- jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
+ jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
if ((jext->dblock + jext->blocks) == dblock) {
jext->blocks += blocks;
return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c8b62577e2f2..c3f7732415be 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2028,7 +2028,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
error = gfs2_trans_begin(sdp,
rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
- RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+ RES_DINODE + RES_STATFS + RES_QUOTA, RES_DINODE +
+ l_blocks);
if (error)
goto out_rg_gunlock;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cb26be6f4351..fe305e4bfd37 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -458,10 +458,6 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
- ret = gfs2_rsqa_alloc(ip);
- if (ret)
- goto out;
-
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret)
@@ -558,7 +554,6 @@ out_uninit:
set_page_dirty(page);
wait_for_stable_page(page);
}
-out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
}
@@ -635,7 +630,17 @@ int gfs2_open_common(struct inode *inode, struct file *file)
gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
file->private_data = fp;
+ if (file->f_mode & FMODE_WRITE) {
+ ret = gfs2_qa_get(GFS2_I(inode));
+ if (ret)
+ goto fail;
+ }
return 0;
+
+fail:
+ kfree(file->private_data);
+ file->private_data = NULL;
+ return ret;
}
/**
@@ -690,10 +695,10 @@ static int gfs2_release(struct inode *inode, struct file *file)
kfree(file->private_data);
file->private_data = NULL;
- if (!(file->f_mode & FMODE_WRITE))
- return 0;
-
- gfs2_rsqa_delete(ip, &inode->i_writecount);
+ if (file->f_mode & FMODE_WRITE) {
+ gfs2_rs_delete(ip, &inode->i_writecount);
+ gfs2_qa_put(ip);
+ }
return 0;
}
@@ -849,10 +854,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct gfs2_inode *ip = GFS2_I(inode);
ssize_t ret;
- ret = gfs2_rsqa_alloc(ip);
- if (ret)
- return ret;
-
gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
if (iocb->ki_flags & IOCB_APPEND) {
@@ -1149,17 +1150,11 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
if (mode & FALLOC_FL_PUNCH_HOLE) {
ret = __gfs2_punch_hole(file, offset, len);
} else {
- ret = gfs2_rsqa_alloc(ip);
- if (ret)
- goto out_putw;
-
ret = __gfs2_fallocate(file, mode, offset, len);
-
if (ret)
gfs2_rs_deltree(&ip->i_res);
}
-out_putw:
put_write_access(inode);
out_unlock:
gfs2_glock_dq(&gh);
@@ -1173,16 +1168,12 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
{
- int error;
- struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
-
- error = gfs2_rsqa_alloc(ip);
- if (error)
- return (ssize_t)error;
+ ssize_t ret;
gfs2_size_hint(out, *ppos, len);
- return iter_file_splice_write(pipe, out, ppos, len, flags);
+ ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+ return ret;
}
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d0eceaff3cea..29f9b6684b74 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -133,6 +133,33 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
}
}
+/**
+ * glock_blocked_by_withdraw - determine if we can still use a glock
+ * @gl: the glock
+ *
+ * We need to allow some glocks to be enqueued, dequeued, promoted, and demoted
+ * when we're withdrawn. For example, to maintain metadata integrity, we should
+ * disallow the use of inode and rgrp glocks when withdrawn. Other glocks, like
+ * iopen or the transaction glocks may be safely used because none of their
+ * metadata goes through the journal. So in general, we should disallow all
+ * glocks that are journaled, and allow all the others. One exception is:
+ * we need to allow our active journal to be promoted and demoted so others
+ * may recover it and we can reacquire it when they're done.
+ */
+static bool glock_blocked_by_withdraw(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ if (likely(!gfs2_withdrawn(sdp)))
+ return false;
+ if (gl->gl_ops->go_flags & GLOF_NONDISK)
+ return false;
+ if (!sdp->sd_jdesc ||
+ gl->gl_name.ln_number == sdp->sd_jdesc->jd_no_addr)
+ return false;
+ return true;
+}
+
void gfs2_glock_free(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -244,7 +271,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
- GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+ GLOCK_BUG_ON(gl, mapping && mapping->nrpages && !gfs2_withdrawn(sdp));
trace_gfs2_glock_put(gl);
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
}
@@ -281,7 +308,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
{
- const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+ const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);
if ((gh->gh_state == LM_ST_EXCLUSIVE ||
gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
return 0;
@@ -549,8 +576,8 @@ __acquires(&gl->gl_lockref.lock)
unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
- if (unlikely(gfs2_withdrawn(sdp)) &&
- target != LM_ST_UNLOCKED)
+ if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) &&
+ gh && !(gh->gh_flags & LM_FLAG_NOEXP))
return;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
@@ -575,13 +602,64 @@ __acquires(&gl->gl_lockref.lock)
(lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
clear_bit(GLF_BLOCKING, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
- if (glops->go_sync)
- glops->go_sync(gl);
- if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ if (glops->go_sync) {
+ ret = glops->go_sync(gl);
+ /* If we had a problem syncing (due to io errors or whatever,
+ * we should not invalidate the metadata or tell dlm to
+ * release the glock to other nodes.
+ */
+ if (ret) {
+ if (cmpxchg(&sdp->sd_log_error, 0, ret)) {
+ fs_err(sdp, "Error %d syncing glock \n", ret);
+ gfs2_dump_glock(NULL, gl, true);
+ }
+ return;
+ }
+ }
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) {
+ /*
+ * The call to go_sync should have cleared out the ail list.
+ * If there are still items, we have a problem. We ought to
+ * withdraw, but we can't because the withdraw code also uses
+ * glocks. Warn about the error, dump the glock, then fall
+ * through and wait for logd to do the withdraw for us.
+ */
+ if ((atomic_read(&gl->gl_ail_count) != 0) &&
+ (!cmpxchg(&sdp->sd_log_error, 0, -EIO))) {
+ gfs2_assert_warn(sdp, !atomic_read(&gl->gl_ail_count));
+ gfs2_dump_glock(NULL, gl, true);
+ }
glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
- clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+ clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+ }
gfs2_glock_hold(gl);
+ /*
+ * Check for an error encountered since we called go_sync and go_inval.
+ * If so, we can't withdraw from the glock code because the withdraw
+ * code itself uses glocks (see function signal_our_withdraw) to
+ * change the mount to read-only. Most importantly, we must not call
+ * dlm to unlock the glock until the journal is in a known good state
+ * (after journal replay) otherwise other nodes may use the object
+ * (rgrp or dinode) and then later, journal replay will corrupt the
+ * file system. The best we can do here is wait for the logd daemon
+ * to see sd_log_error and withdraw, and in the meantime, requeue the
+ * work for later.
+ *
+ * However, if we're just unlocking the lock (say, for unmount, when
+ * gfs2_gl_hash_clear calls clear_glock) and recovery is complete
+ * then it's okay to tell dlm to unlock it.
+ */
+ if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp)))
+ gfs2_withdraw_delayed(sdp);
+ if (glock_blocked_by_withdraw(gl)) {
+ if (target != LM_ST_UNLOCKED ||
+ test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags)) {
+ gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
+ goto out;
+ }
+ }
+
if (sdp->sd_lockstruct.ls_ops->lm_lock) {
/* lock_dlm */
ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
@@ -590,8 +668,7 @@ __acquires(&gl->gl_lockref.lock)
test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) {
finish_xmote(gl, target);
gfs2_glock_queue_work(gl, 0);
- }
- else if (ret) {
+ } else if (ret) {
fs_err(sdp, "lm_lock ret %d\n", ret);
GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp));
}
@@ -599,7 +676,7 @@ __acquires(&gl->gl_lockref.lock)
finish_xmote(gl, target);
gfs2_glock_queue_work(gl, 0);
}
-
+out:
spin_lock(&gl->gl_lockref.lock);
}
@@ -613,7 +690,7 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
struct gfs2_holder *gh;
if (!list_empty(&gl->gl_holders)) {
- gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
return gh;
}
@@ -645,6 +722,9 @@ __acquires(&gl->gl_lockref.lock)
goto out_unlock;
if (nonblock)
goto out_sched;
+ smp_mb();
+ if (atomic_read(&gl->gl_revokes) != 0)
+ goto out_sched;
set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
gl->gl_target = gl->gl_demote_state;
@@ -1160,7 +1240,7 @@ fail:
}
list_add_tail(&gh->gh_list, insert_pt);
do_cancel:
- gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
@@ -1194,10 +1274,9 @@ trap_recursive:
int gfs2_glock_nq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
int error = 0;
- if (unlikely(gfs2_withdrawn(sdp)))
+ if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP))
return -EIO;
if (test_bit(GLF_LRU, &gl->gl_flags))
@@ -1241,24 +1320,32 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
void gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned delay = 0;
int fast_path = 0;
spin_lock(&gl->gl_lockref.lock);
+ /*
+ * If we're in the process of file system withdraw, we cannot just
+ * dequeue any glocks until our journal is recovered, lest we
+ * introduce file system corruption. We need two exceptions to this
+ * rule: We need to allow unlocking of nondisk glocks and the glock
+ * for our own journal that needs recovery.
+ */
+ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+ glock_blocked_by_withdraw(gl) &&
+ gh->gh_gl != sdp->sd_jinode_gl) {
+ sdp->sd_glock_dqs_held++;
+ might_sleep();
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+ TASK_UNINTERRUPTIBLE);
+ }
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
list_del_init(&gh->gh_list);
clear_bit(HIF_HOLDER, &gh->gh_iflags);
if (find_first_holder(gl) == NULL) {
- if (glops->go_unlock) {
- GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
- spin_unlock(&gl->gl_lockref.lock);
- glops->go_unlock(gh);
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- }
if (list_empty(&gl->gl_holders) &&
!test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
!test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1555,7 +1642,7 @@ __acquires(&lru_lock)
list_sort(NULL, list, glock_cmp);
while(!list_empty(list)) {
- gl = list_entry(list->next, struct gfs2_glock, gl_lru);
+ gl = list_first_entry(list, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
@@ -1596,7 +1683,7 @@ static long gfs2_scan_glock_lru(int nr)
spin_lock(&lru_lock);
while ((nr-- >= 0) && !list_empty(&lru_list)) {
- gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
+ gl = list_first_entry(&lru_list, struct gfs2_glock, gl_lru);
/* Test for being demotable */
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 061d22e1ceb6..9e9c7a4b8c66 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -29,6 +29,8 @@
struct workqueue_struct *gfs2_freeze_wq;
+extern struct workqueue_struct *gfs2_control_wq;
+
static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
{
fs_err(gl->gl_name.ln_sbd,
@@ -39,7 +41,8 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
gl->gl_name.ln_type, gl->gl_name.ln_number,
gfs2_glock2aspace(gl));
- gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n");
+ gfs2_lm(gl->gl_name.ln_sbd, "AIL error\n");
+ gfs2_withdraw(gl->gl_name.ln_sbd);
}
/**
@@ -79,34 +82,62 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
}
-static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+static int gfs2_ail_empty_gl(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_trans tr;
+ int ret;
memset(&tr, 0, sizeof(tr));
INIT_LIST_HEAD(&tr.tr_buf);
INIT_LIST_HEAD(&tr.tr_databuf);
tr.tr_revokes = atomic_read(&gl->gl_ail_count);
- if (!tr.tr_revokes)
- return;
+ if (!tr.tr_revokes) {
+ bool have_revokes;
+ bool log_in_flight;
+
+ /*
+ * We have nothing on the ail, but there could be revokes on
+ * the sdp revoke queue, in which case, we still want to flush
+ * the log and wait for it to finish.
+ *
+ * If the sdp revoke list is empty too, we might still have an
+ * io outstanding for writing revokes, so we should wait for
+ * it before returning.
+ *
+ * If none of these conditions are true, our revokes are all
+ * flushed and we can return.
+ */
+ gfs2_log_lock(sdp);
+ have_revokes = !list_empty(&sdp->sd_log_revokes);
+ log_in_flight = atomic_read(&sdp->sd_log_in_flight);
+ gfs2_log_unlock(sdp);
+ if (have_revokes)
+ goto flush;
+ if (log_in_flight)
+ log_flush_wait(sdp);
+ return 0;
+ }
/* A shortened, inline version of gfs2_trans_begin()
* tr->alloced is not set since the transaction structure is
* on the stack */
tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes);
tr.tr_ip = _RET_IP_;
- if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0)
- return;
+ ret = gfs2_log_reserve(sdp, tr.tr_reserved);
+ if (ret < 0)
+ return ret;
WARN_ON_ONCE(current->journal_info);
current->journal_info = &tr;
__gfs2_ail_flush(gl, 0, tr.tr_revokes);
gfs2_trans_end(sdp);
+flush:
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_AIL_EMPTY_GL);
+ return 0;
}
void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
@@ -140,35 +171,32 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
* return to caller to demote/unlock the glock until I/O is complete.
*/
-static void rgrp_go_sync(struct gfs2_glock *gl)
+static int rgrp_go_sync(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
- struct gfs2_rgrpd *rgd;
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
int error;
- spin_lock(&gl->gl_lockref.lock);
- rgd = gl->gl_object;
- if (rgd)
- gfs2_rgrp_brelse(rgd);
- spin_unlock(&gl->gl_lockref.lock);
-
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
- return;
+ return 0;
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_RGRP_GO_SYNC);
filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+ WARN_ON_ONCE(error);
mapping_set_error(mapping, error);
- gfs2_ail_empty_gl(gl);
+ if (!error)
+ error = gfs2_ail_empty_gl(gl);
spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_free_clones(rgd);
spin_unlock(&gl->gl_lockref.lock);
+ return error;
}
/**
@@ -191,7 +219,6 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
gfs2_rgrp_brelse(rgd);
WARN_ON_ONCE(!(flags & DIO_METADATA));
- gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
if (rgd)
@@ -236,12 +263,12 @@ static void gfs2_clear_glop_pending(struct gfs2_inode *ip)
*
*/
-static void inode_go_sync(struct gfs2_glock *gl)
+static int inode_go_sync(struct gfs2_glock *gl)
{
struct gfs2_inode *ip = gfs2_glock2inode(gl);
int isreg = ip && S_ISREG(ip->i_inode.i_mode);
struct address_space *metamapping = gfs2_glock2aspace(gl);
- int error;
+ int error = 0;
if (isreg) {
if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
@@ -274,6 +301,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
out:
gfs2_clear_glop_pending(ip);
+ return error;
}
/**
@@ -291,8 +319,6 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
{
struct gfs2_inode *ip = gfs2_glock2inode(gl);
- gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count));
-
if (flags & DIO_METADATA) {
struct address_space *mapping = gfs2_glock2aspace(gl);
truncate_inode_pages(mapping, 0);
@@ -496,24 +522,29 @@ static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl,
*
*/
-static void freeze_go_sync(struct gfs2_glock *gl)
+static int freeze_go_sync(struct gfs2_glock *gl)
{
int error = 0;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- if (gl->gl_state == LM_ST_SHARED &&
+ if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
error = freeze_super(sdp->sd_vfs);
if (error) {
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
error);
+ if (gfs2_withdrawn(sdp)) {
+ atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
+ return 0;
+ }
gfs2_assert_withdraw(sdp, 0);
}
queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
GFS2_LFC_FREEZE_GO_SYNC);
}
+ return 0;
}
/**
@@ -582,8 +613,76 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
}
}
+/**
+ * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it
+ * @gl: glock being freed
+ *
+ * For now, this is only used for the journal inode glock. In withdraw
+ * situations, we need to wait for the glock to be freed so that we know
+ * other nodes may proceed with recovery / journal replay.
+ */
+static void inode_go_free(struct gfs2_glock *gl)
+{
+ /* Note that we cannot reference gl_object because it's already set
+ * to NULL by this point in its lifecycle. */
+ if (!test_bit(GLF_FREEING, &gl->gl_flags))
+ return;
+ clear_bit_unlock(GLF_FREEING, &gl->gl_flags);
+ wake_up_bit(&gl->gl_flags, GLF_FREEING);
+}
+
+/**
+ * nondisk_go_callback - used to signal when a node did a withdraw
+ * @gl: the nondisk glock
+ * @remote: true if this came from a different cluster node
+ *
+ */
+static void nondisk_go_callback(struct gfs2_glock *gl, bool remote)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ /* Ignore the callback unless it's from another node, and it's the
+ live lock. */
+ if (!remote || gl->gl_name.ln_number != GFS2_LIVE_LOCK)
+ return;
+
+ /* First order of business is to cancel the demote request. We don't
+ * really want to demote a nondisk glock. At best it's just to inform
+ * us of another node's withdraw. We'll keep it in SH mode. */
+ clear_bit(GLF_DEMOTE, &gl->gl_flags);
+ clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
+
+ /* Ignore the unlock if we're withdrawn, unmounting, or in recovery. */
+ if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) ||
+ test_bit(SDF_WITHDRAWN, &sdp->sd_flags) ||
+ test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags))
+ return;
+
+ /* We only care when a node wants us to unlock, because that means
+ * they want a journal recovered. */
+ if (gl->gl_demote_state != LM_ST_UNLOCKED)
+ return;
+
+ if (sdp->sd_args.ar_spectator) {
+ fs_warn(sdp, "Spectator node cannot recover journals.\n");
+ return;
+ }
+
+ fs_warn(sdp, "Some node has withdrawn; checking for recovery.\n");
+ set_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags);
+ /*
+ * We can't call remote_withdraw directly here or gfs2_recover_journal
+ * because this is called from the glock unlock function and the
+ * remote_withdraw needs to enqueue and dequeue the same "live" glock
+ * we were called from. So we queue it to the control work queue in
+ * lock_dlm.
+ */
+ queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
+}
+
const struct gfs2_glock_operations gfs2_meta_glops = {
.go_type = LM_TYPE_META,
+ .go_flags = GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_inode_glops = {
@@ -594,13 +693,13 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
.go_flags = GLOF_ASPACE | GLOF_LRU,
+ .go_free = inode_go_free,
};
const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_sync = rgrp_go_sync,
.go_inval = rgrp_go_inval,
.go_lock = gfs2_rgrp_go_lock,
- .go_unlock = gfs2_rgrp_go_unlock,
.go_dump = gfs2_rgrp_dump,
.go_type = LM_TYPE_RGRP,
.go_flags = GLOF_LVB,
@@ -611,30 +710,34 @@ const struct gfs2_glock_operations gfs2_freeze_glops = {
.go_xmote_bh = freeze_go_xmote_bh,
.go_demote_ok = freeze_go_demote_ok,
.go_type = LM_TYPE_NONDISK,
+ .go_flags = GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_iopen_glops = {
.go_type = LM_TYPE_IOPEN,
.go_callback = iopen_go_callback,
- .go_flags = GLOF_LRU,
+ .go_flags = GLOF_LRU | GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_flock_glops = {
.go_type = LM_TYPE_FLOCK,
- .go_flags = GLOF_LRU,
+ .go_flags = GLOF_LRU | GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_nondisk_glops = {
.go_type = LM_TYPE_NONDISK,
+ .go_flags = GLOF_NONDISK,
+ .go_callback = nondisk_go_callback,
};
const struct gfs2_glock_operations gfs2_quota_glops = {
.go_type = LM_TYPE_QUOTA,
- .go_flags = GLOF_LVB | GLOF_LRU,
+ .go_flags = GLOF_LVB | GLOF_LRU | GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_journal_glops = {
.go_type = LM_TYPE_JOURNAL,
+ .go_flags = GLOF_NONDISK,
};
const struct gfs2_glock_operations *gfs2_glops_list[] = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 9fd88ed18807..84a824293a78 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -234,20 +234,21 @@ struct lm_lockname {
struct gfs2_glock_operations {
- void (*go_sync) (struct gfs2_glock *gl);
+ int (*go_sync) (struct gfs2_glock *gl);
int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (const struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
- void (*go_unlock) (struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
+ void (*go_free)(struct gfs2_glock *gl);
const int go_type;
const unsigned long go_flags;
-#define GLOF_ASPACE 1
-#define GLOF_LVB 2
-#define GLOF_LRU 4
+#define GLOF_ASPACE 1 /* address space attached */
+#define GLOF_LVB 2 /* Lock Value Block attached */
+#define GLOF_LRU 4 /* LRU managed */
+#define GLOF_NONDISK 8 /* not I/O related */
};
enum {
@@ -294,6 +295,7 @@ struct gfs2_qadata { /* quota allocation data */
struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS];
struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS];
unsigned int qa_qd_num;
+ int qa_ref;
};
/* Resource group multi-block reservation, in order of appearance:
@@ -343,6 +345,7 @@ enum {
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
GLF_INODE_CREATING = 16, /* Inode creation occurring */
+ GLF_FREEING = 18, /* Wait for glock to be freed */
};
struct gfs2_glock {
@@ -542,6 +545,7 @@ struct gfs2_jdesc {
struct list_head jd_revoke_list;
unsigned int jd_replay_tail;
+ u64 jd_no_addr;
};
struct gfs2_statfs_change_host {
@@ -616,8 +620,12 @@ enum {
SDF_RORECOVERY = 7, /* read only recovery */
SDF_SKIP_DLM_UNLOCK = 8,
SDF_FORCE_AIL_FLUSH = 9,
- SDF_AIL1_IO_ERROR = 10,
- SDF_FS_FROZEN = 11,
+ SDF_FS_FROZEN = 10,
+ SDF_WITHDRAWING = 11, /* Will withdraw eventually */
+ SDF_WITHDRAW_IN_PROG = 12, /* Withdraw is in progress */
+ SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */
+ SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are
+ withdrawing */
};
enum gfs2_freeze_state {
@@ -768,6 +776,7 @@ struct gfs2_sbd {
struct gfs2_jdesc *sd_jdesc;
struct gfs2_holder sd_journal_gh;
struct gfs2_holder sd_jinode_gh;
+ struct gfs2_glock *sd_jinode_gl;
struct gfs2_holder sd_sc_gh;
struct gfs2_holder sd_qc_gh;
@@ -828,7 +837,8 @@ struct gfs2_sbd {
atomic_t sd_log_in_flight;
struct bio *sd_log_bio;
wait_queue_head_t sd_log_flush_wait;
- int sd_log_error;
+ int sd_log_error; /* First log error */
+ wait_queue_head_t sd_withdraw_wait;
atomic_t sd_reserving_log;
wait_queue_head_t sd_reserving_log_wait;
@@ -852,6 +862,7 @@ struct gfs2_sbd {
unsigned long sd_last_warning;
struct dentry *debugfs_dir; /* debugfs directory */
+ unsigned long sd_glock_dqs_held;
};
static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8294851a9dd9..70b2d3a1e866 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -144,7 +144,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (unlikely(error))
- goto fail_put;
+ goto fail;
if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) {
/*
@@ -155,13 +155,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE,
GL_SKIP, &i_gh);
if (error)
- goto fail_put;
+ goto fail;
if (blktype != GFS2_BLKST_FREE) {
error = gfs2_check_blk_type(sdp, no_addr,
blktype);
if (error)
- goto fail_put;
+ goto fail;
}
}
@@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
set_bit(GIF_INVALID, &ip->i_flags);
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
- goto fail_put;
+ goto fail;
glock_set_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_glock_put(io_gl);
io_gl = NULL;
@@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
/* Inode glock must be locked already */
error = gfs2_inode_refresh(GFS2_I(inode));
if (error)
- goto fail_refresh;
+ goto fail;
} else {
ip->i_no_formal_ino = no_formal_ino;
inode->i_mode = DT2IF(type);
@@ -197,17 +197,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
gfs2_glock_dq_uninit(&i_gh);
return inode;
-fail_refresh:
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-fail_put:
+fail:
if (io_gl)
gfs2_glock_put(io_gl);
- glock_clear_object(ip->i_gl, ip);
if (gfs2_holder_initialized(&i_gh))
gfs2_glock_dq_uninit(&i_gh);
-fail:
iget_failed(inode);
return ERR_PTR(error);
}
@@ -594,13 +588,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
- error = gfs2_rsqa_alloc(dip);
+ error = gfs2_qa_get(dip);
if (error)
return error;
error = gfs2_rindex_update(sdp);
if (error)
- return error;
+ goto fail;
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
if (error)
@@ -647,7 +641,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail_gunlock;
ip = GFS2_I(inode);
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
goto fail_free_acls;
@@ -782,11 +776,13 @@ fail_gunlock2:
clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
gfs2_glock_put(io_gl);
fail_free_inode:
+ gfs2_qa_put(ip);
if (ip->i_gl) {
glock_clear_object(ip->i_gl, ip);
gfs2_glock_put(ip->i_gl);
}
- gfs2_rsqa_delete(ip, NULL);
+ gfs2_rs_delete(ip, NULL);
+ gfs2_qa_put(ip);
fail_free_acls:
posix_acl_release(default_acl);
posix_acl_release(acl);
@@ -804,6 +800,7 @@ fail_gunlock:
if (gfs2_holder_initialized(ghs + 1))
gfs2_glock_dq_uninit(ghs + 1);
fail:
+ gfs2_qa_put(dip);
return error;
}
@@ -905,7 +902,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (S_ISDIR(inode->i_mode))
return -EPERM;
- error = gfs2_rsqa_alloc(dip);
+ error = gfs2_qa_get(dip);
if (error)
return error;
@@ -1008,6 +1005,7 @@ out_gunlock:
out_child:
gfs2_glock_dq(ghs);
out_parent:
+ gfs2_qa_put(ip);
gfs2_holder_uninit(ghs);
gfs2_holder_uninit(ghs + 1);
return error;
@@ -1368,7 +1366,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
return error;
- error = gfs2_rsqa_alloc(ndip);
+ error = gfs2_qa_get(ndip);
if (error)
return error;
@@ -1568,6 +1566,7 @@ out_gunlock_r:
if (gfs2_holder_initialized(&r_gh))
gfs2_glock_dq_uninit(&r_gh);
out:
+ gfs2_qa_put(ndip);
return error;
}
@@ -1879,10 +1878,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
ouid = nuid = NO_UID_QUOTA_CHANGE;
if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
ogid = ngid = NO_GID_QUOTA_CHANGE;
-
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
- goto out;
+ return error;
error = gfs2_rindex_update(sdp);
if (error)
@@ -1920,6 +1918,7 @@ out_end_trans:
out_gunlock_q:
gfs2_quota_unlock(ip);
out:
+ gfs2_qa_put(ip);
return error;
}
@@ -1941,21 +1940,21 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct gfs2_holder i_gh;
int error;
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
return error;
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
if (error)
- return error;
+ goto out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out;
+ goto error;
error = setattr_prepare(dentry, attr);
if (error)
- goto out;
+ goto error;
if (attr->ia_valid & ATTR_SIZE)
error = gfs2_setattr_size(inode, attr->ia_size);
@@ -1967,10 +1966,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
error = posix_acl_chmod(inode, inode->i_mode);
}
-out:
+error:
if (!error)
mark_inode_dirty(inode);
gfs2_glock_dq_uninit(&i_gh);
+out:
+ gfs2_qa_put(ip);
return error;
}
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 7c7197343ee2..9f2b5609f225 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -16,6 +16,8 @@
#include "incore.h"
#include "glock.h"
+#include "glops.h"
+#include "recovery.h"
#include "util.h"
#include "sys.h"
#include "trace_gfs2.h"
@@ -124,6 +126,8 @@ static void gdlm_ast(void *arg)
switch (gl->gl_lksb.sb_status) {
case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
+ if (gl->gl_ops->go_free)
+ gl->gl_ops->go_free(gl);
gfs2_glock_free(gl);
return;
case -DLM_ECANCEL: /* Cancel while getting lock */
@@ -323,6 +327,7 @@ static void gdlm_cancel(struct gfs2_glock *gl)
/*
* dlm/gfs2 recovery coordination using dlm_recover callbacks
*
+ * 0. gfs2 checks for another cluster node withdraw, needing journal replay
* 1. dlm_controld sees lockspace members change
* 2. dlm_controld blocks dlm-kernel locking activity
* 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
@@ -571,6 +576,28 @@ static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
&ls->ls_control_lksb, "control_lock");
}
+/**
+ * remote_withdraw - react to a node withdrawing from the file system
+ * @sdp: The superblock
+ */
+static void remote_withdraw(struct gfs2_sbd *sdp)
+{
+ struct gfs2_jdesc *jd;
+ int ret = 0, count = 0;
+
+ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+ if (jd->jd_jid == sdp->sd_lockstruct.ls_jid)
+ continue;
+ ret = gfs2_recover_journal(jd, true);
+ if (ret)
+ break;
+ count++;
+ }
+
+ /* Now drop the additional reference we acquired */
+ fs_err(sdp, "Journals checked: %d, ret = %d.\n", count, ret);
+}
+
static void gfs2_control_func(struct work_struct *work)
{
struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
@@ -581,6 +608,13 @@ static void gfs2_control_func(struct work_struct *work)
int recover_size;
int i, error;
+ /* First check for other nodes that may have done a withdraw. */
+ if (test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags)) {
+ remote_withdraw(sdp);
+ clear_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags);
+ return;
+ }
+
spin_lock(&ls->ls_recover_spin);
/*
* No MOUNT_DONE means we're still mounting; control_mount()
@@ -1079,6 +1113,10 @@ static void gdlm_recover_prep(void *arg)
struct gfs2_sbd *sdp = arg;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "recover_prep ignored due to withdraw.\n");
+ return;
+ }
spin_lock(&ls->ls_recover_spin);
ls->ls_recover_block = ls->ls_recover_start;
set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
@@ -1101,6 +1139,11 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int jid = slot->slot - 1;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n",
+ jid);
+ return;
+ }
spin_lock(&ls->ls_recover_spin);
if (ls->ls_recover_size < jid + 1) {
fs_err(sdp, "recover_slot jid %d gen %u short size %d\n",
@@ -1125,6 +1168,10 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
struct gfs2_sbd *sdp = arg;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "recover_done ignored due to withdraw.\n");
+ return;
+ }
/* ensure the ls jid arrays are large enough */
set_recover_size(sdp, slots, num_slots);
@@ -1152,6 +1199,11 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n",
+ jid);
+ return;
+ }
if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
return;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 00a2e721a374..3a75843ae580 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -88,8 +88,7 @@ static void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
struct writeback_control *wbc,
- struct gfs2_trans *tr,
- bool *withdraw)
+ struct gfs2_trans *tr)
__releases(&sdp->sd_ail_lock)
__acquires(&sdp->sd_ail_lock)
{
@@ -97,6 +96,7 @@ __acquires(&sdp->sd_ail_lock)
struct address_space *mapping;
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
+ int ret = 0;
list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) {
bh = bd->bd_bh;
@@ -104,16 +104,21 @@ __acquires(&sdp->sd_ail_lock)
gfs2_assert(sdp, bd->bd_tr == tr);
if (!buffer_busy(bh)) {
- if (!buffer_uptodate(bh) &&
- !test_and_set_bit(SDF_AIL1_IO_ERROR,
- &sdp->sd_flags)) {
+ if (buffer_uptodate(bh)) {
+ list_move(&bd->bd_ail_st_list,
+ &tr->tr_ail2_list);
+ continue;
+ }
+ if (!cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
- *withdraw = true;
+ gfs2_withdraw_delayed(sdp);
}
- list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
- continue;
}
+ if (gfs2_withdrawn(sdp)) {
+ gfs2_remove_from_ail(bd);
+ continue;
+ }
if (!buffer_dirty(bh))
continue;
if (gl == bd->bd_gl)
@@ -124,16 +129,50 @@ __acquires(&sdp->sd_ail_lock)
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
- generic_writepages(mapping, wbc);
+ ret = generic_writepages(mapping, wbc);
spin_lock(&sdp->sd_ail_lock);
- if (wbc->nr_to_write <= 0)
+ if (ret || wbc->nr_to_write <= 0)
break;
- return 1;
+ return -EBUSY;
}
- return 0;
+ return ret;
}
+static void dump_ail_list(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr;
+ struct gfs2_bufdata *bd;
+ struct buffer_head *bh;
+
+ fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! t=%d\n",
+ current->journal_info ? 1 : 0);
+
+ list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_reverse(bd, &tr->tr_ail1_list,
+ bd_ail_st_list) {
+ bh = bd->bd_bh;
+ fs_err(sdp, "bd %p: blk:0x%llx bh=%p ", bd,
+ (unsigned long long)bd->bd_blkno, bh);
+ if (!bh) {
+ fs_err(sdp, "\n");
+ continue;
+ }
+ fs_err(sdp, "0x%llx up2:%d dirt:%d lkd:%d req:%d "
+ "map:%d new:%d ar:%d aw:%d delay:%d "
+ "io err:%d unwritten:%d dfr:%d pin:%d esc:%d\n",
+ (unsigned long long)bh->b_blocknr,
+ buffer_uptodate(bh), buffer_dirty(bh),
+ buffer_locked(bh), buffer_req(bh),
+ buffer_mapped(bh), buffer_new(bh),
+ buffer_async_read(bh), buffer_async_write(bh),
+ buffer_delay(bh), buffer_write_io_error(bh),
+ buffer_unwritten(bh),
+ buffer_defer_completion(bh),
+ buffer_pinned(bh), buffer_escaped(bh));
+ }
+ }
+}
/**
* gfs2_ail1_flush - start writeback of some ail1 entries
@@ -149,23 +188,36 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
struct list_head *head = &sdp->sd_ail1_list;
struct gfs2_trans *tr;
struct blk_plug plug;
- bool withdraw = false;
+ int ret;
+ unsigned long flush_start = jiffies;
trace_gfs2_ail_flush(sdp, wbc, 1);
blk_start_plug(&plug);
spin_lock(&sdp->sd_ail_lock);
restart:
+ ret = 0;
+ if (time_after(jiffies, flush_start + (HZ * 600))) {
+ dump_ail_list(sdp);
+ goto out;
+ }
list_for_each_entry_reverse(tr, head, tr_list) {
if (wbc->nr_to_write <= 0)
break;
- if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw) &&
- !gfs2_withdrawn(sdp))
- goto restart;
+ ret = gfs2_ail1_start_one(sdp, wbc, tr);
+ if (ret) {
+ if (ret == -EBUSY)
+ goto restart;
+ break;
+ }
}
+out:
spin_unlock(&sdp->sd_ail_lock);
blk_finish_plug(&plug);
- if (withdraw)
- gfs2_lm_withdraw(sdp, NULL);
+ if (ret) {
+ gfs2_lm(sdp, "gfs2_ail1_start_one (generic_writepages) "
+ "returned: %d\n", ret);
+ gfs2_withdraw(sdp);
+ }
trace_gfs2_ail_flush(sdp, wbc, 0);
}
@@ -189,12 +241,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
/**
* gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
* @sdp: the filesystem
- * @ai: the AIL entry
+ * @tr: the transaction
+ * @max_revokes: If nonzero, issue revokes for the bd items for written buffers
*
*/
static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
- bool *withdraw)
+ int *max_revokes)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
@@ -203,12 +256,32 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
bd_ail_st_list) {
bh = bd->bd_bh;
gfs2_assert(sdp, bd->bd_tr == tr);
- if (buffer_busy(bh))
+ /*
+ * If another process flagged an io error, e.g. writing to the
+ * journal, error all other bhs and move them off the ail1 to
+ * prevent a tight loop when unmount tries to flush ail1,
+ * regardless of whether they're still busy. If no outside
+ * errors were found and the buffer is busy, move to the next.
+ * If the ail buffer is not busy and caught an error, flag it
+ * for others.
+ */
+ if (!sdp->sd_log_error && buffer_busy(bh))
continue;
if (!buffer_uptodate(bh) &&
- !test_and_set_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
+ !cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
- *withdraw = true;
+ gfs2_withdraw_delayed(sdp);
+ }
+ /*
+ * If we have space for revokes and the bd is no longer on any
+ * buf list, we can just add a revoke for it immediately and
+ * avoid having to put it on the ail2 list, where it would need
+ * to be revoked later.
+ */
+ if (*max_revokes && list_empty(&bd->bd_list)) {
+ gfs2_add_revoke(sdp, bd);
+ (*max_revokes)--;
+ continue;
}
list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
}
@@ -217,20 +290,20 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
/**
* gfs2_ail1_empty - Try to empty the ail1 lists
* @sdp: The superblock
+ * @max_revokes: If non-zero, add revokes where appropriate
*
* Tries to empty the ail1 lists, starting with the oldest first
*/
-static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
{
struct gfs2_trans *tr, *s;
int oldest_tr = 1;
int ret;
- bool withdraw = false;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
- gfs2_ail1_empty_one(sdp, tr, &withdraw);
+ gfs2_ail1_empty_one(sdp, tr, &max_revokes);
if (list_empty(&tr->tr_ail1_list) && oldest_tr)
list_move(&tr->tr_list, &sdp->sd_ail2_list);
else
@@ -239,8 +312,10 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
ret = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
- if (withdraw)
- gfs2_lm_withdraw(sdp, "fatal: I/O error(s)\n");
+ if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) {
+ gfs2_lm(sdp, "fatal: I/O error(s)\n");
+ gfs2_withdraw(sdp);
+ }
return ret;
}
@@ -268,20 +343,17 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
}
/**
- * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
- * @sdp: the filesystem
- * @ai: the AIL entry
- *
+ * gfs2_ail_empty_tr - empty one of the ail lists for a transaction
*/
-static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+ struct list_head *head)
{
- struct list_head *head = &tr->tr_ail2_list;
struct gfs2_bufdata *bd;
while (!list_empty(head)) {
- bd = list_entry(head->prev, struct gfs2_bufdata,
- bd_ail_st_list);
+ bd = list_first_entry(head, struct gfs2_bufdata,
+ bd_ail_st_list);
gfs2_assert(sdp, bd->bd_tr == tr);
gfs2_remove_from_ail(bd);
}
@@ -303,7 +375,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
if (!rm)
continue;
- gfs2_ail2_empty_one(sdp, tr);
+ gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
list_del(&tr->tr_list);
gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list));
gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list));
@@ -487,7 +559,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
if (list_empty(&sdp->sd_ail1_list)) {
tail = sdp->sd_log_head;
} else {
- tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans,
+ tr = list_last_entry(&sdp->sd_ail1_list, struct gfs2_trans,
tr_list);
tail = tr->tr_first;
}
@@ -512,7 +584,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
}
-static void log_flush_wait(struct gfs2_sbd *sdp)
+void log_flush_wait(struct gfs2_sbd *sdp)
{
DEFINE_WAIT(wait);
@@ -549,7 +621,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_ordered_lock);
list_sort(NULL, &sdp->sd_log_ordered, &ip_cmp);
while (!list_empty(&sdp->sd_log_ordered)) {
- ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered);
+ ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
if (ip->i_inode.i_mapping->nrpages == 0) {
test_and_clear_bit(GIF_ORDERED, &ip->i_flags);
list_del(&ip->i_ordered);
@@ -570,7 +642,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_ordered_lock);
while (!list_empty(&sdp->sd_log_ordered)) {
- ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered);
+ ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
list_del(&ip->i_ordered);
WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
if (ip->i_inode.i_mapping->nrpages == 0)
@@ -616,27 +688,24 @@ void gfs2_glock_remove_revoke(struct gfs2_glock *gl)
}
}
+/**
+ * gfs2_write_revokes - Add as many revokes to the system transaction as we can
+ * @sdp: The GFS2 superblock
+ *
+ * Our usual strategy is to defer writing revokes as much as we can in the hope
+ * that we'll eventually overwrite the journal, which will make those revokes
+ * go away. This changes when we flush the log: at that point, there will
+ * likely be some left-over space in the last revoke block of that transaction.
+ * We can fill that space with additional revokes for blocks that have already
+ * been written back. This will basically come at no cost now, and will save
+ * us from having to keep track of those blocks on the AIL2 list later.
+ */
void gfs2_write_revokes(struct gfs2_sbd *sdp)
{
- struct gfs2_trans *tr;
- struct gfs2_bufdata *bd, *tmp;
- int have_revokes = 0;
+ /* number of revokes we still have room for */
int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
- gfs2_ail1_empty(sdp);
- spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
- list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
- if (list_empty(&bd->bd_list)) {
- have_revokes = 1;
- goto done;
- }
- }
- }
-done:
- spin_unlock(&sdp->sd_ail_lock);
- if (have_revokes == 0)
- return;
+ gfs2_log_lock(sdp);
while (sdp->sd_log_num_revoke > max_revokes)
max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
max_revokes -= sdp->sd_log_num_revoke;
@@ -647,20 +716,7 @@ done:
if (!sdp->sd_log_blks_reserved)
atomic_dec(&sdp->sd_log_blks_free);
}
- gfs2_log_lock(sdp);
- spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
- list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
- if (max_revokes == 0)
- goto out_of_blocks;
- if (!list_empty(&bd->bd_list))
- continue;
- gfs2_add_revoke(sdp, bd);
- max_revokes--;
- }
- }
-out_of_blocks:
- spin_unlock(&sdp->sd_ail_lock);
+ gfs2_ail1_empty(sdp, max_revokes);
gfs2_log_unlock(sdp);
if (!sdp->sd_log_num_revoke) {
@@ -787,6 +843,40 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
}
/**
+ * ail_drain - drain the ail lists after a withdraw
+ * @sdp: Pointer to GFS2 superblock
+ */
+static void ail_drain(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr;
+
+ spin_lock(&sdp->sd_ail_lock);
+ /*
+ * For transactions on the sd_ail1_list we need to drain both the
+ * ail1 and ail2 lists. That's because function gfs2_ail1_start_one
+ * (temporarily) moves items from its tr_ail1 list to tr_ail2 list
+ * before revokes are sent for that block. Items on the sd_ail2_list
+ * should have already gotten beyond that point, so no need.
+ */
+ while (!list_empty(&sdp->sd_ail1_list)) {
+ tr = list_first_entry(&sdp->sd_ail1_list, struct gfs2_trans,
+ tr_list);
+ gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail1_list);
+ gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
+ list_del(&tr->tr_list);
+ kfree(tr);
+ }
+ while (!list_empty(&sdp->sd_ail2_list)) {
+ tr = list_first_entry(&sdp->sd_ail2_list, struct gfs2_trans,
+ tr_list);
+ gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
+ list_del(&tr->tr_list);
+ kfree(tr);
+ }
+ spin_unlock(&sdp->sd_ail_lock);
+}
+
+/**
* gfs2_log_flush - flush incore transaction(s)
* @sdp: the filesystem
* @gl: The glock structure to flush. If NULL, flush the whole incore log
@@ -796,11 +886,18 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
{
- struct gfs2_trans *tr;
+ struct gfs2_trans *tr = NULL;
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
down_write(&sdp->sd_log_flush_lock);
+ /*
+ * Do this check while holding the log_flush_lock to prevent new
+ * buffers from being added to the ail via gfs2_pin()
+ */
+ if (gfs2_withdrawn(sdp))
+ goto out;
+
/* Log might have been flushed while we waited for the flush lock */
if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
up_write(&sdp->sd_log_flush_lock);
@@ -819,17 +916,27 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
INIT_LIST_HEAD(&tr->tr_ail2_list);
tr->tr_first = sdp->sd_log_flush_head;
if (unlikely (state == SFS_FROZEN))
- gfs2_assert_withdraw(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new);
+ if (gfs2_assert_withdraw_delayed(sdp,
+ !tr->tr_num_buf_new && !tr->tr_num_databuf_new))
+ goto out;
}
if (unlikely(state == SFS_FROZEN))
- gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
- gfs2_assert_withdraw(sdp,
- sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke);
+ if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke))
+ goto out;
+ if (gfs2_assert_withdraw_delayed(sdp,
+ sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke))
+ goto out;
gfs2_ordered_write(sdp);
+ if (gfs2_withdrawn(sdp))
+ goto out;
lops_before_commit(sdp, tr);
+ if (gfs2_withdrawn(sdp))
+ goto out;
gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE);
+ if (gfs2_withdrawn(sdp))
+ goto out;
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
log_flush_wait(sdp);
@@ -839,6 +946,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
trace_gfs2_log_blocks(sdp, -1);
log_write_header(sdp, flags);
}
+ if (gfs2_withdrawn(sdp))
+ goto out;
lops_after_commit(sdp, tr);
gfs2_log_lock(sdp);
@@ -859,9 +968,11 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
for (;;) {
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
- if (gfs2_ail1_empty(sdp))
+ if (gfs2_ail1_empty(sdp, 0))
break;
}
+ if (gfs2_withdrawn(sdp))
+ goto out;
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
trace_gfs2_log_blocks(sdp, -1);
log_write_header(sdp, flags);
@@ -874,6 +985,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
}
+out:
+ if (gfs2_withdrawn(sdp)) {
+ ail_drain(sdp); /* frees all transactions */
+ tr = NULL;
+ }
+
trace_gfs2_log_flush(sdp, 0, flags);
up_write(&sdp->sd_log_flush_lock);
@@ -1016,16 +1133,17 @@ int gfs2_logd(void *data)
/* Check for errors writing to the journal */
if (sdp->sd_log_error) {
- gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: error %d: "
- "withdrawing the file system to "
- "prevent further damage.\n",
- sdp->sd_fsname, sdp->sd_log_error);
+ gfs2_lm(sdp,
+ "GFS2: fsid=%s: error %d: "
+ "withdrawing the file system to "
+ "prevent further damage.\n",
+ sdp->sd_fsname, sdp->sd_log_error);
+ gfs2_withdraw(sdp);
}
did_flush = false;
if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
- gfs2_ail1_empty(sdp);
+ gfs2_ail1_empty(sdp, 0);
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_LOGD_JFLUSH_REQD);
did_flush = true;
@@ -1034,7 +1152,7 @@ int gfs2_logd(void *data)
if (gfs2_ail_flush_reqd(sdp)) {
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
- gfs2_ail1_empty(sdp);
+ gfs2_ail1_empty(sdp, 0);
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_LOGD_AIL_FLUSH_REQD);
did_flush = true;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index c0a65e5a126b..c1cd6ae17659 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -73,6 +73,7 @@ extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
u32 type);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
+extern void log_flush_wait(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index c090d5ad3f22..5ea96757afc4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -203,8 +203,12 @@ static void gfs2_end_log_write(struct bio *bio)
struct bvec_iter_all iter_all;
if (bio->bi_status) {
- fs_err(sdp, "Error %d writing to journal, jid=%u\n",
- bio->bi_status, sdp->sd_jdesc->jd_jid);
+ if (!cmpxchg(&sdp->sd_log_error, 0, (int)bio->bi_status))
+ fs_err(sdp, "Error %d writing to journal, jid=%u\n",
+ bio->bi_status, sdp->sd_jdesc->jd_jid);
+ gfs2_withdraw_delayed(sdp);
+ /* prevent more writes to the journal */
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
wake_up(&sdp->sd_logd_waitq);
}
@@ -730,7 +734,7 @@ static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
head = &tr->tr_buf;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
gfs2_unpin(sdp, bd->bd_bh, tr);
}
@@ -900,7 +904,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
struct gfs2_glock *gl;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
gl = bd->bd_gl;
gfs2_glock_remove_revoke(gl);
@@ -1079,7 +1083,7 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
head = &tr->tr_databuf;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
gfs2_unpin(sdp, bd->bd_bh, tr);
}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0c3772974030..4b72abcf83b2 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -251,7 +251,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head *bh, *bhs[2];
int num = 0;
- if (unlikely(gfs2_withdrawn(sdp))) {
+ if (unlikely(gfs2_withdrawn(sdp)) &&
+ (!sdp->sd_jdesc || (blkno != sdp->sd_jdesc->jd_no_addr))) {
*bhp = NULL;
return -EIO;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a1a8ef7ed3fd..e2b69ffcc6a8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -552,6 +552,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
mutex_lock(&sdp->sd_jindex_mutex);
for (;;) {
+ struct gfs2_inode *jip;
+
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
if (error)
break;
@@ -591,6 +593,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
spin_lock(&sdp->sd_jindex_spin);
jd->jd_jid = sdp->sd_journals++;
+ jip = GFS2_I(jd->jd_inode);
+ jd->jd_no_addr = jip->i_no_addr;
list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
spin_unlock(&sdp->sd_jindex_spin);
}
@@ -600,48 +604,6 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
return error;
}
-/**
- * check_journal_clean - Make sure a journal is clean for a spectator mount
- * @sdp: The GFS2 superblock
- * @jd: The journal descriptor
- *
- * Returns: 0 if the journal is clean or locked, else an error
- */
-static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
-{
- int error;
- struct gfs2_holder j_gh;
- struct gfs2_log_header_host head;
- struct gfs2_inode *ip;
-
- ip = GFS2_I(jd->jd_inode);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP |
- GL_EXACT | GL_NOCACHE, &j_gh);
- if (error) {
- fs_err(sdp, "Error locking journal for spectator mount.\n");
- return -EPERM;
- }
- error = gfs2_jdesc_check(jd);
- if (error) {
- fs_err(sdp, "Error checking journal for spectator mount.\n");
- goto out_unlock;
- }
- error = gfs2_find_jhead(jd, &head, false);
- if (error) {
- fs_err(sdp, "Error parsing journal for spectator mount.\n");
- goto out_unlock;
- }
- if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
- error = -EPERM;
- fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter "
- "must not be a spectator.\n", jd->jd_jid);
- }
-
-out_unlock:
- gfs2_glock_dq_uninit(&j_gh);
- return error;
-}
-
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
struct inode *master = d_inode(sdp->sd_master_dir);
@@ -694,7 +656,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
&gfs2_journal_glops,
- LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
+ LM_ST_EXCLUSIVE,
+ LM_FLAG_NOEXP | GL_NOCACHE,
&sdp->sd_journal_gh);
if (error) {
fs_err(sdp, "can't acquire journal glock: %d\n", error);
@@ -702,6 +665,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
}
ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+ sdp->sd_jinode_gl = ip->i_gl;
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
&sdp->sd_jinode_gh);
@@ -732,7 +696,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x);
if (sdp->sd_args.ar_spectator) {
- error = check_journal_clean(sdp, jd);
+ error = check_journal_clean(sdp, jd, true);
if (error)
goto fail_jinode_gh;
continue;
@@ -762,10 +726,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
return 0;
fail_jinode_gh:
- if (!sdp->sd_args.ar_spectator)
+ /* A withdraw may have done dq/uninit so now we need to check it */
+ if (!sdp->sd_args.ar_spectator &&
+ gfs2_holder_initialized(&sdp->sd_jinode_gh))
gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
fail_journal_gh:
- if (!sdp->sd_args.ar_spectator)
+ if (!sdp->sd_args.ar_spectator &&
+ gfs2_holder_initialized(&sdp->sd_journal_gh))
gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
fail_jindex:
gfs2_jindex_free(sdp);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e9f93045eb01..cc0c4b5800be 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -115,7 +115,7 @@ static void gfs2_qd_dispose(struct list_head *list)
struct gfs2_sbd *sdp;
while (!list_empty(list)) {
- qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
+ qd = list_first_entry(list, struct gfs2_quota_data, qd_lru);
sdp = qd->qd_gl->gl_name.ln_sbd;
list_del(&qd->qd_lru);
@@ -525,11 +525,11 @@ static void qdsb_put(struct gfs2_quota_data *qd)
}
/**
- * gfs2_qa_alloc - make sure we have a quota allocations data structure,
- * if necessary
+ * gfs2_qa_get - make sure we have a quota allocations data structure,
+ * if necessary
* @ip: the inode for this reservation
*/
-int gfs2_qa_alloc(struct gfs2_inode *ip)
+int gfs2_qa_get(struct gfs2_inode *ip)
{
int error = 0;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -540,17 +540,21 @@ int gfs2_qa_alloc(struct gfs2_inode *ip)
down_write(&ip->i_rw_mutex);
if (ip->i_qadata == NULL) {
ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
- if (!ip->i_qadata)
+ if (!ip->i_qadata) {
error = -ENOMEM;
+ goto out;
+ }
}
+ ip->i_qadata->qa_ref++;
+out:
up_write(&ip->i_rw_mutex);
return error;
}
-void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_qa_put(struct gfs2_inode *ip)
{
down_write(&ip->i_rw_mutex);
- if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
+ if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) {
kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
ip->i_qadata = NULL;
}
@@ -566,27 +570,27 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- if (ip->i_qadata == NULL) {
- error = gfs2_rsqa_alloc(ip);
- if (error)
- return error;
- }
+ error = gfs2_qa_get(ip);
+ if (error)
+ return error;
qd = ip->i_qadata->qa_qd;
if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) ||
- gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
- return -EIO;
+ gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) {
+ error = -EIO;
+ goto out;
+ }
error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
if (error)
- goto out;
+ goto out_unhold;
ip->i_qadata->qa_qd_num++;
qd++;
error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
if (error)
- goto out;
+ goto out_unhold;
ip->i_qadata->qa_qd_num++;
qd++;
@@ -594,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
!uid_eq(uid, ip->i_inode.i_uid)) {
error = qdsb_get(sdp, make_kqid_uid(uid), qd);
if (error)
- goto out;
+ goto out_unhold;
ip->i_qadata->qa_qd_num++;
qd++;
}
@@ -603,14 +607,15 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
!gid_eq(gid, ip->i_inode.i_gid)) {
error = qdsb_get(sdp, make_kqid_gid(gid), qd);
if (error)
- goto out;
+ goto out_unhold;
ip->i_qadata->qa_qd_num++;
qd++;
}
-out:
+out_unhold:
if (error)
gfs2_quota_unhold(ip);
+out:
return error;
}
@@ -621,6 +626,7 @@ void gfs2_quota_unhold(struct gfs2_inode *ip)
if (ip->i_qadata == NULL)
return;
+
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
@@ -628,6 +634,7 @@ void gfs2_quota_unhold(struct gfs2_inode *ip)
ip->i_qadata->qa_qd[x] = NULL;
}
ip->i_qadata->qa_qd_num = 0;
+ gfs2_qa_put(ip);
}
static int sort_qd(const void *a, const void *b)
@@ -876,7 +883,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
unsigned int nalloc = 0, blocks;
int error;
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
return error;
@@ -884,8 +891,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
&data_blocks, &ind_blocks);
ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
- if (!ghs)
- return -ENOMEM;
+ if (!ghs) {
+ error = -ENOMEM;
+ goto out;
+ }
sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
inode_lock(&ip->i_inode);
@@ -893,12 +902,12 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
GL_NOCACHE, &ghs[qx]);
if (error)
- goto out;
+ goto out_dq;
}
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
if (error)
- goto out;
+ goto out_dq;
for (x = 0; x < num_qd; x++) {
offset = qd2offset(qda[x]);
@@ -950,13 +959,15 @@ out_ipres:
gfs2_inplace_release(ip);
out_alloc:
gfs2_glock_dq_uninit(&i_gh);
-out:
+out_dq:
while (qx--)
gfs2_glock_dq_uninit(&ghs[qx]);
inode_unlock(&ip->i_inode);
kfree(ghs);
gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl,
GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC);
+out:
+ gfs2_qa_put(ip);
return error;
}
@@ -1259,6 +1270,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
return;
+ BUG_ON(ip->i_qadata->qa_ref <= 0);
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
qd = ip->i_qadata->qa_qd[x];
@@ -1441,7 +1453,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
spin_lock(&qd_lock);
while (!list_empty(head)) {
- qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
+ qd = list_last_entry(head, struct gfs2_quota_data, qd_list);
list_del(&qd->qd_list);
@@ -1476,8 +1488,8 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
if (error == 0 || error == -EROFS)
return;
if (!gfs2_withdrawn(sdp)) {
- fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
- sdp->sd_log_error = error;
+ if (!cmpxchg(&sdp->sd_log_error, 0, error))
+ fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
wake_up(&sdp->sd_logd_waitq);
}
}
@@ -1504,7 +1516,7 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
ip = NULL;
spin_lock(&sdp->sd_trunc_lock);
if (!list_empty(&sdp->sd_trunc_list)) {
- ip = list_entry(sdp->sd_trunc_list.next,
+ ip = list_first_entry(&sdp->sd_trunc_list,
struct gfs2_inode, i_trunc_list);
list_del_init(&ip->i_trunc_list);
}
@@ -1541,6 +1553,8 @@ int gfs2_quotad(void *data)
while (!kthread_should_stop()) {
+ if (gfs2_withdrawn(sdp))
+ goto bypass;
/* Update the master statfs file */
if (sdp->sd_statfs_force_sync) {
int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -1561,6 +1575,7 @@ int gfs2_quotad(void *data)
try_to_freeze();
+bypass:
t = min(quotad_timeo, statfs_timeo);
prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
@@ -1674,7 +1689,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (error)
return error;
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
goto out_put;
@@ -1743,6 +1758,7 @@ out_i:
out_q:
gfs2_glock_dq_uninit(&q_gh);
out_unlockput:
+ gfs2_qa_put(ip);
inode_unlock(&ip->i_inode);
out_put:
qd_put(qd);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 765627d9a91e..7f9ca8ef40fc 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,8 +15,8 @@ struct gfs2_sbd;
#define NO_UID_QUOTA_CHANGE INVALID_UID
#define NO_GID_QUOTA_CHANGE INVALID_GID
-extern int gfs2_qa_alloc(struct gfs2_inode *ip);
-extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern int gfs2_qa_get(struct gfs2_inode *ip);
+extern void gfs2_qa_put(struct gfs2_inode *ip);
extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unhold(struct gfs2_inode *ip);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 85f830e56945..96c345f49273 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -111,7 +111,7 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd)
struct gfs2_revoke_replay *rr;
while (!list_empty(head)) {
- rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
+ rr = list_first_entry(head, struct gfs2_revoke_replay, rr_list);
list_del(&rr->rr_list);
kfree(rr);
}
@@ -305,6 +305,11 @@ void gfs2_recover_func(struct work_struct *work)
int error = 0;
int jlocked = 0;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n",
+ jd->jd_jid);
+ goto fail;
+ }
t_start = ktime_get();
if (sdp->sd_args.ar_spectator)
goto fail;
@@ -393,6 +398,10 @@ void gfs2_recover_func(struct work_struct *work)
fs_info(sdp, "jid=%u: Replaying journal...0x%x to 0x%x\n",
jd->jd_jid, head.lh_tail, head.lh_blkno);
+ /* We take the sd_log_flush_lock here primarily to prevent log
+ * flushes and simultaneous journal replays from stomping on
+ * each other wrt sd_log_bio. */
+ down_read(&sdp->sd_log_flush_lock);
for (pass = 0; pass < 2; pass++) {
lops_before_scan(jd, &head, pass);
error = foreach_descriptor(jd, head.lh_tail,
@@ -403,6 +412,7 @@ void gfs2_recover_func(struct work_struct *work)
}
clean_journal(jd, &head);
+ up_read(&sdp->sd_log_flush_lock);
gfs2_glock_dq_uninit(&thaw_gh);
t_rep = ktime_get();
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e7bf91ec231c..a321c34e3d6e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -457,24 +457,24 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
}
if (count[0] != rgd->rd_free) {
- if (gfs2_consist_rgrpd(rgd))
- fs_err(sdp, "free data mismatch: %u != %u\n",
- count[0], rgd->rd_free);
+ gfs2_lm(sdp, "free data mismatch: %u != %u\n",
+ count[0], rgd->rd_free);
+ gfs2_consist_rgrpd(rgd);
return;
}
tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
if (count[1] != tmp) {
- if (gfs2_consist_rgrpd(rgd))
- fs_err(sdp, "used data mismatch: %u != %u\n",
- count[1], tmp);
+ gfs2_lm(sdp, "used data mismatch: %u != %u\n",
+ count[1], tmp);
+ gfs2_consist_rgrpd(rgd);
return;
}
if (count[2] + count[3] != rgd->rd_dinodes) {
- if (gfs2_consist_rgrpd(rgd))
- fs_err(sdp, "used metadata mismatch: %u != %u\n",
- count[2] + count[3], rgd->rd_dinodes);
+ gfs2_lm(sdp, "used metadata mismatch: %u != %u\n",
+ count[2] + count[3], rgd->rd_dinodes);
+ gfs2_consist_rgrpd(rgd);
return;
}
}
@@ -590,16 +590,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
}
}
-/**
- * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
- * plus a quota allocations data structure, if necessary
- * @ip: the inode for this reservation
- */
-int gfs2_rsqa_alloc(struct gfs2_inode *ip)
-{
- return gfs2_qa_alloc(ip);
-}
-
static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
const char *fs_id_buf)
{
@@ -672,18 +662,17 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
}
/**
- * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
+ * gfs2_rs_delete - delete a multi-block reservation
* @ip: The inode for this reservation
* @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
{
down_write(&ip->i_rw_mutex);
if ((wcount == NULL) || (atomic_read(wcount) <= 1))
gfs2_rs_deltree(&ip->i_res);
up_write(&ip->i_rw_mutex);
- gfs2_qa_delete(ip, wcount);
}
/**
@@ -720,8 +709,12 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
- glock_clear_object(gl, rgd);
+ if (gl->gl_state != LM_ST_UNLOCKED) {
+ gfs2_glock_cb(gl, LM_ST_UNLOCKED);
+ flush_delayed_work(&gl->gl_work);
+ }
gfs2_rgrp_brelse(rgd);
+ glock_clear_object(gl, rgd);
gfs2_glock_put(gl);
}
@@ -733,17 +726,6 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
}
}
-static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
-{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
-
- fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
- fs_info(sdp, "ri_length = %u\n", rgd->rd_length);
- fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
- fs_info(sdp, "ri_data = %u\n", rgd->rd_data);
- fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes);
-}
-
/**
* gfs2_compute_bitstructs - Compute the bitmap sizes
* @rgd: The resource group descriptor
@@ -814,11 +796,20 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
}
bi = rgd->rd_bits + (length - 1);
if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
- if (gfs2_consist_rgrpd(rgd)) {
- gfs2_rindex_print(rgd);
- fs_err(sdp, "start=%u len=%u offset=%u\n",
- bi->bi_start, bi->bi_bytes, bi->bi_offset);
- }
+ gfs2_lm(sdp,
+ "ri_addr = %llu\n"
+ "ri_length = %u\n"
+ "ri_data0 = %llu\n"
+ "ri_data = %u\n"
+ "ri_bitbytes = %u\n"
+ "start=%u len=%u offset=%u\n",
+ (unsigned long long)rgd->rd_addr,
+ rgd->rd_length,
+ (unsigned long long)rgd->rd_data0,
+ rgd->rd_data,
+ rgd->rd_bitbytes,
+ bi->bi_start, bi->bi_bytes, bi->bi_offset);
+ gfs2_consist_rgrpd(rgd);
return -EIO;
}
@@ -1286,23 +1277,6 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
bi->bi_bh = NULL;
}
}
-
-}
-
-/**
- * gfs2_rgrp_go_unlock - Unlock a rgrp glock
- * @gh: The glock holder for the resource group
- *
- */
-
-void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
-{
- struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
- int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
- test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
-
- if (rgd && demote_requested)
- gfs2_rgrp_brelse(rgd);
}
int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
@@ -1832,10 +1806,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
while (1) {
- down_write(&sdp->sd_log_flush_lock);
error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
true);
- up_write(&sdp->sd_log_flush_lock);
if (error == -ENOSPC)
break;
if (WARN_ON_ONCE(error))
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index c14a673ae36f..a1d7e14fc55b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -33,7 +33,6 @@ extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
@@ -45,9 +44,8 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip);
extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
-extern int gfs2_rsqa_alloc(struct gfs2_inode *ip);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 68cc7c291a81..37fc41632aa2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -61,11 +61,13 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
sdp->sd_journals = 0;
spin_unlock(&sdp->sd_jindex_spin);
+ sdp->sd_jdesc = NULL;
while (!list_empty(&list)) {
- jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+ jd = list_first_entry(&list, struct gfs2_jdesc, jd_list);
gfs2_free_journal_extents(jd);
list_del(&jd->jd_list);
iput(jd->jd_inode);
+ jd->jd_inode = NULL;
kfree(jd);
}
}
@@ -171,9 +173,13 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
goto fail_threads;
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
+ if (gfs2_withdrawn(sdp)) {
+ error = -EIO;
+ goto fail;
+ }
error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
- if (error)
+ if (error || gfs2_withdrawn(sdp))
goto fail;
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
@@ -187,7 +193,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
gfs2_log_pointers_init(sdp, head.lh_blkno);
error = gfs2_quota_init(sdp);
- if (error)
+ if (error || gfs2_withdrawn(sdp))
goto fail;
set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
@@ -446,7 +452,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
out:
while (!list_empty(&list)) {
- lfcc = list_entry(list.next, struct lfcc, list);
+ lfcc = list_first_entry(&list, struct lfcc, list);
list_del(&lfcc->list);
gfs2_glock_dq_uninit(&lfcc->gh);
kfree(lfcc);
@@ -599,34 +605,63 @@ out:
int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
{
struct gfs2_holder freeze_gh;
- int error;
-
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE,
- &freeze_gh);
- if (error && !gfs2_withdrawn(sdp))
- return error;
+ int error = 0;
+ int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+ gfs2_holder_mark_uninitialized(&freeze_gh);
+ if (sdp->sd_freeze_gl &&
+ !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) {
+ if (!log_write_allowed) {
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
+ LM_ST_SHARED, GL_NOCACHE |
+ LM_FLAG_TRY, &freeze_gh);
+ if (error == GLR_TRYFAILED)
+ error = 0;
+ } else {
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
+ LM_ST_SHARED, GL_NOCACHE,
+ &freeze_gh);
+ if (error && !gfs2_withdrawn(sdp))
+ return error;
+ }
+ }
flush_workqueue(gfs2_delete_workqueue);
- if (sdp->sd_quotad_process)
+ if (!log_write_allowed && current == sdp->sd_quotad_process)
+ fs_warn(sdp, "The quotad daemon is withdrawing.\n");
+ else if (sdp->sd_quotad_process)
kthread_stop(sdp->sd_quotad_process);
sdp->sd_quotad_process = NULL;
- if (sdp->sd_logd_process)
+
+ if (!log_write_allowed && current == sdp->sd_logd_process)
+ fs_warn(sdp, "The logd daemon is withdrawing.\n");
+ else if (sdp->sd_logd_process)
kthread_stop(sdp->sd_logd_process);
sdp->sd_logd_process = NULL;
- gfs2_quota_sync(sdp->sd_vfs, 0);
- gfs2_statfs_sync(sdp->sd_vfs, 0);
-
- gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
- GFS2_LFC_MAKE_FS_RO);
- wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
- gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
+ if (log_write_allowed) {
+ gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_statfs_sync(sdp->sd_vfs, 0);
+ gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
+ GFS2_LFC_MAKE_FS_RO);
+ wait_event(sdp->sd_reserving_log_wait,
+ atomic_read(&sdp->sd_reserving_log) == 0);
+ gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) ==
+ sdp->sd_jdesc->jd_blocks);
+ } else {
+ wait_event_timeout(sdp->sd_reserving_log_wait,
+ atomic_read(&sdp->sd_reserving_log) == 0,
+ HZ * 5);
+ }
if (gfs2_holder_initialized(&freeze_gh))
gfs2_glock_dq_uninit(&freeze_gh);
gfs2_quota_cleanup(sdp);
+ if (!log_write_allowed)
+ sdp->sd_vfs->s_flags |= SB_RDONLY;
+
return error;
}
@@ -677,8 +712,10 @@ restart:
gfs2_glock_put(sdp->sd_freeze_gl);
if (!sdp->sd_args.ar_spectator) {
- gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
- gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+ if (gfs2_holder_initialized(&sdp->sd_journal_gh))
+ gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+ if (gfs2_holder_initialized(&sdp->sd_jinode_gh))
+ gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
iput(sdp->sd_sc_inode);
@@ -1356,14 +1393,6 @@ out_unlock:
if (gfs2_rs_active(&ip->i_res))
gfs2_rs_deltree(&ip->i_res);
- if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
- glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq(&ip->i_iopen_gh);
- }
- gfs2_holder_uninit(&ip->i_iopen_gh);
- }
if (gfs2_holder_initialized(&gh)) {
glock_clear_object(ip->i_gl, ip);
gfs2_glock_dq_uninit(&gh);
@@ -1372,22 +1401,30 @@ out_unlock:
fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
out:
truncate_inode_pages_final(&inode->i_data);
- gfs2_rsqa_delete(ip, NULL);
+ if (ip->i_qadata)
+ gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0);
+ gfs2_rs_delete(ip, NULL);
+ gfs2_qa_put(ip);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
- glock_clear_object(ip->i_gl, ip);
- wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
- gfs2_glock_add_to_lru(ip->i_gl);
- gfs2_glock_put_eventually(ip->i_gl);
- ip->i_gl = NULL;
+ if (ip->i_gl) {
+ glock_clear_object(ip->i_gl, ip);
+ wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
+ gfs2_glock_add_to_lru(ip->i_gl);
+ gfs2_glock_put_eventually(ip->i_gl);
+ ip->i_gl = NULL;
+ }
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
glock_clear_object(gl, ip);
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq(&ip->i_iopen_gh);
+ }
gfs2_glock_hold(gl);
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_put_eventually(gl);
}
}
@@ -1401,6 +1438,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
return NULL;
ip->i_flags = 0;
ip->i_gl = NULL;
+ gfs2_holder_mark_uninitialized(&ip->i_iopen_gh);
memset(&ip->i_res, 0, sizeof(ip->i_res));
RB_CLEAR_NODE(&ip->i_res.rs_node);
ip->i_rahead = 0;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b8bf811a1305..51900554ed81 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -26,7 +26,6 @@ extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
-
extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
struct gfs2_inode **ipp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 8ccb68f4ed16..d28c41bd69b0 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -136,7 +136,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
if (val != 1)
return -EINVAL;
- gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
+ gfs2_lm(sdp, "withdrawing from cluster at user's request\n");
+ gfs2_withdraw(sdp);
return len;
}
@@ -434,6 +435,8 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
* never clear the DFL_BLOCK_LOCKS flag, so all our locks would
* permanently stop working.
*/
+ if (!sdp->sd_jdesc)
+ goto out;
if (sdp->sd_jdesc->jd_jid == jid && !sdp->sd_args.ar_spectator)
goto out;
rv = -ENOENT;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index a685637a5b55..ffe840505082 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -228,6 +228,10 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
fs_info(sdp, "GFS2:adding buf while frozen\n");
gfs2_assert_withdraw(sdp, 0);
}
+ if (unlikely(gfs2_withdrawn(sdp))) {
+ fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n",
+ (unsigned long long)bd->bd_bh->b_blocknr);
+ }
gfs2_pin(sdp, bd->bd_bh);
mh->__pad0 = cpu_to_be64(0);
mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index ec600b487498..9b64d40ab379 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -11,12 +11,18 @@
#include <linux/buffer_head.h>
#include <linux/crc32.h>
#include <linux/gfs2_ondisk.h>
+#include <linux/delay.h>
#include <linux/uaccess.h>
#include "gfs2.h"
#include "incore.h"
#include "glock.h"
+#include "glops.h"
+#include "log.h"
+#include "lops.h"
+#include "recovery.h"
#include "rgrp.h"
+#include "super.h"
#include "util.h"
struct kmem_cache *gfs2_glock_cachep __read_mostly;
@@ -33,32 +39,257 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
fs_emerg(sdp, "fatal assertion failed\n");
}
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
+/**
+ * check_journal_clean - Make sure a journal is clean for a spectator mount
+ * @sdp: The GFS2 superblock
+ * @jd: The journal descriptor
+ *
+ * Returns: 0 if the journal is clean or locked, else an error
+ */
+int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ bool verbose)
+{
+ int error;
+ struct gfs2_holder j_gh;
+ struct gfs2_log_header_host head;
+ struct gfs2_inode *ip;
+
+ ip = GFS2_I(jd->jd_inode);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP |
+ GL_EXACT | GL_NOCACHE, &j_gh);
+ if (error) {
+ if (verbose)
+ fs_err(sdp, "Error %d locking journal for spectator "
+ "mount.\n", error);
+ return -EPERM;
+ }
+ error = gfs2_jdesc_check(jd);
+ if (error) {
+ if (verbose)
+ fs_err(sdp, "Error checking journal for spectator "
+ "mount.\n");
+ goto out_unlock;
+ }
+ error = gfs2_find_jhead(jd, &head, false);
+ if (error) {
+ if (verbose)
+ fs_err(sdp, "Error parsing journal for spectator "
+ "mount.\n");
+ goto out_unlock;
+ }
+ if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+ error = -EPERM;
+ if (verbose)
+ fs_err(sdp, "jid=%u: Journal is dirty, so the first "
+ "mounter must not be a spectator.\n",
+ jd->jd_jid);
+ }
+
+out_unlock:
+ gfs2_glock_dq_uninit(&j_gh);
+ return error;
+}
+
+static void signal_our_withdraw(struct gfs2_sbd *sdp)
+{
+ struct gfs2_glock *gl = sdp->sd_live_gh.gh_gl;
+ struct inode *inode = sdp->sd_jdesc->jd_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ u64 no_formal_ino = ip->i_no_formal_ino;
+ int ret = 0;
+ int tries;
+
+ if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+ return;
+
+ /* Prevent any glock dq until withdraw recovery is complete */
+ set_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+ /*
+ * Don't tell dlm we're bailing until we have no more buffers in the
+ * wind. If journal had an IO error, the log code should just purge
+ * the outstanding buffers rather than submitting new IO. Making the
+ * file system read-only will flush the journal, etc.
+ *
+ * During a normal unmount, gfs2_make_fs_ro calls gfs2_log_shutdown
+ * which clears SDF_JOURNAL_LIVE. In a withdraw, we must not write
+ * any UNMOUNT log header, so we can't call gfs2_log_shutdown, and
+ * therefore we need to clear SDF_JOURNAL_LIVE manually.
+ */
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+ if (!sb_rdonly(sdp->sd_vfs))
+ ret = gfs2_make_fs_ro(sdp);
+
+ /*
+ * Drop the glock for our journal so another node can recover it.
+ */
+ if (gfs2_holder_initialized(&sdp->sd_journal_gh)) {
+ gfs2_glock_dq_wait(&sdp->sd_journal_gh);
+ gfs2_holder_uninit(&sdp->sd_journal_gh);
+ }
+ sdp->sd_jinode_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq(&sdp->sd_jinode_gh);
+ if (test_bit(SDF_FS_FROZEN, &sdp->sd_flags)) {
+ /* Make sure gfs2_unfreeze works if partially-frozen */
+ flush_workqueue(gfs2_freeze_wq);
+ atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
+ thaw_super(sdp->sd_vfs);
+ } else {
+ wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE);
+ }
+
+ /*
+ * holder_uninit to force glock_put, to force dlm to let go
+ */
+ gfs2_holder_uninit(&sdp->sd_jinode_gh);
+
+ /*
+ * Note: We need to be careful here:
+ * Our iput of jd_inode will evict it. The evict will dequeue its
+ * glock, but the glock dq will wait for the withdraw unless we have
+ * exception code in glock_dq.
+ */
+ iput(inode);
+ /*
+ * Wait until the journal inode's glock is freed. This allows try locks
+ * on other nodes to be successful, otherwise we remain the owner of
+ * the glock as far as dlm is concerned.
+ */
+ if (gl->gl_ops->go_free) {
+ set_bit(GLF_FREEING, &gl->gl_flags);
+ wait_on_bit(&gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE);
+ }
+
+ if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */
+ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+ goto skip_recovery;
+ }
+ /*
+ * Dequeue the "live" glock, but keep a reference so it's never freed.
+ */
+ gfs2_glock_hold(gl);
+ gfs2_glock_dq_wait(&sdp->sd_live_gh);
+ /*
+ * We enqueue the "live" glock in EX so that all other nodes
+ * get a demote request and act on it. We don't really want the
+ * lock in EX, so we send a "try" lock with 1CB to produce a callback.
+ */
+ fs_warn(sdp, "Requesting recovery of jid %d.\n",
+ sdp->sd_lockstruct.ls_jid);
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP,
+ &sdp->sd_live_gh);
+ msleep(GL_GLOCK_MAX_HOLD);
+ /*
+ * This will likely fail in a cluster, but succeed standalone:
+ */
+ ret = gfs2_glock_nq(&sdp->sd_live_gh);
+
+ /*
+ * If we actually got the "live" lock in EX mode, there are no other
+ * nodes available to replay our journal. So we try to replay it
+ * ourselves. We hold the "live" glock to prevent other mounters
+ * during recovery, then just dequeue it and reacquire it in our
+ * normal SH mode. Just in case the problem that caused us to
+ * withdraw prevents us from recovering our journal (e.g. io errors
+ * and such) we still check if the journal is clean before proceeding
+ * but we may wait forever until another mounter does the recovery.
+ */
+ if (ret == 0) {
+ fs_warn(sdp, "No other mounters found. Trying to recover our "
+ "own journal jid %d.\n", sdp->sd_lockstruct.ls_jid);
+ if (gfs2_recover_journal(sdp->sd_jdesc, 1))
+ fs_warn(sdp, "Unable to recover our journal jid %d.\n",
+ sdp->sd_lockstruct.ls_jid);
+ gfs2_glock_dq_wait(&sdp->sd_live_gh);
+ gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT,
+ &sdp->sd_live_gh);
+ gfs2_glock_nq(&sdp->sd_live_gh);
+ }
+
+ gfs2_glock_queue_put(gl); /* drop the extra reference we acquired */
+ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+
+ /*
+ * At this point our journal is evicted, so we need to get a new inode
+ * for it. Once done, we need to call gfs2_find_jhead which
+ * calls gfs2_map_journal_extents to map it for us again.
+ *
+ * Note that we don't really want it to look up a FREE block. The
+ * GFS2_BLKST_FREE simply overrides a block check in gfs2_inode_lookup
+ * which would otherwise fail because it requires grabbing an rgrp
+ * glock, which would fail with -EIO because we're withdrawing.
+ */
+ inode = gfs2_inode_lookup(sdp->sd_vfs, DT_UNKNOWN,
+ sdp->sd_jdesc->jd_no_addr, no_formal_ino,
+ GFS2_BLKST_FREE);
+ if (IS_ERR(inode)) {
+ fs_warn(sdp, "Reprocessing of jid %d failed with %ld.\n",
+ sdp->sd_lockstruct.ls_jid, PTR_ERR(inode));
+ goto skip_recovery;
+ }
+ sdp->sd_jdesc->jd_inode = inode;
+
+ /*
+ * Now wait until recovery is complete.
+ */
+ for (tries = 0; tries < 10; tries++) {
+ ret = check_journal_clean(sdp, sdp->sd_jdesc, false);
+ if (!ret)
+ break;
+ msleep(HZ);
+ fs_warn(sdp, "Waiting for journal recovery jid %d.\n",
+ sdp->sd_lockstruct.ls_jid);
+ }
+skip_recovery:
+ if (!ret)
+ fs_warn(sdp, "Journal recovery complete for jid %d.\n",
+ sdp->sd_lockstruct.ls_jid);
+ else
+ fs_warn(sdp, "Journal recovery skipped for %d until next "
+ "mount.\n", sdp->sd_lockstruct.ls_jid);
+ fs_warn(sdp, "Glock dequeues delayed: %lu\n", sdp->sd_glock_dqs_held);
+ sdp->sd_glock_dqs_held = 0;
+ wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY);
+}
+
+void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...)
{
- struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- const struct lm_lockops *lm = ls->ls_ops;
- va_list args;
struct va_format vaf;
+ va_list args;
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
- test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags))
- return 0;
-
- if (fmt) {
- va_start(args, fmt);
+ test_bit(SDF_WITHDRAWN, &sdp->sd_flags))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ fs_err(sdp, "%pV", &vaf);
+ va_end(args);
+}
- vaf.fmt = fmt;
- vaf.va = &args;
+int gfs2_withdraw(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ const struct lm_lockops *lm = ls->ls_ops;
- fs_err(sdp, "%pV", &vaf);
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+ test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags)) {
+ if (!test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags))
+ return -1;
- va_end(args);
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG,
+ TASK_UNINTERRUPTIBLE);
+ return -1;
}
+ set_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags);
+
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
fs_err(sdp, "about to withdraw this file system\n");
BUG_ON(sdp->sd_args.ar_debug);
+ signal_our_withdraw(sdp);
+
kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
@@ -69,8 +300,11 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
lm->lm_unmount(sdp);
}
set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
- fs_err(sdp, "withdrawn\n");
+ fs_err(sdp, "File system withdrawn\n");
dump_stack();
+ clear_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG);
}
if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
@@ -81,35 +315,45 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
/**
* gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
- * Returns: -1 if this call withdrew the machine,
- * -2 if it was already withdrawn
*/
-int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
- const char *function, char *file, unsigned int line)
+void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line,
+ bool delayed)
{
- int me;
- me = gfs2_lm_withdraw(sdp,
- "fatal: assertion \"%s\" failed\n"
- " function = %s, file = %s, line = %u\n",
- assertion, function, file, line);
+ if (gfs2_withdrawn(sdp))
+ return;
+
+ fs_err(sdp,
+ "fatal: assertion \"%s\" failed\n"
+ " function = %s, file = %s, line = %u\n",
+ assertion, function, file, line);
+
+ /*
+ * If errors=panic was specified on mount, it won't help to delay the
+ * withdraw.
+ */
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+ delayed = false;
+
+ if (delayed)
+ gfs2_withdraw_delayed(sdp);
+ else
+ gfs2_withdraw(sdp);
dump_stack();
- return (me) ? -1 : -2;
}
/**
* gfs2_assert_warn_i - Print a message to the console if @assertion is false
- * Returns: -1 if we printed something
- * -2 if we didn't
*/
-int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
- const char *function, char *file, unsigned int line)
+void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line)
{
if (time_before(jiffies,
sdp->sd_last_warning +
gfs2_tune_get(sdp, gt_complain_secs) * HZ))
- return -2;
+ return;
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n",
@@ -127,69 +371,59 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
sdp->sd_fsname, function, file, line);
sdp->sd_last_warning = jiffies;
-
- return -1;
}
/**
* gfs2_consist_i - Flag a filesystem consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- * 0 if it was already withdrawn
*/
-int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
- char *file, unsigned int line)
+void gfs2_consist_i(struct gfs2_sbd *sdp, const char *function,
+ char *file, unsigned int line)
{
- int rv;
- rv = gfs2_lm_withdraw(sdp,
- "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
- function, file, line);
- return rv;
+ gfs2_lm(sdp,
+ "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
+ function, file, line);
+ gfs2_withdraw(sdp);
}
/**
* gfs2_consist_inode_i - Flag an inode consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- * 0 if it was already withdrawn
*/
-int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
- const char *function, char *file, unsigned int line)
+void gfs2_consist_inode_i(struct gfs2_inode *ip,
+ const char *function, char *file, unsigned int line)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- int rv;
- rv = gfs2_lm_withdraw(sdp,
- "fatal: filesystem consistency error\n"
- " inode = %llu %llu\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)ip->i_no_formal_ino,
- (unsigned long long)ip->i_no_addr,
- function, file, line);
- return rv;
+
+ gfs2_lm(sdp,
+ "fatal: filesystem consistency error\n"
+ " inode = %llu %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)ip->i_no_formal_ino,
+ (unsigned long long)ip->i_no_addr,
+ function, file, line);
+ gfs2_withdraw(sdp);
}
/**
* gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- * 0 if it was already withdrawn
*/
-int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
- const char *function, char *file, unsigned int line)
+void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
+ const char *function, char *file, unsigned int line)
{
struct gfs2_sbd *sdp = rgd->rd_sbd;
char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
- int rv;
sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
- rv = gfs2_lm_withdraw(sdp,
- "fatal: filesystem consistency error\n"
- " RG = %llu\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)rgd->rd_addr,
- function, file, line);
- return rv;
+ gfs2_lm(sdp,
+ "fatal: filesystem consistency error\n"
+ " RG = %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)rgd->rd_addr,
+ function, file, line);
+ gfs2_withdraw(sdp);
}
/**
@@ -203,12 +437,14 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
unsigned int line)
{
int me;
- me = gfs2_lm_withdraw(sdp,
- "fatal: invalid metadata block\n"
- " bh = %llu (%s)\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)bh->b_blocknr, type,
- function, file, line);
+
+ gfs2_lm(sdp,
+ "fatal: invalid metadata block\n"
+ " bh = %llu (%s)\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr, type,
+ function, file, line);
+ me = gfs2_withdraw(sdp);
return (me) ? -1 : -2;
}
@@ -223,12 +459,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
char *file, unsigned int line)
{
int me;
- me = gfs2_lm_withdraw(sdp,
- "fatal: invalid metadata block\n"
- " bh = %llu (type: exp=%u, found=%u)\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)bh->b_blocknr, type, t,
- function, file, line);
+
+ gfs2_lm(sdp,
+ "fatal: invalid metadata block\n"
+ " bh = %llu (type: exp=%u, found=%u)\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr, type, t,
+ function, file, line);
+ me = gfs2_withdraw(sdp);
return (me) ? -1 : -2;
}
@@ -241,12 +479,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
unsigned int line)
{
- int rv;
- rv = gfs2_lm_withdraw(sdp,
- "fatal: I/O error\n"
- " function = %s, file = %s, line = %u\n",
- function, file, line);
- return rv;
+ gfs2_lm(sdp,
+ "fatal: I/O error\n"
+ " function = %s, file = %s, line = %u\n",
+ function, file, line);
+ return gfs2_withdraw(sdp);
}
/**
@@ -258,14 +495,14 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
const char *function, char *file, unsigned int line,
bool withdraw)
{
- if (!gfs2_withdrawn(sdp))
- fs_err(sdp,
- "fatal: I/O error\n"
- " block = %llu\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)bh->b_blocknr,
- function, file, line);
+ if (gfs2_withdrawn(sdp))
+ return;
+
+ fs_err(sdp, "fatal: I/O error\n"
+ " block = %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr, function, file, line);
if (withdraw)
- gfs2_lm_withdraw(sdp, NULL);
+ gfs2_withdraw(sdp);
}
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index f2702bc9837c..a3542560da6f 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -36,41 +36,59 @@ do { \
} while (0)
-int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
- const char *function, char *file, unsigned int line);
+void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line,
+ bool delayed);
#define gfs2_assert_withdraw(sdp, assertion) \
-((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
- __func__, __FILE__, __LINE__))
-
-
-int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
- const char *function, char *file, unsigned int line);
+ ({ \
+ bool _bool = (assertion); \
+ if (unlikely(!_bool)) \
+ gfs2_assert_withdraw_i((sdp), #assertion, \
+ __func__, __FILE__, __LINE__, false); \
+ !_bool; \
+ })
+
+#define gfs2_assert_withdraw_delayed(sdp, assertion) \
+ ({ \
+ bool _bool = (assertion); \
+ if (unlikely(!_bool)) \
+ gfs2_assert_withdraw_i((sdp), #assertion, \
+ __func__, __FILE__, __LINE__, true); \
+ !_bool; \
+ })
+
+void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line);
#define gfs2_assert_warn(sdp, assertion) \
-((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
- __func__, __FILE__, __LINE__))
-
+ ({ \
+ bool _bool = (assertion); \
+ if (unlikely(!_bool)) \
+ gfs2_assert_warn_i((sdp), #assertion, \
+ __func__, __FILE__, __LINE__); \
+ !_bool; \
+ })
-int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
- const char *function, char *file, unsigned int line);
+void gfs2_consist_i(struct gfs2_sbd *sdp,
+ const char *function, char *file, unsigned int line);
#define gfs2_consist(sdp) \
-gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__)
+gfs2_consist_i((sdp), __func__, __FILE__, __LINE__)
-int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
- const char *function, char *file, unsigned int line);
+void gfs2_consist_inode_i(struct gfs2_inode *ip,
+ const char *function, char *file, unsigned int line);
#define gfs2_consist_inode(ip) \
-gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__)
+gfs2_consist_inode_i((ip), __func__, __FILE__, __LINE__)
-int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
- const char *function, char *file, unsigned int line);
+void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
+ const char *function, char *file, unsigned int line);
#define gfs2_consist_rgrpd(rgd) \
-gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__)
+gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__)
int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
@@ -129,6 +147,9 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
char *file, unsigned int line);
+extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ bool verbose);
+
#define gfs2_io_error(sdp) \
gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__);
@@ -165,18 +186,29 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
}
/**
+ * gfs2_withdraw_delayed - withdraw as soon as possible without deadlocks
+ * @sdp: the superblock
+ */
+static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp)
+{
+ set_bit(SDF_WITHDRAWING, &sdp->sd_flags);
+}
+
+/**
* gfs2_withdrawn - test whether the file system is withdrawing or withdrawn
* @sdp: the superblock
*/
static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp)
{
- return test_bit(SDF_WITHDRAWN, &sdp->sd_flags);
+ return test_bit(SDF_WITHDRAWN, &sdp->sd_flags) ||
+ test_bit(SDF_WITHDRAWING, &sdp->sd_flags);
}
#define gfs2_tune_get(sdp, field) \
gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
__printf(2, 3)
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...);
+void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...);
+int gfs2_withdraw(struct gfs2_sbd *sdp);
#endif /* __UTIL_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index bbe593d16bea..9d7667bc4292 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1222,7 +1222,7 @@ static int gfs2_xattr_set(const struct xattr_handler *handler,
struct gfs2_holder gh;
int ret;
- ret = gfs2_rsqa_alloc(ip);
+ ret = gfs2_qa_get(ip);
if (ret)
return ret;
@@ -1231,15 +1231,19 @@ static int gfs2_xattr_set(const struct xattr_handler *handler,
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret)
- return ret;
+ goto out;
} else {
- if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
- return -EIO;
+ if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) {
+ ret = -EIO;
+ goto out;
+ }
gfs2_holder_mark_uninitialized(&gh);
}
ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags);
if (gfs2_holder_initialized(&gh))
gfs2_glock_dq_uninit(&gh);
+out:
+ gfs2_qa_put(ip);
return ret;
}
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index e6d554476db4..eeebe80c6be4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -292,6 +292,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid,
return -ENOENT;
}
+ /* Avoid btree corruption */
+ hfs_bnode_read(fd->bnode, fd->search_key,
+ fd->keyoffset, fd->keylength);
+
err = hfs_brec_remove(fd);
if (err)
return err;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6b8c49076bb..c070c0d8e3e9 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -139,8 +139,8 @@ static char *inode_name(struct inode *ino)
static char *follow_link(char *link)
{
- int len, n;
char *name, *resolved, *end;
+ int n;
name = __getname();
if (!name) {
@@ -164,15 +164,13 @@ static char *follow_link(char *link)
return name;
*(end + 1) = '\0';
- len = strlen(link) + strlen(name) + 1;
- resolved = kmalloc(len, GFP_KERNEL);
+ resolved = kasprintf(GFP_KERNEL, "%s%s", link, name);
if (resolved == NULL) {
n = -ENOMEM;
goto out_free;
}
- sprintf(resolved, "%s%s", link, name);
__putname(name);
kfree(link);
return resolved;
@@ -921,18 +919,16 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
sb->s_d_op = &simple_dentry_operations;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- /* NULL is printed as <NULL> by sprintf: avoid that. */
+ /* NULL is printed as '(null)' by printf(): avoid that. */
if (req_root == NULL)
req_root = "";
err = -ENOMEM;
sb->s_fs_info = host_root_path =
- kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
+ kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root);
if (host_root_path == NULL)
goto out;
- sprintf(host_root_path, "%s/%s", root_ino, req_root);
-
root_inode = new_inode(sb);
if (!root_inode)
goto out;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index aff8642f0c2e..991c60c7ffe0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -393,10 +393,9 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
* maps and global counts. Page faults can not race with truncation
- * in this routine. hugetlb_no_page() prevents page faults in the
- * truncated range. It checks i_size before allocation, and again after
- * with the page table lock for the page held. The same lock must be
- * acquired to unmap a page.
+ * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents
+ * page faults in the truncated range by checking i_size. i_size is
+ * modified while holding i_mmap_rwsem.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserv map
@@ -436,7 +435,15 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
index = page->index;
hash = hugetlb_fault_mutex_hash(mapping, index);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ if (!truncate_op) {
+ /*
+ * Only need to hold the fault mutex in the
+ * hole punch case. This prevents races with
+ * page faults. Races are not possible in the
+ * case of truncation.
+ */
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ }
/*
* If page is mapped, it was faulted in after being
@@ -450,7 +457,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
if (unlikely(page_mapped(page))) {
BUG_ON(truncate_op);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_lock_write(mapping);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vmdelete_list(&mapping->i_mmap,
index * pages_per_huge_page(h),
(index + 1) * pages_per_huge_page(h));
@@ -477,7 +486,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}
unlock_page(page);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ if (!truncate_op)
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
cond_resched();
@@ -515,8 +525,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
BUG_ON(offset & ~huge_page_mask(h));
pgoff = offset >> PAGE_SHIFT;
- i_size_write(inode, offset);
i_mmap_lock_write(mapping);
+ i_size_write(inode, offset);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
@@ -638,7 +648,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
- /* mutex taken here, fault path and hole punch */
+ /*
+ * fault mutex taken here, protects against fault path
+ * and hole punch. inode_lock previously taken protects
+ * against truncation.
+ */
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
diff --git a/fs/internal.h b/fs/internal.h
index f3f280b952a3..aa5d45524e87 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -38,7 +38,6 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
/*
* buffer.c
*/
-extern void guard_bio_eod(struct bio *bio);
extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, struct iomap *iomap);
@@ -61,7 +60,6 @@ extern int finish_clean_context(struct fs_context *fc);
*/
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root);
-extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
long do_mknodat(int dfd, const char __user *filename, umode_t mode,
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 5cef075c0b37..4023c9846860 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,7 @@
#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
+#include <linux/task_work.h>
#include "io-wq.h"
@@ -69,6 +70,8 @@ struct io_worker {
#define IO_WQ_HASH_ORDER 5
#endif
+#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
+
struct io_wqe_acct {
unsigned nr_workers;
unsigned max_workers;
@@ -98,6 +101,7 @@ struct io_wqe {
struct list_head all_list;
struct io_wq *wq;
+ struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
};
/*
@@ -107,8 +111,7 @@ struct io_wq {
struct io_wqe **wqes;
unsigned long state;
- get_work_fn *get_work;
- put_work_fn *put_work;
+ free_work_fn *free_work;
struct task_struct *manager;
struct user_struct *user;
@@ -376,26 +379,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
return __io_worker_unuse(wqe, worker);
}
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
+static inline unsigned int io_get_work_hash(struct io_wq_work *work)
+{
+ return work->flags >> IO_WQ_HASH_SHIFT;
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
+ struct io_wq_work *work, *tail;
+ unsigned int hash;
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
- if (!(work->flags & IO_WQ_WORK_HASHED)) {
- wq_node_del(&wqe->work_list, node, prev);
+ if (!io_wq_is_hashed(work)) {
+ wq_list_del(&wqe->work_list, node, prev);
return work;
}
/* hashed, can run if not already running */
- *hash = work->flags >> IO_WQ_HASH_SHIFT;
- if (!(wqe->hash_map & BIT_ULL(*hash))) {
- wqe->hash_map |= BIT_ULL(*hash);
- wq_node_del(&wqe->work_list, node, prev);
+ hash = io_get_work_hash(work);
+ if (!(wqe->hash_map & BIT(hash))) {
+ wqe->hash_map |= BIT(hash);
+ /* all items with this hash lie in [work, tail] */
+ tail = wqe->hash_tail[hash];
+ wqe->hash_tail[hash] = NULL;
+ wq_list_cut(&wqe->work_list, &tail->list, prev);
return work;
}
}
@@ -440,16 +452,49 @@ static void io_wq_switch_creds(struct io_worker *worker,
worker->saved_creds = old_creds;
}
+static void io_impersonate_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ if (work->files && current->files != work->files) {
+ task_lock(current);
+ current->files = work->files;
+ task_unlock(current);
+ }
+ if (work->fs && current->fs != work->fs)
+ current->fs = work->fs;
+ if (work->mm != worker->mm)
+ io_wq_switch_mm(worker, work);
+ if (worker->cur_creds != work->creds)
+ io_wq_switch_creds(worker, work);
+}
+
+static void io_assign_current_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ if (work) {
+ /* flush pending signals before assigning new work */
+ if (signal_pending(current))
+ flush_signals(current);
+ cond_resched();
+ }
+
+ spin_lock_irq(&worker->lock);
+ worker->cur_work = work;
+ spin_unlock_irq(&worker->lock);
+}
+
+static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
+
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
- struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
do {
- unsigned hash = -1U;
-
+ struct io_wq_work *work;
+ unsigned int hash;
+get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
* the list isn't empty, it means we stalled on hashed work.
@@ -457,81 +502,60 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
- work = io_get_next_work(wqe, &hash);
+ work = io_get_next_work(wqe);
if (work)
__io_worker_busy(wqe, worker, work);
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
spin_unlock_irq(&wqe->lock);
- if (put_work && wq->put_work)
- wq->put_work(old_work);
if (!work)
break;
-next:
- /* flush any pending signals before assigning new work */
- if (signal_pending(current))
- flush_signals(current);
-
- cond_resched();
-
- spin_lock_irq(&worker->lock);
- worker->cur_work = work;
- spin_unlock_irq(&worker->lock);
-
- if (work->flags & IO_WQ_WORK_CB)
- work->func(&work);
-
- if (work->files && current->files != work->files) {
- task_lock(current);
- current->files = work->files;
- task_unlock(current);
- }
- if (work->fs && current->fs != work->fs)
- current->fs = work->fs;
- if (work->mm != worker->mm)
- io_wq_switch_mm(worker, work);
- if (worker->cur_creds != work->creds)
- io_wq_switch_creds(worker, work);
- /*
- * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
- * the worker function will do the right thing.
- */
- if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
- work->flags |= IO_WQ_WORK_CANCEL;
- if (worker->mm)
- work->flags |= IO_WQ_WORK_HAS_MM;
-
- if (wq->get_work) {
- put_work = work;
- wq->get_work(work);
- }
-
- old_work = work;
- work->func(&work);
-
- spin_lock_irq(&worker->lock);
- worker->cur_work = NULL;
- spin_unlock_irq(&worker->lock);
-
- spin_lock_irq(&wqe->lock);
-
- if (hash != -1U) {
- wqe->hash_map &= ~BIT_ULL(hash);
- wqe->flags &= ~IO_WQE_FLAG_STALLED;
- }
- if (work && work != old_work) {
- spin_unlock_irq(&wqe->lock);
-
- if (put_work && wq->put_work) {
- wq->put_work(put_work);
- put_work = NULL;
+ io_assign_current_work(worker, work);
+
+ /* handle a whole dependent link */
+ do {
+ struct io_wq_work *old_work, *next_hashed, *linked;
+
+ next_hashed = wq_next_work(work);
+ io_impersonate_work(worker, work);
+ /*
+ * OK to set IO_WQ_WORK_CANCEL even for uncancellable
+ * work, the worker function will do the right thing.
+ */
+ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
+ work->flags |= IO_WQ_WORK_CANCEL;
+
+ hash = io_get_work_hash(work);
+ linked = old_work = work;
+ linked->func(&linked);
+ linked = (old_work == linked) ? NULL : linked;
+
+ work = next_hashed;
+ if (!work && linked && !io_wq_is_hashed(linked)) {
+ work = linked;
+ linked = NULL;
+ }
+ io_assign_current_work(worker, work);
+ wq->free_work(old_work);
+
+ if (linked)
+ io_wqe_enqueue(wqe, linked);
+
+ if (hash != -1U && !next_hashed) {
+ spin_lock_irq(&wqe->lock);
+ wqe->hash_map &= ~BIT_ULL(hash);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ /* dependent work is not hashed */
+ hash = -1U;
+ /* skip unnecessary unlock-lock wqe->lock */
+ if (!work)
+ goto get_next;
+ spin_unlock_irq(&wqe->lock);
}
+ } while (work);
- /* dependent work not hashed */
- hash = -1U;
- goto next;
- }
+ spin_lock_irq(&wqe->lock);
} while (1);
}
@@ -693,6 +717,9 @@ static int io_wq_manager(void *data)
complete(&wq->done);
while (!kthread_should_stop()) {
+ if (current->task_works)
+ task_work_run();
+
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
@@ -715,6 +742,9 @@ static int io_wq_manager(void *data)
schedule_timeout(HZ);
}
+ if (current->task_works)
+ task_work_run();
+
return 0;
err:
set_bit(IO_WQ_BIT_ERROR, &wq->state);
@@ -747,17 +777,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
return true;
}
-static void io_run_cancel(struct io_wq_work *work)
+static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
+ struct io_wq *wq = wqe->wq;
+
do {
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
work->func(&work);
work = (work == old_work) ? NULL : work;
+ wq->free_work(old_work);
} while (work);
}
+static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+{
+ unsigned int hash;
+ struct io_wq_work *tail;
+
+ if (!io_wq_is_hashed(work)) {
+append:
+ wq_list_add_tail(&work->list, &wqe->work_list);
+ return;
+ }
+
+ hash = io_get_work_hash(work);
+ tail = wqe->hash_tail[hash];
+ wqe->hash_tail[hash] = work;
+ if (!tail)
+ goto append;
+
+ wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
+}
+
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -771,13 +824,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
* It's close enough to not be an issue, fork() has the same delay.
*/
if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
- io_run_cancel(work);
+ io_run_cancel(work, wqe);
return;
}
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
- wq_list_add_tail(&work->list, &wqe->work_list);
+ io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
@@ -794,19 +847,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
}
/*
- * Enqueue work, hashed by some key. Work items that hash to the same value
- * will not be done in parallel. Used to limit concurrent writes, generally
- * hashed by inode.
+ * Work items that hash to the same value will not be done in parallel.
+ * Used to limit concurrent writes, generally hashed by inode.
*/
-void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
+void io_wq_hash_work(struct io_wq_work *work, void *val)
{
- struct io_wqe *wqe = wq->wqes[numa_node_id()];
- unsigned bit;
-
+ unsigned int bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER);
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
- io_wqe_enqueue(wqe, work);
}
static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
@@ -856,14 +905,13 @@ void io_wq_cancel_all(struct io_wq *wq)
}
struct io_cb_cancel_data {
- struct io_wqe *wqe;
- work_cancel_fn *cancel;
- void *caller_data;
+ work_cancel_fn *fn;
+ void *data;
};
-static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
+static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
- struct io_cb_cancel_data *data = cancel_data;
+ struct io_cb_cancel_data *match = data;
unsigned long flags;
bool ret = false;
@@ -874,83 +922,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
- data->cancel(worker->cur_work, data->caller_data)) {
- send_sig(SIGINT, worker->task, 1);
- ret = true;
- }
- spin_unlock_irqrestore(&worker->lock, flags);
-
- return ret;
-}
-
-static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
- work_cancel_fn *cancel,
- void *cancel_data)
-{
- struct io_cb_cancel_data data = {
- .wqe = wqe,
- .cancel = cancel,
- .caller_data = cancel_data,
- };
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
- unsigned long flags;
- bool found = false;
-
- spin_lock_irqsave(&wqe->lock, flags);
- wq_list_for_each(node, prev, &wqe->work_list) {
- work = container_of(node, struct io_wq_work, list);
-
- if (cancel(work, cancel_data)) {
- wq_node_del(&wqe->work_list, node, prev);
- found = true;
- break;
- }
- }
- spin_unlock_irqrestore(&wqe->lock, flags);
-
- if (found) {
- io_run_cancel(work);
- return IO_WQ_CANCEL_OK;
- }
-
- rcu_read_lock();
- found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
- rcu_read_unlock();
- return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
-}
-
-enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data)
-{
- enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int node;
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- ret = io_wqe_cancel_cb_work(wqe, cancel, data);
- if (ret != IO_WQ_CANCEL_NOTFOUND)
- break;
- }
-
- return ret;
-}
-
-struct work_match {
- bool (*fn)(struct io_wq_work *, void *data);
- void *data;
-};
-
-static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
-{
- struct work_match *match = data;
- unsigned long flags;
- bool ret = false;
-
- spin_lock_irqsave(&worker->lock, flags);
- if (match->fn(worker->cur_work, match->data) &&
- !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
+ match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
@@ -960,7 +932,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
}
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
- struct work_match *match)
+ struct io_cb_cancel_data *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
@@ -977,7 +949,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
work = container_of(node, struct io_wq_work, list);
if (match->fn(work, match->data)) {
- wq_node_del(&wqe->work_list, node, prev);
+ wq_list_del(&wqe->work_list, node, prev);
found = true;
break;
}
@@ -985,7 +957,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
- io_run_cancel(work);
+ io_run_cancel(work, wqe);
return IO_WQ_CANCEL_OK;
}
@@ -1001,22 +973,16 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
-static bool io_wq_work_match(struct io_wq_work *work, void *data)
-{
- return work == data;
-}
-
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+ void *data)
{
- struct work_match match = {
- .fn = io_wq_work_match,
- .data = cwork
+ struct io_cb_cancel_data match = {
+ .fn = cancel,
+ .data = data,
};
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
- cwork->flags |= IO_WQ_WORK_CANCEL;
-
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
@@ -1028,33 +994,28 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
return ret;
}
+static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
+{
+ return work == data;
+}
+
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+{
+ return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
+}
+
static bool io_wq_pid_match(struct io_wq_work *work, void *data)
{
pid_t pid = (pid_t) (unsigned long) data;
- if (work)
- return work->task_pid == pid;
- return false;
+ return work->task_pid == pid;
}
enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
{
- struct work_match match = {
- .fn = io_wq_pid_match,
- .data = (void *) (unsigned long) pid
- };
- enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int node;
+ void *data = (void *) (unsigned long) pid;
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- ret = io_wqe_cancel_work(wqe, &match);
- if (ret != IO_WQ_CANCEL_NOTFOUND)
- break;
- }
-
- return ret;
+ return io_wq_cancel_cb(wq, io_wq_pid_match, data);
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
@@ -1062,6 +1023,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret = -ENOMEM, node;
struct io_wq *wq;
+ if (WARN_ON_ONCE(!data->free_work))
+ return ERR_PTR(-EINVAL);
+
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
@@ -1072,8 +1036,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(-ENOMEM);
}
- wq->get_work = data->get_work;
- wq->put_work = data->put_work;
+ wq->free_work = data->free_work;
/* caller must already hold a reference to this */
wq->user = data->user;
@@ -1130,7 +1093,7 @@ err:
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
- if (data->get_work != wq->get_work || data->put_work != wq->put_work)
+ if (data->free_work != wq->free_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
@@ -1168,3 +1131,8 @@ void io_wq_destroy(struct io_wq *wq)
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
}
+
+struct task_struct *io_wq_get_task(struct io_wq *wq)
+{
+ return wq->manager;
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index e5e15f2c93ec..5ba12de7572f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -5,10 +5,8 @@ struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
- IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32,
- IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
@@ -30,6 +28,18 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+static inline void wq_list_add_after(struct io_wq_work_node *node,
+ struct io_wq_work_node *pos,
+ struct io_wq_work_list *list)
+{
+ struct io_wq_work_node *next = pos->next;
+
+ pos->next = node;
+ node->next = next;
+ if (!next)
+ list->last = node;
+}
+
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
-static inline void wq_node_del(struct io_wq_work_list *list,
- struct io_wq_work_node *node,
+static inline void wq_list_cut(struct io_wq_work_list *list,
+ struct io_wq_work_node *last,
struct io_wq_work_node *prev)
{
- if (node == list->first)
- WRITE_ONCE(list->first, node->next);
- if (node == list->last)
+ /* first in the list, if prev==NULL */
+ if (!prev)
+ WRITE_ONCE(list->first, last->next);
+ else
+ prev->next = last->next;
+
+ if (last == list->last)
list->last = prev;
- if (prev)
- prev->next = node->next;
- node->next = NULL;
+ last->next = NULL;
+}
+
+static inline void wq_list_del(struct io_wq_work_list *list,
+ struct io_wq_work_node *node,
+ struct io_wq_work_node *prev)
+{
+ wq_list_cut(list, node, prev);
}
#define wq_list_for_each(pos, prv, head) \
@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
} while (0)
struct io_wq_work {
- union {
- struct io_wq_work_node list;
- void *data;
- };
+ struct io_wq_work_node list;
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
@@ -83,14 +99,20 @@ struct io_wq_work {
*(work) = (struct io_wq_work){ .func = _func }; \
} while (0) \
-typedef void (get_work_fn)(struct io_wq_work *);
-typedef void (put_work_fn)(struct io_wq_work *);
+static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
+{
+ if (!work->list.next)
+ return NULL;
+
+ return container_of(work->list.next, struct io_wq_work, list);
+}
+
+typedef void (free_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
- get_work_fn *get_work;
- put_work_fn *put_work;
+ free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
-void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
+void io_wq_hash_work(struct io_wq_work *work, void *val);
+
+static inline bool io_wq_is_hashed(struct io_wq_work *work)
+{
+ return work->flags & IO_WQ_WORK_HASHED;
+}
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
@@ -109,6 +136,8 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data);
+struct task_struct *io_wq_get_task(struct io_wq *wq);
+
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3affd96a98ba..5190bfb6a665 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -44,6 +44,7 @@
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
+#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>
@@ -76,6 +77,8 @@
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#include <linux/fs_struct.h>
+#include <linux/splice.h>
+#include <linux/task_work.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -183,14 +186,30 @@ struct fixed_file_table {
struct file **files;
};
+struct fixed_file_ref_node {
+ struct percpu_ref refs;
+ struct list_head node;
+ struct list_head file_list;
+ struct fixed_file_data *file_data;
+ struct work_struct work;
+};
+
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
+ struct percpu_ref *cur_refs;
struct percpu_ref refs;
- struct llist_head put_llist;
- struct work_struct ref_work;
struct completion done;
+ struct list_head ref_list;
+ spinlock_t lock;
+};
+
+struct io_buffer {
+ struct list_head list;
+ __u64 addr;
+ __s32 len;
+ __u16 bid;
};
struct io_ring_ctx {
@@ -270,6 +289,8 @@ struct io_ring_ctx {
struct socket *ring_sock;
#endif
+ struct idr io_buffer_idr;
+
struct idr personality_idr;
struct {
@@ -290,7 +311,6 @@ struct io_ring_ctx {
struct {
spinlock_t completion_lock;
- struct llist_head poll_llist;
/*
* ->poll_list is protected by the ctx->uring_lock for
@@ -306,6 +326,8 @@ struct io_ring_ctx {
spinlock_t inflight_lock;
struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
+
+ struct work_struct exit_work;
};
/*
@@ -386,7 +408,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
+ int bgid;
size_t len;
+ struct io_buffer *kbuf;
};
struct io_open {
@@ -430,6 +454,24 @@ struct io_epoll {
struct epoll_event event;
};
+struct io_splice {
+ struct file *file_out;
+ struct file *file_in;
+ loff_t off_out;
+ loff_t off_in;
+ u64 len;
+ unsigned int flags;
+};
+
+struct io_provide_buf {
+ struct file *file;
+ __u64 addr;
+ __s32 len;
+ __u32 bgid;
+ __u16 nbufs;
+ __u16 bid;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -464,6 +506,7 @@ enum {
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
+ REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
@@ -479,6 +522,11 @@ enum {
REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_OVERFLOW_BIT,
+ REQ_F_POLLED_BIT,
+ REQ_F_BUFFER_SELECTED_BIT,
+
+ /* not a real bit, just to check we're not overflowing the space */
+ __REQ_F_LAST_BIT,
};
enum {
@@ -492,6 +540,8 @@ enum {
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
+ /* IOSQE_BUFFER_SELECT */
+ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
@@ -521,6 +571,15 @@ enum {
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* in overflow list */
REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
+ /* already went through poll handler */
+ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
+ /* buffer already selected */
+ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+};
+
+struct async_poll {
+ struct io_poll_iocb poll;
+ struct io_wq_work work;
};
/*
@@ -546,32 +605,45 @@ struct io_kiocb {
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
+ struct io_splice splice;
+ struct io_provide_buf pbuf;
};
struct io_async_ctx *io;
- /*
- * llist_node is only used for poll deferred completions
- */
- struct llist_node llist_node;
- bool in_async;
+ int cflags;
bool needs_fixed_file;
u8 opcode;
struct io_ring_ctx *ctx;
- union {
- struct list_head list;
- struct hlist_node hash_node;
- };
- struct list_head link_list;
+ struct list_head list;
unsigned int flags;
refcount_t refs;
+ struct task_struct *task;
+ unsigned long fsize;
u64 user_data;
u32 result;
u32 sequence;
+ struct list_head link_list;
+
struct list_head inflight_entry;
- struct io_wq_work work;
+ struct percpu_ref *fixed_file_refs;
+
+ union {
+ /*
+ * Only commands that never go async can use the below fields,
+ * obviously. Right now only IORING_OP_POLL_ADD uses them, and
+ * async armed poll handlers for regular commands. The latter
+ * restore the work, if needed.
+ */
+ struct {
+ struct callback_head task_work;
+ struct hlist_node hash_node;
+ struct async_poll *apoll;
+ };
+ struct io_wq_work work;
+ };
};
#define IO_PLUG_THRESHOLD 2
@@ -615,6 +687,11 @@ struct io_op_def {
unsigned file_table : 1;
/* needs ->fs */
unsigned needs_fs : 1;
+ /* set if opcode supports polled "wait" */
+ unsigned pollin : 1;
+ unsigned pollout : 1;
+ /* op supports buffer selection */
+ unsigned buffer_select : 1;
};
static const struct io_op_def io_op_defs[] = {
@@ -624,6 +701,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_WRITEV] = {
.async_ctx = 1,
@@ -631,6 +710,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
@@ -638,11 +718,13 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -658,6 +740,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.needs_fs = 1,
+ .pollout = 1,
},
[IORING_OP_RECVMSG] = {
.async_ctx = 1,
@@ -665,6 +748,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.needs_fs = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_TIMEOUT] = {
.async_ctx = 1,
@@ -676,6 +761,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.file_table = 1,
+ .pollin = 1,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
@@ -687,6 +773,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
@@ -715,11 +802,14 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_WRITE] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
@@ -731,11 +821,14 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_RECV] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_OPENAT2] = {
.needs_file = 1,
@@ -747,6 +840,13 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.file_table = 1,
},
+ [IORING_OP_SPLICE] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_PROVIDE_BUFFERS] = {},
+ [IORING_OP_REMOVE_BUFFERS] = {},
};
static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -759,8 +859,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
-static void io_ring_file_ref_flush(struct fixed_file_data *data);
static void io_cleanup_req(struct io_kiocb *req);
+static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
+ int fd, struct file **out_file, bool fixed);
+static void __io_queue_sqe(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe);
static struct kmem_cache *req_cachep;
@@ -827,11 +930,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]);
+ idr_init(&ctx->io_buffer_idr);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
- init_llist_head(&ctx->poll_llist);
INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -952,15 +1055,14 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
}
}
-static inline bool io_prep_async_work(struct io_kiocb *req,
+static inline void io_prep_async_work(struct io_kiocb *req,
struct io_kiocb **link)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
- bool do_hashed = false;
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file)
- do_hashed = true;
+ io_wq_hash_work(&req->work, file_inode(req->file));
} else {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
@@ -969,25 +1071,18 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
- return do_hashed;
}
static inline void io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link;
- bool do_hashed;
- do_hashed = io_prep_async_work(req, &link);
+ io_prep_async_work(req, &link);
- trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
- req->flags);
- if (!do_hashed) {
- io_wq_enqueue(ctx->io_wq, &req->work);
- } else {
- io_wq_enqueue_hashed(ctx->io_wq, &req->work,
- file_inode(req->file));
- }
+ trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
+ &req->work, req->flags);
+ io_wq_enqueue(ctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1054,24 +1149,19 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
return false;
if (!ctx->eventfd_async)
return true;
- return io_wq_current_is_worker() || in_interrupt();
+ return io_wq_current_is_worker();
}
-static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
- if (trigger_ev)
+ if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
}
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
- __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
-}
-
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
@@ -1108,7 +1198,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
- WRITE_ONCE(cqe->flags, 0);
+ WRITE_ONCE(cqe->flags, req->cflags);
} else {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1132,7 +1222,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
return cqe != NULL;
}
-static void io_cqring_fill_event(struct io_kiocb *req, long res)
+static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
@@ -1148,7 +1238,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
if (likely(cqe)) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
- WRITE_ONCE(cqe->flags, 0);
+ WRITE_ONCE(cqe->flags, cflags);
} else if (ctx->cq_overflow_flushed) {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1160,23 +1250,34 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
req->flags |= REQ_F_OVERFLOW;
refcount_inc(&req->refs);
req->result = res;
+ req->cflags = cflags;
list_add_tail(&req->list, &ctx->cq_overflow_list);
}
}
-static void io_cqring_add_event(struct io_kiocb *req, long res)
+static void io_cqring_fill_event(struct io_kiocb *req, long res)
+{
+ __io_cqring_fill_event(req, res, 0);
+}
+
+static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- io_cqring_fill_event(req, res);
+ __io_cqring_fill_event(req, res, cflags);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
+static void io_cqring_add_event(struct io_kiocb *req, long res)
+{
+ __io_cqring_add_event(req, res, 0);
+}
+
static inline bool io_is_fallback_req(struct io_kiocb *req)
{
return req == (struct io_kiocb *)
@@ -1194,8 +1295,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
return NULL;
}
-static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
+ struct io_submit_state *state)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
@@ -1228,46 +1329,30 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req = state->reqs[state->free_reqs];
}
-got_it:
- req->io = NULL;
- req->file = NULL;
- req->ctx = ctx;
- req->flags = 0;
- /* one is dropped after submission, the other at completion */
- refcount_set(&req->refs, 2);
- req->result = 0;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
return req;
fallback:
- req = io_get_fallback_req(ctx);
- if (req)
- goto got_it;
- percpu_ref_put(&ctx->refs);
- return NULL;
+ return io_get_fallback_req(ctx);
}
-static void __io_req_do_free(struct io_kiocb *req)
+static inline void io_put_file(struct io_kiocb *req, struct file *file,
+ bool fixed)
{
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
+ if (fixed)
+ percpu_ref_put(req->fixed_file_refs);
else
- clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
+ fput(file);
}
static void __io_req_aux_free(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
if (req->flags & REQ_F_NEED_CLEANUP)
io_cleanup_req(req);
kfree(req->io);
- if (req->file) {
- if (req->flags & REQ_F_FIXED_FILE)
- percpu_ref_put(&ctx->file_data->refs);
- else
- fput(req->file);
- }
+ if (req->file)
+ io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+ if (req->task)
+ put_task_struct(req->task);
io_req_work_drop_env(req);
}
@@ -1288,7 +1373,10 @@ static void __io_free_req(struct io_kiocb *req)
}
percpu_ref_put(&req->ctx->refs);
- __io_req_do_free(req);
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
}
struct req_batch {
@@ -1299,21 +1387,18 @@ struct req_batch {
static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
{
- int fixed_refs = rb->to_free;
-
if (!rb->to_free)
return;
if (rb->need_iter) {
int i, inflight = 0;
unsigned long flags;
- fixed_refs = 0;
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_FIXED_FILE) {
req->file = NULL;
- fixed_refs++;
+ percpu_ref_put(req->fixed_file_refs);
}
if (req->flags & REQ_F_INFLIGHT)
inflight++;
@@ -1339,8 +1424,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
}
do_free:
kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- if (fixed_refs)
- percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
percpu_ref_put_many(&ctx->refs, rb->to_free);
rb->to_free = rb->need_iter = 0;
}
@@ -1474,6 +1557,30 @@ static void io_free_req(struct io_kiocb *req)
io_queue_async_work(nxt);
}
+static void io_link_work_cb(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct io_kiocb *link;
+
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ io_queue_linked_timeout(link);
+ io_wq_submit_work(workptr);
+}
+
+static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
+{
+ struct io_kiocb *link;
+ const struct io_op_def *def = &io_op_defs[nxt->opcode];
+
+ if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
+ io_wq_hash_work(&nxt->work, file_inode(nxt->file));
+
+ *workptr = &nxt->work;
+ link = io_prep_linked_timeout(nxt);
+ if (link)
+ nxt->work.func = io_link_work_cb;
+}
+
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
@@ -1493,6 +1600,26 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
+static void io_steal_work(struct io_kiocb *req,
+ struct io_wq_work **workptr)
+{
+ /*
+ * It's in an io-wq worker, so there always should be at least
+ * one reference, which will be dropped in io_put_work() just
+ * after the current handler returns.
+ *
+ * It also means, that if the counter dropped to 1, then there is
+ * no asynchronous users left, so it's safe to steal the next work.
+ */
+ if (refcount_read(&req->refs) == 1) {
+ struct io_kiocb *nxt = NULL;
+
+ io_req_find_next(req, &nxt);
+ if (nxt)
+ io_wq_assign_next(workptr, nxt);
+ }
+}
+
/*
* Must only be used if we don't need to care about links, usually from
* within the completion handling itself.
@@ -1554,6 +1681,19 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
return true;
}
+static int io_put_kbuf(struct io_kiocb *req)
+{
+ struct io_buffer *kbuf;
+ int cflags;
+
+ kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+ cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
+ cflags |= IORING_CQE_F_BUFFER;
+ req->rw.addr = 0;
+ kfree(kbuf);
+ return cflags;
+}
+
/*
* Find and free completed poll iocbs
*/
@@ -1565,10 +1705,15 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
+ int cflags = 0;
+
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
- io_cqring_fill_event(req, req->result);
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_kbuf(req);
+
+ __io_cqring_fill_event(req, req->result, cflags);
(*nr_events)++;
if (refcount_dec_and_test(&req->refs) &&
@@ -1577,14 +1722,29 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
io_commit_cqring(ctx);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
}
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, list);
+ list_del(&req->list);
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ } while (!list_empty(again));
+}
+
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
long min)
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
+ LIST_HEAD(again);
bool spin;
int ret;
@@ -1599,9 +1759,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct kiocb *kiocb = &req->rw.kiocb;
/*
- * Move completed entries to our local list. If we find a
- * request that requires polling, break out and complete
- * the done list first, if we have entries there.
+ * Move completed and retryable entries to our local lists.
+ * If we find a request that requires polling, break out
+ * and complete those lists first, if we have entries there.
*/
if (req->flags & REQ_F_IOPOLL_COMPLETED) {
list_move_tail(&req->list, &done);
@@ -1610,6 +1770,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
break;
+ if (req->result == -EAGAIN) {
+ list_move_tail(&req->list, &again);
+ continue;
+ }
+ if (!list_empty(&again))
+ break;
+
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
@@ -1622,6 +1789,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
+
return ret;
}
@@ -1743,13 +1913,16 @@ static inline void req_set_fail_links(struct io_kiocb *req)
static void io_complete_rw_common(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+ int cflags = 0;
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (res != req->result)
req_set_fail_links(req);
- io_cqring_add_event(req, res);
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_kbuf(req);
+ __io_cqring_add_event(req, res, cflags);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -1760,17 +1933,6 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
io_put_req(req);
}
-static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
-{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
- struct io_kiocb *nxt = NULL;
-
- io_complete_rw_common(kiocb, res);
- io_put_req_find_next(req, &nxt);
-
- return nxt;
-}
-
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -1841,7 +2003,7 @@ static void io_file_put(struct io_submit_state *state)
* assuming most submissions are for one file, or at least that each file
* has more than one submission.
*/
-static struct file *io_file_get(struct io_submit_state *state, int fd)
+static struct file *__io_file_get(struct io_submit_state *state, int fd)
{
if (!state)
return fget(fd);
@@ -1938,7 +2100,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
- /* we own ->private, reuse it for the buffer index */
+ /* we own ->private, reuse it for the buffer index / buffer ID */
req->rw.kiocb.private = (void *) (unsigned long)
READ_ONCE(sqe->buf_index);
return 0;
@@ -1965,15 +2127,14 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
-static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
- bool in_async)
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
- if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
- *nxt = __io_complete_rw(kiocb, ret);
+ if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
+ io_complete_rw(kiocb, ret, 0);
else
io_rw_done(kiocb, ret);
}
@@ -2052,11 +2213,147 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
return len;
}
+static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
+{
+ if (needs_lock)
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
+{
+ /*
+ * "Normal" inline submissions always hold the uring_lock, since we
+ * grab it from the system call. Same is true for the SQPOLL offload.
+ * The only exception is when we've detached the request and issue it
+ * from an async worker thread, grab the lock for that case.
+ */
+ if (needs_lock)
+ mutex_lock(&ctx->uring_lock);
+}
+
+static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
+ int bgid, struct io_buffer *kbuf,
+ bool needs_lock)
+{
+ struct io_buffer *head;
+
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ return kbuf;
+
+ io_ring_submit_lock(req->ctx, needs_lock);
+
+ lockdep_assert_held(&req->ctx->uring_lock);
+
+ head = idr_find(&req->ctx->io_buffer_idr, bgid);
+ if (head) {
+ if (!list_empty(&head->list)) {
+ kbuf = list_last_entry(&head->list, struct io_buffer,
+ list);
+ list_del(&kbuf->list);
+ } else {
+ kbuf = head;
+ idr_remove(&req->ctx->io_buffer_idr, bgid);
+ }
+ if (*len > kbuf->len)
+ *len = kbuf->len;
+ } else {
+ kbuf = ERR_PTR(-ENOBUFS);
+ }
+
+ io_ring_submit_unlock(req->ctx, needs_lock);
+
+ return kbuf;
+}
+
+static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
+ bool needs_lock)
+{
+ struct io_buffer *kbuf;
+ int bgid;
+
+ kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+ bgid = (int) (unsigned long) req->rw.kiocb.private;
+ kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+ if (IS_ERR(kbuf))
+ return kbuf;
+ req->rw.addr = (u64) (unsigned long) kbuf;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+ return u64_to_user_ptr(kbuf->addr);
+}
+
+#ifdef CONFIG_COMPAT
+static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
+ bool needs_lock)
+{
+ struct compat_iovec __user *uiov;
+ compat_ssize_t clen;
+ void __user *buf;
+ ssize_t len;
+
+ uiov = u64_to_user_ptr(req->rw.addr);
+ if (!access_ok(uiov, sizeof(*uiov)))
+ return -EFAULT;
+ if (__get_user(clen, &uiov->iov_len))
+ return -EFAULT;
+ if (clen < 0)
+ return -EINVAL;
+
+ len = clen;
+ buf = io_rw_buffer_select(req, &len, needs_lock);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+ iov[0].iov_base = buf;
+ iov[0].iov_len = (compat_size_t) len;
+ return 0;
+}
+#endif
+
+static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
+ bool needs_lock)
+{
+ struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
+ void __user *buf;
+ ssize_t len;
+
+ if (copy_from_user(iov, uiov, sizeof(*uiov)))
+ return -EFAULT;
+
+ len = iov[0].iov_len;
+ if (len < 0)
+ return -EINVAL;
+ buf = io_rw_buffer_select(req, &len, needs_lock);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+ iov[0].iov_base = buf;
+ iov[0].iov_len = len;
+ return 0;
+}
+
+static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
+ bool needs_lock)
+{
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ return 0;
+ if (!req->rw.len)
+ return 0;
+ else if (req->rw.len > 1)
+ return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ return io_compat_import(req, iov, needs_lock);
+#endif
+
+ return __io_iov_buffer_select(req, iov, needs_lock);
+}
+
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter)
+ struct iovec **iovec, struct iov_iter *iter,
+ bool needs_lock)
{
void __user *buf = u64_to_user_ptr(req->rw.addr);
size_t sqe_len = req->rw.len;
+ ssize_t ret;
u8 opcode;
opcode = req->opcode;
@@ -2065,12 +2362,20 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return io_import_fixed(req, rw, iter);
}
- /* buffer index only valid with fixed read/write */
- if (req->rw.kiocb.private)
+ /* buffer index only valid with fixed read/write, or buffer select */
+ if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
- ssize_t ret;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+ if (IS_ERR(buf)) {
+ *iovec = NULL;
+ return PTR_ERR(buf);
+ }
+ req->rw.len = sqe_len;
+ }
+
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
return ret < 0 ? ret : sqe_len;
@@ -2086,6 +2391,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return iorw->size;
}
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ ret = io_iov_buffer_select(req, *iovec, needs_lock);
+ if (!ret) {
+ ret = (*iovec)->iov_len;
+ iov_iter_init(iter, rw, *iovec, 1, ret);
+ }
+ *iovec = NULL;
+ return ret;
+ }
+
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
@@ -2162,19 +2477,26 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
req->io->rw.iov = iovec;
if (!req->io->rw.iov) {
req->io->rw.iov = req->io->rw.fast_iov;
- memcpy(req->io->rw.iov, fast_iov,
- sizeof(struct iovec) * iter->nr_segs);
+ if (req->io->rw.iov != fast_iov)
+ memcpy(req->io->rw.iov, fast_iov,
+ sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
}
+static inline int __io_alloc_async_ctx(struct io_kiocb *req)
+{
+ req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
+ return req->io == NULL;
+}
+
static int io_alloc_async_ctx(struct io_kiocb *req)
{
if (!io_op_defs[req->opcode].async_ctx)
return 0;
- req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
- return req->io == NULL;
+
+ return __io_alloc_async_ctx(req);
}
static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
@@ -2184,7 +2506,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
if (!io_op_defs[req->opcode].async_ctx)
return 0;
if (!req->io) {
- if (io_alloc_async_ctx(req))
+ if (__io_alloc_async_ctx(req))
return -ENOMEM;
io_req_map_rw(req, io_size, iovec, fast_iov, iter);
@@ -2213,7 +2535,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
io = req->io;
io->rw.iov = io->rw.fast_iov;
req->io = NULL;
- ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
+ ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
req->io = io;
if (ret < 0)
return ret;
@@ -2222,8 +2544,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
-static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_read(struct io_kiocb *req, bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2231,13 +2552,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
size_t iov_count;
ssize_t io_size, ret;
- ret = io_import_iovec(READ, req, &iovec, &iter);
+ ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
- req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
req->result = 0;
io_size = ret;
@@ -2248,10 +2569,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(req->file)) {
- req->flags |= REQ_F_MUST_PUNT;
+ if (force_nonblock && !io_file_supports_async(req->file))
goto copy_iov;
- }
iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
@@ -2265,13 +2584,16 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2, nxt, req->in_async);
+ kiocb_done(kiocb, ret2);
} else {
copy_iov:
ret = io_setup_async_rw(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
+ /* any defer here is final, must blocking retry */
+ if (!(req->flags & REQ_F_NOWAIT))
+ req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
@@ -2295,6 +2617,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
+ req->fsize = rlimit(RLIMIT_FSIZE);
+
/* either don't need iovec imported or already have it */
if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -2302,7 +2626,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
io = req->io;
io->rw.iov = io->rw.fast_iov;
req->io = NULL;
- ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
+ ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
req->io = io;
if (ret < 0)
return ret;
@@ -2311,8 +2635,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
-static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_write(struct io_kiocb *req, bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2320,7 +2643,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
size_t iov_count;
ssize_t ret, io_size;
- ret = io_import_iovec(WRITE, req, &iovec, &iter);
+ ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
@@ -2337,10 +2660,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(req->file)) {
- req->flags |= REQ_F_MUST_PUNT;
+ if (force_nonblock && !io_file_supports_async(req->file))
goto copy_iov;
- }
/* file path doesn't support NOWAIT for non-direct_IO */
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
@@ -2367,24 +2688,33 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
}
kiocb->ki_flags |= IOCB_WRITE;
+ if (!force_nonblock)
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+
if (req->file->f_op->write_iter)
ret2 = call_write_iter(req->file, kiocb, &iter);
else
ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+
+ if (!force_nonblock)
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+
/*
- * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
+ * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
* retry them without IOCB_NOWAIT.
*/
if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
ret2 = -EAGAIN;
if (!force_nonblock || ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2, nxt, req->in_async);
+ kiocb_done(kiocb, ret2);
} else {
copy_iov:
ret = io_setup_async_rw(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
+ /* any defer here is final, must blocking retry */
+ req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
@@ -2394,6 +2724,76 @@ out_free:
return ret;
}
+static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_splice* sp = &req->splice;
+ unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
+ int ret;
+
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
+
+ sp->file_in = NULL;
+ sp->off_in = READ_ONCE(sqe->splice_off_in);
+ sp->off_out = READ_ONCE(sqe->off);
+ sp->len = READ_ONCE(sqe->len);
+ sp->flags = READ_ONCE(sqe->splice_flags);
+
+ if (unlikely(sp->flags & ~valid_flags))
+ return -EINVAL;
+
+ ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
+ (sp->flags & SPLICE_F_FD_IN_FIXED));
+ if (ret)
+ return ret;
+ req->flags |= REQ_F_NEED_CLEANUP;
+
+ if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+ req->work.flags |= IO_WQ_WORK_UNBOUND;
+
+ return 0;
+}
+
+static bool io_splice_punt(struct file *file)
+{
+ if (get_pipe_info(file))
+ return false;
+ if (!io_file_supports_async(file))
+ return true;
+ return !(file->f_mode & O_NONBLOCK);
+}
+
+static int io_splice(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_splice *sp = &req->splice;
+ struct file *in = sp->file_in;
+ struct file *out = sp->file_out;
+ unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ loff_t *poff_in, *poff_out;
+ long ret;
+
+ if (force_nonblock) {
+ if (io_splice_punt(in) || io_splice_punt(out))
+ return -EAGAIN;
+ flags |= SPLICE_F_NONBLOCK;
+ }
+
+ poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
+ poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
+ ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+
+ io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+
+ io_cqring_add_event(req, ret);
+ if (ret != sp->len)
+ req_set_fail_links(req);
+ io_put_req(req);
+ return 0;
+}
+
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
@@ -2442,85 +2842,63 @@ static bool io_req_cancelled(struct io_kiocb *req)
return false;
}
-static void io_link_work_cb(struct io_wq_work **workptr)
+static void __io_fsync(struct io_kiocb *req)
{
- struct io_wq_work *work = *workptr;
- struct io_kiocb *link = work->data;
-
- io_queue_linked_timeout(link);
- work->func = io_wq_submit_work;
-}
-
-static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
-{
- struct io_kiocb *link;
-
- io_prep_async_work(nxt, &link);
- *workptr = &nxt->work;
- if (link) {
- nxt->work.flags |= IO_WQ_WORK_CB;
- nxt->work.func = io_link_work_cb;
- nxt->work.data = link;
- }
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
loff_t end = req->sync.off + req->sync.len;
- struct io_kiocb *nxt = NULL;
int ret;
- if (io_req_cancelled(req))
- return;
-
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+ io_put_req(req);
}
-static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static void io_fsync_finish(struct io_wq_work **workptr)
{
- struct io_wq_work *work, *old_work;
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ if (io_req_cancelled(req))
+ return;
+ __io_fsync(req);
+ io_steal_work(req, workptr);
+}
+
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
+{
/* fsync always requires a blocking context */
if (force_nonblock) {
- io_put_req(req);
req->work.func = io_fsync_finish;
return -EAGAIN;
}
-
- work = old_work = &req->work;
- io_fsync_finish(&work);
- if (work && work != old_work)
- *nxt = container_of(work, struct io_kiocb, work);
+ __io_fsync(req);
return 0;
}
-static void io_fallocate_finish(struct io_wq_work **workptr)
+static void __io_fallocate(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
int ret;
- if (io_req_cancelled(req))
- return;
-
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+ io_put_req(req);
+}
+
+static void io_fallocate_finish(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+ if (io_req_cancelled(req))
+ return;
+ __io_fallocate(req);
+ io_steal_work(req, workptr);
}
static int io_fallocate_prep(struct io_kiocb *req,
@@ -2532,26 +2910,19 @@ static int io_fallocate_prep(struct io_kiocb *req,
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
req->sync.mode = READ_ONCE(sqe->len);
+ req->fsize = rlimit(RLIMIT_FSIZE);
return 0;
}
-static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
{
- struct io_wq_work *work, *old_work;
-
/* fallocate always requiring blocking context */
if (force_nonblock) {
- io_put_req(req);
req->work.func = io_fallocate_finish;
return -EAGAIN;
}
- work = old_work = &req->work;
- io_fallocate_finish(&work);
- if (work && work != old_work)
- *nxt = container_of(work, struct io_kiocb, work);
-
+ __io_fallocate(req);
return 0;
}
@@ -2562,7 +2933,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -2571,6 +2942,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->open.how.mode = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.how.flags = READ_ONCE(sqe->open_flags);
+ if (force_o_largefile())
+ req->open.how.flags |= O_LARGEFILE;
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
@@ -2593,7 +2966,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -2626,8 +2999,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_openat2(struct io_kiocb *req, bool force_nonblock)
{
struct open_flags op;
struct file *file;
@@ -2658,15 +3030,171 @@ err:
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
}
-static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_openat(struct io_kiocb *req, bool force_nonblock)
{
req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
- return io_openat2(req, nxt, force_nonblock);
+ return io_openat2(req, force_nonblock);
+}
+
+static int io_remove_buffers_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_provide_buf *p = &req->pbuf;
+ u64 tmp;
+
+ if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
+ return -EINVAL;
+
+ tmp = READ_ONCE(sqe->fd);
+ if (!tmp || tmp > USHRT_MAX)
+ return -EINVAL;
+
+ memset(p, 0, sizeof(*p));
+ p->nbufs = tmp;
+ p->bgid = READ_ONCE(sqe->buf_group);
+ return 0;
+}
+
+static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
+ int bgid, unsigned nbufs)
+{
+ unsigned i = 0;
+
+ /* shouldn't happen */
+ if (!nbufs)
+ return 0;
+
+ /* the head kbuf is the list itself */
+ while (!list_empty(&buf->list)) {
+ struct io_buffer *nxt;
+
+ nxt = list_first_entry(&buf->list, struct io_buffer, list);
+ list_del(&nxt->list);
+ kfree(nxt);
+ if (++i == nbufs)
+ return i;
+ }
+ i++;
+ kfree(buf);
+ idr_remove(&ctx->io_buffer_idr, bgid);
+
+ return i;
+}
+
+static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_provide_buf *p = &req->pbuf;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer *head;
+ int ret = 0;
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ ret = -ENOENT;
+ head = idr_find(&ctx->io_buffer_idr, p->bgid);
+ if (head)
+ ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
+ return 0;
+}
+
+static int io_provide_buffers_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_provide_buf *p = &req->pbuf;
+ u64 tmp;
+
+ if (sqe->ioprio || sqe->rw_flags)
+ return -EINVAL;
+
+ tmp = READ_ONCE(sqe->fd);
+ if (!tmp || tmp > USHRT_MAX)
+ return -E2BIG;
+ p->nbufs = tmp;
+ p->addr = READ_ONCE(sqe->addr);
+ p->len = READ_ONCE(sqe->len);
+
+ if (!access_ok(u64_to_user_ptr(p->addr), p->len))
+ return -EFAULT;
+
+ p->bgid = READ_ONCE(sqe->buf_group);
+ tmp = READ_ONCE(sqe->off);
+ if (tmp > USHRT_MAX)
+ return -E2BIG;
+ p->bid = tmp;
+ return 0;
+}
+
+static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
+{
+ struct io_buffer *buf;
+ u64 addr = pbuf->addr;
+ int i, bid = pbuf->bid;
+
+ for (i = 0; i < pbuf->nbufs; i++) {
+ buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ break;
+
+ buf->addr = addr;
+ buf->len = pbuf->len;
+ buf->bid = bid;
+ addr += pbuf->len;
+ bid++;
+ if (!*head) {
+ INIT_LIST_HEAD(&buf->list);
+ *head = buf;
+ } else {
+ list_add_tail(&buf->list, &(*head)->list);
+ }
+ }
+
+ return i ? i : -ENOMEM;
+}
+
+static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_provide_buf *p = &req->pbuf;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer *head, *list;
+ int ret = 0;
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
+
+ ret = io_add_buffers(p, &head);
+ if (ret < 0)
+ goto out;
+
+ if (!list) {
+ ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
+ GFP_KERNEL);
+ if (ret < 0) {
+ __io_remove_buffers(ctx, head, p->bgid, -1U);
+ goto out;
+ }
+ }
+out:
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
+ return 0;
}
static int io_epoll_ctl_prep(struct io_kiocb *req,
@@ -2694,8 +3222,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#endif
}
-static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_EPOLL)
struct io_epoll *ie = &req->epoll;
@@ -2708,7 +3235,7 @@ static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
@@ -2730,8 +3257,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#endif
}
-static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_madvise(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = &req->madvise;
@@ -2744,7 +3270,7 @@ static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
@@ -2762,8 +3288,7 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
{
struct io_fadvise *fa = &req->fadvise;
int ret;
@@ -2783,7 +3308,7 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
}
@@ -2795,7 +3320,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -2820,8 +3345,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_statx(struct io_kiocb *req, bool force_nonblock)
{
struct io_open *ctx = &req->open;
unsigned lookup_flags;
@@ -2858,7 +3382,7 @@ err:
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
}
@@ -2873,7 +3397,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
@@ -2885,7 +3409,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
/* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
+static void __io_close_finish(struct io_kiocb *req)
{
int ret;
@@ -2894,22 +3418,19 @@ static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
fput(req->close.put_file);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
}
static void io_close_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
/* not cancellable, don't do io_req_cancelled() */
- __io_close_finish(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+ __io_close_finish(req);
+ io_steal_work(req, workptr);
}
-static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_close(struct io_kiocb *req, bool force_nonblock)
{
int ret;
@@ -2919,23 +3440,25 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
return ret;
/* if the file has a flush method, be safe and punt to async */
- if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
- goto eagain;
+ if (req->close.put_file->f_op->flush && force_nonblock) {
+ /* submission ref will be dropped, take it for async */
+ refcount_inc(&req->refs);
+
+ req->work.func = io_close_finish;
+ /*
+ * Do manual async queue here to avoid grabbing files - we don't
+ * need the files, and it'll cause io_close_finish() to close
+ * the file again and cause a double CQE entry for this request
+ */
+ io_queue_async_work(req);
+ return 0;
+ }
/*
* No ->flush(), safely close from here and just punt the
* fput() to async context.
*/
- __io_close_finish(req, nxt);
- return 0;
-eagain:
- req->work.func = io_close_finish;
- /*
- * Do manual async queue here to avoid grabbing files - we don't
- * need the files, and it'll cause io_close_finish() to close
- * the file again and cause a double CQE entry for this request
- */
- io_queue_async_work(req);
+ __io_close_finish(req);
return 0;
}
@@ -2957,47 +3480,59 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
+static void __io_sync_file_range(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
int ret;
- if (io_req_cancelled(req))
- return;
-
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+ io_put_req(req);
}
-static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+
+static void io_sync_file_range_finish(struct io_wq_work **workptr)
{
- struct io_wq_work *work, *old_work;
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+ if (io_req_cancelled(req))
+ return;
+ __io_sync_file_range(req);
+ io_put_req(req); /* put submission ref */
+}
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
+{
/* sync_file_range always requires a blocking context */
if (force_nonblock) {
- io_put_req(req);
req->work.func = io_sync_file_range_finish;
return -EAGAIN;
}
- work = old_work = &req->work;
- io_sync_file_range_finish(&work);
- if (work && work != old_work)
- *nxt = container_of(work, struct io_kiocb, work);
+ __io_sync_file_range(req);
return 0;
}
+#if defined(CONFIG_NET)
+static int io_setup_async_msg(struct io_kiocb *req,
+ struct io_async_msghdr *kmsg)
+{
+ if (req->io)
+ return -EAGAIN;
+ if (io_alloc_async_ctx(req)) {
+ if (kmsg->iov != kmsg->fast_iov)
+ kfree(kmsg->iov);
+ return -ENOMEM;
+ }
+ req->flags |= REQ_F_NEED_CLEANUP;
+ memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
+ return -EAGAIN;
+}
+
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
-#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
int ret;
@@ -3023,15 +3558,10 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
int ret;
@@ -3071,18 +3601,8 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
flags |= MSG_DONTWAIT;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
- if (force_nonblock && ret == -EAGAIN) {
- if (req->io)
- return -EAGAIN;
- if (io_alloc_async_ctx(req)) {
- if (kmsg->iov != kmsg->fast_iov)
- kfree(kmsg->iov);
- return -ENOMEM;
- }
- req->flags |= REQ_F_NEED_CLEANUP;
- memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
- return -EAGAIN;
- }
+ if (force_nonblock && ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
@@ -3093,17 +3613,12 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
struct socket *sock;
int ret;
@@ -3144,17 +3659,120 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
- return -EOPNOTSUPP;
+}
+
+static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+{
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct iovec __user *uiov;
+ size_t iov_len;
+ int ret;
+
+ ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
+ &uiov, &iov_len);
+ if (ret)
+ return ret;
+
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ if (iov_len > 1)
+ return -EINVAL;
+ if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
+ return -EFAULT;
+ sr->len = io->msg.iov[0].iov_len;
+ iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
+ sr->len);
+ io->msg.iov = NULL;
+ } else {
+ ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+ &io->msg.iov, &io->msg.msg.msg_iter);
+ if (ret > 0)
+ ret = 0;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_ctx *io)
+{
+ struct compat_msghdr __user *msg_compat;
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct compat_iovec __user *uiov;
+ compat_uptr_t ptr;
+ compat_size_t len;
+ int ret;
+
+ msg_compat = (struct compat_msghdr __user *) sr->msg;
+ ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
+ &ptr, &len);
+ if (ret)
+ return ret;
+
+ uiov = compat_ptr(ptr);
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ compat_ssize_t clen;
+
+ if (len > 1)
+ return -EINVAL;
+ if (!access_ok(uiov, sizeof(*uiov)))
+ return -EFAULT;
+ if (__get_user(clen, &uiov->iov_len))
+ return -EFAULT;
+ if (clen < 0)
+ return -EINVAL;
+ sr->len = io->msg.iov[0].iov_len;
+ io->msg.iov = NULL;
+ } else {
+ ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
+ &io->msg.iov,
+ &io->msg.msg.msg_iter);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
#endif
+
+static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+{
+ io->msg.iov = io->msg.fast_iov;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ return __io_compat_recvmsg_copy_hdr(req, io);
+#endif
+
+ return __io_recvmsg_copy_hdr(req, io);
+}
+
+static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
+ int *cflags, bool needs_lock)
+{
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct io_buffer *kbuf;
+
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return NULL;
+
+ kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
+ if (IS_ERR(kbuf))
+ return kbuf;
+
+ sr->kbuf = kbuf;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+
+ *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
+ *cflags |= IORING_CQE_F_BUFFER;
+ return kbuf;
}
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
-#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
int ret;
@@ -3162,6 +3780,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+ sr->bgid = READ_ONCE(sqe->buf_group);
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
@@ -3174,30 +3793,24 @@ static int io_recvmsg_prep(struct io_kiocb *req,
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
- io->msg.iov = io->msg.fast_iov;
- ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
- &io->msg.uaddr, &io->msg.iov);
+ ret = io_recvmsg_copy_hdr(req, io);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
- int ret;
+ int ret, cflags = 0;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
+ struct io_buffer *kbuf;
struct io_async_ctx io;
unsigned flags;
@@ -3209,19 +3822,23 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
kmsg->iov = kmsg->fast_iov;
kmsg->msg.msg_iter.iov = kmsg->iov;
} else {
- struct io_sr_msg *sr = &req->sr_msg;
-
kmsg = &io.msg;
kmsg->msg.msg_name = &io.msg.addr;
- io.msg.iov = io.msg.fast_iov;
- ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
- sr->msg_flags, &io.msg.uaddr,
- &io.msg.iov);
+ ret = io_recvmsg_copy_hdr(req, &io);
if (ret)
return ret;
}
+ kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+ if (IS_ERR(kbuf)) {
+ return PTR_ERR(kbuf);
+ } else if (kbuf) {
+ kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
+ 1, req->sr_msg.len);
+ }
+
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -3230,18 +3847,8 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN) {
- if (req->io)
- return -EAGAIN;
- if (io_alloc_async_ctx(req)) {
- if (kmsg->iov != kmsg->fast_iov)
- kfree(kmsg->iov);
- return -ENOMEM;
- }
- memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
- req->flags |= REQ_F_NEED_CLEANUP;
- return -EAGAIN;
- }
+ if (force_nonblock && ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
@@ -3249,22 +3856,18 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
if (kmsg && kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- io_cqring_add_event(req, ret);
+ __io_cqring_add_event(req, ret, cflags);
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
+ struct io_buffer *kbuf = NULL;
struct socket *sock;
- int ret;
+ int ret, cflags = 0;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -3272,15 +3875,25 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
+ void __user *buf = sr->buf;
struct msghdr msg;
struct iovec iov;
unsigned flags;
- ret = import_single_range(READ, sr->buf, sr->len, &iov,
+ kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+ else if (kbuf)
+ buf = u64_to_user_ptr(kbuf->addr);
+
+ ret = import_single_range(READ, buf, sr->len, &iov,
&msg.msg_iter);
- if (ret)
+ if (ret) {
+ kfree(kbuf);
return ret;
+ }
+ req->flags |= REQ_F_NEED_CLEANUP;
msg.msg_name = NULL;
msg.msg_control = NULL;
msg.msg_controllen = 0;
@@ -3301,20 +3914,17 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
ret = -EINTR;
}
- io_cqring_add_event(req, ret);
+ kfree(kbuf);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ __io_cqring_add_event(req, ret, cflags);
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
-#if defined(CONFIG_NET)
struct io_accept *accept = &req->accept;
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
@@ -3327,14 +3937,9 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-#if defined(CONFIG_NET)
-static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int __io_accept(struct io_kiocb *req, bool force_nonblock)
{
struct io_accept *accept = &req->accept;
unsigned file_flags;
@@ -3351,44 +3956,34 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
}
static void io_accept_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
if (io_req_cancelled(req))
return;
- __io_accept(req, &nxt, false);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+ __io_accept(req, false);
+ io_steal_work(req, workptr);
}
-#endif
-static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
int ret;
- ret = __io_accept(req, nxt, force_nonblock);
+ ret = __io_accept(req, force_nonblock);
if (ret == -EAGAIN && force_nonblock) {
req->work.func = io_accept_finish;
- io_put_req(req);
return -EAGAIN;
}
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
-#if defined(CONFIG_NET)
struct io_connect *conn = &req->connect;
struct io_async_ctx *io = req->io;
@@ -3405,15 +4000,10 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return move_addr_to_kernel(conn->addr, conn->addr_len,
&io->connect.address);
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
struct io_async_ctx __io, *io;
unsigned file_flags;
int ret;
@@ -3449,25 +4039,303 @@ out:
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
+}
+#else /* !CONFIG_NET */
+static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
return -EOPNOTSUPP;
-#endif
}
-static void io_poll_remove_one(struct io_kiocb *req)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
{
- struct io_poll_iocb *poll = &req->poll;
+ return -EOPNOTSUPP;
+}
+
+static int io_send(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_recvmsg_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_recv(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_connect(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_NET */
+
+struct io_poll_table {
+ struct poll_table_struct pt;
+ struct io_kiocb *req;
+ int error;
+};
+
+static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
+ struct wait_queue_head *head)
+{
+ if (unlikely(poll->head)) {
+ pt->error = -EINVAL;
+ return;
+ }
+
+ pt->error = 0;
+ poll->head = head;
+ add_wait_queue(head, &poll->wait);
+}
+
+static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+ __io_queue_proc(&pt->req->apoll->poll, pt, head);
+}
+
+static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
+ __poll_t mask, task_work_func_t func)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & poll->events))
+ return 0;
+
+ trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
+
+ list_del_init(&poll->wait.entry);
+
+ tsk = req->task;
+ req->result = mask;
+ init_task_work(&req->task_work, func);
+ /*
+ * If this fails, then the task is exiting. Punt to one of the io-wq
+ * threads to ensure the work gets run, we can't always rely on exit
+ * cancelation taking care of this.
+ */
+ ret = task_work_add(tsk, &req->task_work, true);
+ if (unlikely(ret)) {
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, true);
+ }
+ wake_up_process(tsk);
+ return 1;
+}
+
+static void io_async_task_func(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct async_poll *apoll = req->apoll;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
+
+ WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
+
+ if (hash_hashed(&req->hash_node)) {
+ spin_lock_irq(&ctx->completion_lock);
+ hash_del(&req->hash_node);
+ spin_unlock_irq(&ctx->completion_lock);
+ }
+
+ /* restore ->work in case we need to retry again */
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
+
+ __set_current_state(TASK_RUNNING);
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(req, NULL);
+ mutex_unlock(&ctx->uring_lock);
+
+ kfree(apoll);
+}
+
+static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct io_kiocb *req = wait->private;
+ struct io_poll_iocb *poll = &req->apoll->poll;
+
+ trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
+ key_to_poll(key));
+
+ return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
+}
+
+static void io_poll_req_insert(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct hlist_head *list;
+
+ list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+ hlist_add_head(&req->hash_node, list);
+}
+
+static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
+ struct io_poll_iocb *poll,
+ struct io_poll_table *ipt, __poll_t mask,
+ wait_queue_func_t wake_func)
+ __acquires(&ctx->completion_lock)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool cancel = false;
+
+ poll->file = req->file;
+ poll->head = NULL;
+ poll->done = poll->canceled = false;
+ poll->events = mask;
+
+ ipt->pt._key = mask;
+ ipt->req = req;
+ ipt->error = -EINVAL;
+
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, wake_func);
+ poll->wait.private = req;
+
+ mask = vfs_poll(req->file, &ipt->pt) & poll->events;
+
+ spin_lock_irq(&ctx->completion_lock);
+ if (likely(poll->head)) {
+ spin_lock(&poll->head->lock);
+ if (unlikely(list_empty(&poll->wait.entry))) {
+ if (ipt->error)
+ cancel = true;
+ ipt->error = 0;
+ mask = 0;
+ }
+ if (mask || ipt->error)
+ list_del_init(&poll->wait.entry);
+ else if (cancel)
+ WRITE_ONCE(poll->canceled, true);
+ else if (!poll->done) /* actually waiting for an event */
+ io_poll_req_insert(req);
+ spin_unlock(&poll->head->lock);
+ }
+
+ return mask;
+}
+
+static bool io_arm_poll_handler(struct io_kiocb *req)
+{
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ struct io_ring_ctx *ctx = req->ctx;
+ struct async_poll *apoll;
+ struct io_poll_table ipt;
+ __poll_t mask, ret;
+
+ if (!req->file || !file_can_poll(req->file))
+ return false;
+ if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
+ return false;
+ if (!def->pollin && !def->pollout)
+ return false;
+
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (unlikely(!apoll))
+ return false;
+
+ req->flags |= REQ_F_POLLED;
+ memcpy(&apoll->work, &req->work, sizeof(req->work));
+
+ get_task_struct(current);
+ req->task = current;
+ req->apoll = apoll;
+ INIT_HLIST_NODE(&req->hash_node);
+
+ mask = 0;
+ if (def->pollin)
+ mask |= POLLIN | POLLRDNORM;
+ if (def->pollout)
+ mask |= POLLOUT | POLLWRNORM;
+ mask |= POLLERR | POLLPRI;
+
+ ipt.pt._qproc = io_async_queue_proc;
+
+ ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
+ io_async_wake);
+ if (ret) {
+ ipt.error = 0;
+ apoll->poll.done = true;
+ spin_unlock_irq(&ctx->completion_lock);
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll);
+ return false;
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+ trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
+ apoll->poll.events);
+ return true;
+}
+
+static bool __io_poll_remove_one(struct io_kiocb *req,
+ struct io_poll_iocb *poll)
+{
+ bool do_complete = false;
spin_lock(&poll->head->lock);
WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
list_del_init(&poll->wait.entry);
- io_queue_async_work(req);
+ do_complete = true;
}
spin_unlock(&poll->head->lock);
+ return do_complete;
+}
+
+static bool io_poll_remove_one(struct io_kiocb *req)
+{
+ bool do_complete;
+
+ if (req->opcode == IORING_OP_POLL_ADD) {
+ do_complete = __io_poll_remove_one(req, &req->poll);
+ } else {
+ /* non-poll requests have submit ref still */
+ do_complete = __io_poll_remove_one(req, &req->apoll->poll);
+ if (do_complete)
+ io_put_req(req);
+ }
+
hash_del(&req->hash_node);
+
+ if (do_complete) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(req->ctx);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
+ }
+
+ return do_complete;
}
static void io_poll_remove_all(struct io_ring_ctx *ctx)
@@ -3485,6 +4353,8 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -3494,10 +4364,11 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
hlist_for_each_entry(req, list, hash_node) {
- if (sqe_addr == req->user_data) {
- io_poll_remove_one(req);
+ if (sqe_addr != req->user_data)
+ continue;
+ if (io_poll_remove_one(req))
return 0;
- }
+ return -EALREADY;
}
return -ENOENT;
@@ -3543,186 +4414,66 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
struct io_ring_ctx *ctx = req->ctx;
req->poll.done = true;
- if (error)
- io_cqring_fill_event(req, error);
- else
- io_cqring_fill_event(req, mangle_poll(mask));
+ io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
}
-static void io_poll_complete_work(struct io_wq_work **workptr)
+static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
{
- struct io_wq_work *work = *workptr;
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_poll_iocb *poll = &req->poll;
- struct poll_table_struct pt = { ._key = poll->events };
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *nxt = NULL;
- __poll_t mask = 0;
- int ret = 0;
+ struct io_poll_iocb *poll = &req->poll;
- if (work->flags & IO_WQ_WORK_CANCEL) {
- WRITE_ONCE(poll->canceled, true);
- ret = -ECANCELED;
- } else if (READ_ONCE(poll->canceled)) {
- ret = -ECANCELED;
- }
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ struct poll_table_struct pt = { ._key = poll->events };
- if (ret != -ECANCELED)
- mask = vfs_poll(poll->file, &pt) & poll->events;
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
- /*
- * Note that ->ki_cancel callers also delete iocb from active_reqs after
- * calling ->ki_cancel. We need the ctx_lock roundtrip here to
- * synchronize with them. In the cancellation case the list_del_init
- * itself is not actually needed, but harmless so we keep it in to
- * avoid further branches in the fast path.
- */
spin_lock_irq(&ctx->completion_lock);
- if (!mask && ret != -ECANCELED) {
+ if (!req->result && !READ_ONCE(poll->canceled)) {
add_wait_queue(poll->head, &poll->wait);
spin_unlock_irq(&ctx->completion_lock);
return;
}
hash_del(&req->hash_node);
- io_poll_complete(req, mask, ret);
+ io_poll_complete(req, req->result, 0);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req_find_next(req, nxt);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
-
- if (ret < 0)
- req_set_fail_links(req);
- io_put_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
}
-static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
+static void io_poll_task_func(struct callback_head *cb)
{
- struct io_kiocb *req, *tmp;
- struct req_batch rb;
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_kiocb *nxt = NULL;
- rb.to_free = rb.need_iter = 0;
- spin_lock_irq(&ctx->completion_lock);
- llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
+ io_poll_task_handler(req, &nxt);
+ if (nxt) {
+ struct io_ring_ctx *ctx = nxt->ctx;
- if (refcount_dec_and_test(&req->refs) &&
- !io_req_multi_free(&rb, req)) {
- req->flags |= REQ_F_COMP_LOCKED;
- io_free_req(req);
- }
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(nxt, NULL);
+ mutex_unlock(&ctx->uring_lock);
}
- spin_unlock_irq(&ctx->completion_lock);
-
- io_cqring_ev_posted(ctx);
- io_free_req_many(ctx, &rb);
-}
-
-static void io_poll_flush(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct llist_node *nodes;
-
- nodes = llist_del_all(&req->ctx->poll_llist);
- if (nodes)
- __io_poll_flush(req->ctx, nodes);
-}
-
-static void io_poll_trigger_evfd(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- eventfd_signal(req->ctx->cq_ev_fd, 1);
- io_put_req(req);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
- struct io_poll_iocb *poll = wait->private;
- struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
- struct io_ring_ctx *ctx = req->ctx;
- __poll_t mask = key_to_poll(key);
-
- /* for instances that support it check for an event match first: */
- if (mask && !(mask & poll->events))
- return 0;
-
- list_del_init(&poll->wait.entry);
-
- /*
- * Run completion inline if we can. We're using trylock here because
- * we are violating the completion_lock -> poll wq lock ordering.
- * If we have a link timeout we're going to need the completion_lock
- * for finalizing the request, mark us as having grabbed that already.
- */
- if (mask) {
- unsigned long flags;
-
- if (llist_empty(&ctx->poll_llist) &&
- spin_trylock_irqsave(&ctx->completion_lock, flags)) {
- bool trigger_ev;
-
- hash_del(&req->hash_node);
- io_poll_complete(req, mask, 0);
-
- trigger_ev = io_should_trigger_evfd(ctx);
- if (trigger_ev && eventfd_signal_count()) {
- trigger_ev = false;
- req->work.func = io_poll_trigger_evfd;
- } else {
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req(req);
- req = NULL;
- }
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- __io_cqring_ev_posted(ctx, trigger_ev);
- } else {
- req->result = mask;
- req->llist_node.next = NULL;
- /* if the list wasn't empty, we're done */
- if (!llist_add(&req->llist_node, &ctx->poll_llist))
- req = NULL;
- else
- req->work.func = io_poll_flush;
- }
- }
- if (req)
- io_queue_async_work(req);
+ struct io_kiocb *req = wait->private;
+ struct io_poll_iocb *poll = &req->poll;
- return 1;
+ return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
}
-struct io_poll_table {
- struct poll_table_struct pt;
- struct io_kiocb *req;
- int error;
-};
-
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- if (unlikely(pt->req->poll.head)) {
- pt->error = -EINVAL;
- return;
- }
-
- pt->error = 0;
- pt->req->poll.head = head;
- add_wait_queue(head, &pt->req->poll.wait);
-}
-
-static void io_poll_req_insert(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
- hlist_add_head(&req->hash_node, list);
+ __io_queue_proc(&pt->req->poll, pt, head);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3739,55 +4490,26 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+ get_task_struct(current);
+ req->task = current;
return 0;
}
-static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
+static int io_poll_add(struct io_kiocb *req)
{
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
struct io_poll_table ipt;
- bool cancel = false;
__poll_t mask;
- INIT_IO_WORK(&req->work, io_poll_complete_work);
INIT_HLIST_NODE(&req->hash_node);
-
- poll->head = NULL;
- poll->done = false;
- poll->canceled = false;
-
- ipt.pt._qproc = io_poll_queue_proc;
- ipt.pt._key = poll->events;
- ipt.req = req;
- ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
-
- /* initialized the list so that we can do list_empty checks */
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, io_poll_wake);
- poll->wait.private = poll;
-
INIT_LIST_HEAD(&req->list);
+ ipt.pt._qproc = io_poll_queue_proc;
- mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+ mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
+ io_poll_wake);
- spin_lock_irq(&ctx->completion_lock);
- if (likely(poll->head)) {
- spin_lock(&poll->head->lock);
- if (unlikely(list_empty(&poll->wait.entry))) {
- if (ipt.error)
- cancel = true;
- ipt.error = 0;
- mask = 0;
- }
- if (mask || ipt.error)
- list_del_init(&poll->wait.entry);
- else if (cancel)
- WRITE_ONCE(poll->canceled, true);
- else if (!poll->done) /* actually waiting for an event */
- io_poll_req_insert(req);
- spin_unlock(&poll->head->lock);
- }
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
io_poll_complete(req, mask, 0);
@@ -3796,7 +4518,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
if (mask) {
io_cqring_ev_posted(ctx);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
}
return ipt.error;
}
@@ -4045,7 +4767,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
struct io_kiocb *req, __u64 sqe_addr,
- struct io_kiocb **nxt, int success_ret)
+ int success_ret)
{
unsigned long flags;
int ret;
@@ -4071,7 +4793,7 @@ done:
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
}
static int io_async_cancel_prep(struct io_kiocb *req,
@@ -4087,11 +4809,11 @@ static int io_async_cancel_prep(struct io_kiocb *req,
return 0;
}
-static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
+static int io_async_cancel(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
+ io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
return 0;
}
@@ -4226,6 +4948,15 @@ static int io_req_defer_prep(struct io_kiocb *req,
case IORING_OP_EPOLL_CTL:
ret = io_epoll_ctl_prep(req, sqe);
break;
+ case IORING_OP_SPLICE:
+ ret = io_splice_prep(req, sqe);
+ break;
+ case IORING_OP_PROVIDE_BUFFERS:
+ ret = io_provide_buffers_prep(req, sqe);
+ break;
+ case IORING_OP_REMOVE_BUFFERS:
+ ret = io_remove_buffers_prep(req, sqe);
+ break;
default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
@@ -4272,29 +5003,43 @@ static void io_cleanup_req(struct io_kiocb *req)
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ kfree((void *)(unsigned long)req->rw.addr);
+ /* fallthrough */
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
if (io->rw.iov != io->rw.fast_iov)
kfree(io->rw.iov);
break;
- case IORING_OP_SENDMSG:
case IORING_OP_RECVMSG:
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ kfree(req->sr_msg.kbuf);
+ /* fallthrough */
+ case IORING_OP_SENDMSG:
if (io->msg.iov != io->msg.fast_iov)
kfree(io->msg.iov);
break;
+ case IORING_OP_RECV:
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ kfree(req->sr_msg.kbuf);
+ break;
case IORING_OP_OPENAT:
case IORING_OP_OPENAT2:
case IORING_OP_STATX:
putname(req->open.filename);
break;
+ case IORING_OP_SPLICE:
+ io_put_file(req, req->splice.file_in,
+ (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+ break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock)
+ bool force_nonblock)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -4311,7 +5056,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_read(req, nxt, force_nonblock);
+ ret = io_read(req, force_nonblock);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
@@ -4321,7 +5066,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_write(req, nxt, force_nonblock);
+ ret = io_write(req, force_nonblock);
break;
case IORING_OP_FSYNC:
if (sqe) {
@@ -4329,7 +5074,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_fsync(req, nxt, force_nonblock);
+ ret = io_fsync(req, force_nonblock);
break;
case IORING_OP_POLL_ADD:
if (sqe) {
@@ -4337,7 +5082,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_poll_add(req, nxt);
+ ret = io_poll_add(req);
break;
case IORING_OP_POLL_REMOVE:
if (sqe) {
@@ -4353,7 +5098,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_sync_file_range(req, nxt, force_nonblock);
+ ret = io_sync_file_range(req, force_nonblock);
break;
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
@@ -4363,9 +5108,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_SENDMSG)
- ret = io_sendmsg(req, nxt, force_nonblock);
+ ret = io_sendmsg(req, force_nonblock);
else
- ret = io_send(req, nxt, force_nonblock);
+ ret = io_send(req, force_nonblock);
break;
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
@@ -4375,9 +5120,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_RECVMSG)
- ret = io_recvmsg(req, nxt, force_nonblock);
+ ret = io_recvmsg(req, force_nonblock);
else
- ret = io_recv(req, nxt, force_nonblock);
+ ret = io_recv(req, force_nonblock);
break;
case IORING_OP_TIMEOUT:
if (sqe) {
@@ -4401,7 +5146,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_accept(req, nxt, force_nonblock);
+ ret = io_accept(req, force_nonblock);
break;
case IORING_OP_CONNECT:
if (sqe) {
@@ -4409,7 +5154,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_connect(req, nxt, force_nonblock);
+ ret = io_connect(req, force_nonblock);
break;
case IORING_OP_ASYNC_CANCEL:
if (sqe) {
@@ -4417,7 +5162,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_async_cancel(req, nxt);
+ ret = io_async_cancel(req);
break;
case IORING_OP_FALLOCATE:
if (sqe) {
@@ -4425,7 +5170,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_fallocate(req, nxt, force_nonblock);
+ ret = io_fallocate(req, force_nonblock);
break;
case IORING_OP_OPENAT:
if (sqe) {
@@ -4433,7 +5178,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_openat(req, nxt, force_nonblock);
+ ret = io_openat(req, force_nonblock);
break;
case IORING_OP_CLOSE:
if (sqe) {
@@ -4441,7 +5186,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_close(req, nxt, force_nonblock);
+ ret = io_close(req, force_nonblock);
break;
case IORING_OP_FILES_UPDATE:
if (sqe) {
@@ -4457,7 +5202,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_statx(req, nxt, force_nonblock);
+ ret = io_statx(req, force_nonblock);
break;
case IORING_OP_FADVISE:
if (sqe) {
@@ -4465,7 +5210,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_fadvise(req, nxt, force_nonblock);
+ ret = io_fadvise(req, force_nonblock);
break;
case IORING_OP_MADVISE:
if (sqe) {
@@ -4473,7 +5218,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_madvise(req, nxt, force_nonblock);
+ ret = io_madvise(req, force_nonblock);
break;
case IORING_OP_OPENAT2:
if (sqe) {
@@ -4481,7 +5226,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_openat2(req, nxt, force_nonblock);
+ ret = io_openat2(req, force_nonblock);
break;
case IORING_OP_EPOLL_CTL:
if (sqe) {
@@ -4489,7 +5234,31 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_epoll_ctl(req, nxt, force_nonblock);
+ ret = io_epoll_ctl(req, force_nonblock);
+ break;
+ case IORING_OP_SPLICE:
+ if (sqe) {
+ ret = io_splice_prep(req, sqe);
+ if (ret < 0)
+ break;
+ }
+ ret = io_splice(req, force_nonblock);
+ break;
+ case IORING_OP_PROVIDE_BUFFERS:
+ if (sqe) {
+ ret = io_provide_buffers_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_provide_buffers(req, force_nonblock);
+ break;
+ case IORING_OP_REMOVE_BUFFERS:
+ if (sqe) {
+ ret = io_remove_buffers_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_remove_buffers(req, force_nonblock);
break;
default:
ret = -EINVAL;
@@ -4522,7 +5291,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
int ret = 0;
/* if NO_CANCEL is set, we must still run the work */
@@ -4532,9 +5300,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
}
if (!ret) {
- req->in_async = true;
do {
- ret = io_issue_sqe(req, NULL, &nxt, false);
+ ret = io_issue_sqe(req, NULL, false);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -4546,18 +5313,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
} while (1);
}
- /* drop submission reference */
- io_put_req(req);
-
if (ret) {
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
}
- /* if a dependent link is ready, pass it back */
- if (!ret && nxt)
- io_wq_assign_next(workptr, nxt);
+ io_steal_work(req, workptr);
}
static int io_req_needs_file(struct io_kiocb *req, int fd)
@@ -4578,41 +5340,48 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
return table->files[index & IORING_FILE_TABLE_MASK];;
}
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
+ int fd, struct file **out_file, bool fixed)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned flags;
- int fd;
-
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
-
- if (!io_req_needs_file(req, fd))
- return 0;
+ struct file *file;
- if (flags & IOSQE_FIXED_FILE) {
+ if (fixed) {
if (unlikely(!ctx->file_data ||
(unsigned) fd >= ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
- req->file = io_file_from_index(ctx, fd);
- if (!req->file)
+ file = io_file_from_index(ctx, fd);
+ if (!file)
return -EBADF;
- req->flags |= REQ_F_FIXED_FILE;
- percpu_ref_get(&ctx->file_data->refs);
+ req->fixed_file_refs = ctx->file_data->cur_refs;
+ percpu_ref_get(req->fixed_file_refs);
} else {
- if (req->needs_fixed_file)
- return -EBADF;
trace_io_uring_file_get(ctx, fd);
- req->file = io_file_get(state, fd);
- if (unlikely(!req->file))
+ file = __io_file_get(state, fd);
+ if (unlikely(!file))
return -EBADF;
}
+ *out_file = file;
return 0;
}
+static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
+ int fd, unsigned int flags)
+{
+ bool fixed;
+
+ if (!io_req_needs_file(req, fd))
+ return 0;
+
+ fixed = (flags & IOSQE_FIXED_FILE);
+ if (unlikely(!fixed && req->needs_fixed_file))
+ return -EBADF;
+
+ return io_file_get(state, req, fd, &req->file, fixed);
+}
+
static int io_grab_files(struct io_kiocb *req)
{
int ret = -EBADF;
@@ -4672,8 +5441,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
if (prev) {
req_set_fail_links(prev);
- io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
- -ETIME);
+ io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
io_put_req(prev);
} else {
io_cqring_add_event(req, -ETIME);
@@ -4710,6 +5478,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
if (!(req->flags & REQ_F_LINK))
return NULL;
+ /* for polled retry, if flag is set, we already went through here */
+ if (req->flags & REQ_F_POLLED)
+ return NULL;
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
link_list);
@@ -4723,7 +5494,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *linked_timeout;
- struct io_kiocb *nxt = NULL;
+ struct io_kiocb *nxt;
const struct cred *old_creds = NULL;
int ret;
@@ -4739,7 +5510,7 @@ again:
old_creds = override_creds(req->work.creds);
}
- ret = io_issue_sqe(req, sqe, &nxt, true);
+ ret = io_issue_sqe(req, sqe, true);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -4747,6 +5518,11 @@ again:
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
+ if (io_arm_poll_handler(req)) {
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+ goto exit;
+ }
punt:
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
@@ -4759,10 +5535,11 @@ punt:
* submit reference when the iocb is actually submitted.
*/
io_queue_async_work(req);
- goto done_req;
+ goto exit;
}
err:
+ nxt = NULL;
/* drop submission reference */
io_put_req_find_next(req, &nxt);
@@ -4779,15 +5556,14 @@ err:
req_set_fail_links(req);
io_put_req(req);
}
-done_req:
if (nxt) {
req = nxt;
- nxt = NULL;
if (req->flags & REQ_F_FORCE_ASYNC)
goto punt;
goto again;
}
+exit:
if (old_creds)
revert_creds(old_creds);
}
@@ -4829,14 +5605,15 @@ static inline void io_queue_link_head(struct io_kiocb *req)
}
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_BUFFER_SELECT)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned int sqe_flags;
- int ret, id;
+ int ret, id, fd;
sqe_flags = READ_ONCE(sqe->flags);
@@ -4846,6 +5623,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
goto err_req;
}
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[req->opcode].buffer_select) {
+ ret = -EOPNOTSUPP;
+ goto err_req;
+ }
+
id = READ_ONCE(sqe->personality);
if (id) {
req->work.creds = idr_find(&ctx->personality_idr, id);
@@ -4857,10 +5640,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
- IOSQE_ASYNC);
+ req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
+ IOSQE_ASYNC | IOSQE_FIXED_FILE |
+ IOSQE_BUFFER_SELECT);
- ret = io_req_set_file(state, req, sqe);
+ fd = READ_ONCE(sqe->fd);
+ ret = io_req_set_file(state, req, fd, sqe_flags);
if (unlikely(ret)) {
err_req:
io_cqring_add_event(req, ret);
@@ -4976,8 +5761,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe **sqe_ptr)
+static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
u32 *sq_array = ctx->sq_array;
unsigned head;
@@ -4991,25 +5775,40 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
* though the application is the one updating it.
*/
head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
- if (likely(head < ctx->sq_entries)) {
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = ctx->cached_sq_head;
- *sqe_ptr = &ctx->sq_sqes[head];
- req->opcode = READ_ONCE((*sqe_ptr)->opcode);
- req->user_data = READ_ONCE((*sqe_ptr)->user_data);
- ctx->cached_sq_head++;
- return true;
- }
+ if (likely(head < ctx->sq_entries))
+ return &ctx->sq_sqes[head];
/* drop invalid entries */
- ctx->cached_sq_head++;
ctx->cached_sq_dropped++;
WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
- return false;
+ return NULL;
+}
+
+static inline void io_consume_sqe(struct io_ring_ctx *ctx)
+{
+ ctx->cached_sq_head++;
+}
+
+static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ /*
+ * All io need record the previous position, if LINK vs DARIN,
+ * it can be used to mark the position of the first IO in the
+ * link list.
+ */
+ req->sequence = ctx->cached_sq_head;
+ req->opcode = READ_ONCE(sqe->opcode);
+ req->user_data = READ_ONCE(sqe->user_data);
+ req->io = NULL;
+ req->file = NULL;
+ req->ctx = ctx;
+ req->flags = 0;
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
+ req->task = NULL;
+ req->result = 0;
+ INIT_IO_WORK(&req->work, io_wq_submit_work);
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
@@ -5047,17 +5846,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct io_kiocb *req;
int err;
- req = io_get_req(ctx, statep);
+ sqe = io_get_sqe(ctx);
+ if (unlikely(!sqe)) {
+ io_consume_sqe(ctx);
+ break;
+ }
+ req = io_alloc_req(ctx, statep);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
- if (!io_get_sqring(ctx, req, &sqe)) {
- __io_req_do_free(req);
- break;
- }
+ io_init_req(ctx, req, sqe);
+ io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
@@ -5079,7 +5881,6 @@ fail_req:
*mm = ctx->sqo_mm;
}
- req->in_async = async;
req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, async);
@@ -5163,6 +5964,8 @@ static int io_sq_thread(void *data)
if (!list_empty(&ctx->poll_list) ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
+ if (current->task_works)
+ task_work_run();
cond_resched();
continue;
}
@@ -5194,6 +5997,11 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
break;
}
+ if (current->task_works) {
+ task_work_run();
+ finish_wait(&ctx->sqo_wait, &wait);
+ continue;
+ }
if (signal_pending(current))
flush_signals(current);
schedule();
@@ -5213,6 +6021,9 @@ static int io_sq_thread(void *data)
timeout = jiffies + ctx->sq_thread_idle;
}
+ if (current->task_works)
+ task_work_run();
+
set_fs(old_fs);
if (cur_mm) {
unuse_mm(cur_mm);
@@ -5277,8 +6088,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
struct io_rings *rings = ctx->rings;
int ret = 0;
- if (io_cqring_events(ctx, false) >= min_events)
- return 0;
+ do {
+ if (io_cqring_events(ctx, false) >= min_events)
+ return 0;
+ if (!current->task_works)
+ break;
+ task_work_run();
+ } while (1);
if (sig) {
#ifdef CONFIG_COMPAT
@@ -5298,6 +6114,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
+ if (current->task_works)
+ task_work_run();
if (io_should_wake(&iowq, false))
break;
schedule();
@@ -5344,43 +6162,36 @@ static void io_file_ref_kill(struct percpu_ref *ref)
complete(&data->done);
}
-static void io_file_ref_exit_and_free(struct work_struct *work)
-{
- struct fixed_file_data *data;
-
- data = container_of(work, struct fixed_file_data, ref_work);
-
- /*
- * Ensure any percpu-ref atomic switch callback has run, it could have
- * been in progress when the files were being unregistered. Once
- * that's done, we can safely exit and free the ref and containing
- * data structure.
- */
- rcu_barrier();
- percpu_ref_exit(&data->refs);
- kfree(data);
-}
-
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
+ struct fixed_file_ref_node *ref_node = NULL;
unsigned nr_tables, i;
+ unsigned long flags;
if (!data)
return -ENXIO;
- percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
- flush_work(&data->ref_work);
+ spin_lock_irqsave(&data->lock, flags);
+ if (!list_empty(&data->ref_list))
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_file_ref_node, node);
+ spin_unlock_irqrestore(&data->lock, flags);
+ if (ref_node)
+ percpu_ref_kill(&ref_node->refs);
+
+ percpu_ref_kill(&data->refs);
+
+ /* wait for all refs nodes to complete */
wait_for_completion(&data->done);
- io_ring_file_ref_flush(data);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
kfree(data->table);
- INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
- queue_work(system_wq, &data->ref_work);
+ percpu_ref_exit(&data->refs);
+ kfree(data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
@@ -5424,13 +6235,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
struct sk_buff *skb;
int i, nr_files;
- if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
- unsigned long inflight = ctx->user->unix_inflight + nr;
-
- if (inflight > task_rlimit(current, RLIMIT_NOFILE))
- return -EMFILE;
- }
-
fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
if (!fpl)
return -ENOMEM;
@@ -5605,50 +6409,72 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
}
struct io_file_put {
- struct llist_node llist;
+ struct list_head list;
struct file *file;
- struct completion *done;
};
-static void io_ring_file_ref_flush(struct fixed_file_data *data)
+static void io_file_put_work(struct work_struct *work)
{
+ struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *file_data;
+ struct io_ring_ctx *ctx;
struct io_file_put *pfile, *tmp;
- struct llist_node *node;
+ unsigned long flags;
- while ((node = llist_del_all(&data->put_llist)) != NULL) {
- llist_for_each_entry_safe(pfile, tmp, node, llist) {
- io_ring_file_put(data->ctx, pfile->file);
- if (pfile->done)
- complete(pfile->done);
- else
- kfree(pfile);
- }
+ ref_node = container_of(work, struct fixed_file_ref_node, work);
+ file_data = ref_node->file_data;
+ ctx = file_data->ctx;
+
+ list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
+ list_del_init(&pfile->list);
+ io_ring_file_put(ctx, pfile->file);
+ kfree(pfile);
}
+
+ spin_lock_irqsave(&file_data->lock, flags);
+ list_del_init(&ref_node->node);
+ spin_unlock_irqrestore(&file_data->lock, flags);
+
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
+ percpu_ref_put(&file_data->refs);
}
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_file_data_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
+
+ ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- data = container_of(work, struct fixed_file_data, ref_work);
- io_ring_file_ref_flush(data);
- percpu_ref_switch_to_percpu(&data->refs);
+ queue_work(system_wq, &ref_node->work);
}
-static void io_file_data_ref_zero(struct percpu_ref *ref)
+static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+ struct io_ring_ctx *ctx)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
- data = container_of(ref, struct fixed_file_data, refs);
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return ERR_PTR(-ENOMEM);
- /*
- * We can't safely switch from inside this context, punt to wq. If
- * the table ref is going away, the table is being unregistered.
- * Don't queue up the async work for that case, the caller will
- * handle it.
- */
- if (!percpu_ref_is_dying(&data->refs))
- queue_work(system_wq, &data->ref_work);
+ if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
+ 0, GFP_KERNEL)) {
+ kfree(ref_node);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ref_node->node);
+ INIT_LIST_HEAD(&ref_node->file_list);
+ INIT_WORK(&ref_node->work, io_file_put_work);
+ ref_node->file_data = ctx->file_data;
+ return ref_node;
+
+}
+
+static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+{
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -5659,6 +6485,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
struct file *file;
int fd, ret = 0;
unsigned i;
+ struct fixed_file_ref_node *ref_node;
+ unsigned long flags;
if (ctx->file_data)
return -EBUSY;
@@ -5672,6 +6500,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
ctx->file_data->ctx = ctx;
init_completion(&ctx->file_data->done);
+ INIT_LIST_HEAD(&ctx->file_data->ref_list);
+ spin_lock_init(&ctx->file_data->lock);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
ctx->file_data->table = kcalloc(nr_tables,
@@ -5683,15 +6513,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
}
- if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+ if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
- ctx->file_data->put_llist.first = NULL;
- INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
percpu_ref_exit(&ctx->file_data->refs);
@@ -5754,9 +6582,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
ret = io_sqe_files_scm(ctx);
- if (ret)
+ if (ret) {
io_sqe_files_unregister(ctx);
+ return ret;
+ }
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node)) {
+ io_sqe_files_unregister(ctx);
+ return PTR_ERR(ref_node);
+ }
+
+ ctx->file_data->cur_refs = &ref_node->refs;
+ spin_lock_irqsave(&ctx->file_data->lock, flags);
+ list_add(&ref_node->node, &ctx->file_data->ref_list);
+ spin_unlock_irqrestore(&ctx->file_data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
return ret;
}
@@ -5803,46 +6644,22 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static void io_atomic_switch(struct percpu_ref *ref)
+static int io_queue_file_removal(struct fixed_file_data *data,
+ struct file *file)
{
- struct fixed_file_data *data;
+ struct io_file_put *pfile;
+ struct percpu_ref *refs = data->cur_refs;
+ struct fixed_file_ref_node *ref_node;
- /*
- * Juggle reference to ensure we hit zero, if needed, so we can
- * switch back to percpu mode
- */
- data = container_of(ref, struct fixed_file_data, refs);
- percpu_ref_put(&data->refs);
- percpu_ref_get(&data->refs);
-}
-
-static bool io_queue_file_removal(struct fixed_file_data *data,
- struct file *file)
-{
- struct io_file_put *pfile, pfile_stack;
- DECLARE_COMPLETION_ONSTACK(done);
-
- /*
- * If we fail allocating the struct we need for doing async reomval
- * of this file, just punt to sync and wait for it.
- */
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
- if (!pfile) {
- pfile = &pfile_stack;
- pfile->done = &done;
- }
+ if (!pfile)
+ return -ENOMEM;
+ ref_node = container_of(refs, struct fixed_file_ref_node, refs);
pfile->file = file;
- llist_add(&pfile->llist, &data->put_llist);
+ list_add(&pfile->list, &ref_node->file_list);
- if (pfile == &pfile_stack) {
- percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
- wait_for_completion(&done);
- flush_work(&data->ref_work);
- return false;
- }
-
- return true;
+ return 0;
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -5850,17 +6667,23 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
unsigned nr_args)
{
struct fixed_file_data *data = ctx->file_data;
- bool ref_switch = false;
+ struct fixed_file_ref_node *ref_node;
struct file *file;
__s32 __user *fds;
int fd, i, err;
__u32 done;
+ unsigned long flags;
+ bool needs_switch = false;
if (check_add_overflow(up->offset, nr_args, &done))
return -EOVERFLOW;
if (done > ctx->nr_user_files)
return -EINVAL;
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node))
+ return PTR_ERR(ref_node);
+
done = 0;
fds = u64_to_user_ptr(up->fds);
while (nr_args) {
@@ -5877,9 +6700,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
index = i & IORING_FILE_TABLE_MASK;
if (table->files[index]) {
file = io_file_from_index(ctx, index);
+ err = io_queue_file_removal(data, file);
+ if (err)
+ break;
table->files[index] = NULL;
- if (io_queue_file_removal(data, file))
- ref_switch = true;
+ needs_switch = true;
}
if (fd != -1) {
file = fget(fd);
@@ -5910,11 +6735,19 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
- if (ref_switch)
- percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
+ if (needs_switch) {
+ percpu_ref_kill(data->cur_refs);
+ spin_lock_irqsave(&data->lock, flags);
+ list_add(&ref_node->node, &data->ref_list);
+ data->cur_refs = &ref_node->refs;
+ spin_unlock_irqrestore(&data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
+ } else
+ destroy_fixed_file_ref_node(ref_node);
return done ? done : err;
}
+
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
@@ -5932,20 +6765,14 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
return __io_sqe_files_update(ctx, &up, nr_args);
}
-static void io_put_work(struct io_wq_work *work)
+static void io_free_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ /* Consider that io_steal_work() relies on this ref */
io_put_req(req);
}
-static void io_get_work(struct io_wq_work *work)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- refcount_inc(&req->refs);
-}
-
static int io_init_wq_offload(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
@@ -5956,8 +6783,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
int ret = 0;
data.user = ctx->user;
- data.get_work = io_get_work;
- data.put_work = io_put_work;
+ data.free_work = io_free_work;
if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
@@ -6359,6 +7185,21 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
return -ENXIO;
}
+static int __io_destroy_buffers(int id, void *p, void *data)
+{
+ struct io_ring_ctx *ctx = data;
+ struct io_buffer *buf = p;
+
+ __io_remove_buffers(ctx, buf, id, -1U);
+ return 0;
+}
+
+static void io_destroy_buffers(struct io_ring_ctx *ctx)
+{
+ idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
+ idr_destroy(&ctx->io_buffer_idr);
+}
+
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
@@ -6369,6 +7210,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
+ io_destroy_buffers(ctx);
idr_destroy(&ctx->personality_idr);
#if defined(CONFIG_UNIX)
@@ -6431,6 +7273,18 @@ static int io_remove_personalities(int id, void *p, void *data)
return 0;
}
+static void io_ring_exit_work(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx;
+
+ ctx = container_of(work, struct io_ring_ctx, exit_work);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+
+ wait_for_completion(&ctx->completions[0]);
+ io_ring_ctx_free(ctx);
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
@@ -6458,8 +7312,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
- wait_for_completion(&ctx->completions[0]);
- io_ring_ctx_free(ctx);
+ INIT_WORK(&ctx->exit_work, io_ring_exit_work);
+ queue_work(system_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
@@ -6623,6 +7477,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
int submitted = 0;
struct fd f;
+ if (current->task_works)
+ task_work_run();
+
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
return -EINVAL;
@@ -6669,7 +7526,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
min_complete = min(min_complete, ctx->cq_entries);
- if (ctx->flags & IORING_SETUP_IOPOLL) {
+ /*
+ * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
+ * space applications don't need to do io completion events
+ * polling again, they can rely on io_sq_thread to do polling
+ * work, which can reduce cpu usage and uring_lock contention.
+ */
+ if (ctx->flags & IORING_SETUP_IOPOLL &&
+ !(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_iopoll_check(ctx, &nr_events, min_complete);
} else {
ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
@@ -6745,6 +7609,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "Personalities:\n");
idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
}
+ seq_printf(m, "PollList:\n");
+ spin_lock_irq(&ctx->completion_lock);
+ for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+ struct hlist_head *list = &ctx->cancel_hash[i];
+ struct io_kiocb *req;
+
+ hlist_for_each_entry(req, list, hash_node)
+ seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
+ req->task->task_works != NULL);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
mutex_unlock(&ctx->uring_lock);
}
@@ -6961,7 +7836,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY;
+ IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
@@ -7239,6 +8114,7 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(8, __u64, off);
BUILD_BUG_SQE_ELEM(8, __u64, addr2);
BUILD_BUG_SQE_ELEM(16, __u64, addr);
+ BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
BUILD_BUG_SQE_ELEM(24, __u32, len);
BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
@@ -7253,11 +8129,14 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
+ BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
BUILD_BUG_SQE_ELEM(32, __u64, user_data);
BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
+ BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
+ BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
return 0;
};
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 7c84c4c027c4..89e21961d1ad 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -302,6 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) {
gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ gfp_t orig_gfp = gfp;
int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (ctx->bio)
@@ -310,6 +311,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (ctx->is_readahead) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+ /*
+ * If the bio_alloc fails, try it again for a single page to
+ * avoid having to deal with partial page reads. This emulates
+ * what do_mpage_readpage does.
+ */
+ if (!ctx->bio)
+ ctx->bio = bio_alloc(orig_gfp, 1);
ctx->bio->bi_opf = REQ_OP_READ;
if (ctx->is_readahead)
ctx->bio->bi_opf |= REQ_RAHEAD;
@@ -503,7 +511,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
int
iomap_releasepage(struct page *page, gfp_t gfp_mask)
{
- trace_iomap_releasepage(page->mapping->host, page, 0, 0);
+ trace_iomap_releasepage(page->mapping->host, page_offset(page),
+ PAGE_SIZE);
/*
* mm accommodates an old ext3 case where clean pages might not have had
@@ -520,7 +529,7 @@ EXPORT_SYMBOL_GPL(iomap_releasepage);
void
iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
{
- trace_iomap_invalidatepage(page->mapping->host, page, offset, len);
+ trace_iomap_invalidatepage(page->mapping->host, offset, len);
/*
* If we are invalidating the entire page, clear the dirty state from it
@@ -974,13 +983,6 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
}
-static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
- struct iomap *iomap)
-{
- return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
- iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
-}
-
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
void *data, struct iomap *iomap, struct iomap *srcmap)
@@ -1000,7 +1002,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
bytes = min_t(loff_t, PAGE_SIZE - offset, count);
if (IS_DAX(inode))
- status = iomap_dax_zero(pos, offset, bytes, iomap);
+ status = dax_iomap_zero(pos, offset, bytes, iomap);
else
status = iomap_zero(inode, pos, offset, bytes, iomap,
srcmap);
@@ -1519,7 +1521,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
u64 end_offset;
loff_t offset;
- trace_iomap_writepage(inode, page, 0, 0);
+ trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE);
/*
* Refuse to write the page out if we are called from reclaim context.
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 23837926c0c5..20dde5aadcdd 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -534,8 +534,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
/*
* We are about to drop our additional submission reference, which
- * might be the last reference to the dio. There are three three
- * different ways we can progress here:
+ * might be the last reference to the dio. There are three different
+ * ways we can progress here:
*
* (a) If this is the last reference we will always complete and free
* the dio ourselves.
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 6dc227b8c47e..4df19c66f597 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -41,14 +41,12 @@ DEFINE_EVENT(iomap_readpage_class, name, \
DEFINE_READPAGE_EVENT(iomap_readpage);
DEFINE_READPAGE_EVENT(iomap_readpages);
-DECLARE_EVENT_CLASS(iomap_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
- unsigned int len),
- TP_ARGS(inode, page, off, len),
+DECLARE_EVENT_CLASS(iomap_range_class,
+ TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),
+ TP_ARGS(inode, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u64, ino)
- __field(pgoff_t, pgoff)
__field(loff_t, size)
__field(unsigned long, offset)
__field(unsigned int, length)
@@ -56,29 +54,26 @@ DECLARE_EVENT_CLASS(iomap_page_class,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->pgoff = page_offset(page);
__entry->size = i_size_read(inode);
__entry->offset = off;
__entry->length = len;
),
- TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx "
"length %x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->pgoff,
__entry->size,
__entry->offset,
__entry->length)
)
-#define DEFINE_PAGE_EVENT(name) \
-DEFINE_EVENT(iomap_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
- unsigned int len), \
- TP_ARGS(inode, page, off, len))
-DEFINE_PAGE_EVENT(iomap_writepage);
-DEFINE_PAGE_EVENT(iomap_releasepage);
-DEFINE_PAGE_EVENT(iomap_invalidatepage);
+#define DEFINE_RANGE_EVENT(name) \
+DEFINE_EVENT(iomap_range_class, name, \
+ TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\
+ TP_ARGS(inode, off, len))
+DEFINE_RANGE_EVENT(iomap_writepage);
+DEFINE_RANGE_EVENT(iomap_releasepage);
+DEFINE_RANGE_EVENT(iomap_invalidatepage);
#define IOMAP_TYPE_STRINGS \
{ IOMAP_HOLE, "HOLE" }, \
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 27373f5792a4..e855d8260433 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -997,9 +997,10 @@ restart_loop:
* journalled data) we need to unmap buffer and clear
* more bits. We also need to be careful about the check
* because the data page mapping can get cleared under
- * out hands, which alse need not to clear more bits
- * because the page and buffers will be freed and can
- * never be reused once we are done with them.
+ * our hands. Note that if mapping == NULL, we don't
+ * need to make buffer unmapped because the page is
+ * already detached from the mapping and buffers cannot
+ * get reused.
*/
mapping = READ_ONCE(bh->b_page->mapping);
if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index d0f7a5abd9a9..fc2469a20fed 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
kn->iattr->ia_ctime = kn->iattr->ia_atime;
simple_xattrs_init(&kn->iattr->xattrs);
+ atomic_set(&kn->iattr->nr_user_xattrs, 0);
+ atomic_set(&kn->iattr->user_xattr_size, 0);
out_unlock:
ret = kn->iattr;
mutex_unlock(&iattr_mutex);
@@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
if (!attrs)
return -ENOMEM;
- return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
+ return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL);
}
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
@@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
return kernfs_xattr_set(kn, name, value, size, flags);
}
+static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
+ const char *full_name,
+ struct simple_xattrs *xattrs,
+ const void *value, size_t size, int flags)
+{
+ atomic_t *sz = &kn->iattr->user_xattr_size;
+ atomic_t *nr = &kn->iattr->nr_user_xattrs;
+ ssize_t removed_size;
+ int ret;
+
+ if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
+ ret = -ENOSPC;
+ goto dec_count_out;
+ }
+
+ if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
+ ret = -ENOSPC;
+ goto dec_size_out;
+ }
+
+ ret = simple_xattr_set(xattrs, full_name, value, size, flags,
+ &removed_size);
+
+ if (!ret && removed_size >= 0)
+ size = removed_size;
+ else if (!ret)
+ return 0;
+dec_size_out:
+ atomic_sub(size, sz);
+dec_count_out:
+ atomic_dec(nr);
+ return ret;
+}
+
+static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
+ const char *full_name,
+ struct simple_xattrs *xattrs,
+ const void *value, size_t size, int flags)
+{
+ atomic_t *sz = &kn->iattr->user_xattr_size;
+ atomic_t *nr = &kn->iattr->nr_user_xattrs;
+ ssize_t removed_size;
+ int ret;
+
+ ret = simple_xattr_set(xattrs, full_name, value, size, flags,
+ &removed_size);
+
+ if (removed_size >= 0) {
+ atomic_sub(removed_size, sz);
+ atomic_dec(nr);
+ }
+
+ return ret;
+}
+
+static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *suffix, const void *value,
+ size_t size, int flags)
+{
+ const char *full_name = xattr_full_name(handler, suffix);
+ struct kernfs_node *kn = inode->i_private;
+ struct kernfs_iattrs *attrs;
+
+ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
+ return -EOPNOTSUPP;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ if (value)
+ return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
+ value, size, flags);
+ else
+ return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
+ value, size, flags);
+
+}
+
static const struct xattr_handler kernfs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = kernfs_vfs_xattr_get,
@@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = {
.set = kernfs_vfs_xattr_set,
};
+static const struct xattr_handler kernfs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = kernfs_vfs_xattr_get,
+ .set = kernfs_vfs_user_xattr_set,
+};
+
const struct xattr_handler *kernfs_xattr_handlers[] = {
&kernfs_trusted_xattr_handler,
&kernfs_security_xattr_handler,
+ &kernfs_user_xattr_handler,
NULL
};
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 2f3c51d55261..7ee97ef59184 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -26,6 +26,8 @@ struct kernfs_iattrs {
struct timespec64 ia_ctime;
struct simple_xattrs xattrs;
+ atomic_t nr_user_xattrs;
+ atomic_t user_xattr_size;
};
/* +1 to avoid triggering overflow warning when negating it */
diff --git a/fs/libfs.c b/fs/libfs.c
index c686bd9caac6..3759fbacf522 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -891,7 +891,7 @@ int simple_attr_open(struct inode *inode, struct file *file,
{
struct simple_attr *attr;
- attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
if (!attr)
return -ENOMEM;
@@ -931,9 +931,11 @@ ssize_t simple_attr_read(struct file *file, char __user *buf,
if (ret)
return ret;
- if (*ppos) { /* continued read */
+ if (*ppos && attr->get_buf[0]) {
+ /* continued read */
size = strlen(attr->get_buf);
- } else { /* first read */
+ } else {
+ /* first read */
u64 val;
ret = attr->get(attr->data, &val);
if (ret)
diff --git a/fs/namei.c b/fs/namei.c
index db6565c99825..a320371899cf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -503,9 +503,10 @@ struct nameidata {
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
- struct inode *link_inode;
unsigned root_seq;
int dfd;
+ kuid_t dir_uid;
+ umode_t dir_mode;
} __randomize_layout;
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
@@ -530,52 +531,34 @@ static void restore_nameidata(void)
kfree(now->stack);
}
-static int __nd_alloc_stack(struct nameidata *nd)
+static bool nd_alloc_stack(struct nameidata *nd)
{
struct saved *p;
- if (nd->flags & LOOKUP_RCU) {
- p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
- GFP_ATOMIC);
- if (unlikely(!p))
- return -ECHILD;
- } else {
- p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
- GFP_KERNEL);
- if (unlikely(!p))
- return -ENOMEM;
- }
+ p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
+ nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
+ if (unlikely(!p))
+ return false;
memcpy(p, nd->internal, sizeof(nd->internal));
nd->stack = p;
- return 0;
+ return true;
}
/**
- * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
- * @path: nameidate to verify
+ * path_connected - Verify that a dentry is below mnt.mnt_root
*
* Rename can sometimes move a file or directory outside of a bind
* mount, path_connected allows those cases to be detected.
*/
-static bool path_connected(const struct path *path)
+static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
- struct vfsmount *mnt = path->mnt;
struct super_block *sb = mnt->mnt_sb;
/* Bind mounts and multi-root filesystems can have disconnected paths */
if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
return true;
- return is_subdir(path->dentry, mnt->mnt_root);
-}
-
-static inline int nd_alloc_stack(struct nameidata *nd)
-{
- if (likely(nd->depth != EMBEDDED_LEVELS))
- return 0;
- if (likely(nd->stack != nd->internal))
- return 0;
- return __nd_alloc_stack(nd);
+ return is_subdir(dentry, mnt->mnt_root);
}
static void drop_links(struct nameidata *nd)
@@ -608,10 +591,9 @@ static void terminate_walk(struct nameidata *nd)
}
/* path_put is needed afterwards regardless of success or failure */
-static bool legitimize_path(struct nameidata *nd,
- struct path *path, unsigned seq)
+static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
- int res = __legitimize_mnt(path->mnt, nd->m_seq);
+ int res = __legitimize_mnt(path->mnt, mseq);
if (unlikely(res)) {
if (res > 0)
path->mnt = NULL;
@@ -625,6 +607,12 @@ static bool legitimize_path(struct nameidata *nd,
return !read_seqcount_retry(&path->dentry->d_seq, seq);
}
+static inline bool legitimize_path(struct nameidata *nd,
+ struct path *path, unsigned seq)
+{
+ return __legitimize_path(path, seq, nd->m_seq);
+}
+
static bool legitimize_links(struct nameidata *nd)
{
int i;
@@ -858,25 +846,6 @@ static int set_root(struct nameidata *nd)
return 0;
}
-static void path_put_conditional(struct path *path, struct nameidata *nd)
-{
- dput(path->dentry);
- if (path->mnt != nd->path.mnt)
- mntput(path->mnt);
-}
-
-static inline void path_to_nameidata(const struct path *path,
- struct nameidata *nd)
-{
- if (!(nd->flags & LOOKUP_RCU)) {
- dput(nd->path.dentry);
- if (nd->path.mnt != path->mnt)
- mntput(nd->path.mnt);
- }
- nd->path.mnt = path->mnt;
- nd->path.dentry = path->dentry;
-}
-
static int nd_jump_root(struct nameidata *nd)
{
if (unlikely(nd->flags & LOOKUP_BENEATH))
@@ -969,28 +938,21 @@ int sysctl_protected_regular __read_mostly;
*
* Returns 0 if following the symlink is allowed, -ve on error.
*/
-static inline int may_follow_link(struct nameidata *nd)
+static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
- const struct inode *inode;
- const struct inode *parent;
- kuid_t puid;
-
if (!sysctl_protected_symlinks)
return 0;
/* Allowed if owner and follower match. */
- inode = nd->link_inode;
if (uid_eq(current_cred()->fsuid, inode->i_uid))
return 0;
/* Allowed if parent directory not sticky and world-writable. */
- parent = nd->inode;
- if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
+ if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
return 0;
/* Allowed if parent directory and link owner match. */
- puid = parent->i_uid;
- if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
+ if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
return 0;
if (nd->flags & LOOKUP_RCU)
@@ -1113,63 +1075,6 @@ static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
return 0;
}
-static __always_inline
-const char *get_link(struct nameidata *nd)
-{
- struct saved *last = nd->stack + nd->depth - 1;
- struct dentry *dentry = last->link.dentry;
- struct inode *inode = nd->link_inode;
- int error;
- const char *res;
-
- if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
- return ERR_PTR(-ELOOP);
-
- if (!(nd->flags & LOOKUP_RCU)) {
- touch_atime(&last->link);
- cond_resched();
- } else if (atime_needs_update(&last->link, inode)) {
- if (unlikely(unlazy_walk(nd)))
- return ERR_PTR(-ECHILD);
- touch_atime(&last->link);
- }
-
- error = security_inode_follow_link(dentry, inode,
- nd->flags & LOOKUP_RCU);
- if (unlikely(error))
- return ERR_PTR(error);
-
- nd->last_type = LAST_BIND;
- res = READ_ONCE(inode->i_link);
- if (!res) {
- const char * (*get)(struct dentry *, struct inode *,
- struct delayed_call *);
- get = inode->i_op->get_link;
- if (nd->flags & LOOKUP_RCU) {
- res = get(NULL, inode, &last->done);
- if (res == ERR_PTR(-ECHILD)) {
- if (unlikely(unlazy_walk(nd)))
- return ERR_PTR(-ECHILD);
- res = get(dentry, inode, &last->done);
- }
- } else {
- res = get(dentry, inode, &last->done);
- }
- if (IS_ERR_OR_NULL(res))
- return res;
- }
- if (*res == '/') {
- error = nd_jump_root(nd);
- if (unlikely(error))
- return ERR_PTR(error);
- while (unlikely(*++res == '/'))
- ;
- }
- if (!*res)
- res = NULL;
- return res;
-}
-
/*
* follow_up - Find the mountpoint of path's vfsmount
*
@@ -1203,19 +1108,59 @@ int follow_up(struct path *path)
}
EXPORT_SYMBOL(follow_up);
+static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
+ struct path *path, unsigned *seqp)
+{
+ while (mnt_has_parent(m)) {
+ struct dentry *mountpoint = m->mnt_mountpoint;
+
+ m = m->mnt_parent;
+ if (unlikely(root->dentry == mountpoint &&
+ root->mnt == &m->mnt))
+ break;
+ if (mountpoint != m->mnt.mnt_root) {
+ path->mnt = &m->mnt;
+ path->dentry = mountpoint;
+ *seqp = read_seqcount_begin(&mountpoint->d_seq);
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool choose_mountpoint(struct mount *m, const struct path *root,
+ struct path *path)
+{
+ bool found;
+
+ rcu_read_lock();
+ while (1) {
+ unsigned seq, mseq = read_seqbegin(&mount_lock);
+
+ found = choose_mountpoint_rcu(m, root, path, &seq);
+ if (unlikely(!found)) {
+ if (!read_seqretry(&mount_lock, mseq))
+ break;
+ } else {
+ if (likely(__legitimize_path(path, seq, mseq)))
+ break;
+ rcu_read_unlock();
+ path_put(path);
+ rcu_read_lock();
+ }
+ }
+ rcu_read_unlock();
+ return found;
+}
+
/*
* Perform an automount
* - return -EISDIR to tell follow_managed() to stop and return the path we
* were called with.
*/
-static int follow_automount(struct path *path, struct nameidata *nd,
- bool *need_mntput)
+static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
- struct vfsmount *mnt;
- int err;
-
- if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
- return -EREMOTE;
+ struct dentry *dentry = path->dentry;
/* We don't want to mount if someone's just doing a stat -
* unless they're stat'ing a directory and appended a '/' to
@@ -1228,138 +1173,91 @@ static int follow_automount(struct path *path, struct nameidata *nd,
* as being automount points. These will need the attentions
* of the daemon to instantiate them before they can be used.
*/
- if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+ if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
- path->dentry->d_inode)
+ dentry->d_inode)
return -EISDIR;
- nd->total_link_count++;
- if (nd->total_link_count >= 40)
+ if (count && (*count)++ >= MAXSYMLINKS)
return -ELOOP;
- mnt = path->dentry->d_op->d_automount(path);
- if (IS_ERR(mnt)) {
- /*
- * The filesystem is allowed to return -EISDIR here to indicate
- * it doesn't want to automount. For instance, autofs would do
- * this so that its userspace daemon can mount on this dentry.
- *
- * However, we can only permit this if it's a terminal point in
- * the path being looked up; if it wasn't then the remainder of
- * the path is inaccessible and we should say so.
- */
- if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
- return -EREMOTE;
- return PTR_ERR(mnt);
- }
-
- if (!mnt) /* mount collision */
- return 0;
-
- if (!*need_mntput) {
- /* lock_mount() may release path->mnt on error */
- mntget(path->mnt);
- *need_mntput = true;
- }
- err = finish_automount(mnt, path);
-
- switch (err) {
- case -EBUSY:
- /* Someone else made a mount here whilst we were busy */
- return 0;
- case 0:
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- return 0;
- default:
- return err;
- }
-
+ return finish_automount(dentry->d_op->d_automount(path), path);
}
/*
- * Handle a dentry that is managed in some way.
- * - Flagged for transit management (autofs)
- * - Flagged as mountpoint
- * - Flagged as automount point
- *
- * This may only be called in refwalk mode.
- * On success path->dentry is known positive.
- *
- * Serialization is taken care of in namespace.c
+ * mount traversal - out-of-line part. One note on ->d_flags accesses -
+ * dentries are pinned but not locked here, so negative dentry can go
+ * positive right under us. Use of smp_load_acquire() provides a barrier
+ * sufficient for ->d_inode and ->d_flags consistency.
*/
-static int follow_managed(struct path *path, struct nameidata *nd)
+static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
+ int *count, unsigned lookup_flags)
{
- struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
- unsigned flags;
+ struct vfsmount *mnt = path->mnt;
bool need_mntput = false;
int ret = 0;
- /* Given that we're not holding a lock here, we retain the value in a
- * local variable for each dentry as we look at it so that we don't see
- * the components of that value change under us */
- while (flags = smp_load_acquire(&path->dentry->d_flags),
- unlikely(flags & DCACHE_MANAGED_DENTRY)) {
+ while (flags & DCACHE_MANAGED_DENTRY) {
/* Allow the filesystem to manage the transit without i_mutex
* being held. */
if (flags & DCACHE_MANAGE_TRANSIT) {
- BUG_ON(!path->dentry->d_op);
- BUG_ON(!path->dentry->d_op->d_manage);
ret = path->dentry->d_op->d_manage(path, false);
flags = smp_load_acquire(&path->dentry->d_flags);
if (ret < 0)
break;
}
- /* Transit to a mounted filesystem. */
- if (flags & DCACHE_MOUNTED) {
+ if (flags & DCACHE_MOUNTED) { // something's mounted on it..
struct vfsmount *mounted = lookup_mnt(path);
- if (mounted) {
+ if (mounted) { // ... in our namespace
dput(path->dentry);
if (need_mntput)
mntput(path->mnt);
path->mnt = mounted;
path->dentry = dget(mounted->mnt_root);
+ // here we know it's positive
+ flags = path->dentry->d_flags;
need_mntput = true;
continue;
}
-
- /* Something is mounted on this dentry in another
- * namespace and/or whatever was mounted there in this
- * namespace got unmounted before lookup_mnt() could
- * get it */
}
- /* Handle an automount point */
- if (flags & DCACHE_NEED_AUTOMOUNT) {
- ret = follow_automount(path, nd, &need_mntput);
- if (ret < 0)
- break;
- continue;
- }
+ if (!(flags & DCACHE_NEED_AUTOMOUNT))
+ break;
- /* We didn't change the current path point */
- break;
+ // uncovered automount point
+ ret = follow_automount(path, count, lookup_flags);
+ flags = smp_load_acquire(&path->dentry->d_flags);
+ if (ret < 0)
+ break;
}
- if (need_mntput) {
- if (path->mnt == mnt)
- mntput(path->mnt);
- if (unlikely(nd->flags & LOOKUP_NO_XDEV))
- ret = -EXDEV;
- else
- nd->flags |= LOOKUP_JUMPED;
- }
- if (ret == -EISDIR || !ret)
- ret = 1;
- if (ret > 0 && unlikely(d_flags_negative(flags)))
+ if (ret == -EISDIR)
+ ret = 0;
+ // possible if you race with several mount --move
+ if (need_mntput && path->mnt == mnt)
+ mntput(path->mnt);
+ if (!ret && unlikely(d_flags_negative(flags)))
ret = -ENOENT;
- if (unlikely(ret < 0))
- path_put_conditional(path, nd);
+ *jumped = need_mntput;
return ret;
}
+static inline int traverse_mounts(struct path *path, bool *jumped,
+ int *count, unsigned lookup_flags)
+{
+ unsigned flags = smp_load_acquire(&path->dentry->d_flags);
+
+ /* fastpath */
+ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
+ *jumped = false;
+ if (unlikely(d_flags_negative(flags)))
+ return -ENOENT;
+ return 0;
+ }
+ return __traverse_mounts(path, flags, jumped, count, lookup_flags);
+}
+
int follow_down_one(struct path *path)
{
struct vfsmount *mounted;
@@ -1376,11 +1274,22 @@ int follow_down_one(struct path *path)
}
EXPORT_SYMBOL(follow_down_one);
-static inline int managed_dentry_rcu(const struct path *path)
+/*
+ * Follow down to the covering mount currently visible to userspace. At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ */
+int follow_down(struct path *path)
{
- return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
- path->dentry->d_op->d_manage(path, true) : 0;
+ struct vfsmount *mnt = path->mnt;
+ bool jumped;
+ int ret = traverse_mounts(path, &jumped, NULL, 0);
+
+ if (path->mnt != mnt)
+ mntput(mnt);
+ return ret;
}
+EXPORT_SYMBOL(follow_down);
/*
* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
@@ -1389,204 +1298,88 @@ static inline int managed_dentry_rcu(const struct path *path)
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
struct inode **inode, unsigned *seqp)
{
+ struct dentry *dentry = path->dentry;
+ unsigned int flags = dentry->d_flags;
+
+ if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
+ return true;
+
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+ return false;
+
for (;;) {
- struct mount *mounted;
/*
* Don't forget we might have a non-mountpoint managed dentry
* that wants to block transit.
*/
- switch (managed_dentry_rcu(path)) {
- case -ECHILD:
- default:
- return false;
- case -EISDIR:
- return true;
- case 0:
- break;
- }
-
- if (!d_mountpoint(path->dentry))
- return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
-
- mounted = __lookup_mnt(path->mnt, path->dentry);
- if (!mounted)
- break;
- if (unlikely(nd->flags & LOOKUP_NO_XDEV))
- return false;
- path->mnt = &mounted->mnt;
- path->dentry = mounted->mnt.mnt_root;
- nd->flags |= LOOKUP_JUMPED;
- *seqp = read_seqcount_begin(&path->dentry->d_seq);
- /*
- * Update the inode too. We don't need to re-check the
- * dentry sequence number here after this d_inode read,
- * because a mount-point is always pinned.
- */
- *inode = path->dentry->d_inode;
- }
- return !read_seqretry(&mount_lock, nd->m_seq) &&
- !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
-}
-
-static int follow_dotdot_rcu(struct nameidata *nd)
-{
- struct inode *inode = nd->inode;
-
- while (1) {
- if (path_equal(&nd->path, &nd->root)) {
- if (unlikely(nd->flags & LOOKUP_BENEATH))
- return -ECHILD;
- break;
+ if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
+ int res = dentry->d_op->d_manage(path, true);
+ if (res)
+ return res == -EISDIR;
+ flags = dentry->d_flags;
}
- if (nd->path.dentry != nd->path.mnt->mnt_root) {
- struct dentry *old = nd->path.dentry;
- struct dentry *parent = old->d_parent;
- unsigned seq;
- inode = parent->d_inode;
- seq = read_seqcount_begin(&parent->d_seq);
- if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
- return -ECHILD;
- nd->path.dentry = parent;
- nd->seq = seq;
- if (unlikely(!path_connected(&nd->path)))
- return -ECHILD;
- break;
- } else {
- struct mount *mnt = real_mount(nd->path.mnt);
- struct mount *mparent = mnt->mnt_parent;
- struct dentry *mountpoint = mnt->mnt_mountpoint;
- struct inode *inode2 = mountpoint->d_inode;
- unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
- if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
- return -ECHILD;
- if (&mparent->mnt == nd->path.mnt)
- break;
- if (unlikely(nd->flags & LOOKUP_NO_XDEV))
- return -ECHILD;
- /* we know that mountpoint was pinned */
- nd->path.dentry = mountpoint;
- nd->path.mnt = &mparent->mnt;
- inode = inode2;
- nd->seq = seq;
+ if (flags & DCACHE_MOUNTED) {
+ struct mount *mounted = __lookup_mnt(path->mnt, dentry);
+ if (mounted) {
+ path->mnt = &mounted->mnt;
+ dentry = path->dentry = mounted->mnt.mnt_root;
+ nd->flags |= LOOKUP_JUMPED;
+ *seqp = read_seqcount_begin(&dentry->d_seq);
+ *inode = dentry->d_inode;
+ /*
+ * We don't need to re-check ->d_seq after this
+ * ->d_inode read - there will be an RCU delay
+ * between mount hash removal and ->mnt_root
+ * becoming unpinned.
+ */
+ flags = dentry->d_flags;
+ continue;
+ }
+ if (read_seqretry(&mount_lock, nd->m_seq))
+ return false;
}
+ return !(flags & DCACHE_NEED_AUTOMOUNT);
}
- while (unlikely(d_mountpoint(nd->path.dentry))) {
- struct mount *mounted;
- mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
- if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
- return -ECHILD;
- if (!mounted)
- break;
- if (unlikely(nd->flags & LOOKUP_NO_XDEV))
- return -ECHILD;
- nd->path.mnt = &mounted->mnt;
- nd->path.dentry = mounted->mnt.mnt_root;
- inode = nd->path.dentry->d_inode;
- nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
- }
- nd->inode = inode;
- return 0;
}
-/*
- * Follow down to the covering mount currently visible to userspace. At each
- * point, the filesystem owning that dentry may be queried as to whether the
- * caller is permitted to proceed or not.
- */
-int follow_down(struct path *path)
+static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
+ struct path *path, struct inode **inode,
+ unsigned int *seqp)
{
- unsigned managed;
+ bool jumped;
int ret;
- while (managed = READ_ONCE(path->dentry->d_flags),
- unlikely(managed & DCACHE_MANAGED_DENTRY)) {
- /* Allow the filesystem to manage the transit without i_mutex
- * being held.
- *
- * We indicate to the filesystem if someone is trying to mount
- * something here. This gives autofs the chance to deny anyone
- * other than its daemon the right to mount on its
- * superstructure.
- *
- * The filesystem may sleep at this point.
- */
- if (managed & DCACHE_MANAGE_TRANSIT) {
- BUG_ON(!path->dentry->d_op);
- BUG_ON(!path->dentry->d_op->d_manage);
- ret = path->dentry->d_op->d_manage(path, false);
- if (ret < 0)
- return ret == -EISDIR ? 0 : ret;
- }
-
- /* Transit to a mounted filesystem. */
- if (managed & DCACHE_MOUNTED) {
- struct vfsmount *mounted = lookup_mnt(path);
- if (!mounted)
- break;
- dput(path->dentry);
- mntput(path->mnt);
- path->mnt = mounted;
- path->dentry = dget(mounted->mnt_root);
- continue;
- }
-
- /* Don't handle automount points here */
- break;
- }
- return 0;
-}
-EXPORT_SYMBOL(follow_down);
-
-/*
- * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
- */
-static void follow_mount(struct path *path)
-{
- while (d_mountpoint(path->dentry)) {
- struct vfsmount *mounted = lookup_mnt(path);
- if (!mounted)
- break;
- dput(path->dentry);
- mntput(path->mnt);
- path->mnt = mounted;
- path->dentry = dget(mounted->mnt_root);
+ path->mnt = nd->path.mnt;
+ path->dentry = dentry;
+ if (nd->flags & LOOKUP_RCU) {
+ unsigned int seq = *seqp;
+ if (unlikely(!*inode))
+ return -ENOENT;
+ if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+ return 0;
+ if (unlazy_child(nd, dentry, seq))
+ return -ECHILD;
+ // *path might've been clobbered by __follow_mount_rcu()
+ path->mnt = nd->path.mnt;
+ path->dentry = dentry;
}
-}
-
-static int path_parent_directory(struct path *path)
-{
- struct dentry *old = path->dentry;
- /* rare case of legitimate dget_parent()... */
- path->dentry = dget_parent(path->dentry);
- dput(old);
- if (unlikely(!path_connected(path)))
- return -ENOENT;
- return 0;
-}
-
-static int follow_dotdot(struct nameidata *nd)
-{
- while (1) {
- if (path_equal(&nd->path, &nd->root)) {
- if (unlikely(nd->flags & LOOKUP_BENEATH))
- return -EXDEV;
- break;
- }
- if (nd->path.dentry != nd->path.mnt->mnt_root) {
- int ret = path_parent_directory(&nd->path);
- if (ret)
- return ret;
- break;
- }
- if (!follow_up(&nd->path))
- break;
+ ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
+ if (jumped) {
if (unlikely(nd->flags & LOOKUP_NO_XDEV))
- return -EXDEV;
+ ret = -EXDEV;
+ else
+ nd->flags |= LOOKUP_JUMPED;
}
- follow_mount(&nd->path);
- nd->inode = nd->path.dentry->d_inode;
- return 0;
+ if (unlikely(ret)) {
+ dput(path->dentry);
+ if (path->mnt != nd->path.mnt)
+ mntput(path->mnt);
+ } else {
+ *inode = d_backing_inode(path->dentry);
+ *seqp = 0; /* out of RCU mode, so the value doesn't matter */
+ }
+ return ret;
}
/*
@@ -1643,14 +1436,12 @@ static struct dentry *__lookup_hash(const struct qstr *name,
return dentry;
}
-static int lookup_fast(struct nameidata *nd,
- struct path *path, struct inode **inode,
- unsigned *seqp)
+static struct dentry *lookup_fast(struct nameidata *nd,
+ struct inode **inode,
+ unsigned *seqp)
{
- struct vfsmount *mnt = nd->path.mnt;
struct dentry *dentry, *parent = nd->path.dentry;
int status = 1;
- int err;
/*
* Rename seqlock is not required here because in the off chance
@@ -1659,12 +1450,11 @@ static int lookup_fast(struct nameidata *nd,
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
- bool negative;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
if (unlikely(!dentry)) {
if (unlazy_walk(nd))
- return -ECHILD;
- return 0;
+ return ERR_PTR(-ECHILD);
+ return NULL;
}
/*
@@ -1672,9 +1462,8 @@ static int lookup_fast(struct nameidata *nd,
* the dentry name information from lookup.
*/
*inode = d_backing_inode(dentry);
- negative = d_is_negative(dentry);
if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
- return -ECHILD;
+ return ERR_PTR(-ECHILD);
/*
* This sequence count validates that the parent had no
@@ -1684,46 +1473,30 @@ static int lookup_fast(struct nameidata *nd,
* enough, we can use __read_seqcount_retry here.
*/
if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
- return -ECHILD;
+ return ERR_PTR(-ECHILD);
*seqp = seq;
status = d_revalidate(dentry, nd->flags);
- if (likely(status > 0)) {
- /*
- * Note: do negative dentry check after revalidation in
- * case that drops it.
- */
- if (unlikely(negative))
- return -ENOENT;
- path->mnt = mnt;
- path->dentry = dentry;
- if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
- return 1;
- }
+ if (likely(status > 0))
+ return dentry;
if (unlazy_child(nd, dentry, seq))
- return -ECHILD;
+ return ERR_PTR(-ECHILD);
if (unlikely(status == -ECHILD))
/* we'd been told to redo it in non-rcu mode */
status = d_revalidate(dentry, nd->flags);
} else {
dentry = __d_lookup(parent, &nd->last);
if (unlikely(!dentry))
- return 0;
+ return NULL;
status = d_revalidate(dentry, nd->flags);
}
if (unlikely(status <= 0)) {
if (!status)
d_invalidate(dentry);
dput(dentry);
- return status;
+ return ERR_PTR(status);
}
-
- path->mnt = mnt;
- path->dentry = dentry;
- err = follow_managed(path, nd);
- if (likely(err > 0))
- *inode = d_backing_inode(path->dentry);
- return err;
+ return dentry;
}
/* Fast lookup failed, do it the slow way */
@@ -1788,81 +1561,107 @@ static inline int may_lookup(struct nameidata *nd)
return inode_permission(nd->inode, MAY_EXEC);
}
-static inline int handle_dots(struct nameidata *nd, int type)
+static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
{
- if (type == LAST_DOTDOT) {
- int error = 0;
+ if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
+ return -ELOOP;
- if (!nd->root.mnt) {
- error = set_root(nd);
- if (error)
- return error;
- }
- if (nd->flags & LOOKUP_RCU)
- error = follow_dotdot_rcu(nd);
- else
- error = follow_dotdot(nd);
- if (error)
- return error;
+ if (likely(nd->depth != EMBEDDED_LEVELS))
+ return 0;
+ if (likely(nd->stack != nd->internal))
+ return 0;
+ if (likely(nd_alloc_stack(nd)))
+ return 0;
- if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
- /*
- * If there was a racing rename or mount along our
- * path, then we can't be sure that ".." hasn't jumped
- * above nd->root (and so userspace should retry or use
- * some fallback).
- */
- smp_rmb();
- if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
- return -EAGAIN;
- if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
- return -EAGAIN;
- }
+ if (nd->flags & LOOKUP_RCU) {
+ // we need to grab link before we do unlazy. And we can't skip
+ // unlazy even if we fail to grab the link - cleanup needs it
+ bool grabbed_link = legitimize_path(nd, link, seq);
+
+ if (unlazy_walk(nd) != 0 || !grabbed_link)
+ return -ECHILD;
+
+ if (nd_alloc_stack(nd))
+ return 0;
}
- return 0;
+ return -ENOMEM;
}
-static int pick_link(struct nameidata *nd, struct path *link,
- struct inode *inode, unsigned seq)
+enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
+
+static const char *pick_link(struct nameidata *nd, struct path *link,
+ struct inode *inode, unsigned seq, int flags)
{
- int error;
struct saved *last;
- if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
- path_to_nameidata(link, nd);
- return -ELOOP;
- }
- if (!(nd->flags & LOOKUP_RCU)) {
- if (link->mnt == nd->path.mnt)
- mntget(link->mnt);
- }
- error = nd_alloc_stack(nd);
+ const char *res;
+ int error = reserve_stack(nd, link, seq);
+
if (unlikely(error)) {
- if (error == -ECHILD) {
- if (unlikely(!legitimize_path(nd, link, seq))) {
- drop_links(nd);
- nd->depth = 0;
- nd->flags &= ~LOOKUP_RCU;
- nd->path.mnt = NULL;
- nd->path.dentry = NULL;
- rcu_read_unlock();
- } else if (likely(unlazy_walk(nd)) == 0)
- error = nd_alloc_stack(nd);
- }
- if (error) {
+ if (!(nd->flags & LOOKUP_RCU))
path_put(link);
- return error;
- }
+ return ERR_PTR(error);
}
-
last = nd->stack + nd->depth++;
last->link = *link;
clear_delayed_call(&last->done);
- nd->link_inode = inode;
last->seq = seq;
- return 1;
-}
-enum {WALK_FOLLOW = 1, WALK_MORE = 2};
+ if (flags & WALK_TRAILING) {
+ error = may_follow_link(nd, inode);
+ if (unlikely(error))
+ return ERR_PTR(error);
+ }
+
+ if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
+ return ERR_PTR(-ELOOP);
+
+ if (!(nd->flags & LOOKUP_RCU)) {
+ touch_atime(&last->link);
+ cond_resched();
+ } else if (atime_needs_update(&last->link, inode)) {
+ if (unlikely(unlazy_walk(nd)))
+ return ERR_PTR(-ECHILD);
+ touch_atime(&last->link);
+ }
+
+ error = security_inode_follow_link(link->dentry, inode,
+ nd->flags & LOOKUP_RCU);
+ if (unlikely(error))
+ return ERR_PTR(error);
+
+ res = READ_ONCE(inode->i_link);
+ if (!res) {
+ const char * (*get)(struct dentry *, struct inode *,
+ struct delayed_call *);
+ get = inode->i_op->get_link;
+ if (nd->flags & LOOKUP_RCU) {
+ res = get(NULL, inode, &last->done);
+ if (res == ERR_PTR(-ECHILD)) {
+ if (unlikely(unlazy_walk(nd)))
+ return ERR_PTR(-ECHILD);
+ res = get(link->dentry, inode, &last->done);
+ }
+ } else {
+ res = get(link->dentry, inode, &last->done);
+ }
+ if (!res)
+ goto all_done;
+ if (IS_ERR(res))
+ return res;
+ }
+ if (*res == '/') {
+ error = nd_jump_root(nd);
+ if (unlikely(error))
+ return ERR_PTR(error);
+ while (unlikely(*++res == '/'))
+ ;
+ }
+ if (*res)
+ return res;
+all_done: // pure jump
+ put_link(nd);
+ return NULL;
+}
/*
* Do we need to follow links? We _really_ want to be able
@@ -1870,63 +1669,187 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2};
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
*/
-static inline int step_into(struct nameidata *nd, struct path *path,
- int flags, struct inode *inode, unsigned seq)
+static const char *step_into(struct nameidata *nd, int flags,
+ struct dentry *dentry, struct inode *inode, unsigned seq)
{
- if (!(flags & WALK_MORE) && nd->depth)
- put_link(nd);
- if (likely(!d_is_symlink(path->dentry)) ||
- !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
+ struct path path;
+ int err = handle_mounts(nd, dentry, &path, &inode, &seq);
+
+ if (err < 0)
+ return ERR_PTR(err);
+ if (likely(!d_is_symlink(path.dentry)) ||
+ ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
+ (flags & WALK_NOFOLLOW)) {
/* not a symlink or should not follow */
- path_to_nameidata(path, nd);
+ if (!(nd->flags & LOOKUP_RCU)) {
+ dput(nd->path.dentry);
+ if (nd->path.mnt != path.mnt)
+ mntput(nd->path.mnt);
+ }
+ nd->path = path;
nd->inode = inode;
nd->seq = seq;
- return 0;
+ return NULL;
}
- /* make sure that d_is_symlink above matches inode */
if (nd->flags & LOOKUP_RCU) {
- if (read_seqcount_retry(&path->dentry->d_seq, seq))
- return -ECHILD;
+ /* make sure that d_is_symlink above matches inode */
+ if (read_seqcount_retry(&path.dentry->d_seq, seq))
+ return ERR_PTR(-ECHILD);
+ } else {
+ if (path.mnt == nd->path.mnt)
+ mntget(path.mnt);
}
- return pick_link(nd, path, inode, seq);
+ return pick_link(nd, &path, inode, seq, flags);
}
-static int walk_component(struct nameidata *nd, int flags)
+static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
+ struct inode **inodep,
+ unsigned *seqp)
{
- struct path path;
+ struct dentry *parent, *old;
+
+ if (path_equal(&nd->path, &nd->root))
+ goto in_root;
+ if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
+ struct path path;
+ unsigned seq;
+ if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
+ &nd->root, &path, &seq))
+ goto in_root;
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+ return ERR_PTR(-ECHILD);
+ nd->path = path;
+ nd->inode = path.dentry->d_inode;
+ nd->seq = seq;
+ if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+ return ERR_PTR(-ECHILD);
+ /* we know that mountpoint was pinned */
+ }
+ old = nd->path.dentry;
+ parent = old->d_parent;
+ *inodep = parent->d_inode;
+ *seqp = read_seqcount_begin(&parent->d_seq);
+ if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
+ return ERR_PTR(-ECHILD);
+ if (unlikely(!path_connected(nd->path.mnt, parent)))
+ return ERR_PTR(-ECHILD);
+ return parent;
+in_root:
+ if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+ return ERR_PTR(-ECHILD);
+ if (unlikely(nd->flags & LOOKUP_BENEATH))
+ return ERR_PTR(-ECHILD);
+ return NULL;
+}
+
+static struct dentry *follow_dotdot(struct nameidata *nd,
+ struct inode **inodep,
+ unsigned *seqp)
+{
+ struct dentry *parent;
+
+ if (path_equal(&nd->path, &nd->root))
+ goto in_root;
+ if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
+ struct path path;
+
+ if (!choose_mountpoint(real_mount(nd->path.mnt),
+ &nd->root, &path))
+ goto in_root;
+ path_put(&nd->path);
+ nd->path = path;
+ nd->inode = path.dentry->d_inode;
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+ return ERR_PTR(-EXDEV);
+ }
+ /* rare case of legitimate dget_parent()... */
+ parent = dget_parent(nd->path.dentry);
+ if (unlikely(!path_connected(nd->path.mnt, parent))) {
+ dput(parent);
+ return ERR_PTR(-ENOENT);
+ }
+ *seqp = 0;
+ *inodep = parent->d_inode;
+ return parent;
+
+in_root:
+ if (unlikely(nd->flags & LOOKUP_BENEATH))
+ return ERR_PTR(-EXDEV);
+ dget(nd->path.dentry);
+ return NULL;
+}
+
+static const char *handle_dots(struct nameidata *nd, int type)
+{
+ if (type == LAST_DOTDOT) {
+ const char *error = NULL;
+ struct dentry *parent;
+ struct inode *inode;
+ unsigned seq;
+
+ if (!nd->root.mnt) {
+ error = ERR_PTR(set_root(nd));
+ if (error)
+ return error;
+ }
+ if (nd->flags & LOOKUP_RCU)
+ parent = follow_dotdot_rcu(nd, &inode, &seq);
+ else
+ parent = follow_dotdot(nd, &inode, &seq);
+ if (IS_ERR(parent))
+ return ERR_CAST(parent);
+ if (unlikely(!parent))
+ error = step_into(nd, WALK_NOFOLLOW,
+ nd->path.dentry, nd->inode, nd->seq);
+ else
+ error = step_into(nd, WALK_NOFOLLOW,
+ parent, inode, seq);
+ if (unlikely(error))
+ return error;
+
+ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
+ /*
+ * If there was a racing rename or mount along our
+ * path, then we can't be sure that ".." hasn't jumped
+ * above nd->root (and so userspace should retry or use
+ * some fallback).
+ */
+ smp_rmb();
+ if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
+ return ERR_PTR(-EAGAIN);
+ if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
+ return ERR_PTR(-EAGAIN);
+ }
+ }
+ return NULL;
+}
+
+static const char *walk_component(struct nameidata *nd, int flags)
+{
+ struct dentry *dentry;
struct inode *inode;
unsigned seq;
- int err;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if (unlikely(nd->last_type != LAST_NORM)) {
- err = handle_dots(nd, nd->last_type);
if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
- return err;
+ return handle_dots(nd, nd->last_type);
}
- err = lookup_fast(nd, &path, &inode, &seq);
- if (unlikely(err <= 0)) {
- if (err < 0)
- return err;
- path.dentry = lookup_slow(&nd->last, nd->path.dentry,
- nd->flags);
- if (IS_ERR(path.dentry))
- return PTR_ERR(path.dentry);
-
- path.mnt = nd->path.mnt;
- err = follow_managed(&path, nd);
- if (unlikely(err < 0))
- return err;
-
- seq = 0; /* we are already out of RCU mode */
- inode = d_backing_inode(path.dentry);
+ dentry = lookup_fast(nd, &inode, &seq);
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry);
+ if (unlikely(!dentry)) {
+ dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry);
}
-
- return step_into(nd, &path, flags, inode, seq);
+ if (!(flags & WALK_MORE) && nd->depth)
+ put_link(nd);
+ return step_into(nd, flags, dentry, inode, seq);
}
/*
@@ -2167,8 +2090,11 @@ static inline u64 hash_name(const void *salt, const char *name)
*/
static int link_path_walk(const char *name, struct nameidata *nd)
{
+ int depth = 0; // depth <= nd->depth
int err;
+ nd->last_type = LAST_ROOT;
+ nd->flags |= LOOKUP_PARENT;
if (IS_ERR(name))
return PTR_ERR(name);
while (*name=='/')
@@ -2178,6 +2104,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
/* At this point we know we have a real path component. */
for(;;) {
+ const char *link;
u64 hash_len;
int type;
@@ -2227,36 +2154,27 @@ static int link_path_walk(const char *name, struct nameidata *nd)
} while (unlikely(*name == '/'));
if (unlikely(!*name)) {
OK:
- /* pathname body, done */
- if (!nd->depth)
- return 0;
- name = nd->stack[nd->depth - 1].name;
- /* trailing symlink, done */
- if (!name)
+ /* pathname or trailing symlink, done */
+ if (!depth) {
+ nd->dir_uid = nd->inode->i_uid;
+ nd->dir_mode = nd->inode->i_mode;
+ nd->flags &= ~LOOKUP_PARENT;
return 0;
+ }
/* last component of nested symlink */
- err = walk_component(nd, WALK_FOLLOW);
+ name = nd->stack[--depth].name;
+ link = walk_component(nd, 0);
} else {
/* not the last component */
- err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
+ link = walk_component(nd, WALK_MORE);
}
- if (err < 0)
- return err;
-
- if (err) {
- const char *s = get_link(nd);
-
- if (IS_ERR(s))
- return PTR_ERR(s);
- err = 0;
- if (unlikely(!s)) {
- /* jumped */
- put_link(nd);
- } else {
- nd->stack[nd->depth - 1].name = name;
- name = s;
- continue;
- }
+ if (unlikely(link)) {
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ /* a symlink to follow */
+ nd->stack[depth++].name = name;
+ name = link;
+ continue;
}
if (unlikely(!d_can_lookup(nd->path.dentry))) {
if (nd->flags & LOOKUP_RCU) {
@@ -2279,8 +2197,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
if (flags & LOOKUP_RCU)
rcu_read_lock();
- nd->last_type = LAST_ROOT; /* if there are only slashes... */
- nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
+ nd->flags = flags | LOOKUP_JUMPED;
nd->depth = 0;
nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
@@ -2370,54 +2287,20 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
return s;
}
-static const char *trailing_symlink(struct nameidata *nd)
-{
- const char *s;
- int error = may_follow_link(nd);
- if (unlikely(error))
- return ERR_PTR(error);
- nd->flags |= LOOKUP_PARENT;
- nd->stack[0].name = NULL;
- s = get_link(nd);
- return s ? s : "";
-}
-
-static inline int lookup_last(struct nameidata *nd)
+static inline const char *lookup_last(struct nameidata *nd)
{
if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
- nd->flags &= ~LOOKUP_PARENT;
- return walk_component(nd, 0);
+ return walk_component(nd, WALK_TRAILING);
}
static int handle_lookup_down(struct nameidata *nd)
{
- struct path path = nd->path;
- struct inode *inode = nd->inode;
- unsigned seq = nd->seq;
- int err;
-
- if (nd->flags & LOOKUP_RCU) {
- /*
- * don't bother with unlazy_walk on failure - we are
- * at the very beginning of walk, so we lose nothing
- * if we simply redo everything in non-RCU mode
- */
- if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
- return -ECHILD;
- } else {
- dget(path.dentry);
- err = follow_managed(&path, nd);
- if (unlikely(err < 0))
- return err;
- inode = d_backing_inode(path.dentry);
- seq = 0;
- }
- path_to_nameidata(&path, nd);
- nd->inode = inode;
- nd->seq = seq;
- return 0;
+ if (!(nd->flags & LOOKUP_RCU))
+ dget(nd->path.dentry);
+ return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
+ nd->path.dentry, nd->inode, nd->seq));
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -2432,16 +2315,19 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
s = ERR_PTR(err);
}
- while (!(err = link_path_walk(s, nd))
- && ((err = lookup_last(nd)) > 0)) {
- s = trailing_symlink(nd);
- }
+ while (!(err = link_path_walk(s, nd)) &&
+ (s = lookup_last(nd)) != NULL)
+ ;
if (!err)
err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY)
if (!d_can_lookup(nd->path.dentry))
err = -ENOTDIR;
+ if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
+ err = handle_lookup_down(nd);
+ nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
+ }
if (!err) {
*path = nd->path;
nd->path.mnt = NULL;
@@ -2470,7 +2356,8 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
if (likely(!retval))
- audit_inode(name, path->dentry, 0);
+ audit_inode(name, path->dentry,
+ flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
restore_nameidata();
putname(name);
return retval;
@@ -2718,24 +2605,23 @@ int path_pts(struct path *path)
/* Find something mounted on "pts" in the same directory as
* the input path.
*/
- struct dentry *child, *parent;
- struct qstr this;
- int ret;
-
- ret = path_parent_directory(path);
- if (ret)
- return ret;
+ struct dentry *parent = dget_parent(path->dentry);
+ struct dentry *child;
+ struct qstr this = QSTR_INIT("pts", 3);
- parent = path->dentry;
- this.name = "pts";
- this.len = 3;
+ if (unlikely(!path_connected(path->mnt, parent))) {
+ dput(parent);
+ return -ENOENT;
+ }
+ dput(path->dentry);
+ path->dentry = parent;
child = d_hash_and_lookup(parent, &this);
if (!child)
return -ENOENT;
path->dentry = child;
dput(parent);
- follow_mount(path);
+ follow_down(path);
return 0;
}
#endif
@@ -2748,88 +2634,6 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
}
EXPORT_SYMBOL(user_path_at_empty);
-/**
- * path_mountpoint - look up a path to be umounted
- * @nd: lookup context
- * @flags: lookup flags
- * @path: pointer to container for result
- *
- * Look up the given name, but don't attempt to revalidate the last component.
- * Returns 0 and "path" will be valid on success; Returns error otherwise.
- */
-static int
-path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
-{
- const char *s = path_init(nd, flags);
- int err;
-
- while (!(err = link_path_walk(s, nd)) &&
- (err = lookup_last(nd)) > 0) {
- s = trailing_symlink(nd);
- }
- if (!err && (nd->flags & LOOKUP_RCU))
- err = unlazy_walk(nd);
- if (!err)
- err = handle_lookup_down(nd);
- if (!err) {
- *path = nd->path;
- nd->path.mnt = NULL;
- nd->path.dentry = NULL;
- }
- terminate_walk(nd);
- return err;
-}
-
-static int
-filename_mountpoint(int dfd, struct filename *name, struct path *path,
- unsigned int flags)
-{
- struct nameidata nd;
- int error;
- if (IS_ERR(name))
- return PTR_ERR(name);
- set_nameidata(&nd, dfd, name);
- error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
- if (unlikely(error == -ECHILD))
- error = path_mountpoint(&nd, flags, path);
- if (unlikely(error == -ESTALE))
- error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
- if (likely(!error))
- audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL);
- restore_nameidata();
- putname(name);
- return error;
-}
-
-/**
- * user_path_mountpoint_at - lookup a path from userland in order to umount it
- * @dfd: directory file descriptor
- * @name: pathname from userland
- * @flags: lookup flags
- * @path: pointer to container to hold result
- *
- * A umount is a special case for path walking. We're not actually interested
- * in the inode in this situation, and ESTALE errors can be a problem. We
- * simply want track down the dentry and vfsmount attached at the mountpoint
- * and avoid revalidating the last component.
- *
- * Returns 0 and populates "path" on success.
- */
-int
-user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
- struct path *path)
-{
- return filename_mountpoint(dfd, getname(name), path, flags);
-}
-
-int
-kern_path_mountpoint(int dfd, const char *name, struct path *path,
- unsigned int flags)
-{
- return filename_mountpoint(dfd, getname_kernel(name), path, flags);
-}
-EXPORT_SYMBOL(kern_path_mountpoint);
-
int __check_sticky(struct inode *dir, struct inode *inode)
{
kuid_t fsuid = current_fsuid();
@@ -3127,18 +2931,14 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
*
* Returns an error code otherwise.
*/
-static int atomic_open(struct nameidata *nd, struct dentry *dentry,
- struct path *path, struct file *file,
- const struct open_flags *op,
- int open_flag, umode_t mode)
+static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
+ struct file *file,
+ int open_flag, umode_t mode)
{
struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
struct inode *dir = nd->path.dentry->d_inode;
int error;
- if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */
- open_flag &= ~O_TRUNC;
-
if (nd->flags & LOOKUP_DIRECTORY)
open_flag |= O_DIRECTORY;
@@ -3149,19 +2949,10 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
d_lookup_done(dentry);
if (!error) {
if (file->f_mode & FMODE_OPENED) {
- /*
- * We didn't have the inode before the open, so check open
- * permission here.
- */
- int acc_mode = op->acc_mode;
- if (file->f_mode & FMODE_CREATED) {
- WARN_ON(!(open_flag & O_CREAT));
- fsnotify_create(dir, dentry);
- acc_mode = 0;
+ if (unlikely(dentry != file->f_path.dentry)) {
+ dput(dentry);
+ dentry = dget(file->f_path.dentry);
}
- error = may_open(&file->f_path, acc_mode, open_flag);
- if (WARN_ON(error > 0))
- error = -EINVAL;
} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
error = -EIO;
} else {
@@ -3169,19 +2960,15 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
dput(dentry);
dentry = file->f_path.dentry;
}
- if (file->f_mode & FMODE_CREATED)
- fsnotify_create(dir, dentry);
- if (unlikely(d_is_negative(dentry))) {
+ if (unlikely(d_is_negative(dentry)))
error = -ENOENT;
- } else {
- path->dentry = dentry;
- path->mnt = nd->path.mnt;
- return 0;
- }
}
}
- dput(dentry);
- return error;
+ if (error) {
+ dput(dentry);
+ dentry = ERR_PTR(error);
+ }
+ return dentry;
}
/*
@@ -3199,10 +2986,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
*
* An error code is returned on failure.
*/
-static int lookup_open(struct nameidata *nd, struct path *path,
- struct file *file,
- const struct open_flags *op,
- bool got_write)
+static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
+ const struct open_flags *op,
+ bool got_write)
{
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
@@ -3213,7 +2999,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
if (unlikely(IS_DEADDIR(dir_inode)))
- return -ENOENT;
+ return ERR_PTR(-ENOENT);
file->f_mode &= ~FMODE_CREATED;
dentry = d_lookup(dir, &nd->last);
@@ -3221,7 +3007,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
if (!dentry) {
dentry = d_alloc_parallel(dir, &nd->last, &wq);
if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ return dentry;
}
if (d_in_lookup(dentry))
break;
@@ -3237,7 +3023,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
}
if (dentry->d_inode) {
/* Cached positive dentry: will open in f_op->open */
- goto out_no_open;
+ return dentry;
}
/*
@@ -3249,41 +3035,27 @@ static int lookup_open(struct nameidata *nd, struct path *path,
* Another problem is returing the "right" error value (e.g. for an
* O_EXCL open we want to return EEXIST not EROFS).
*/
+ if (unlikely(!got_write))
+ open_flag &= ~O_TRUNC;
if (open_flag & O_CREAT) {
+ if (open_flag & O_EXCL)
+ open_flag &= ~O_TRUNC;
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current_umask();
- if (unlikely(!got_write)) {
- create_error = -EROFS;
- open_flag &= ~O_CREAT;
- if (open_flag & (O_EXCL | O_TRUNC))
- goto no_open;
- /* No side effects, safe to clear O_CREAT */
- } else {
+ if (likely(got_write))
create_error = may_o_create(&nd->path, dentry, mode);
- if (create_error) {
- open_flag &= ~O_CREAT;
- if (open_flag & O_EXCL)
- goto no_open;
- }
- }
- } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
- unlikely(!got_write)) {
- /*
- * No O_CREATE -> atomicity not a requirement -> fall
- * back to lookup + open
- */
- goto no_open;
+ else
+ create_error = -EROFS;
}
-
+ if (create_error)
+ open_flag &= ~O_CREAT;
if (dir_inode->i_op->atomic_open) {
- error = atomic_open(nd, dentry, path, file, op, open_flag,
- mode);
- if (unlikely(error == -ENOENT) && create_error)
- error = create_error;
- return error;
+ dentry = atomic_open(nd, dentry, file, open_flag, mode);
+ if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
+ dentry = ERR_PTR(create_error);
+ return dentry;
}
-no_open:
if (d_in_lookup(dentry)) {
struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
nd->flags);
@@ -3310,78 +3082,60 @@ no_open:
open_flag & O_EXCL);
if (error)
goto out_dput;
- fsnotify_create(dir_inode, dentry);
}
if (unlikely(create_error) && !dentry->d_inode) {
error = create_error;
goto out_dput;
}
-out_no_open:
- path->dentry = dentry;
- path->mnt = nd->path.mnt;
- return 0;
+ return dentry;
out_dput:
dput(dentry);
- return error;
+ return ERR_PTR(error);
}
-/*
- * Handle the last step of open()
- */
-static int do_last(struct nameidata *nd,
+static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry;
- kuid_t dir_uid = nd->inode->i_uid;
- umode_t dir_mode = nd->inode->i_mode;
int open_flag = op->open_flag;
- bool will_truncate = (open_flag & O_TRUNC) != 0;
bool got_write = false;
- int acc_mode = op->acc_mode;
unsigned seq;
struct inode *inode;
- struct path path;
+ struct dentry *dentry;
+ const char *res;
int error;
- nd->flags &= ~LOOKUP_PARENT;
nd->flags |= op->intent;
if (nd->last_type != LAST_NORM) {
- error = handle_dots(nd, nd->last_type);
- if (unlikely(error))
- return error;
- goto finish_open;
+ if (nd->depth)
+ put_link(nd);
+ return handle_dots(nd, nd->last_type);
}
if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
- error = lookup_fast(nd, &path, &inode, &seq);
- if (likely(error > 0))
+ dentry = lookup_fast(nd, &inode, &seq);
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry);
+ if (likely(dentry))
goto finish_lookup;
- if (error < 0)
- return error;
-
- BUG_ON(nd->inode != dir->d_inode);
BUG_ON(nd->flags & LOOKUP_RCU);
} else {
/* create side of things */
- /*
- * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
- * has been cleared when we got to the last component we are
- * about to look up
- */
- error = complete_walk(nd);
- if (error)
- return error;
-
+ if (nd->flags & LOOKUP_RCU) {
+ error = unlazy_walk(nd);
+ if (unlikely(error))
+ return ERR_PTR(error);
+ }
audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len]))
- return -EISDIR;
+ return ERR_PTR(-EISDIR);
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
@@ -3398,108 +3152,90 @@ static int do_last(struct nameidata *nd,
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
- error = lookup_open(nd, &path, file, op, got_write);
+ dentry = lookup_open(nd, file, op, got_write);
+ if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
+ fsnotify_create(dir->d_inode, dentry);
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
inode_unlock_shared(dir->d_inode);
- if (error)
- goto out;
-
- if (file->f_mode & FMODE_OPENED) {
- if ((file->f_mode & FMODE_CREATED) ||
- !S_ISREG(file_inode(file)->i_mode))
- will_truncate = false;
-
- audit_inode(nd->name, file->f_path.dentry, 0);
- goto opened;
- }
+ if (got_write)
+ mnt_drop_write(nd->path.mnt);
- if (file->f_mode & FMODE_CREATED) {
- /* Don't check for write permission, don't truncate */
- open_flag &= ~O_TRUNC;
- will_truncate = false;
- acc_mode = 0;
- path_to_nameidata(&path, nd);
- goto finish_open_created;
- }
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry);
- /*
- * If atomic_open() acquired write access it is dropped now due to
- * possible mount and symlink following (this might be optimized away if
- * necessary...)
- */
- if (got_write) {
- mnt_drop_write(nd->path.mnt);
- got_write = false;
+ if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
+ dput(nd->path.dentry);
+ nd->path.dentry = dentry;
+ return NULL;
}
- error = follow_managed(&path, nd);
- if (unlikely(error < 0))
- return error;
+finish_lookup:
+ if (nd->depth)
+ put_link(nd);
+ res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
+ if (unlikely(res))
+ nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+ return res;
+}
- /*
- * create/update audit record if it already exists.
- */
- audit_inode(nd->name, path.dentry, 0);
+/*
+ * Handle the last step of open()
+ */
+static int do_open(struct nameidata *nd,
+ struct file *file, const struct open_flags *op)
+{
+ int open_flag = op->open_flag;
+ bool do_truncate;
+ int acc_mode;
+ int error;
- if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
- path_to_nameidata(&path, nd);
- return -EEXIST;
+ if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
+ error = complete_walk(nd);
+ if (error)
+ return error;
}
-
- seq = 0; /* out of RCU mode, so the value doesn't matter */
- inode = d_backing_inode(path.dentry);
-finish_lookup:
- error = step_into(nd, &path, 0, inode, seq);
- if (unlikely(error))
- return error;
-finish_open:
- /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
- error = complete_walk(nd);
- if (error)
- return error;
- audit_inode(nd->name, nd->path.dentry, 0);
+ if (!(file->f_mode & FMODE_CREATED))
+ audit_inode(nd->name, nd->path.dentry, 0);
if (open_flag & O_CREAT) {
- error = -EISDIR;
+ if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
+ return -EEXIST;
if (d_is_dir(nd->path.dentry))
- goto out;
- error = may_create_in_sticky(dir_mode, dir_uid,
+ return -EISDIR;
+ error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
- goto out;
+ return error;
}
- error = -ENOTDIR;
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
- goto out;
- if (!d_is_reg(nd->path.dentry))
- will_truncate = false;
+ return -ENOTDIR;
- if (will_truncate) {
+ do_truncate = false;
+ acc_mode = op->acc_mode;
+ if (file->f_mode & FMODE_CREATED) {
+ /* Don't check for write permission, don't truncate */
+ open_flag &= ~O_TRUNC;
+ acc_mode = 0;
+ } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
error = mnt_want_write(nd->path.mnt);
if (error)
- goto out;
- got_write = true;
+ return error;
+ do_truncate = true;
}
-finish_open_created:
error = may_open(&nd->path, acc_mode, open_flag);
- if (error)
- goto out;
- BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
- error = vfs_open(&nd->path, file);
- if (error)
- goto out;
-opened:
- error = ima_file_check(file, op->acc_mode);
- if (!error && will_truncate)
+ if (!error && !(file->f_mode & FMODE_OPENED))
+ error = vfs_open(&nd->path, file);
+ if (!error)
+ error = ima_file_check(file, op->acc_mode);
+ if (!error && do_truncate)
error = handle_truncate(file);
-out:
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
}
- if (got_write)
+ if (do_truncate)
mnt_drop_write(nd->path.mnt);
return error;
}
@@ -3604,10 +3340,10 @@ static struct file *path_openat(struct nameidata *nd,
} else {
const char *s = path_init(nd, flags);
while (!(error = link_path_walk(s, nd)) &&
- (error = do_last(nd, file, op)) > 0) {
- nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
- s = trailing_symlink(nd);
- }
+ (s = open_last_lookups(nd, file, op)) != NULL)
+ ;
+ if (!error)
+ error = do_open(nd, file, op);
terminate_walk(nd);
}
if (likely(!error)) {
diff --git a/fs/namespace.c b/fs/namespace.c
index 85b5f7bea82e..a28e4db075ed 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1669,7 +1669,7 @@ int ksys_umount(char __user *name, int flags)
struct path path;
struct mount *mnt;
int retval;
- int lookup_flags = 0;
+ int lookup_flags = LOOKUP_MOUNTPOINT;
if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
return -EINVAL;
@@ -1680,7 +1680,7 @@ int ksys_umount(char __user *name, int flags)
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
- retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
+ retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
mnt = real_mount(path.mnt);
@@ -2697,45 +2697,32 @@ static int do_move_mount_old(struct path *path, const char *old_name)
/*
* add a mount into a namespace's mount tree
*/
-static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
+static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
+ struct path *path, int mnt_flags)
{
- struct mountpoint *mp;
- struct mount *parent;
- int err;
+ struct mount *parent = real_mount(path->mnt);
mnt_flags &= ~MNT_INTERNAL_FLAGS;
- mp = lock_mount(path);
- if (IS_ERR(mp))
- return PTR_ERR(mp);
-
- parent = real_mount(path->mnt);
- err = -EINVAL;
if (unlikely(!check_mnt(parent))) {
/* that's acceptable only for automounts done in private ns */
if (!(mnt_flags & MNT_SHRINKABLE))
- goto unlock;
+ return -EINVAL;
/* ... and for those we'd better have mountpoint still alive */
if (!parent->mnt_ns)
- goto unlock;
+ return -EINVAL;
}
/* Refuse the same filesystem on the same mount point */
- err = -EBUSY;
if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
path->mnt->mnt_root == path->dentry)
- goto unlock;
+ return -EBUSY;
- err = -EINVAL;
if (d_is_symlink(newmnt->mnt.mnt_root))
- goto unlock;
+ return -EINVAL;
newmnt->mnt.mnt_flags = mnt_flags;
- err = graft_tree(newmnt, parent, mp);
-
-unlock:
- unlock_mount(mp);
- return err;
+ return graft_tree(newmnt, parent, mp);
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
@@ -2748,6 +2735,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
unsigned int mnt_flags)
{
struct vfsmount *mnt;
+ struct mountpoint *mp;
struct super_block *sb = fc->root->d_sb;
int error;
@@ -2768,7 +2756,13 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
mnt_warn_timestamp_expiry(mountpoint, mnt);
- error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+ mp = lock_mount(mountpoint);
+ if (IS_ERR(mp)) {
+ mntput(mnt);
+ return PTR_ERR(mp);
+ }
+ error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
+ unlock_mount(mp);
if (error < 0)
mntput(mnt);
return error;
@@ -2829,23 +2823,63 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int finish_automount(struct vfsmount *m, struct path *path)
{
- struct mount *mnt = real_mount(m);
+ struct dentry *dentry = path->dentry;
+ struct mountpoint *mp;
+ struct mount *mnt;
int err;
+
+ if (!m)
+ return 0;
+ if (IS_ERR(m))
+ return PTR_ERR(m);
+
+ mnt = real_mount(m);
/* The new mount record should have at least 2 refs to prevent it being
* expired before we get a chance to add it
*/
BUG_ON(mnt_get_count(mnt) < 2);
if (m->mnt_sb == path->mnt->mnt_sb &&
- m->mnt_root == path->dentry) {
+ m->mnt_root == dentry) {
err = -ELOOP;
- goto fail;
+ goto discard;
}
- err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
- if (!err)
- return 0;
-fail:
+ /*
+ * we don't want to use lock_mount() - in this case finding something
+ * that overmounts our mountpoint to be means "quitely drop what we've
+ * got", not "try to mount it on top".
+ */
+ inode_lock(dentry->d_inode);
+ namespace_lock();
+ if (unlikely(cant_mount(dentry))) {
+ err = -ENOENT;
+ goto discard_locked;
+ }
+ rcu_read_lock();
+ if (unlikely(__lookup_mnt(path->mnt, dentry))) {
+ rcu_read_unlock();
+ err = 0;
+ goto discard_locked;
+ }
+ rcu_read_unlock();
+ mp = get_mountpoint(dentry);
+ if (IS_ERR(mp)) {
+ err = PTR_ERR(mp);
+ goto discard_locked;
+ }
+
+ err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+ unlock_mount(mp);
+ if (unlikely(err))
+ goto discard;
+ mntput(m);
+ return 0;
+
+discard_locked:
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+discard:
/* remove m from any expiration list it may be on */
if (!list_empty(&mnt->mnt_expire)) {
namespace_lock();
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 40b6c5ac46c0..88e1763e02f3 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -164,7 +164,7 @@ config ROOT_NFS
If you want your system to mount its root file system via NFS,
choose Y here. This is common practice for managing systems
without local permanent storage. For details, read
- <file:Documentation/filesystems/nfs/nfsroot.txt>.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst>.
Most people say N here.
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 690221747b47..d1a0e2c8b1b4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -476,7 +476,7 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
err = ext_tree_remove(bl, true, 0, LLONG_MAX);
WARN_ON(err);
- kfree(bl);
+ kfree_rcu(bl, bl_layout.plh_rcu);
}
static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 549350259840..6a2033131c06 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -127,7 +127,9 @@ extern __be32 nfs4_callback_sequence(void *argp, void *resp,
#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
-#define RCA4_TYPE_MASK_ALL 0xf31f
+#define PNFS_FF_RCA4_TYPE_MASK_READ 16
+#define PNFS_FF_RCA4_TYPE_MASK_RW 17
+#define RCA4_TYPE_MASK_ALL 0x3f31f
struct cb_recallanyargs {
uint32_t craa_objs_to_keep;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index cd4c6bc81cae..e61dbc9b86ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -121,31 +121,31 @@ out:
*/
static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
const nfs4_stateid *stateid)
+ __must_hold(RCU)
{
struct nfs_server *server;
struct inode *inode;
struct pnfs_layout_hdr *lo;
+ rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_is_valid(lo))
+ continue;
if (stateid != NULL &&
!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
continue;
+ if (!nfs_sb_active(server->super))
+ continue;
inode = igrab(lo->plh_inode);
- if (!inode)
- return ERR_PTR(-EAGAIN);
- if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
- return ERR_PTR(-EAGAIN);
- }
- return inode;
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
}
}
-
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
}
@@ -163,28 +163,25 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
struct inode *inode;
struct pnfs_layout_hdr *lo;
+ rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
nfsi = NFS_I(lo->plh_inode);
if (nfs_compare_fh(fh, &nfsi->fh))
continue;
if (nfsi->layout != lo)
continue;
+ if (!nfs_sb_active(server->super))
+ continue;
inode = igrab(lo->plh_inode);
- if (!inode)
- return ERR_PTR(-EAGAIN);
- if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
- return ERR_PTR(-EAGAIN);
- }
- return inode;
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
}
}
-
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
}
@@ -194,14 +191,9 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
{
struct inode *inode;
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
inode = nfs_layout_find_inode_by_stateid(clp, stateid);
if (inode == ERR_PTR(-ENOENT))
inode = nfs_layout_find_inode_by_fh(clp, fh);
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
-
return inode;
}
@@ -280,7 +272,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto unlock;
}
- pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true);
switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
&args->cbl_range,
be32_to_cpu(args->cbl_stateid.seqid))) {
@@ -605,6 +597,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
struct cb_recallanyargs *args = argp;
__be32 status;
fmode_t flags = 0;
+ bool schedule_manager = false;
status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
if (!cps->clp) /* set in cb_sequence */
@@ -627,6 +620,18 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
pnfs_recall_all_layouts(cps->clp);
+
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (schedule_manager)
+ nfs4_schedule_state_manager(cps->clp);
+
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 1865322de142..816e1427f17e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -378,6 +378,18 @@ nfs_inode_detach_delegation(struct inode *inode)
}
static void
+nfs_update_delegation_cred(struct nfs_delegation *delegation,
+ const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred_fscmp(delegation->cred, cred) != 0) {
+ old = xchg(&delegation->cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
+static void
nfs_update_inplace_delegation(struct nfs_delegation *delegation,
const struct nfs_delegation *update)
{
@@ -385,8 +397,14 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
delegation->stateid.seqid = update->stateid.seqid;
smp_wmb();
delegation->type = update->type;
- if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ delegation->pagemod_limit = update->pagemod_limit;
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ delegation->change_attr = update->change_attr;
+ nfs_update_delegation_cred(delegation, update->cred);
+ /* smp_mb__before_atomic() is implicit due to xchg() */
+ clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
atomic_long_inc(&nfs_active_delegations);
+ }
}
}
@@ -545,21 +563,11 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
return ret;
}
-/**
- * nfs_client_return_marked_delegations - return previously marked delegations
- * @clp: nfs_client to process
- *
- * Note that this function is designed to be called by the state
- * manager thread. For this reason, it cannot flush the dirty data,
- * since that could deadlock in case of a state recovery error.
- *
- * Returns zero on success, or a negative errno value.
- */
-int nfs_client_return_marked_delegations(struct nfs_client *clp)
+static int nfs_server_return_marked_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
struct nfs_delegation *prev;
- struct nfs_server *server;
struct inode *inode;
struct inode *place_holder = NULL;
struct nfs_delegation *place_holder_deleg = NULL;
@@ -569,78 +577,79 @@ restart:
/*
* To avoid quadratic looping we hold a reference
* to an inode place_holder. Each time we restart, we
- * list nfs_servers from the server of that inode, and
- * delegation in the server from the delegations of that
- * inode.
+ * list delegation in the server from the delegations
+ * of that inode.
* prev is an RCU-protected pointer to a delegation which
* wasn't marked for return and might be a good choice for
* the next place_holder.
*/
- rcu_read_lock();
prev = NULL;
+ delegation = NULL;
+ rcu_read_lock();
if (place_holder)
- server = NFS_SERVER(place_holder);
- else
- server = list_entry_rcu(clp->cl_superblocks.next,
- struct nfs_server, client_link);
- list_for_each_entry_from_rcu(server, &clp->cl_superblocks, client_link) {
- delegation = NULL;
- if (place_holder && server == NFS_SERVER(place_holder))
- delegation = rcu_dereference(NFS_I(place_holder)->delegation);
- if (!delegation || delegation != place_holder_deleg)
- delegation = list_entry_rcu(server->delegations.next,
- struct nfs_delegation, super_list);
- list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
- struct inode *to_put = NULL;
-
- if (!nfs_delegation_need_return(delegation)) {
+ delegation = rcu_dereference(NFS_I(place_holder)->delegation);
+ if (!delegation || delegation != place_holder_deleg)
+ delegation = list_entry_rcu(server->delegations.next,
+ struct nfs_delegation, super_list);
+ list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
+ struct inode *to_put = NULL;
+
+ if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags))
+ continue;
+ if (!nfs_delegation_need_return(delegation)) {
+ if (nfs4_is_valid_delegation(delegation, 0))
prev = delegation;
- continue;
- }
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
-
- if (prev) {
- struct inode *tmp;
-
- tmp = nfs_delegation_grab_inode(prev);
- if (tmp) {
- to_put = place_holder;
- place_holder = tmp;
- place_holder_deleg = prev;
- }
- }
+ continue;
+ }
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- if (to_put)
- iput(to_put);
- nfs_sb_deactive(server->super);
- goto restart;
+ if (prev) {
+ struct inode *tmp = nfs_delegation_grab_inode(prev);
+ if (tmp) {
+ to_put = place_holder;
+ place_holder = tmp;
+ place_holder_deleg = prev;
}
- delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ }
+
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
rcu_read_unlock();
+ iput(to_put);
+ goto restart;
+ }
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
- if (to_put)
- iput(to_put);
+ iput(to_put);
- err = nfs_end_delegation_return(inode, delegation, 0);
- iput(inode);
- nfs_sb_deactive(server->super);
- cond_resched();
- if (!err)
- goto restart;
- set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
- if (place_holder)
- iput(place_holder);
- return err;
- }
+ err = nfs_end_delegation_return(inode, delegation, 0);
+ iput(inode);
+ cond_resched();
+ if (!err)
+ goto restart;
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+ goto out;
}
rcu_read_unlock();
- if (place_holder)
- iput(place_holder);
- return 0;
+out:
+ iput(place_holder);
+ return err;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+ return nfs_client_for_each_server(clp,
+ nfs_server_return_marked_delegations, NULL);
}
/**
@@ -1083,53 +1092,51 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
rcu_read_unlock();
}
-/**
- * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
- * @clp: nfs_client to process
- *
- */
-void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
- struct nfs_server *server;
struct inode *inode;
-
restart:
rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry_rcu(delegation, &server->delegations,
- super_list) {
- if (test_bit(NFS_DELEGATION_INODE_FREEING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_NEED_RECLAIM,
- &delegation->flags) == 0)
- continue;
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- nfs_sb_deactive(server->super);
- goto restart;
- }
- delegation = nfs_start_delegation_return_locked(NFS_I(inode));
- rcu_read_unlock();
- if (delegation != NULL) {
- if (nfs_detach_delegation(NFS_I(inode), delegation,
- server) != NULL)
- nfs_free_delegation(delegation);
- /* Match nfs_start_delegation_return_locked */
- nfs_put_delegation(delegation);
- }
- iput(inode);
- nfs_sb_deactive(server->super);
- cond_resched();
- goto restart;
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
+ if (delegation != NULL) {
+ if (nfs_detach_delegation(NFS_I(inode), delegation,
+ server) != NULL)
+ nfs_free_delegation(delegation);
+ /* Match nfs_start_delegation_return_locked */
+ nfs_put_delegation(delegation);
}
+ iput(inode);
+ cond_resched();
+ goto restart;
}
rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations,
+ NULL);
}
static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
@@ -1215,62 +1222,61 @@ nfs_delegation_test_free_expired(struct inode *inode,
nfs_remove_bad_delegation(inode, stateid);
}
-/**
- * nfs_reap_expired_delegations - reap expired delegations
- * @clp: nfs_client to process
- *
- * Iterates through all the delegations associated with this server and
- * checks if they have may have been revoked. This function is usually
- * expected to be called in cases where the server may have lost its
- * lease.
- */
-void nfs_reap_expired_delegations(struct nfs_client *clp)
+static int nfs_server_reap_expired_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
- struct nfs_server *server;
struct inode *inode;
const struct cred *cred;
nfs4_stateid stateid;
-
restart:
rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry_rcu(delegation, &server->delegations,
- super_list) {
- if (test_bit(NFS_DELEGATION_INODE_FREEING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_TEST_EXPIRED,
- &delegation->flags) == 0)
- continue;
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- nfs_sb_deactive(server->super);
- goto restart;
- }
- cred = get_cred_rcu(delegation->cred);
- nfs4_stateid_copy(&stateid, &delegation->stateid);
- clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
- rcu_read_unlock();
- nfs_delegation_test_free_expired(inode, &stateid, cred);
- put_cred(cred);
- if (nfs4_server_rebooted(clp)) {
- nfs_inode_mark_test_expired_delegation(server,inode);
- iput(inode);
- nfs_sb_deactive(server->super);
- return;
- }
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ spin_lock(&delegation->lock);
+ cred = get_cred_rcu(delegation->cred);
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ spin_unlock(&delegation->lock);
+ clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ rcu_read_unlock();
+ nfs_delegation_test_free_expired(inode, &stateid, cred);
+ put_cred(cred);
+ if (!nfs4_server_rebooted(server->nfs_client)) {
iput(inode);
- nfs_sb_deactive(server->super);
cond_resched();
goto restart;
}
+ nfs_inode_mark_test_expired_delegation(server,inode);
+ iput(inode);
+ return -EAGAIN;
}
rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations,
+ NULL);
}
void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
@@ -1359,11 +1365,14 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
- bool ret;
+ bool ret = false;
flags &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
+ if (!delegation)
+ goto out;
+ spin_lock(&delegation->lock);
ret = nfs4_is_valid_delegation(delegation, flags);
if (ret) {
nfs4_stateid_copy(dst, &delegation->stateid);
@@ -1371,6 +1380,8 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
if (cred)
*cred = get_cred(delegation->cred);
}
+ spin_unlock(&delegation->lock);
+out:
rcu_read_unlock();
return ret;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 193d6fb363b7..5a331da5f55a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -141,10 +141,9 @@ struct nfs_cache_array {
int size;
int eof_index;
u64 last_cookie;
- struct nfs_cache_array_entry array[0];
+ struct nfs_cache_array_entry array[];
};
-typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
typedef struct {
struct file *file;
struct page *page;
@@ -153,7 +152,7 @@ typedef struct {
u64 *dir_cookie;
u64 last_cookie;
loff_t current_index;
- decode_dirent_t decode;
+ loff_t prev_index;
unsigned long dir_verifier;
unsigned long timestamp;
@@ -240,6 +239,25 @@ out:
return ret;
}
+static inline
+int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return in_compat_syscall();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
+static
+bool nfs_readdir_use_cookie(const struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return false;
+ return true;
+}
+
static
int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
{
@@ -289,7 +307,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
!nfs_readdir_inode_mapping_valid(nfsi)) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->ctx->pos) {
+ } else if (new_pos < desc->prev_index) {
if (ctx->duped > 0
&& ctx->dup_cookie == *desc->dir_cookie) {
if (printk_ratelimit()) {
@@ -305,7 +323,11 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
ctx->dup_cookie = *desc->dir_cookie;
ctx->duped = -1;
}
- desc->ctx->pos = new_pos;
+ if (nfs_readdir_use_cookie(desc->file))
+ desc->ctx->pos = *desc->dir_cookie;
+ else
+ desc->ctx->pos = new_pos;
+ desc->prev_index = new_pos;
desc->cache_entry_index = i;
return 0;
}
@@ -376,9 +398,10 @@ error:
static int xdr_decode(nfs_readdir_descriptor_t *desc,
struct nfs_entry *entry, struct xdr_stream *xdr)
{
+ struct inode *inode = file_inode(desc->file);
int error;
- error = desc->decode(xdr, entry, desc->plus);
+ error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus);
if (error)
return error;
entry->fattr->time_start = desc->timestamp;
@@ -756,6 +779,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
if (desc->page_index == 0) {
desc->current_index = 0;
+ desc->prev_index = 0;
desc->last_cookie = 0;
}
do {
@@ -786,11 +810,14 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
desc->eof = true;
break;
}
- desc->ctx->pos++;
if (i < (array->size-1))
*desc->dir_cookie = array->array[i+1].cookie;
else
*desc->dir_cookie = array->last_cookie;
+ if (nfs_readdir_use_cookie(file))
+ desc->ctx->pos = *desc->dir_cookie;
+ else
+ desc->ctx->pos++;
if (ctx->duped != 0)
ctx->duped = 1;
}
@@ -860,9 +887,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
- nfs_readdir_descriptor_t my_desc,
- *desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
+ nfs_readdir_descriptor_t my_desc = {
+ .file = file,
+ .ctx = ctx,
+ .dir_cookie = &dir_ctx->dir_cookie,
+ .plus = nfs_use_readdirplus(inode, ctx),
+ },
+ *desc = &my_desc;
int res = 0;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -875,14 +907,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
- memset(desc, 0, sizeof(*desc));
-
- desc->file = file;
- desc->ctx = ctx;
- desc->dir_cookie = &dir_ctx->dir_cookie;
- desc->decode = NFS_PROTO(inode)->decode_dirent;
- desc->plus = nfs_use_readdirplus(inode, ctx);
-
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
@@ -954,7 +978,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
- dir_ctx->dir_cookie = 0;
+ if (nfs_readdir_use_cookie(filp))
+ dir_ctx->dir_cookie = offset;
+ else
+ dir_ctx->dir_cookie = 0;
dir_ctx->duped = 0;
}
inode_unlock(inode);
@@ -2282,7 +2309,7 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
static LIST_HEAD(nfs_access_lru_list);
static atomic_long_t nfs_access_nr_entries;
-static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+static unsigned long nfs_access_max_cachesize = 4*1024*1024;
module_param(nfs_access_max_cachesize, ulong, 0644);
MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
@@ -2489,7 +2516,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre
rcu_read_lock();
if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
goto out;
- lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+ lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru));
cache = list_entry(lh, struct nfs_access_entry, lru);
if (lh == &nfsi->access_cache_entry_lru ||
cred_fscmp(cred, cache->cred) != 0)
@@ -2642,9 +2669,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
status = NFS_PROTO(inode)->access(inode, &cache);
if (status != 0) {
if (status == -ESTALE) {
- nfs_zap_caches(inode);
if (!S_ISDIR(inode->i_mode))
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
}
goto out;
}
@@ -2732,14 +2760,7 @@ force_lookup:
if (!NFS_PROTO(inode)->access)
goto out_notsup;
- /* Always try fast lookups first */
- rcu_read_lock();
- res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
- rcu_read_unlock();
- if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
- /* Fast lookup failed, try the slow way */
- res = nfs_do_access(inode, cred, mask);
- }
+ res = nfs_do_access(inode, cred, mask);
out:
if (!res && (mask & MAY_EXEC))
res = nfs_execute_ok(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b768a0b42e82..a57e7c72c7f4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -94,7 +94,7 @@ struct nfs_direct_req {
#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
/* for read */
#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
- struct nfs_writeverf verf; /* unstable write verifier */
+#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
};
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
@@ -151,106 +151,6 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
dreq->count = dreq_len;
}
-/*
- * nfs_direct_select_verf - select the right verifier
- * @dreq - direct request possibly spanning multiple servers
- * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
- * @commit_idx - commit bucket index for the DS
- *
- * returns the correct verifier to use given the role of the server
- */
-static struct nfs_writeverf *
-nfs_direct_select_verf(struct nfs_direct_req *dreq,
- struct nfs_client *ds_clp,
- int commit_idx)
-{
- struct nfs_writeverf *verfp = &dreq->verf;
-
-#ifdef CONFIG_NFS_V4_1
- /*
- * pNFS is in use, use the DS verf except commit_through_mds is set
- * for layout segment where nbuckets is zero.
- */
- if (ds_clp && dreq->ds_cinfo.nbuckets > 0) {
- if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
- verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
- else
- WARN_ON_ONCE(1);
- }
-#endif
- return verfp;
-}
-
-
-/*
- * nfs_direct_set_hdr_verf - set the write/commit verifier
- * @dreq - direct request possibly spanning multiple servers
- * @hdr - pageio header to validate against previously seen verfs
- *
- * Set the server's (MDS or DS) "seen" verifier
- */
-static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
- WARN_ON_ONCE(verfp->committed >= 0);
- memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
- WARN_ON_ONCE(verfp->committed < 0);
-}
-
-static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
- const struct nfs_writeverf *v2)
-{
- return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
-}
-
-/*
- * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
- * @dreq - direct request possibly spanning multiple servers
- * @hdr - pageio header to validate against previously seen verf
- *
- * set the server's "seen" verf if not initialized.
- * returns result of comparison between @hdr->verf and the "seen"
- * verf of the server used by @hdr (DS or MDS)
- */
-static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
- if (verfp->committed < 0) {
- nfs_direct_set_hdr_verf(dreq, hdr);
- return 0;
- }
- return nfs_direct_cmp_verf(verfp, &hdr->verf);
-}
-
-/*
- * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
- * @dreq - direct request possibly spanning multiple servers
- * @data - commit data to validate against previously seen verf
- *
- * returns result of comparison between @data->verf and the verf of
- * the server used by @data (DS or MDS)
- */
-static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
- struct nfs_commit_data *data)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, data->ds_clp,
- data->ds_commit_index);
-
- /* verifier not set so always fail */
- if (verfp->committed < 0 || data->res.verf->committed <= NFS_UNSTABLE)
- return 1;
-
- return nfs_direct_cmp_verf(verfp, data->res.verf);
-}
-
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* @iocb: target I/O control block
@@ -305,7 +205,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
kref_get(&dreq->kref);
init_completion(&dreq->completion);
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
- dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
+ pnfs_init_ds_commit_info(&dreq->ds_cinfo);
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
spin_lock_init(&dreq->lock);
@@ -316,7 +216,7 @@ static void nfs_direct_req_free(struct kref *kref)
{
struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
- nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
+ pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
if (dreq->l_ctx != NULL)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
@@ -571,6 +471,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
goto out_release;
}
dreq->l_ctx = l_ctx;
@@ -605,15 +506,30 @@ out:
}
static void
+nfs_direct_join_group(struct list_head *list, struct inode *inode)
+{
+ struct nfs_page *req, *next;
+
+ list_for_each_entry(req, list, wb_list) {
+ if (req->wb_head != req || req->wb_this_page == req)
+ continue;
+ for (next = req->wb_this_page;
+ next != req->wb_head;
+ next = next->wb_this_page) {
+ nfs_list_remove_request(next);
+ nfs_release_request(next);
+ }
+ nfs_join_page_group(req, inode);
+ }
+}
+
+static void
nfs_direct_write_scan_commit_list(struct inode *inode,
struct list_head *list,
struct nfs_commit_info *cinfo)
{
mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-#ifdef CONFIG_NFS_V4_1
- if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
- NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
-#endif
+ pnfs_recover_commit_reqs(list, cinfo);
nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
@@ -629,11 +545,12 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+ nfs_direct_join_group(&reqs, dreq->inode);
+
dreq->count = 0;
dreq->max_count = 0;
list_for_each_entry(req, &reqs, wb_list)
dreq->max_count += req->wb_bytes;
- dreq->verf.committed = NFS_INVALID_STABLE_HOW;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
get_dreq(dreq);
@@ -670,27 +587,35 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
static void nfs_direct_commit_complete(struct nfs_commit_data *data)
{
+ const struct nfs_writeverf *verf = data->res.verf;
struct nfs_direct_req *dreq = data->dreq;
struct nfs_commit_info cinfo;
struct nfs_page *req;
int status = data->task.tk_status;
+ if (status < 0) {
+ /* Errors in commit are fatal */
+ dreq->error = status;
+ dreq->max_count = 0;
+ dreq->count = 0;
+ dreq->flags = NFS_ODIRECT_DONE;
+ } else if (dreq->flags == NFS_ODIRECT_DONE)
+ status = dreq->error;
+
nfs_init_cinfo_from_dreq(&cinfo, dreq);
- if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data))
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
+ if (status >= 0 && !nfs_write_match_verf(verf, req)) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
/*
* Despite the reboot, the write was successful,
* so reset wb_nio.
*/
req->wb_nio = 0;
- /* Note the rewrite will go through mds */
nfs_mark_request_commit(req, NULL, &cinfo, 0);
- } else
+ } else /* Error or match */
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -705,7 +630,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq = cinfo->dreq;
spin_lock(&dreq->lock);
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ if (dreq->flags != NFS_ODIRECT_DONE)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
spin_unlock(&dreq->lock);
nfs_mark_request_commit(req, NULL, cinfo, 0);
}
@@ -728,6 +654,23 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
nfs_direct_write_reschedule(dreq);
}
+static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
+{
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ LIST_HEAD(reqs);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
+ nfs_list_remove_request(req);
+ nfs_release_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+}
+
static void nfs_direct_write_schedule_work(struct work_struct *work)
{
struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
@@ -742,6 +685,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
nfs_direct_write_reschedule(dreq);
break;
default:
+ nfs_direct_write_clear_reqs(dreq);
nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
nfs_direct_complete(dreq);
}
@@ -768,20 +712,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
- if (hdr->good_bytes != 0) {
- if (nfs_write_need_commit(hdr)) {
- if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
- request_commit = true;
- else if (dreq->flags == 0) {
- nfs_direct_set_hdr_verf(dreq, hdr);
- request_commit = true;
- dreq->flags = NFS_ODIRECT_DO_COMMIT;
- } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- request_commit = true;
- if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
- dreq->flags =
- NFS_ODIRECT_RESCHED_WRITES;
- }
+ if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+ switch (dreq->flags) {
+ case 0:
+ dreq->flags = NFS_ODIRECT_DO_COMMIT;
+ request_commit = true;
+ break;
+ case NFS_ODIRECT_RESCHED_WRITES:
+ case NFS_ODIRECT_DO_COMMIT:
+ request_commit = true;
}
}
spin_unlock(&dreq->lock);
@@ -990,11 +929,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
goto out_release;
}
dreq->l_ctx = l_ctx;
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
nfs_start_io_direct(inode);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 89bd5581f317..963800037609 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -152,12 +152,13 @@ static int nfs_dns_upcall(struct cache_detail *cd,
struct cache_head *ch)
{
struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
- int ret;
- ret = nfs_cache_upcall(cd, key->hostname);
- if (ret)
- ret = sunrpc_cache_pipe_upcall(cd, ch);
- return ret;
+ if (test_and_set_bit(CACHE_PENDING, &ch->flags))
+ return 0;
+ if (!nfs_cache_upcall(cd, key->hostname))
+ return 0;
+ clear_bit(CACHE_PENDING, &ch->flags);
+ return sunrpc_cache_pipe_upcall_timeout(cd, ch);
}
static int nfs_dns_match(struct cache_head *ca,
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index c9b605f6c9cb..a13e69009f19 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -49,6 +49,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
MODULE_DESCRIPTION("The NFSv4 file layout driver");
#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
+static const struct pnfs_commit_ops filelayout_commit_ops;
static loff_t
filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
@@ -750,72 +751,17 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
/* This assumes a single RW lseg */
if (lseg->pls_range.iomode == IOMODE_RW) {
struct nfs4_filelayout *flo;
+ struct inode *inode;
flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
- flo->commit_info.nbuckets = 0;
- kfree(flo->commit_info.buckets);
- flo->commit_info.buckets = NULL;
+ inode = flo->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg);
+ spin_unlock(&inode->i_lock);
}
_filelayout_free_lseg(fl);
}
-static int
-filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- gfp_t gfp_flags)
-{
- struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
- struct pnfs_commit_bucket *buckets;
- int size, i;
-
- if (fl->commit_through_mds)
- return 0;
-
- size = (fl->stripe_type == STRIPE_SPARSE) ?
- fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
- if (cinfo->ds->nbuckets >= size) {
- /* This assumes there is only one IOMODE_RW lseg. What
- * we really want to do is have a layout_hdr level
- * dictionary of <multipath_list4, fh> keys, each
- * associated with a struct list_head, populated by calls
- * to filelayout_write_pagelist().
- * */
- return 0;
- }
-
- buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
- gfp_flags);
- if (!buckets)
- return -ENOMEM;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- /* mark direct verifier as unset */
- buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
- }
-
- spin_lock(&cinfo->inode->i_lock);
- if (cinfo->ds->nbuckets >= size)
- goto out;
- for (i = 0; i < cinfo->ds->nbuckets; i++) {
- list_splice(&cinfo->ds->buckets[i].written,
- &buckets[i].written);
- list_splice(&cinfo->ds->buckets[i].committing,
- &buckets[i].committing);
- buckets[i].direct_verf.committed =
- cinfo->ds->buckets[i].direct_verf.committed;
- buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
- buckets[i].clseg = cinfo->ds->buckets[i].clseg;
- }
- swap(cinfo->ds->buckets, buckets);
- cinfo->ds->nbuckets = size;
-out:
- spin_unlock(&cinfo->inode->i_lock);
- kfree(buckets);
- return 0;
-}
-
static struct pnfs_layout_segment *
filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
struct nfs4_layoutget_res *lgr,
@@ -938,9 +884,6 @@ static void
filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- struct nfs_commit_info cinfo;
- int status;
-
pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
@@ -959,17 +902,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
- goto out_mds;
- nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
- status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
- if (status < 0) {
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
- goto out_mds;
- }
- return;
-out_mds:
- nfs_pageio_reset_write_mds(pgio);
+ nfs_pageio_reset_write_mds(pgio);
}
static const struct nfs_pageio_ops filelayout_pg_read_ops = {
@@ -1078,36 +1011,6 @@ out_err:
return -EAGAIN;
}
-/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
- * for @page
- * @cinfo - commit info for current inode
- * @page - page to search for matching head request
- *
- * Returns a the head request if one is found, otherwise returns NULL.
- */
-static struct nfs_page *
-filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
-{
- struct nfs_page *freq, *t;
- struct pnfs_commit_bucket *b;
- int i;
-
- /* Linearly search the commit lists for each bucket until a matching
- * request is found */
- for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
- list_for_each_entry_safe(freq, t, &b->written, wb_list) {
- if (freq->wb_page == page)
- return freq->wb_head;
- }
- list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
- if (freq->wb_page == page)
- return freq->wb_head;
- }
- }
-
- return NULL;
-}
-
static int
filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
int how, struct nfs_commit_info *cinfo)
@@ -1140,13 +1043,17 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
struct nfs4_filelayout *flo;
flo = kzalloc(sizeof(*flo), gfp_flags);
- return flo != NULL ? &flo->generic_hdr : NULL;
+ if (flo == NULL)
+ return NULL;
+ pnfs_init_ds_commit_info(&flo->commit_info);
+ flo->commit_info.ops = &filelayout_commit_ops;
+ return &flo->generic_hdr;
}
static void
filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
- kfree(FILELAYOUT_FROM_HDR(lo));
+ kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu);
}
static struct pnfs_ds_commit_info *
@@ -1160,6 +1067,46 @@ filelayout_get_ds_info(struct inode *inode)
return &FILELAYOUT_FROM_HDR(layout)->commit_info;
}
+static void
+filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+ unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
+ fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+ new = pnfs_alloc_commit_array(size, GFP_NOIO);
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static const struct pnfs_commit_ops filelayout_commit_ops = {
+ .setup_ds_info = filelayout_setup_ds_info,
+ .release_ds_info = filelayout_release_ds_info,
+ .mark_request_commit = filelayout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .search_commit_reqs = pnfs_generic_search_commit_reqs,
+ .commit_pagelist = filelayout_commit_pagelist,
+};
+
static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
@@ -1173,12 +1120,6 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.pg_read_ops = &filelayout_pg_read_ops,
.pg_write_ops = &filelayout_pg_write_ops,
.get_ds_info = &filelayout_get_ds_info,
- .mark_request_commit = filelayout_mark_request_commit,
- .clear_request_commit = pnfs_generic_clear_request_commit,
- .scan_commit_lists = pnfs_generic_scan_commit_lists,
- .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
- .search_commit_reqs = filelayout_search_commit_reqs,
- .commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
.alloc_deviceid_node = filelayout_alloc_deviceid_node,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index bb9148b83166..7d399f72ebbb 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -32,6 +32,7 @@
static unsigned short io_maxretrans;
+static const struct pnfs_commit_ops ff_layout_commit_ops;
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr);
static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
@@ -48,9 +49,11 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
ffl = kzalloc(sizeof(*ffl), gfp_flags);
if (ffl) {
+ pnfs_init_ds_commit_info(&ffl->commit_info);
INIT_LIST_HEAD(&ffl->error_list);
INIT_LIST_HEAD(&ffl->mirrors);
ffl->last_report_time = ktime_get();
+ ffl->commit_info.ops = &ff_layout_commit_ops;
return &ffl->generic_hdr;
} else
return NULL;
@@ -59,14 +62,14 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
static void
ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
+ struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_ds_err *err, *n;
- list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
- list) {
+ list_for_each_entry_safe(err, n, &ffl->error_list, list) {
list_del(&err->list);
kfree(err);
}
- kfree(FF_LAYOUT_FROM_HDR(lo));
+ kfree_rcu(ffl, generic_hdr.plh_rcu);
}
static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
@@ -248,36 +251,10 @@ static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
{
- int i;
-
- if (fls->mirror_array) {
- for (i = 0; i < fls->mirror_array_cnt; i++) {
- /* normally mirror_ds is freed in
- * .free_deviceid_node but we still do it here
- * for .alloc_lseg error path */
- ff_layout_put_mirror(fls->mirror_array[i]);
- }
- kfree(fls->mirror_array);
- fls->mirror_array = NULL;
- }
-}
-
-static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
-{
- int ret = 0;
+ u32 i;
- dprintk("--> %s\n", __func__);
-
- /* FIXME: remove this check when layout segment support is added */
- if (lgr->range.offset != 0 ||
- lgr->range.length != NFS4_MAX_UINT64) {
- dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
- __func__);
- ret = -EINVAL;
- }
-
- dprintk("--> %s returns %d\n", __func__, ret);
- return ret;
+ for (i = 0; i < fls->mirror_array_cnt; i++)
+ ff_layout_put_mirror(fls->mirror_array[i]);
}
static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
@@ -289,6 +266,23 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
}
static bool
+ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
+ struct pnfs_layout_segment *l2)
+{
+ const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ u32 i;
+
+ if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
+ return false;
+ for (i = 0; i < fl1->mirror_array_cnt; i++) {
+ if (fl1->mirror_array[i] != fl2->mirror_array[i])
+ return false;
+ }
+ return true;
+}
+
+static bool
ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
const struct pnfs_layout_range *l2)
{
@@ -323,6 +317,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
new->pls_range.length);
if (new_end < old->pls_range.offset)
return false;
+ if (!ff_lseg_match_mirrors(new, old))
+ return false;
/* Mergeable: copy info from 'old' to 'new' */
if (new_end < old_end)
@@ -400,16 +396,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
rc = -ENOMEM;
- fls = kzalloc(sizeof(*fls), gfp_flags);
+ fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt),
+ gfp_flags);
if (!fls)
goto out_err_free;
fls->mirror_array_cnt = mirror_array_cnt;
fls->stripe_unit = stripe_unit;
- fls->mirror_array = kcalloc(fls->mirror_array_cnt,
- sizeof(fls->mirror_array[0]), gfp_flags);
- if (fls->mirror_array == NULL)
- goto out_err_free;
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
@@ -545,9 +538,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
out_sort_mirrors:
ff_layout_sort_mirrors(fls);
- rc = ff_layout_check_layout(lgr);
- if (rc)
- goto out_err_free;
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
out_free_page:
@@ -560,17 +550,6 @@ out_err_free:
goto out_free_page;
}
-static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
-{
- struct pnfs_layout_segment *lseg;
-
- list_for_each_entry(lseg, &layout->plh_segs, pls_list)
- if (lseg->pls_range.iomode == IOMODE_RW)
- return true;
-
- return false;
-}
-
static void
ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
{
@@ -585,23 +564,12 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
inode = ffl->generic_hdr.plh_inode;
spin_lock(&inode->i_lock);
- if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
- ffl->commit_info.nbuckets = 0;
- kfree(ffl->commit_info.buckets);
- ffl->commit_info.buckets = NULL;
- }
+ pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg);
spin_unlock(&inode->i_lock);
}
_ff_layout_free_lseg(fls);
}
-/* Return 1 until we have multiple lsegs support */
-static int
-ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
-{
- return 1;
-}
-
static void
nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
@@ -746,52 +714,6 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
spin_unlock(&mirror->lock);
}
-static int
-ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- gfp_t gfp_flags)
-{
- struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
- struct pnfs_commit_bucket *buckets;
- int size;
-
- if (cinfo->ds->nbuckets != 0) {
- /* This assumes there is only one RW lseg per file.
- * To support multiple lseg per file, we need to
- * change struct pnfs_commit_bucket to allow dynamic
- * increasing nbuckets.
- */
- return 0;
- }
-
- size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
-
- buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
- gfp_flags);
- if (!buckets)
- return -ENOMEM;
- else {
- int i;
-
- spin_lock(&cinfo->inode->i_lock);
- if (cinfo->ds->nbuckets != 0)
- kfree(buckets);
- else {
- cinfo->ds->buckets = buckets;
- cinfo->ds->nbuckets = size;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- /* mark direct verifier as unset */
- buckets[i].direct_verf.committed =
- NFS_INVALID_STABLE_HOW;
- }
- }
- spin_unlock(&cinfo->inode->i_lock);
- return 0;
- }
-}
-
static void
ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
{
@@ -876,8 +798,8 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_READ,
strict_iomode,
GFP_KERNEL);
@@ -888,6 +810,14 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
static void
+ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+}
+
+static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -897,7 +827,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
retry:
- pnfs_generic_pg_check_layout(pgio);
+ ff_layout_pg_check_layout(pgio, req);
/* Use full layout for now */
if (!pgio->pg_lseg) {
ff_layout_pg_get_read(pgio, req, false);
@@ -953,18 +883,16 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
- struct nfs_commit_info cinfo;
struct nfs4_pnfs_ds *ds;
int i;
- int status;
retry:
- pnfs_generic_pg_check_layout(pgio);
+ ff_layout_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_RW,
false,
GFP_NOFS);
@@ -978,11 +906,6 @@ retry:
if (pgio->pg_lseg == NULL)
goto out_mds;
- nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
- status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
- if (status < 0)
- goto out_mds;
-
/* Use a direct mapping of ds_idx to pgio mirror_idx */
if (WARN_ON_ONCE(pgio->pg_mirror_count !=
FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
@@ -1297,21 +1220,23 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
}
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, offset, length, status, opnum,
+ GFP_NOIO);
+
switch (status) {
case NFS4ERR_DELAY:
case NFS4ERR_GRACE:
- return;
- default:
break;
+ case NFS4ERR_NXIO:
+ ff_layout_mark_ds_unreachable(lseg, idx);
+ /* Fallthrough */
+ default:
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
}
- mirror = FF_LAYOUT_COMP(lseg, idx);
- err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, offset, length, status, opnum,
- GFP_NOIO);
- if (status == NFS4ERR_NXIO)
- ff_layout_mark_ds_unreachable(lseg, idx);
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -2012,6 +1937,33 @@ ff_layout_get_ds_info(struct inode *inode)
}
static void
+ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+
+ new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO);
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static void
ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
{
nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
@@ -2496,6 +2448,16 @@ ff_layout_set_layoutdriver(struct nfs_server *server,
return 0;
}
+static const struct pnfs_commit_ops ff_layout_commit_ops = {
+ .setup_ds_info = ff_layout_setup_ds_info,
+ .release_ds_info = ff_layout_release_ds_info,
+ .mark_request_commit = pnfs_layout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .commit_pagelist = ff_layout_commit_pagelist,
+};
+
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
@@ -2512,11 +2474,6 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.pg_write_ops = &ff_layout_pg_write_ops,
.get_ds_info = ff_layout_get_ds_info,
.free_deviceid_node = ff_layout_free_deviceid_node,
- .mark_request_commit = pnfs_layout_mark_request_commit,
- .clear_request_commit = pnfs_generic_clear_request_commit,
- .scan_commit_lists = pnfs_generic_scan_commit_lists,
- .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
- .commit_pagelist = ff_layout_commit_pagelist,
.read_pagelist = ff_layout_read_pagelist,
.write_pagelist = ff_layout_write_pagelist,
.alloc_deviceid_node = ff_layout_alloc_deviceid_node,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 2f369966abf7..354a031c69b1 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment {
u64 stripe_unit;
u32 flags;
u32 mirror_array_cnt;
- struct nfs4_ff_layout_mirror **mirror_array;
+ struct nfs4_ff_layout_mirror *mirror_array[];
};
struct nfs4_flexfile_layout {
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e113fcb4bb4c..ccc88be88d6a 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -190,6 +190,7 @@ static const struct constant_table nfs_vers_tokens[] = {
{ "4.0", Opt_vers_4_0 },
{ "4.1", Opt_vers_4_1 },
{ "4.2", Opt_vers_4_2 },
+ {}
};
enum {
@@ -202,13 +203,14 @@ enum {
nr__Opt_xprt
};
-static const struct constant_table nfs_xprt_protocol_tokens[nr__Opt_xprt] = {
+static const struct constant_table nfs_xprt_protocol_tokens[] = {
{ "rdma", Opt_xprt_rdma },
{ "rdma6", Opt_xprt_rdma6 },
{ "tcp", Opt_xprt_tcp },
{ "tcp6", Opt_xprt_tcp6 },
{ "udp", Opt_xprt_udp },
{ "udp6", Opt_xprt_udp6 },
+ {}
};
enum {
@@ -239,6 +241,7 @@ static const struct constant_table nfs_secflavor_tokens[] = {
{ "spkm3i", Opt_sec_spkmi },
{ "spkm3p", Opt_sec_spkmp },
{ "sys", Opt_sec_sys },
+ {}
};
/*
@@ -1135,7 +1138,7 @@ out_no_address:
return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
out_invalid_transport_udp:
- return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp");
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
}
#endif
@@ -1257,7 +1260,7 @@ out_v4_not_compiled:
nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
return -EPROTONOSUPPORT;
out_invalid_transport_udp:
- return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp");
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
out_no_address:
return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
out_mountproto_mismatch:
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b012c2668a1f..aaeeb4659bff 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -73,6 +73,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
struct inode *inode;
char *name;
int error = -ENOMEM;
+ unsigned long kflags = 0, kflags_out = 0;
name = kstrdup(fc->source, GFP_KERNEL);
if (!name)
@@ -83,11 +84,14 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
if (fsinfo.fattr == NULL)
goto out_name;
+ fsinfo.fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(fsinfo.fattr->label))
+ goto out_fattr;
error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo);
if (error < 0) {
dprintk("nfs_get_root: getattr error = %d\n", -error);
nfs_errorf(fc, "NFS: Couldn't getattr on root");
- goto out_fattr;
+ goto out_label;
}
inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr, NULL);
@@ -95,12 +99,12 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
dprintk("nfs_get_root: get root inode failed\n");
error = PTR_ERR(inode);
nfs_errorf(fc, "NFS: Couldn't get root inode");
- goto out_fattr;
+ goto out_label;
}
error = nfs_superblock_set_dummy_root(s, inode);
if (error != 0)
- goto out_fattr;
+ goto out_label;
/* root dentries normally start off anonymous and get spliced in later
* if the dentry tree reaches them; however if the dentry already
@@ -111,7 +115,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
dprintk("nfs_get_root: get root dentry failed\n");
error = PTR_ERR(root);
nfs_errorf(fc, "NFS: Couldn't get root dentry");
- goto out_fattr;
+ goto out_label;
}
security_d_instantiate(root, inode);
@@ -123,12 +127,39 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
}
spin_unlock(&root->d_lock);
fc->root = root;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+ kflags |= SECURITY_LSM_NATIVE_LABELS;
+ if (ctx->clone_data.sb) {
+ if (d_inode(fc->root)->i_fop != &nfs_dir_operations) {
+ error = -ESTALE;
+ goto error_splat_root;
+ }
+ /* clone lsm security options from the parent to the new sb */
+ error = security_sb_clone_mnt_opts(ctx->clone_data.sb,
+ s, kflags, &kflags_out);
+ } else {
+ error = security_sb_set_mnt_opts(s, fc->security,
+ kflags, &kflags_out);
+ }
+ if (error)
+ goto error_splat_root;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+ !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+ NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+
+ nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label);
error = 0;
+out_label:
+ nfs4_label_free(fsinfo.fattr->label);
out_fattr:
nfs_free_fattr(fsinfo.fattr);
out_name:
kfree(name);
out:
return error;
+error_splat_root:
+ dput(fc->root);
+ fc->root = NULL;
+ goto out_label;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 11bf15800ac9..b9d0921cb4fe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -62,7 +62,6 @@
/* Default is to see 64-bit inode numbers */
static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
-static void nfs_invalidate_inode(struct inode *);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
static struct kmem_cache * nfs_inode_cachep;
@@ -284,10 +283,18 @@ EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
* Invalidate, but do not unhash, the inode.
* NB: must be called with inode->i_lock held!
*/
-static void nfs_invalidate_inode(struct inode *inode)
+static void nfs_set_inode_stale_locked(struct inode *inode)
{
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
nfs_zap_caches_locked(inode);
+ trace_nfs_set_inode_stale(inode);
+}
+
+void nfs_set_inode_stale(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_inode_stale_locked(inode);
+ spin_unlock(&inode->i_lock);
}
struct nfs_find_desc {
@@ -959,16 +966,16 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
struct file *filp)
{
struct nfs_open_context *ctx;
- const struct cred *cred = get_current_cred();
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
- put_cred(cred);
+ if (!ctx)
return ERR_PTR(-ENOMEM);
- }
nfs_sb_active(dentry->d_sb);
ctx->dentry = dget(dentry);
- ctx->cred = cred;
+ if (filp)
+ ctx->cred = get_cred(filp->f_cred);
+ else
+ ctx->cred = get_current_cred();
ctx->ll_cred = NULL;
ctx->state = NULL;
ctx->mode = f_mode;
@@ -1163,9 +1170,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
status = 0;
break;
case -ESTALE:
- nfs_zap_caches(inode);
if (!S_ISDIR(inode->i_mode))
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
}
goto err_out;
}
@@ -2064,7 +2072,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
* lookup validation will know that the inode is bad.
* (But we fall through to invalidate the caches.)
*/
- nfs_invalidate_inode(inode);
+ nfs_set_inode_stale_locked(inode);
return -ESTALE;
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f80c47d5ff27..1f32a9fbfdaf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -274,12 +274,6 @@ void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
-static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
-{
- WARN_ON_ONCE(desc->pg_mirror_count < 1);
- return desc->pg_mirror_count > 1;
-}
-
static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
const struct nfs_open_context *ctx2)
{
@@ -417,7 +411,9 @@ extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
-
+extern int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data);
/* io.c */
extern void nfs_start_io_read(struct inode *inode);
extern void nfs_end_io_read(struct inode *inode);
@@ -515,13 +511,25 @@ int nfs_filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
#ifdef CONFIG_NFS_V4_1
+static inline void
+pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets)
+{
+ unsigned int i;
+
+ for (i = 0; i < nbuckets; i++)
+ buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
static inline
void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
{
- int i;
+ struct pnfs_commit_array *array;
- for (i = 0; i < cinfo->nbuckets; i++)
- cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list)
+ pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets,
+ array->nbuckets);
+ rcu_read_unlock();
}
#else
static inline
@@ -542,6 +550,14 @@ nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
return memcmp(v1->data, v2->data, sizeof(v1->data));
}
+static inline bool
+nfs_write_match_verf(const struct nfs_writeverf *verf,
+ struct nfs_page *req)
+{
+ return verf->committed > NFS_UNSTABLE &&
+ !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f3ece8ed3203..6b063227e34e 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -145,6 +145,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
struct vfsmount *mnt = ERR_PTR(-ENOMEM);
struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
struct nfs_client *client = server->nfs_client;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
int ret;
if (IS_ROOT(path->dentry))
@@ -190,12 +191,12 @@ struct vfsmount *nfs_d_automount(struct path *path)
if (IS_ERR(mnt))
goto out_fc;
- if (nfs_mountpoint_expiry_timeout < 0)
+ mntget(mnt); /* prevent immediate expiration */
+ if (timeout <= 0)
goto out_fc;
- mntget(mnt); /* prevent immediate expiration */
mnt_set_expiry(mnt, &nfs_automount_list);
- schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+ schedule_delayed_work(&nfs_automount_task, timeout);
out_fc:
put_fs_context(fc);
@@ -233,10 +234,11 @@ const struct inode_operations nfs_referral_inode_operations = {
static void nfs_expire_automounts(struct work_struct *work)
{
struct list_head *list = &nfs_automount_list;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
mark_mounts_for_expiry(list);
- if (!list_empty(list))
- schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+ if (!list_empty(list) && timeout > 0)
+ schedule_delayed_work(&nfs_automount_task, timeout);
}
void nfs_release_automount_timer(void)
@@ -247,10 +249,7 @@ void nfs_release_automount_timer(void)
/**
* nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @dentry: parent directory
- * @fh: filehandle for new root dentry
- * @fattr: attributes for new root inode
- * @authflavor: security flavor to use when performing the mount
+ * @fc: pointer to struct nfs_fs_context
*
*/
int nfs_do_submount(struct fs_context *fc)
@@ -312,3 +311,53 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server)
return nfs_do_submount(fc);
}
EXPORT_SYMBOL_GPL(nfs_submount);
+
+static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
+{
+ long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtol(val, 0, &num);
+ if (ret)
+ return -EINVAL;
+ if (num > 0) {
+ if (num >= INT_MAX / HZ)
+ num = INT_MAX;
+ else
+ num *= HZ;
+ *((int *)kp->arg) = num;
+ if (!list_empty(&nfs_automount_list))
+ mod_delayed_work(system_wq, &nfs_automount_task, num);
+ } else {
+ *((int *)kp->arg) = -1*HZ;
+ cancel_delayed_work(&nfs_automount_task);
+ }
+ return 0;
+}
+
+static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
+{
+ long num = *((int *)kp->arg);
+
+ if (num > 0) {
+ if (num >= INT_MAX - (HZ - 1))
+ num = INT_MAX / HZ;
+ else
+ num = (num + (HZ - 1)) / HZ;
+ } else
+ num = -1;
+ return scnprintf(buffer, PAGE_SIZE, "%li\n", num);
+}
+
+static const struct kernel_param_ops param_ops_nfs_timeout = {
+ .set = param_set_nfs_timeout,
+ .get = param_get_nfs_timeout,
+};
+#define param_check_nfs_timeout(name, p) __param_check(name, p, int);
+
+module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644);
+MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout,
+ "Set the NFS automounted mountpoint timeout value (seconds)."
+ "Values <= 0 turn expiration off.");
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 8be1ba7c62bb..2b7f6dcd2eb8 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -42,7 +42,9 @@ enum nfs4_client_state {
NFS4CLNT_LEASE_MOVED,
NFS4CLNT_DELEGATION_EXPIRED,
NFS4CLNT_RUN_MANAGER,
- NFS4CLNT_DELEGRETURN_RUNNING,
+ NFS4CLNT_RECALL_RUNNING,
+ NFS4CLNT_RECALL_ANY_LAYOUT_READ,
+ NFS4CLNT_RECALL_ANY_LAYOUT_RW,
};
#define NFS4_RENEW_TIMEOUT 0x01
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1297919e0fce..8e5d6223ddd3 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -252,6 +252,9 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
+ if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode))
+ return -ETXTBSY;
+
/* check alignment w.r.t. clone_blksize */
ret = -EINVAL;
if (bs) {
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 84026e7b8a5f..a3ab6e219061 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -354,7 +354,7 @@ static int try_location(struct fs_context *fc,
/**
* nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @dentry: parent directory
+ * @fc: pointer to struct nfs_fs_context
* @locations: array of NFSv4 server location information
*
*/
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 69b7ab7a5815..512afb1c7867 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2346,7 +2346,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
.callback_ops = &nfs4_open_confirm_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status;
@@ -2511,7 +2511,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data,
.callback_ops = &nfs4_open_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status;
@@ -2790,16 +2790,19 @@ static int nfs41_check_delegation_stateid(struct nfs4_state *state)
return NFS_OK;
}
+ spin_lock(&delegation->lock);
nfs4_stateid_copy(&stateid, &delegation->stateid);
if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
&delegation->flags)) {
+ spin_unlock(&delegation->lock);
rcu_read_unlock();
return NFS_OK;
}
if (delegation->cred)
cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
rcu_read_unlock();
status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
trace_nfs4_test_delegation_stateid(state, NULL, status);
@@ -3651,7 +3654,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
.rpc_message = &msg,
.callback_ops = &nfs4_close_ops,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status = -ENOMEM;
@@ -4002,7 +4005,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
{
int error;
struct nfs_fattr *fattr = info->fattr;
- struct nfs4_label *label = NULL;
+ struct nfs4_label *label = fattr->label;
error = nfs4_server_capabilities(server, mntfh);
if (error < 0) {
@@ -4010,23 +4013,17 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
return error;
}
- label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(label))
- return PTR_ERR(label);
-
error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL);
if (error < 0) {
dprintk("nfs4_get_root: getattr error = %d\n", -error);
- goto err_free_label;
+ goto out;
}
if (fattr->valid & NFS_ATTR_FATTR_FSID &&
!nfs_fsid_equal(&server->fsid, &fattr->fsid))
memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
-err_free_label:
- nfs4_label_free(label);
-
+out:
return error;
}
@@ -5550,7 +5547,7 @@ unwind:
struct nfs4_cached_acl {
int cached;
size_t len;
- char data[0];
+ char data[];
};
static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
@@ -6259,6 +6256,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
/* Fallthrough */
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
+ case -ETIMEDOUT:
task->tk_status = 0;
break;
case -NFS4ERR_OLD_STATEID:
@@ -6349,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs4_delegreturn_ops,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT,
};
int status = 0;
@@ -6932,7 +6930,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
.rpc_message = &msg,
.callback_ops = &nfs4_lock_ops,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int ret;
@@ -9176,7 +9174,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutget_call_ops,
.callback_data = lgp,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
struct pnfs_layout_segment *lseg = NULL;
struct nfs4_exception exception = {
@@ -9293,6 +9291,7 @@ static void nfs4_layoutreturn_release(void *calldata)
lrp->ld_private.ops->free(&lrp->ld_private);
pnfs_put_layout_hdr(lrp->args.layout);
nfs_iput_and_deactive(lrp->inode);
+ put_cred(lrp->cred);
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f7723d221945..ac93715c05a4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2524,6 +2524,21 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
}
return 0;
}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+ int iomode = 0;
+
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state))
+ iomode += IOMODE_READ;
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state))
+ iomode += IOMODE_RW;
+ /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */
+ if (iomode) {
+ pnfs_layout_return_unused_byclid(clp, iomode);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ }
+}
#else /* CONFIG_NFS_V4_1 */
static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
@@ -2531,6 +2546,10 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
{
return 0;
}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+}
#endif /* CONFIG_NFS_V4_1 */
static void nfs4_state_manager(struct nfs_client *clp)
@@ -2635,12 +2654,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
- if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) {
+ if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
nfs_client_return_marked_delegations(clp);
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
}
- clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state);
+ nfs4_layoutreturn_any_run(clp);
+ clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
}
/* Did we race with an attempt to give us more work? */
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 1e97e5e04cb4..543541173a3d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -584,7 +584,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED);
TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED);
TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED);
TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER);
-TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW);
#define show_nfs4_clp_state(state) \
__print_flags(state, "|", \
@@ -605,7 +607,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
{ NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \
{ NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \
{ NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \
- { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" })
+ { NFS4CLNT_RECALL_RUNNING, "RECALL_RUNNING" }, \
+ { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \
+ { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" })
TRACE_EVENT(nfs4_state_mgr,
TP_PROTO(
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index effaa4247b91..8d3278805602 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -88,7 +88,7 @@
#define NFS_ROOT "/tftpboot/%s"
/* Default NFSROOT mount options. */
-#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
+#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index a9588d19a5ae..7e7a97ae21ed 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -181,6 +181,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done,
int error \
), \
TP_ARGS(inode, error))
+DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale);
DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 20b3717cd7ca..f61f96603df7 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -33,9 +33,7 @@ static const struct rpc_call_ops nfs_pgio_common_ops;
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
{
- return nfs_pgio_has_mirroring(desc) ?
- &desc->pg_mirrors[desc->pg_mirror_idx] :
- &desc->pg_mirrors[0];
+ return &desc->pg_mirrors[desc->pg_mirror_idx];
}
EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
@@ -133,47 +131,166 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
/*
- * nfs_page_group_lock - lock the head of the page group
- * @req - request in group that is to be locked
+ * nfs_page_lock_head_request - page lock the head of the page group
+ * @req: any member of the page group
+ */
+struct nfs_page *
+nfs_page_group_lock_head(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ while (!nfs_lock_request(head)) {
+ int ret = nfs_wait_on_request(head);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+ if (head != req)
+ kref_get(&head->wb_kref);
+ return head;
+}
+
+/*
+ * nfs_unroll_locks - unlock all newly locked reqs and wait on @req
+ * @head: head request of page group, must be holding head lock
+ * @req: request that couldn't lock and needs to wait on the req bit lock
*
- * this lock must be held when traversing or modifying the page
- * group list
+ * This is a helper function for nfs_lock_and_join_requests
+ * returns 0 on success, < 0 on error.
+ */
+static void
+nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+
+ /* relinquish all the locks successfully grabbed this run */
+ for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+ if (!kref_read(&tmp->wb_kref))
+ continue;
+ nfs_unlock_and_release_request(tmp);
+ }
+}
+
+/*
+ * nfs_page_group_lock_subreq - try to lock a subrequest
+ * @head: head request of page group
+ * @subreq: request to lock
*
- * return 0 on success, < 0 on error
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request and page group both locked.
+ * On error, it returns with the page group unlocked.
*/
-int
-nfs_page_group_lock(struct nfs_page *req)
+static int
+nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq)
{
- struct nfs_page *head = req->wb_head;
+ int ret;
+
+ if (!kref_get_unless_zero(&subreq->wb_kref))
+ return 0;
+ while (!nfs_lock_request(subreq)) {
+ nfs_page_group_unlock(head);
+ ret = nfs_wait_on_request(subreq);
+ if (!ret)
+ ret = nfs_page_group_lock(head);
+ if (ret < 0) {
+ nfs_unroll_locks(head, subreq);
+ nfs_release_request(subreq);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * nfs_page_group_lock_subrequests - try to lock the subrequests
+ * @head: head request of page group
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request locked.
+ */
+int nfs_page_group_lock_subrequests(struct nfs_page *head)
+{
+ struct nfs_page *subreq;
+ int ret;
- WARN_ON_ONCE(head != head->wb_head);
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ return ret;
+ /* lock each request in the page group */
+ for (subreq = head->wb_this_page; subreq != head;
+ subreq = subreq->wb_this_page) {
+ ret = nfs_page_group_lock_subreq(head, subreq);
+ if (ret < 0)
+ return ret;
+ }
+ nfs_page_group_unlock(head);
+ return 0;
+}
- if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
+/*
+ * nfs_page_set_headlock - set the request PG_HEADLOCK
+ * @req: request that is to be locked
+ *
+ * this lock must be held when modifying req->wb_head
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_set_headlock(struct nfs_page *req)
+{
+ if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags))
return 0;
- set_bit(PG_CONTENDED1, &head->wb_flags);
+ set_bit(PG_CONTENDED1, &req->wb_flags);
smp_mb__after_atomic();
- return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK,
TASK_UNINTERRUPTIBLE);
}
/*
- * nfs_page_group_unlock - unlock the head of the page group
- * @req - request in group that is to be unlocked
+ * nfs_page_clear_headlock - clear the request PG_HEADLOCK
+ * @req: request that is to be locked
*/
void
-nfs_page_group_unlock(struct nfs_page *req)
+nfs_page_clear_headlock(struct nfs_page *req)
{
- struct nfs_page *head = req->wb_head;
-
- WARN_ON_ONCE(head != head->wb_head);
-
smp_mb__before_atomic();
- clear_bit(PG_HEADLOCK, &head->wb_flags);
+ clear_bit(PG_HEADLOCK, &req->wb_flags);
smp_mb__after_atomic();
- if (!test_bit(PG_CONTENDED1, &head->wb_flags))
+ if (!test_bit(PG_CONTENDED1, &req->wb_flags))
return;
- wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+ wake_up_bit(&req->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req: request in group that is to be locked
+ *
+ * this lock must be held when traversing or modifying the page
+ * group list
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_group_lock(struct nfs_page *req)
+{
+ int ret;
+
+ ret = nfs_page_set_headlock(req);
+ if (ret || req->wb_head == req)
+ return ret;
+ return nfs_page_set_headlock(req->wb_head);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req: request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+ if (req != req->wb_head)
+ nfs_page_clear_headlock(req->wb_head);
+ nfs_page_clear_headlock(req);
}
/*
@@ -359,15 +476,23 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
}
static struct nfs_page *
-nfs_create_subreq(struct nfs_page *req, struct nfs_page *last,
- unsigned int pgbase, unsigned int offset,
+nfs_create_subreq(struct nfs_page *req,
+ unsigned int pgbase,
+ unsigned int offset,
unsigned int count)
{
+ struct nfs_page *last;
struct nfs_page *ret;
ret = __nfs_create_request(req->wb_lock_context, req->wb_page,
pgbase, offset, count);
if (!IS_ERR(ret)) {
+ /* find the last request */
+ for (last = req->wb_head;
+ last->wb_this_page != req->wb_head;
+ last = last->wb_this_page)
+ ;
+
nfs_lock_request(ret);
ret->wb_index = req->wb_index;
nfs_page_group_init(ret, last);
@@ -627,9 +752,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
.callback_ops = call_ops,
.callback_data = hdr,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | flags,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
};
- int ret = 0;
hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
@@ -641,18 +765,10 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
(unsigned long long)hdr->args.offset);
task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- ret = PTR_ERR(task);
- goto out;
- }
- if (how & FLUSH_SYNC) {
- ret = rpc_wait_for_completion_task(task);
- if (ret == 0)
- ret = task->tk_status;
- }
+ if (IS_ERR(task))
+ return PTR_ERR(task);
rpc_put_task(task);
-out:
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
@@ -886,15 +1002,6 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
pgio->pg_mirror_count = mirror_count;
}
-/*
- * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
- */
-void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
-{
- pgio->pg_mirror_count = 1;
- pgio->pg_mirror_idx = 0;
-}
-
static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
{
pgio->pg_mirror_count = 1;
@@ -911,7 +1018,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
}
/**
- * nfs_can_coalesce_requests - test two requests for compatibility
+ * nfs_coalesce_size - test two requests for compatibility
* @prev: pointer to nfs_page
* @req: pointer to nfs_page
* @pgio: pointer to nfs_pagio_descriptor
@@ -920,41 +1027,36 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
* page data area they describe is contiguous, and that their RPC
* credentials, NFSv4 open state, and lockowners are the same.
*
- * Return 'true' if this is the case, else return 'false'.
+ * Returns size of the request that can be coalesced
*/
-static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+static unsigned int nfs_coalesce_size(struct nfs_page *prev,
struct nfs_page *req,
struct nfs_pageio_descriptor *pgio)
{
- size_t size;
struct file_lock_context *flctx;
if (prev) {
if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
- return false;
+ return 0;
flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
if (flctx != NULL &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock)) &&
!nfs_match_lock_context(req->wb_lock_context,
prev->wb_lock_context))
- return false;
+ return 0;
if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
- return false;
+ return 0;
if (req->wb_page == prev->wb_page) {
if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
- return false;
+ return 0;
} else {
if (req->wb_pgbase != 0 ||
prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
- return false;
+ return 0;
}
}
- size = pgio->pg_ops->pg_test(pgio, prev, req);
- WARN_ON_ONCE(size > req->wb_bytes);
- if (size && size < req->wb_bytes)
- req->wb_bytes = size;
- return size > 0;
+ return pgio->pg_ops->pg_test(pgio, prev, req);
}
/**
@@ -962,15 +1064,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
* @desc: destination io descriptor
* @req: request
*
- * Returns true if the request 'req' was successfully coalesced into the
- * existing list of pages 'desc'.
+ * If the request 'req' was successfully coalesced into the existing list
+ * of pages 'desc', it returns the size of req.
*/
-static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
- struct nfs_page *req)
+static unsigned int
+nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
{
struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_page *prev = NULL;
+ unsigned int size;
if (mirror->pg_count != 0) {
prev = nfs_list_entry(mirror->pg_list.prev);
@@ -990,11 +1093,12 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
return 0;
}
- if (!nfs_can_coalesce_requests(prev, req, desc))
- return 0;
+ size = nfs_coalesce_size(prev, req, desc);
+ if (size < req->wb_bytes)
+ return size;
nfs_list_move_request(req, &mirror->pg_list);
mirror->pg_count += req->wb_bytes;
- return 1;
+ return req->wb_bytes;
}
/*
@@ -1034,7 +1138,8 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
* @req: request
*
* This may split a request into subrequests which are all part of the
- * same page group.
+ * same page group. If so, it will submit @req as the last one, to ensure
+ * the pointer to @req is still valid in case of failure.
*
* Returns true if the request 'req' was successfully coalesced into the
* existing list of pages 'desc'.
@@ -1043,51 +1148,50 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_page *subreq;
- unsigned int bytes_left = 0;
- unsigned int offset, pgbase;
+ unsigned int size, subreq_size;
nfs_page_group_lock(req);
subreq = req;
- bytes_left = subreq->wb_bytes;
- offset = subreq->wb_offset;
- pgbase = subreq->wb_pgbase;
-
- do {
- if (!nfs_pageio_do_add_request(desc, subreq)) {
- /* make sure pg_test call(s) did nothing */
- WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
- WARN_ON_ONCE(subreq->wb_offset != offset);
- WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
-
+ subreq_size = subreq->wb_bytes;
+ for(;;) {
+ size = nfs_pageio_do_add_request(desc, subreq);
+ if (size == subreq_size) {
+ /* We successfully submitted a request */
+ if (subreq == req)
+ break;
+ req->wb_pgbase += size;
+ req->wb_bytes -= size;
+ req->wb_offset += size;
+ subreq_size = req->wb_bytes;
+ subreq = req;
+ continue;
+ }
+ if (WARN_ON_ONCE(subreq != req)) {
+ nfs_page_group_unlock(req);
+ nfs_pageio_cleanup_request(desc, subreq);
+ subreq = req;
+ subreq_size = req->wb_bytes;
+ nfs_page_group_lock(req);
+ }
+ if (!size) {
+ /* Can't coalesce any more, so do I/O */
nfs_page_group_unlock(req);
desc->pg_moreio = 1;
nfs_pageio_doio(desc);
if (desc->pg_error < 0 || mirror->pg_recoalesce)
- goto out_cleanup_subreq;
+ return 0;
/* retry add_request for this subreq */
nfs_page_group_lock(req);
continue;
}
-
- /* check for buggy pg_test call(s) */
- WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
- WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
- WARN_ON_ONCE(subreq->wb_bytes == 0);
-
- bytes_left -= subreq->wb_bytes;
- offset += subreq->wb_bytes;
- pgbase += subreq->wb_bytes;
-
- if (bytes_left) {
- subreq = nfs_create_subreq(req, subreq, pgbase,
- offset, bytes_left);
- if (IS_ERR(subreq))
- goto err_ptr;
- }
- } while (bytes_left > 0);
+ subreq = nfs_create_subreq(req, req->wb_pgbase,
+ req->wb_offset, size);
+ if (IS_ERR(subreq))
+ goto err_ptr;
+ subreq_size = size;
+ }
nfs_page_group_unlock(req);
return 1;
@@ -1095,10 +1199,6 @@ err_ptr:
desc->pg_error = PTR_ERR(subreq);
nfs_page_group_unlock(req);
return 0;
-out_cleanup_subreq:
- if (req != subreq)
- nfs_pageio_cleanup_request(desc, subreq);
- return 0;
}
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1167,7 +1267,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
{
u32 midx;
unsigned int pgbase, offset, bytes;
- struct nfs_page *dupreq, *lastreq;
+ struct nfs_page *dupreq;
pgbase = req->wb_pgbase;
offset = req->wb_offset;
@@ -1177,38 +1277,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (desc->pg_error < 0)
goto out_failed;
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- if (midx) {
- nfs_page_group_lock(req);
+ /* Create the mirror instances first, and fire them off */
+ for (midx = 1; midx < desc->pg_mirror_count; midx++) {
+ nfs_page_group_lock(req);
- /* find the last request */
- for (lastreq = req->wb_head;
- lastreq->wb_this_page != req->wb_head;
- lastreq = lastreq->wb_this_page)
- ;
+ dupreq = nfs_create_subreq(req,
+ pgbase, offset, bytes);
- dupreq = nfs_create_subreq(req, lastreq,
- pgbase, offset, bytes);
-
- nfs_page_group_unlock(req);
- if (IS_ERR(dupreq)) {
- desc->pg_error = PTR_ERR(dupreq);
- goto out_failed;
- }
- } else
- dupreq = req;
+ nfs_page_group_unlock(req);
+ if (IS_ERR(dupreq)) {
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
+ }
- if (nfs_pgio_has_mirroring(desc))
- desc->pg_mirror_idx = midx;
+ desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
goto out_cleanup_subreq;
}
+ desc->pg_mirror_idx = 0;
+ if (!nfs_pageio_add_request_mirror(desc, req))
+ goto out_failed;
+
return 1;
out_cleanup_subreq:
- if (req != dupreq)
- nfs_pageio_cleanup_request(desc, dupreq);
+ nfs_pageio_cleanup_request(desc, dupreq);
out_failed:
nfs_pageio_error_cleanup(desc);
return 0;
@@ -1226,8 +1320,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
u32 restore_idx = desc->pg_mirror_idx;
- if (nfs_pgio_has_mirroring(desc))
- desc->pg_mirror_idx = mirror_idx;
+ desc->pg_mirror_idx = mirror_idx;
for (;;) {
nfs_pageio_doio(desc);
if (desc->pg_error < 0 || !mirror->pg_recoalesce)
@@ -1320,6 +1413,14 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
}
}
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ nfs_pageio_complete(pgio);
+}
+
int __init nfs_init_nfspagecache(void)
{
nfs_page_cachep = kmem_cache_create("nfs_page",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 542ea8dfd1bc..b8d78f393365 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -268,11 +268,11 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
struct nfs_server *server = NFS_SERVER(lo->plh_inode);
struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
- if (!list_empty(&lo->plh_layouts)) {
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
struct nfs_client *clp = server->nfs_client;
spin_lock(&clp->cl_lock);
- list_del_init(&lo->plh_layouts);
+ list_del_rcu(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
}
put_cred(lo->plh_lc_cred);
@@ -309,6 +309,16 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
}
}
+static struct inode *
+pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode = igrab(lo->plh_inode);
+ if (inode)
+ return inode;
+ set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+ return NULL;
+}
+
static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
u32 seq)
@@ -496,6 +506,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
{
INIT_LIST_HEAD(&lseg->pls_list);
INIT_LIST_HEAD(&lseg->pls_lc_list);
+ INIT_LIST_HEAD(&lseg->pls_commits);
refcount_set(&lseg->pls_refcount, 1);
set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
@@ -782,9 +793,10 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
/* If the sb is being destroyed, just bail */
if (!nfs_sb_active(server->super))
break;
- inode = igrab(lo->plh_inode);
+ inode = pnfs_grab_inode_layout_hdr(lo);
if (inode != NULL) {
- list_del_init(&lo->plh_layouts);
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
+ list_del_rcu(&lo->plh_layouts);
if (pnfs_layout_add_bulk_destroy_list(inode,
layout_list))
continue;
@@ -794,7 +806,6 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
} else {
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
- set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
}
nfs_sb_deactive(server->super);
spin_lock(&clp->cl_lock);
@@ -903,10 +914,21 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
pnfs_destroy_layouts_byclid(clp, false);
}
+static void
+pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
+ old = xchg(&lo->plh_lc_cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
- bool update_barrier)
+ const struct cred *cred, bool update_barrier)
{
u32 oldseq, newseq, new_barrier = 0;
@@ -914,6 +936,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
newseq = be32_to_cpu(new->seqid);
if (!pnfs_layout_is_valid(lo)) {
+ pnfs_set_layout_cred(lo, cred);
nfs4_stateid_copy(&lo->plh_stateid, new);
lo->plh_barrier = newseq;
pnfs_clear_layoutreturn_info(lo);
@@ -1061,7 +1084,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
lgp->args.ctx = get_nfs_open_context(ctx);
nfs4_stateid_copy(&lgp->args.stateid, stateid);
lgp->gfp_flags = gfp_flags;
- lgp->cred = get_cred(ctx->cred);
+ lgp->cred = ctx->cred;
return lgp;
}
@@ -1072,7 +1095,6 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
nfs4_free_pages(lgp->args.layout.pages, max_pages);
if (lgp->args.inode)
pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
- put_cred(lgp->cred);
put_nfs_open_context(lgp->args.ctx);
kfree(lgp);
}
@@ -1109,7 +1131,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
pnfs_free_returned_lsegs(lo, &freeme, range, seq);
- pnfs_set_layout_stateid(lo, stateid, true);
+ pnfs_set_layout_stateid(lo, stateid, NULL, true);
} else
pnfs_mark_layout_stateid_invalid(lo, &freeme);
out_unlock:
@@ -1122,6 +1144,7 @@ out_unlock:
static bool
pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
nfs4_stateid *stateid,
+ const struct cred **cred,
enum pnfs_iomode *iomode)
{
/* Serialise LAYOUTGET/LAYOUTRETURN */
@@ -1132,18 +1155,17 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
pnfs_get_layout_hdr(lo);
if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
- if (stateid != NULL) {
- nfs4_stateid_copy(stateid, &lo->plh_stateid);
- if (lo->plh_return_seq != 0)
- stateid->seqid = cpu_to_be32(lo->plh_return_seq);
- }
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
+ if (lo->plh_return_seq != 0)
+ stateid->seqid = cpu_to_be32(lo->plh_return_seq);
if (iomode != NULL)
*iomode = lo->plh_return_iomode;
pnfs_clear_layoutreturn_info(lo);
return true;
}
- if (stateid != NULL)
- nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
if (iomode != NULL)
*iomode = IOMODE_ANY;
return true;
@@ -1167,20 +1189,26 @@ pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
}
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
- enum pnfs_iomode iomode, bool sync)
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ const struct cred **pcred,
+ enum pnfs_iomode iomode,
+ bool sync)
{
struct inode *ino = lo->plh_inode;
struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
struct nfs4_layoutreturn *lrp;
+ const struct cred *cred = *pcred;
int status = 0;
+ *pcred = NULL;
lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&ino->i_lock);
+ put_cred(cred);
pnfs_put_layout_hdr(lo);
goto out;
}
@@ -1188,7 +1216,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
lrp->args.ld_private = &lrp->ld_private;
lrp->clp = NFS_SERVER(ino)->nfs_client;
- lrp->cred = lo->plh_lc_cred;
+ lrp->cred = cred;
if (ld->prepare_layoutreturn)
ld->prepare_layoutreturn(&lrp->args);
@@ -1233,15 +1261,16 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
return;
spin_lock(&inode->i_lock);
if (pnfs_layout_need_return(lo)) {
+ const struct cred *cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode;
bool send;
- send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
}
} else
spin_unlock(&inode->i_lock);
@@ -1261,6 +1290,7 @@ _pnfs_return_layout(struct inode *ino)
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
LIST_HEAD(tmp_list);
+ const struct cred *cred;
nfs4_stateid stateid;
int status = 0;
bool send, valid_layout;
@@ -1305,10 +1335,10 @@ _pnfs_return_layout(struct inode *ino)
goto out_put_layout_hdr;
}
- send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
spin_unlock(&ino->i_lock);
if (send)
- status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
+ status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
out_put_layout_hdr:
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -1354,6 +1384,7 @@ bool pnfs_roc(struct inode *ino,
struct nfs4_state *state;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg, *next;
+ const struct cred *lc_cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode = 0;
bool layoutreturn = false, roc = false;
@@ -1423,16 +1454,20 @@ retry:
* 2. we don't send layoutreturn
*/
/* lo ref dropped in pnfs_roc_release() */
- layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
/* If the creds don't match, we can't compound the layoutreturn */
- if (!layoutreturn || cred_fscmp(cred, lo->plh_lc_cred) != 0)
+ if (!layoutreturn)
goto out_noroc;
+ if (cred_fscmp(cred, lc_cred) != 0)
+ goto out_noroc_put_cred;
roc = layoutreturn;
pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
res->lrs_present = 0;
layoutreturn = false;
+out_noroc_put_cred:
+ put_cred(lc_cred);
out_noroc:
spin_unlock(&ino->i_lock);
rcu_read_unlock();
@@ -1445,7 +1480,7 @@ out_noroc:
return true;
}
if (layoutreturn)
- pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+ pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
pnfs_put_layout_hdr(lo);
return false;
}
@@ -1859,15 +1894,14 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
static void _add_to_server_list(struct pnfs_layout_hdr *lo,
struct nfs_server *server)
{
- if (list_empty(&lo->plh_layouts)) {
+ if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
struct nfs_client *clp = server->nfs_client;
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
*/
spin_lock(&clp->cl_lock);
- if (list_empty(&lo->plh_layouts))
- list_add_tail(&lo->plh_layouts, &server->layouts);
+ list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
spin_unlock(&clp->cl_lock);
}
}
@@ -1989,6 +2023,7 @@ lookup_again:
goto lookup_again;
}
+ spin_unlock(&ino->i_lock);
first = true;
status = nfs4_select_rw_stateid(ctx->state,
iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
@@ -1998,12 +2033,12 @@ lookup_again:
trace_pnfs_update_layout(ino, pos, count,
iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_INVALID_OPEN);
- spin_unlock(&ino->i_lock);
nfs4_schedule_stateid_recovery(server, ctx->state);
pnfs_clear_first_layoutget(lo);
pnfs_put_layout_hdr(lo);
goto lookup_again;
}
+ spin_lock(&ino->i_lock);
} else {
nfs4_stateid_copy(&stateid, &lo->plh_stateid);
}
@@ -2323,14 +2358,14 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
if (!pnfs_layout_is_valid(lo)) {
/* We have a completely new layout */
- pnfs_set_layout_stateid(lo, &res->stateid, true);
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
} else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
goto out_forget;
}
- pnfs_set_layout_stateid(lo, &res->stateid, false);
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
} else {
/*
* We got an entirely new state ID. Mark all segments for the
@@ -2423,43 +2458,159 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
return -ENOENT;
}
-void pnfs_error_mark_layout_for_return(struct inode *inode,
- struct pnfs_layout_segment *lseg)
+static void
+pnfs_mark_layout_for_return(struct inode *inode,
+ const struct pnfs_layout_range *range)
{
- struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- struct pnfs_layout_range range = {
- .iomode = lseg->pls_range.iomode,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
+ struct pnfs_layout_hdr *lo;
bool return_now = false;
spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
if (!pnfs_layout_is_valid(lo)) {
spin_unlock(&inode->i_lock);
return;
}
- pnfs_set_plh_return_info(lo, range.iomode, 0);
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0) != -EBUSY) {
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
+ const struct cred *cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode;
- return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
spin_unlock(&inode->i_lock);
if (return_now)
- pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
} else {
spin_unlock(&inode->i_lock);
nfs_commit_inode(inode, 0);
}
}
+
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_range range = {
+ .iomode = lseg->pls_range.iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ pnfs_mark_layout_for_return(inode, &range);
+}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
+static bool
+pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
+{
+ return pnfs_layout_is_valid(lo) &&
+ !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+}
+
+static struct pnfs_layout_segment *
+pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ continue;
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
+ continue;
+ if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
+ return lseg;
+ }
+ return NULL;
+}
+
+/* Find open file states whose mode matches that of the range */
+static bool
+pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range)
+{
+ struct list_head *head;
+ struct nfs_open_context *ctx;
+ fmode_t mode = 0;
+
+ if (!pnfs_layout_can_be_returned(lo) ||
+ !pnfs_find_first_lseg(lo, range, range->iomode))
+ return false;
+
+ head = &NFS_I(lo->plh_inode)->open_files;
+ list_for_each_entry_rcu(ctx, head, list) {
+ if (ctx->state)
+ mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
+ }
+
+ switch (range->iomode) {
+ default:
+ break;
+ case IOMODE_READ:
+ mode &= ~FMODE_WRITE;
+ break;
+ case IOMODE_RW:
+ if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
+ mode &= ~FMODE_READ;
+ }
+ return mode == 0;
+}
+
+static int
+pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
+{
+ const struct pnfs_layout_range *range = data;
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_can_be_returned(lo) ||
+ test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ continue;
+ inode = lo->plh_inode;
+ spin_lock(&inode->i_lock);
+ if (!pnfs_should_return_unused_layout(lo, range)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ spin_unlock(&inode->i_lock);
+ inode = pnfs_grab_inode_layout_hdr(lo);
+ if (!inode)
+ continue;
+ rcu_read_unlock();
+ pnfs_mark_layout_for_return(inode, range);
+ iput(inode);
+ cond_resched();
+ goto restart;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+void
+pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_range range = {
+ .iomode = iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
+ &range);
+}
+
void
pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
{
@@ -2475,7 +2626,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
* Check for any intersection between the request and the pgio->pg_lseg,
* and if none, put this pgio->pg_lseg away.
*/
-static void
+void
pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
@@ -2483,6 +2634,7 @@ pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page
pgio->pg_lseg = NULL;
}
}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
@@ -3000,10 +3152,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
end_pos = nfsi->layout->plh_lwb;
nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
+ data->cred = get_cred(nfsi->layout->plh_lc_cred);
spin_unlock(&inode->i_lock);
data->args.inode = inode;
- data->cred = get_cred(nfsi->layout->plh_lc_cred);
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0fafdadc9c8d..8e0ada581b92 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -66,6 +66,7 @@ struct nfs4_pnfs_ds {
struct pnfs_layout_segment {
struct list_head pls_list;
struct list_head pls_lc_list;
+ struct list_head pls_commits;
struct pnfs_layout_range pls_range;
refcount_t pls_refcount;
u32 pls_seq;
@@ -105,6 +106,7 @@ enum {
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
+ NFS_LAYOUT_HASHED, /* The layout visible */
};
enum layoutdriver_policy_flags {
@@ -148,22 +150,6 @@ struct pnfs_layoutdriver_type {
const struct nfs_pageio_ops *pg_write_ops;
struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
- void (*mark_request_commit) (struct nfs_page *req,
- struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- u32 ds_commit_idx);
- void (*clear_request_commit) (struct nfs_page *req,
- struct nfs_commit_info *cinfo);
- int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
- int max);
- void (*recover_commit_reqs) (struct list_head *list,
- struct nfs_commit_info *cinfo);
- struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
- struct page *page);
- int (*commit_pagelist)(struct inode *inode,
- struct list_head *mds_pages,
- int how,
- struct nfs_commit_info *cinfo);
int (*sync)(struct inode *inode, bool datasync);
@@ -186,6 +172,29 @@ struct pnfs_layoutdriver_type {
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
};
+struct pnfs_commit_ops {
+ void (*setup_ds_info)(struct pnfs_ds_commit_info *,
+ struct pnfs_layout_segment *);
+ void (*release_ds_info)(struct pnfs_ds_commit_info *,
+ struct inode *inode);
+ int (*commit_pagelist)(struct inode *inode,
+ struct list_head *mds_pages,
+ int how,
+ struct nfs_commit_info *cinfo);
+ void (*mark_request_commit) (struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+ void (*clear_request_commit) (struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+ int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+ int max);
+ void (*recover_commit_reqs) (struct list_head *list,
+ struct nfs_commit_info *cinfo);
+ struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+ struct page *page);
+};
+
struct pnfs_layout_hdr {
refcount_t plh_refcount;
atomic_t plh_outstanding; /* number of RPCs out */
@@ -203,6 +212,7 @@ struct pnfs_layout_hdr {
loff_t plh_lwb; /* last write byte for layoutcommit */
const struct cred *plh_lc_cred; /* layoutcommit cred */
struct inode *plh_inode;
+ struct rcu_head plh_rcu;
};
struct pnfs_device {
@@ -242,6 +252,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
+void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
@@ -267,6 +278,7 @@ bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new,
+ const struct cred *cred,
bool update_barrier);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
@@ -326,6 +338,9 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg);
+void pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode);
+
/* nfs4_deviceid_flags */
enum {
NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
@@ -360,6 +375,16 @@ bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
/* pnfs_nfs.c */
+struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags);
+void pnfs_free_commit_array(struct pnfs_commit_array *p);
+struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *,
+ struct pnfs_commit_array *,
+ struct pnfs_layout_segment *);
+
+void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg);
+void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo);
+
void pnfs_generic_clear_request_commit(struct nfs_page *req,
struct nfs_commit_info *cinfo);
void pnfs_generic_commit_release(void *calldata);
@@ -367,6 +392,8 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
void pnfs_generic_rw_release(void *data);
void pnfs_generic_recover_commit_reqs(struct list_head *dst,
struct nfs_commit_info *cinfo);
+struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
+ struct page *page);
int pnfs_generic_commit_pagelist(struct inode *inode,
struct list_head *mds_pages,
int how,
@@ -438,9 +465,11 @@ static inline int
pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
struct nfs_commit_info *cinfo)
{
- if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0)
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0)
return PNFS_NOT_ATTEMPTED;
- return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo);
+ return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo);
}
static inline struct pnfs_ds_commit_info *
@@ -454,6 +483,28 @@ pnfs_get_ds_info(struct inode *inode)
}
static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode);
+ if (inode_cinfo != NULL)
+ fl_cinfo->ops = inode_cinfo->ops;
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ INIT_LIST_HEAD(&fl_cinfo->commits);
+ fl_cinfo->ops = NULL;
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL)
+ fl_cinfo->ops->release_ds_info(fl_cinfo, inode);
+}
+
+static inline void
pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
{
set_bit(NFS_DEVICEID_INVALID, &node->flags);
@@ -463,24 +514,22 @@ static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo, u32 ds_commit_idx)
{
- struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (lseg == NULL || ld->mark_request_commit == NULL)
+ if (!lseg || !fl_cinfo->ops->mark_request_commit)
return false;
- ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+ fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
return true;
}
static inline bool
pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
- struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (ld == NULL || ld->clear_request_commit == NULL)
+ if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit)
return false;
- ld->clear_request_commit(req, cinfo);
+ fl_cinfo->ops->clear_request_commit(req, cinfo);
return true;
}
@@ -488,21 +537,31 @@ static inline int
pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
int max)
{
- if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo || fl_cinfo->nwritten == 0)
return 0;
- else
- return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
+ return fl_cinfo->ops->scan_commit_lists(cinfo, max);
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo && fl_cinfo->nwritten != 0)
+ fl_cinfo->ops->recover_commit_reqs(head, cinfo);
}
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
struct page *page)
{
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (ld == NULL || ld->search_commit_reqs == NULL)
+ if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs)
return NULL;
- return ld->search_commit_reqs(cinfo, page);
+ return fl_cinfo->ops->search_commit_reqs(cinfo, page);
}
/* Should the pNFS client commit and return the layout upon a setattr */
@@ -750,6 +809,21 @@ pnfs_get_ds_info(struct inode *inode)
return NULL;
}
+static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo, u32 ds_commit_idx)
@@ -770,6 +844,11 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
return 0;
}
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+}
+
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
struct page *page)
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 8b37e7f8e789..e7ddbce48321 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -59,6 +59,17 @@ void pnfs_generic_commit_release(void *calldata)
}
EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+static struct pnfs_layout_segment *
+pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket)
+{
+ if (list_empty(&bucket->committing) && list_empty(&bucket->written)) {
+ struct pnfs_layout_segment *freeme = bucket->lseg;
+ bucket->lseg = NULL;
+ return freeme;
+ }
+ return NULL;
+}
+
/* The generic layer is about to remove the req from the commit list.
* If this will make the bucket empty, it will need to put the lseg reference.
* Note this must be called holding nfsi->commit_mutex
@@ -78,8 +89,7 @@ pnfs_generic_clear_request_commit(struct nfs_page *req,
bucket = list_first_entry(&req->wb_list,
struct pnfs_commit_bucket,
written);
- freeme = bucket->wlseg;
- bucket->wlseg = NULL;
+ freeme = pnfs_free_bucket_lseg(bucket);
}
out:
nfs_request_remove_commit_list(req, cinfo);
@@ -87,10 +97,154 @@ out:
}
EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+struct pnfs_commit_array *
+pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags)
+{
+ struct pnfs_commit_array *p;
+ struct pnfs_commit_bucket *b;
+
+ p = kmalloc(struct_size(p, buckets, n), gfp_flags);
+ if (!p)
+ return NULL;
+ p->nbuckets = n;
+ INIT_LIST_HEAD(&p->cinfo_list);
+ INIT_LIST_HEAD(&p->lseg_list);
+ p->lseg = NULL;
+ for (b = &p->buckets[0]; n != 0; b++, n--) {
+ INIT_LIST_HEAD(&b->written);
+ INIT_LIST_HEAD(&b->committing);
+ b->lseg = NULL;
+ b->direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ }
+ return p;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array);
+
+void
+pnfs_free_commit_array(struct pnfs_commit_array *p)
+{
+ kfree_rcu(p, rcu);
+}
+EXPORT_SYMBOL_GPL(pnfs_free_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (array->lseg == lseg)
+ return array;
+ }
+ return NULL;
+}
+
+struct pnfs_commit_array *
+pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_commit_array *new,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (array)
+ return array;
+ new->lseg = lseg;
+ refcount_set(&new->refcount, 1);
+ list_add_rcu(&new->cinfo_list, &fl_cinfo->commits);
+ list_add(&new->lseg_list, &lseg->pls_commits);
+ return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (!array) {
+ rcu_read_unlock();
+ fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg);
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ }
+ rcu_read_unlock();
+ return array;
+}
+
+static void
+pnfs_release_commit_array_locked(struct pnfs_commit_array *array)
+{
+ list_del_rcu(&array->cinfo_list);
+ list_del(&array->lseg_list);
+ pnfs_free_commit_array(array);
+}
+
+static void
+pnfs_put_commit_array_locked(struct pnfs_commit_array *array)
+{
+ if (refcount_dec_and_test(&array->refcount))
+ pnfs_release_commit_array_locked(array);
+}
+
+static void
+pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode)
+{
+ if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) {
+ pnfs_release_commit_array_locked(array);
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+static struct pnfs_commit_array *
+pnfs_get_commit_array(struct pnfs_commit_array *array)
+{
+ if (refcount_inc_not_zero(&array->refcount))
+ return array;
+ return NULL;
+}
+
+static void
+pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array)
+{
+ array->lseg = NULL;
+ list_del_init(&array->lseg_list);
+ pnfs_put_commit_array_locked(array);
+}
+
+void
+pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg);
+
+void
+pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy);
+
+/*
+ * Locks the nfs_page requests for commit and moves them to
+ * @bucket->committing.
+ */
static int
-pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
- struct nfs_commit_info *cinfo,
- int max)
+pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo,
+ int max)
{
struct list_head *src = &bucket->written;
struct list_head *dst = &bucket->committing;
@@ -101,158 +255,254 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
if (ret) {
cinfo->ds->nwritten -= ret;
cinfo->ds->ncommitting += ret;
- if (bucket->clseg == NULL)
- bucket->clseg = pnfs_get_lseg(bucket->wlseg);
- if (list_empty(src)) {
- pnfs_put_lseg(bucket->wlseg);
- bucket->wlseg = NULL;
- }
}
return ret;
}
+static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ int max)
+{
+ unsigned int i;
+ int rv = 0, cnt;
+
+ for (i = 0; i < nbuckets && max != 0; i++) {
+ cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max);
+ rv += cnt;
+ max -= cnt;
+ }
+ return rv;
+}
+
/* Move reqs from written to committing lists, returning count
* of number moved.
*/
-int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
- int max)
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max)
{
- int i, rv = 0, cnt;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ int rv = 0, cnt;
- lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
- for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
- cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
- cinfo, max);
- max -= cnt;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ cnt = pnfs_bucket_scan_array(cinfo, array->buckets,
+ array->nbuckets, max);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
rv += cnt;
+ max -= cnt;
+ if (!max)
+ break;
}
+ rcu_read_unlock();
return rv;
}
EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
-/* Pull everything off the committing lists and dump into @dst. */
-void pnfs_generic_recover_commit_reqs(struct list_head *dst,
- struct nfs_commit_info *cinfo)
+static unsigned int
+pnfs_bucket_recover_commit_reqs(struct list_head *dst,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
{
struct pnfs_commit_bucket *b;
struct pnfs_layout_segment *freeme;
- int nwritten;
- int i;
+ unsigned int nwritten, ret = 0;
+ unsigned int i;
- lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
restart:
- for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0);
if (!nwritten)
continue;
- cinfo->ds->nwritten -= nwritten;
- if (list_empty(&b->written)) {
- freeme = b->wlseg;
- b->wlseg = NULL;
+ ret += nwritten;
+ freeme = pnfs_free_bucket_lseg(b);
+ if (freeme) {
pnfs_put_lseg(freeme);
goto restart;
}
}
+ return ret;
+}
+
+/* Pull everything off the committing lists and dump into @dst. */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ unsigned int nwritten;
+
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ nwritten = pnfs_bucket_recover_commit_reqs(dst,
+ array->buckets,
+ array->nbuckets,
+ cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ fl_cinfo->nwritten -= nwritten;
+ }
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
-static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
+static struct nfs_page *
+pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets, struct page *page)
+{
+ struct nfs_page *req;
+ struct pnfs_commit_bucket *b;
+ unsigned int i;
+
+ /* Linearly search the commit lists for each bucket until a matching
+ * request is found */
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
+ list_for_each_entry(req, &b->written, wb_list) {
+ if (req->wb_page == page)
+ return req->wb_head;
+ }
+ list_for_each_entry(req, &b->committing, wb_list) {
+ if (req->wb_page == page)
+ return req->wb_head;
+ }
+ }
+ return NULL;
+}
+
+/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest
+ * for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+struct nfs_page *
+pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
{
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ struct nfs_page *req;
+
+ list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) {
+ req = pnfs_bucket_search_commit_reqs(array->buckets,
+ array->nbuckets, page);
+ if (req)
+ return req;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs);
+
+static struct pnfs_layout_segment *
+pnfs_bucket_get_committing(struct list_head *head,
+ struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, &bucket->committing)
+ cinfo->ds->ncommitting--;
+ list_splice_init(&bucket->committing, head);
+ return pnfs_free_bucket_lseg(bucket);
+}
+
+static struct nfs_commit_data *
+pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+
+ if (!data)
+ return NULL;
+ data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo);
+ if (!data->lseg)
+ data->lseg = pnfs_get_lseg(bucket->lseg);
+ return data;
+}
+
+static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo,
+ unsigned int idx)
+{
struct pnfs_commit_bucket *bucket;
struct pnfs_layout_segment *freeme;
- struct list_head *pos;
LIST_HEAD(pages);
- int i;
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- for (i = idx; i < fl_cinfo->nbuckets; i++) {
- bucket = &fl_cinfo->buckets[i];
+ for (bucket = buckets; idx < nbuckets; bucket++, idx++) {
if (list_empty(&bucket->committing))
continue;
- freeme = bucket->clseg;
- bucket->clseg = NULL;
- list_for_each(pos, &bucket->committing)
- cinfo->ds->ncommitting--;
- list_splice_init(&bucket->committing, &pages);
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- nfs_retry_commit(&pages, freeme, cinfo, i);
+ nfs_retry_commit(&pages, freeme, cinfo, idx);
pnfs_put_lseg(freeme);
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
}
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
static unsigned int
-pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
- struct list_head *list)
+pnfs_bucket_alloc_ds_commits(struct list_head *list,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
{
- struct pnfs_ds_commit_info *fl_cinfo;
struct pnfs_commit_bucket *bucket;
struct nfs_commit_data *data;
- int i;
+ unsigned int i;
unsigned int nreq = 0;
- fl_cinfo = cinfo->ds;
- bucket = fl_cinfo->buckets;
- for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+ for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) {
if (list_empty(&bucket->committing))
continue;
- data = nfs_commitdata_alloc(false);
- if (!data)
- break;
- data->ds_commit_index = i;
- list_add(&data->pages, list);
- nreq++;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (!list_empty(&bucket->committing)) {
+ data = pnfs_bucket_fetch_commitdata(bucket, cinfo);
+ if (!data)
+ goto out_error;
+ data->ds_commit_index = i;
+ list_add_tail(&data->list, list);
+ atomic_inc(&cinfo->mds->rpcs_out);
+ nreq++;
+ }
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
-
+ return nreq;
+out_error:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
/* Clean up on error */
- pnfs_generic_retry_commit(cinfo, i);
+ pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i);
return nreq;
}
-static inline
-void pnfs_fetch_commit_bucket_list(struct list_head *pages,
- struct nfs_commit_data *data,
- struct nfs_commit_info *cinfo)
+static unsigned int
+pnfs_alloc_ds_commits_list(struct list_head *list,
+ struct pnfs_ds_commit_info *fl_cinfo,
+ struct nfs_commit_info *cinfo)
{
- struct pnfs_commit_bucket *bucket;
- struct list_head *pos;
-
- bucket = &cinfo->ds->buckets[data->ds_commit_index];
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- list_for_each(pos, &bucket->committing)
- cinfo->ds->ncommitting--;
- list_splice_init(&bucket->committing, pages);
- data->lseg = bucket->clseg;
- bucket->clseg = NULL;
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
-
-}
+ struct pnfs_commit_array *array;
+ unsigned int ret = 0;
-/* Helper function for pnfs_generic_commit_pagelist to catch an empty
- * page list. This can happen when two commits race.
- *
- * This must be called instead of nfs_init_commit - call one or the other, but
- * not both!
- */
-static bool
-pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
- struct nfs_commit_data *data,
- struct nfs_commit_info *cinfo)
-{
- if (list_empty(pages)) {
- if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
- wake_up_var(&cinfo->mds->rpcs_out);
- /* don't call nfs_commitdata_release - it tries to put
- * the open_context which is not acquired until nfs_init_commit
- * which has not been called on @data */
- WARN_ON_ONCE(data->context);
- nfs_commit_free(data);
- return true;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ ret += pnfs_bucket_alloc_ds_commits(list, array->buckets,
+ array->nbuckets, cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
}
-
- return false;
+ rcu_read_unlock();
+ return ret;
}
/* This follows nfs_commit_list pretty closely */
@@ -262,6 +512,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
int (*initiate_commit)(struct nfs_commit_data *data,
int how))
{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
struct nfs_commit_data *data, *tmp;
LIST_HEAD(list);
unsigned int nreq = 0;
@@ -269,40 +520,25 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
if (!list_empty(mds_pages)) {
data = nfs_commitdata_alloc(true);
data->ds_commit_index = -1;
- list_add(&data->pages, &list);
+ list_splice_init(mds_pages, &data->pages);
+ list_add_tail(&data->list, &list);
+ atomic_inc(&cinfo->mds->rpcs_out);
nreq++;
}
- nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
-
+ nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo);
if (nreq == 0)
goto out;
- atomic_add(nreq, &cinfo->mds->rpcs_out);
-
- list_for_each_entry_safe(data, tmp, &list, pages) {
- list_del_init(&data->pages);
+ list_for_each_entry_safe(data, tmp, &list, list) {
+ list_del(&data->list);
if (data->ds_commit_index < 0) {
- /* another commit raced with us */
- if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
- data, cinfo))
- continue;
-
- nfs_init_commit(data, mds_pages, NULL, cinfo);
+ nfs_init_commit(data, NULL, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
data->mds_ops, how, 0);
} else {
- LIST_HEAD(pages);
-
- pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
-
- /* another commit raced with us */
- if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
- data, cinfo))
- continue;
-
- nfs_init_commit(data, &pages, data->lseg, cinfo);
+ nfs_init_commit(data, NULL, data->lseg, cinfo);
initiate_commit(data, how);
}
}
@@ -930,32 +1166,33 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
u32 ds_commit_idx)
{
struct list_head *list;
- struct pnfs_commit_bucket *buckets;
+ struct pnfs_commit_array *array;
+ struct pnfs_commit_bucket *bucket;
mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- buckets = cinfo->ds->buckets;
- list = &buckets[ds_commit_idx].written;
- if (list_empty(list)) {
- if (!pnfs_is_valid_lseg(lseg)) {
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- cinfo->completion_ops->resched_write(cinfo, req);
- return;
- }
- /* Non-empty buckets hold a reference on the lseg. That ref
- * is normally transferred to the COMMIT call and released
- * there. It could also be released if the last req is pulled
- * off due to a rewrite, in which case it will be done in
- * pnfs_common_clear_request_commit
- */
- WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
- buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
- }
+ array = pnfs_lookup_commit_array(cinfo->ds, lseg);
+ if (!array || !pnfs_is_valid_lseg(lseg))
+ goto out_resched;
+ bucket = &array->buckets[ds_commit_idx];
+ list = &bucket->written;
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * pnfs_common_clear_request_commit
+ */
+ if (!bucket->lseg)
+ bucket->lseg = pnfs_get_lseg(lseg);
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
nfs_request_add_commit_list_locked(req, list, cinfo);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
nfs_mark_page_unstable(req->wb_page, cinfo);
+ return;
+out_resched:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ cinfo->completion_ops->resched_write(cinfo, req);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 34bb9add2302..13b22e898116 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -250,7 +250,7 @@ static int nfs_readpage_done(struct rpc_task *task,
trace_nfs_readpage_done(task, hdr);
if (task->tk_status == -ESTALE) {
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
nfs_mark_for_revalidate(inode);
}
return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index dada09b391c6..59ef3b13ccca 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -176,6 +176,41 @@ void nfs_sb_deactive(struct super_block *sb)
}
EXPORT_SYMBOL_GPL(nfs_sb_deactive);
+static int __nfs_list_for_each_server(struct list_head *head,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ struct nfs_server *server, *last = NULL;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, head, client_link) {
+ if (!nfs_sb_active(server->super))
+ continue;
+ rcu_read_unlock();
+ if (last)
+ nfs_sb_deactive(last->super);
+ last = server;
+ ret = fn(server, data);
+ if (ret)
+ goto out;
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+out:
+ if (last)
+ nfs_sb_deactive(last->super);
+ return ret;
+}
+
+int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data);
+}
+EXPORT_SYMBOL_GPL(nfs_client_for_each_server);
+
/*
* Deliver file system statistics to userspace
*/
@@ -1179,7 +1214,6 @@ int nfs_get_tree_common(struct fs_context *fc)
struct super_block *s;
int (*compare_super)(struct super_block *, struct fs_context *) = nfs_compare_super;
struct nfs_server *server = ctx->server;
- unsigned long kflags = 0, kflags_out = 0;
int error;
ctx->server = NULL;
@@ -1239,26 +1273,6 @@ int nfs_get_tree_common(struct fs_context *fc)
goto error_splat_super;
}
- if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
- kflags |= SECURITY_LSM_NATIVE_LABELS;
- if (ctx->clone_data.sb) {
- if (d_inode(fc->root)->i_fop != &nfs_dir_operations) {
- error = -ESTALE;
- goto error_splat_root;
- }
- /* clone any lsm security options from the parent to the new sb */
- error = security_sb_clone_mnt_opts(ctx->clone_data.sb, s, kflags,
- &kflags_out);
- } else {
- error = security_sb_set_mnt_opts(s, fc->security,
- kflags, &kflags_out);
- }
- if (error)
- goto error_splat_root;
- if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
- !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
- NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
-
s->s_flags |= SB_ACTIVE;
error = 0;
@@ -1268,10 +1282,6 @@ out:
out_err_nosb:
nfs_free_server(server);
goto out;
-
-error_splat_root:
- dput(fc->root);
- fc->root = NULL;
error_splat_super:
deactivate_locked_super(s);
goto out;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 0effeee28352..b27ebdccef70 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -98,7 +98,7 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
.callback_ops = &nfs_unlink_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
struct rpc_task *task;
struct inode *dir = d_inode(data->dentry->d_parent);
@@ -341,7 +341,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
.callback_ops = &nfs_rename_ops,
.workqueue = nfsiod_workqueue,
.rpc_client = NFS_CLIENT(old_dir),
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
data = kzalloc(sizeof(*data), GFP_KERNEL);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c478b772cc49..df4b87c30ac9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -149,6 +149,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
kref_put(&ioc->refcount, nfs_io_completion_release);
}
+static void
+nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
+{
+ if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
+ kref_get(&req->wb_kref);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
+ }
+}
+
+static int
+nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+{
+ int ret;
+
+ if (!test_bit(PG_REMOVE, &req->wb_flags))
+ return 0;
+ ret = nfs_page_group_lock(req);
+ if (ret)
+ return ret;
+ if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
+ nfs_page_set_inode_ref(req, inode);
+ nfs_page_group_unlock(req);
+ return 0;
+}
+
static struct nfs_page *
nfs_page_private_request(struct page *page)
{
@@ -218,6 +243,36 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page)
return req;
}
+static struct nfs_page *nfs_find_and_lock_page_request(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *req, *head;
+ int ret;
+
+ for (;;) {
+ req = nfs_page_find_head_request(page);
+ if (!req)
+ return req;
+ head = nfs_page_group_lock_head(req);
+ if (head != req)
+ nfs_release_request(req);
+ if (IS_ERR(head))
+ return head;
+ ret = nfs_cancel_remove_inode(head, inode);
+ if (ret < 0) {
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
+ }
+ /* Ensure that nobody removed the request before we locked it */
+ if (head == nfs_page_private_request(page))
+ break;
+ if (PageSwapCache(page))
+ break;
+ nfs_unlock_and_release_request(head);
+ }
+ return head;
+}
+
/* Adjust the file length if we're writing beyond the end */
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
{
@@ -380,34 +435,6 @@ static void nfs_end_page_writeback(struct nfs_page *req)
}
/*
- * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req
- *
- * this is a helper function for nfs_lock_and_join_requests
- *
- * @inode - inode associated with request page group, must be holding inode lock
- * @head - head request of page group, must be holding head lock
- * @req - request that couldn't lock and needs to wait on the req bit lock
- *
- * NOTE: this must be called holding page_group bit lock
- * which will be released before returning.
- *
- * returns 0 on success, < 0 on error.
- */
-static void
-nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
- struct nfs_page *req)
-{
- struct nfs_page *tmp;
-
- /* relinquish all the locks successfully grabbed this run */
- for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
- if (!kref_read(&tmp->wb_kref))
- continue;
- nfs_unlock_and_release_request(tmp);
- }
-}
-
-/*
* nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
*
* @destroy_list - request list (using wb_this_page) terminated by @old_head
@@ -428,22 +455,29 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
destroy_list = (subreq->wb_this_page == old_head) ?
NULL : subreq->wb_this_page;
+ /* Note: lock subreq in order to change subreq->wb_head */
+ nfs_page_set_headlock(subreq);
WARN_ON_ONCE(old_head != subreq->wb_head);
/* make sure old group is not used */
subreq->wb_this_page = subreq;
+ subreq->wb_head = subreq;
clear_bit(PG_REMOVE, &subreq->wb_flags);
/* Note: races with nfs_page_group_destroy() */
if (!kref_read(&subreq->wb_kref)) {
/* Check if we raced with nfs_page_group_destroy() */
- if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags))
+ if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+ nfs_page_clear_headlock(subreq);
nfs_free_request(subreq);
+ } else
+ nfs_page_clear_headlock(subreq);
continue;
}
+ nfs_page_clear_headlock(subreq);
- subreq->wb_head = subreq;
+ nfs_release_request(old_head);
if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
nfs_release_request(subreq);
@@ -457,105 +491,43 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
}
/*
- * nfs_lock_and_join_requests - join all subreqs to the head req and return
- * a locked reference, cancelling any pending
- * operations for this page.
- *
- * @page - the page used to lookup the "page group" of nfs_page structures
+ * nfs_join_page_group - destroy subrequests of the head req
+ * @head: the page used to lookup the "page group" of nfs_page structures
+ * @inode: Inode to which the request belongs.
*
* This function joins all sub requests to the head request by first
* locking all requests in the group, cancelling any pending operations
* and finally updating the head request to cover the whole range covered by
* the (former) group. All subrequests are removed from any write or commit
* lists, unlinked from the group and destroyed.
- *
- * Returns a locked, referenced pointer to the head request - which after
- * this call is guaranteed to be the only request associated with the page.
- * Returns NULL if no requests are found for @page, or a ERR_PTR if an
- * error was encountered.
*/
-static struct nfs_page *
-nfs_lock_and_join_requests(struct page *page)
+void
+nfs_join_page_group(struct nfs_page *head, struct inode *inode)
{
- struct inode *inode = page_file_mapping(page)->host;
- struct nfs_page *head, *subreq;
+ struct nfs_page *subreq;
struct nfs_page *destroy_list = NULL;
- unsigned int total_bytes;
- int ret;
+ unsigned int pgbase, off, bytes;
-try_again:
- /*
- * A reference is taken only on the head request which acts as a
- * reference to the whole page group - the group will not be destroyed
- * until the head reference is released.
- */
- head = nfs_page_find_head_request(page);
- if (!head)
- return NULL;
-
- /* lock the page head first in order to avoid an ABBA inefficiency */
- if (!nfs_lock_request(head)) {
- ret = nfs_wait_on_request(head);
- nfs_release_request(head);
- if (ret < 0)
- return ERR_PTR(ret);
- goto try_again;
- }
-
- /* Ensure that nobody removed the request before we locked it */
- if (head != nfs_page_private_request(page) && !PageSwapCache(page)) {
- nfs_unlock_and_release_request(head);
- goto try_again;
- }
-
- ret = nfs_page_group_lock(head);
- if (ret < 0)
- goto release_request;
-
- /* lock each request in the page group */
- total_bytes = head->wb_bytes;
+ pgbase = head->wb_pgbase;
+ bytes = head->wb_bytes;
+ off = head->wb_offset;
for (subreq = head->wb_this_page; subreq != head;
subreq = subreq->wb_this_page) {
-
- if (!kref_get_unless_zero(&subreq->wb_kref)) {
- if (subreq->wb_offset == head->wb_offset + total_bytes)
- total_bytes += subreq->wb_bytes;
- continue;
- }
-
- while (!nfs_lock_request(subreq)) {
- /*
- * Unlock page to allow nfs_page_group_sync_on_bit()
- * to succeed
- */
- nfs_page_group_unlock(head);
- ret = nfs_wait_on_request(subreq);
- if (!ret)
- ret = nfs_page_group_lock(head);
- if (ret < 0) {
- nfs_unroll_locks(inode, head, subreq);
- nfs_release_request(subreq);
- goto release_request;
- }
- }
- /*
- * Subrequests are always contiguous, non overlapping
- * and in order - but may be repeated (mirrored writes).
- */
- if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
- /* keep track of how many bytes this group covers */
- total_bytes += subreq->wb_bytes;
- } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
- ((subreq->wb_offset + subreq->wb_bytes) >
- (head->wb_offset + total_bytes)))) {
- nfs_page_group_unlock(head);
- nfs_unroll_locks(inode, head, subreq);
- nfs_unlock_and_release_request(subreq);
- ret = -EIO;
- goto release_request;
+ /* Subrequests should always form a contiguous range */
+ if (pgbase > subreq->wb_pgbase) {
+ off -= pgbase - subreq->wb_pgbase;
+ bytes += pgbase - subreq->wb_pgbase;
+ pgbase = subreq->wb_pgbase;
}
+ bytes = max(subreq->wb_pgbase + subreq->wb_bytes
+ - pgbase, bytes);
}
+ /* Set the head request's range to cover the former page group */
+ head->wb_pgbase = pgbase;
+ head->wb_bytes = bytes;
+ head->wb_offset = off;
+
/* Now that all requests are locked, make sure they aren't on any list.
* Commit list removal accounting is done after locks are dropped */
subreq = head;
@@ -569,36 +541,52 @@ try_again:
/* destroy list will be terminated by head */
destroy_list = head->wb_this_page;
head->wb_this_page = head;
-
- /* change head request to cover whole range that
- * the former page group covered */
- head->wb_bytes = total_bytes;
}
- /* Postpone destruction of this request */
- if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
- set_bit(PG_INODE_REF, &head->wb_flags);
- kref_get(&head->wb_kref);
- atomic_long_inc(&NFS_I(inode)->nrequests);
- }
+ nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+}
- nfs_page_group_unlock(head);
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req
+ * @page: the page used to lookup the "page group" of nfs_page structures
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group. All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *
+nfs_lock_and_join_requests(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *head;
+ int ret;
- nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+ /*
+ * A reference is taken only on the head request which acts as a
+ * reference to the whole page group - the group will not be destroyed
+ * until the head reference is released.
+ */
+ head = nfs_find_and_lock_page_request(page);
+ if (IS_ERR_OR_NULL(head))
+ return head;
- /* Did we lose a race with nfs_inode_remove_request()? */
- if (!(PagePrivate(page) || PageSwapCache(page))) {
+ /* lock each request in the page group */
+ ret = nfs_page_group_lock_subrequests(head);
+ if (ret < 0) {
nfs_unlock_and_release_request(head);
- return NULL;
+ return ERR_PTR(ret);
}
- /* still holds ref on head from nfs_page_find_head_request
- * and still has lock on head from lock loop */
- return head;
+ nfs_join_page_group(head, inode);
-release_request:
- nfs_unlock_and_release_request(head);
- return ERR_PTR(ret);
+ return head;
}
static void nfs_write_error(struct nfs_page *req, int error)
@@ -1707,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
.callback_ops = call_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | flags,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
.priority = priority,
};
/* Set up the initial task struct. */
@@ -1746,14 +1734,19 @@ void nfs_init_commit(struct nfs_commit_data *data,
struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo)
{
- struct nfs_page *first = nfs_list_entry(head->next);
- struct nfs_open_context *ctx = nfs_req_openctx(first);
- struct inode *inode = d_inode(ctx->dentry);
+ struct nfs_page *first;
+ struct nfs_open_context *ctx;
+ struct inode *inode;
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
- list_splice_init(head, &data->pages);
+ if (head)
+ list_splice_init(head, &data->pages);
+
+ first = nfs_list_entry(data->pages.next);
+ ctx = nfs_req_openctx(first);
+ inode = d_inode(ctx->dentry);
data->inode = inode;
data->cred = ctx->cred;
@@ -1869,8 +1862,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Okay, COMMIT succeeded, apparently. Check the verifier
* returned by the server against all stored verfs. */
- if (verf->committed > NFS_UNSTABLE &&
- !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier)) {
+ if (nfs_write_match_verf(verf, req)) {
/* We have a match */
if (req->wb_page)
nfs_inode_remove_request(req);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f368f3215f88..99d2cae91bd6 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -136,7 +136,7 @@ config NFSD_FLEXFILELAYOUT
config NFSD_V4_2_INTER_SSC
bool "NFSv4.2 inter server to server COPY"
- depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2
+ depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 && NFS_FS=y
help
This option enables support for NFSv4.2 inter server to
server copy where the destination server calls the NFSv4.2
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 15422c951fd1..cb777fe82988 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -23,6 +23,7 @@
#include "netns.h"
#include "pnfs.h"
#include "filecache.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_EXPORT
@@ -50,6 +51,11 @@ static void expkey_put(struct kref *ref)
kfree_rcu(key, ek_rcu);
}
+static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h);
+}
+
static void expkey_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -140,7 +146,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
if (len == 0) {
set_bit(CACHE_NEGATIVE, &key.h.flags);
ek = svc_expkey_update(cd, &key, ek);
- if (!ek)
+ if (ek)
+ trace_nfsd_expkey_update(ek, NULL);
+ else
err = -ENOMEM;
} else {
err = kern_path(buf, 0, &key.ek_path);
@@ -150,7 +158,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
dprintk("Found the path %s\n", buf);
ek = svc_expkey_update(cd, &key, ek);
- if (!ek)
+ if (ek)
+ trace_nfsd_expkey_update(ek, buf);
+ else
err = -ENOMEM;
path_put(&key.ek_path);
}
@@ -249,6 +259,7 @@ static const struct cache_detail svc_expkey_cache_template = {
.hash_size = EXPKEY_HASHMAX,
.name = "nfsd.fh",
.cache_put = expkey_put,
+ .cache_upcall = expkey_upcall,
.cache_request = expkey_request,
.cache_parse = expkey_parse,
.cache_show = expkey_show,
@@ -330,6 +341,11 @@ static void svc_export_put(struct kref *ref)
kfree_rcu(exp, ex_rcu);
}
+static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h);
+}
+
static void svc_export_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -643,15 +659,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
}
expp = svc_export_lookup(&exp);
- if (expp)
- expp = svc_export_update(&exp, expp);
- else
- err = -ENOMEM;
- cache_flush();
- if (expp == NULL)
+ if (!expp) {
err = -ENOMEM;
- else
+ goto out4;
+ }
+ expp = svc_export_update(&exp, expp);
+ if (expp) {
+ trace_nfsd_export_update(expp);
+ cache_flush();
exp_put(expp);
+ } else
+ err = -ENOMEM;
out4:
nfsd4_fslocs_free(&exp.ex_fslocs);
kfree(exp.ex_uuid);
@@ -767,6 +785,7 @@ static const struct cache_detail svc_export_cache_template = {
.hash_size = EXPORT_HASHMAX,
.name = "nfsd.export",
.cache_put = svc_export_put,
+ .cache_upcall = svc_export_upcall,
.cache_request = svc_export_request,
.cache_parse = svc_export_parse,
.cache_show = svc_export_show,
@@ -832,8 +851,10 @@ exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type,
if (ek == NULL)
return ERR_PTR(-ENOMEM);
err = cache_check(cd, &ek->h, reqp);
- if (err)
+ if (err) {
+ trace_nfsd_exp_find_key(&key, err);
return ERR_PTR(err);
+ }
return ek;
}
@@ -855,8 +876,10 @@ exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp,
if (exp == NULL)
return ERR_PTR(-ENOMEM);
err = cache_check(cd, &exp->h, reqp);
- if (err)
+ if (err) {
+ trace_nfsd_exp_get_by_name(&key, err);
return ERR_PTR(err);
+ }
return exp;
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 22e77ede9f14..82198d747c4c 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -890,7 +890,7 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
- nf_node) {
+ nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) {
if ((need & nf->nf_may) != need)
continue;
if (nf->nf_inode != inode)
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 2baf32311e00..09aa545825bd 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -172,6 +172,8 @@ struct nfsd_net {
unsigned int longest_chain_cachesize;
struct shrinker nfsd_reply_cache_shrinker;
+ /* utsname taken from the the process that starts the server */
+ char nfsd_name[UNX_MAXNODENAME+1];
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index d1f285245af8..9460be8a8321 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -122,6 +122,12 @@ idtoname_hash(struct ent *ent)
return hash;
}
+static int
+idtoname_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
static void
idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
int *blen)
@@ -184,6 +190,7 @@ static const struct cache_detail idtoname_cache_template = {
.hash_size = ENT_HASHMAX,
.name = "nfs4.idtoname",
.cache_put = ent_put,
+ .cache_upcall = idtoname_upcall,
.cache_request = idtoname_request,
.cache_parse = idtoname_parse,
.cache_show = idtoname_show,
@@ -295,6 +302,12 @@ nametoid_hash(struct ent *ent)
return hash_str(ent->name, ENT_HASHBITS);
}
+static int
+nametoid_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
static void
nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
int *blen)
@@ -347,6 +360,7 @@ static const struct cache_detail nametoid_cache_template = {
.hash_size = ENT_HASHMAX,
.name = "nfs4.nametoid",
.cache_put = ent_put,
+ .cache_upcall = nametoid_upcall,
.cache_request = nametoid_request,
.cache_parse = nametoid_parse,
.cache_show = nametoid_show,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 65cfe9ab47be..e32ecedece0f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -494,6 +494,8 @@ find_any_file(struct nfs4_file *f)
{
struct nfsd_file *ret;
+ if (!f)
+ return NULL;
spin_lock(&f->fi_lock);
ret = __nfs4_get_fd(f, O_RDWR);
if (!ret) {
@@ -1309,6 +1311,12 @@ static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
nfs4_free_stateowner(sop);
}
+static bool
+nfs4_ol_stateid_unhashed(const struct nfs4_ol_stateid *stp)
+{
+ return list_empty(&stp->st_perfile);
+}
+
static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp)
{
struct nfs4_file *fp = stp->st_stid.sc_file;
@@ -1379,9 +1387,11 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
{
lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
+ if (!unhash_ol_stateid(stp))
+ return false;
list_del_init(&stp->st_locks);
nfs4_unhash_stid(&stp->st_stid);
- return unhash_ol_stateid(stp);
+ return true;
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1446,13 +1456,12 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
static bool unhash_open_stateid(struct nfs4_ol_stateid *stp,
struct list_head *reaplist)
{
- bool unhashed;
-
lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
- unhashed = unhash_ol_stateid(stp);
+ if (!unhash_ol_stateid(stp))
+ return false;
release_open_stateid_locks(stp, reaplist);
- return unhashed;
+ return true;
}
static void release_open_stateid(struct nfs4_ol_stateid *stp)
@@ -2636,7 +2645,7 @@ static const struct file_operations client_ctl_fops = {
static const struct tree_descr client_files[] = {
[0] = {"info", &client_info_fops, S_IRUSR},
[1] = {"states", &client_states_fops, S_IRUSR},
- [2] = {"ctl", &client_ctl_fops, S_IRUSR|S_IWUSR},
+ [2] = {"ctl", &client_ctl_fops, S_IWUSR},
[3] = {""},
};
@@ -4343,7 +4352,8 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
{
struct nfs4_file *fp;
- hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
+ hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
+ lockdep_is_held(&state_lock)) {
if (fh_match(&fp->fi_fhandle, fh)) {
if (refcount_inc_not_zero(&fp->fi_ref))
return fp;
@@ -5521,15 +5531,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
CLOSE_STATEID(stateid))
return status;
- /* Client debugging aid. */
- if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) {
- char addr_str[INET6_ADDRSTRLEN];
- rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str,
- sizeof(addr_str));
- pr_warn_ratelimited("NFSD: client %s testing state ID "
- "with incorrect client ID\n", addr_str);
+ if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid))
return status;
- }
spin_lock(&cl->cl_lock);
s = find_stateid_locked(cl, stateid);
if (!s)
@@ -6393,21 +6396,21 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
}
static struct nfs4_ol_stateid *
-find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
+find_lock_stateid(const struct nfs4_lockowner *lo,
+ const struct nfs4_ol_stateid *ost)
{
struct nfs4_ol_stateid *lst;
- struct nfs4_client *clp = lo->lo_owner.so_client;
- lockdep_assert_held(&clp->cl_lock);
+ lockdep_assert_held(&ost->st_stid.sc_client->cl_lock);
- list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
- if (lst->st_stid.sc_type != NFS4_LOCK_STID)
- continue;
- if (lst->st_stid.sc_file == fp) {
- refcount_inc(&lst->st_stid.sc_count);
- return lst;
+ /* If ost is not hashed, ost->st_locks will not be valid */
+ if (!nfs4_ol_stateid_unhashed(ost))
+ list_for_each_entry(lst, &ost->st_locks, st_locks) {
+ if (lst->st_stateowner == &lo->lo_owner) {
+ refcount_inc(&lst->st_stid.sc_count);
+ return lst;
+ }
}
- }
return NULL;
}
@@ -6423,11 +6426,11 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX);
retry:
spin_lock(&clp->cl_lock);
- spin_lock(&fp->fi_lock);
- retstp = find_lock_stateid(lo, fp);
+ if (nfs4_ol_stateid_unhashed(open_stp))
+ goto out_close;
+ retstp = find_lock_stateid(lo, open_stp);
if (retstp)
- goto out_unlock;
-
+ goto out_found;
refcount_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_LOCK_STID;
stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
@@ -6436,22 +6439,26 @@ retry:
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
+ spin_lock(&fp->fi_lock);
list_add(&stp->st_locks, &open_stp->st_locks);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
list_add(&stp->st_perfile, &fp->fi_stateids);
-out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&clp->cl_lock);
- if (retstp) {
- if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
- nfs4_put_stid(&retstp->st_stid);
- goto retry;
- }
- /* To keep mutex tracking happy */
- mutex_unlock(&stp->st_mutex);
- stp = retstp;
- }
return stp;
+out_found:
+ spin_unlock(&clp->cl_lock);
+ if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
+ nfs4_put_stid(&retstp->st_stid);
+ goto retry;
+ }
+ /* To keep mutex tracking happy */
+ mutex_unlock(&stp->st_mutex);
+ return retstp;
+out_close:
+ spin_unlock(&clp->cl_lock);
+ mutex_unlock(&stp->st_mutex);
+ return NULL;
}
static struct nfs4_ol_stateid *
@@ -6466,7 +6473,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
*new = false;
spin_lock(&clp->cl_lock);
- lst = find_lock_stateid(lo, fi);
+ lst = find_lock_stateid(lo, ost);
spin_unlock(&clp->cl_lock);
if (lst != NULL) {
if (nfsd4_lock_ol_stateid(lst) == nfs_ok)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9761512674a0..996ac01ee977 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3591,23 +3591,22 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
__be32 nfserr;
__be32 tmp;
__be32 *p;
- u32 zzz = 0;
int pad;
+ /*
+ * svcrdma requires every READ payload to start somewhere
+ * in xdr->pages.
+ */
+ if (xdr->iov == xdr->buf->head) {
+ xdr->iov = NULL;
+ xdr->end = xdr->p;
+ }
+
len = maxcount;
v = 0;
-
- thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
- p = xdr_reserve_space(xdr, (thislen+3)&~3);
- WARN_ON_ONCE(!p);
- resp->rqstp->rq_vec[v].iov_base = p;
- resp->rqstp->rq_vec[v].iov_len = thislen;
- v++;
- len -= thislen;
-
while (len) {
thislen = min_t(long, len, PAGE_SIZE);
- p = xdr_reserve_space(xdr, (thislen+3)&~3);
+ p = xdr_reserve_space(xdr, thislen);
WARN_ON_ONCE(!p);
resp->rqstp->rq_vec[v].iov_base = p;
resp->rqstp->rq_vec[v].iov_len = thislen;
@@ -3616,23 +3615,25 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
}
read->rd_vlen = v;
- len = maxcount;
nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
&eof);
read->rd_length = maxcount;
if (nfserr)
return nfserr;
- xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
+ if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount))
+ return nfserr_io;
+ xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount));
tmp = htonl(eof);
write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
tmp = htonl(maxcount);
write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+ tmp = xdr_zero;
pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
- &zzz, pad);
+ &tmp, pad);
return 0;
}
@@ -4005,11 +4006,12 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
int major_id_sz;
int server_scope_sz;
uint64_t minor_id = 0;
+ struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
- major_id = utsname()->nodename;
- major_id_sz = strlen(major_id);
- server_scope = utsname()->nodename;
- server_scope_sz = strlen(server_scope);
+ major_id = nn->nfsd_name;
+ major_id_sz = strlen(nn->nfsd_name);
+ server_scope = nn->nfsd_name;
+ server_scope_sz = strlen(nn->nfsd_name);
p = xdr_reserve_space(xdr,
8 /* eir_clientid */ +
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index e109a1007704..3bb2db947d29 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1333,6 +1333,7 @@ void nfsd_client_rmdir(struct dentry *dentry)
dget(dentry);
ret = simple_rmdir(dir, dentry);
WARN_ON_ONCE(ret);
+ fsnotify_rmdir(dir, dentry);
d_delete(dentry);
inode_unlock(dir);
}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index b319080288c3..37bc8f5f4514 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -14,6 +14,7 @@
#include "nfsd.h"
#include "vfs.h"
#include "auth.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_FH
@@ -209,11 +210,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
}
error = nfserr_stale;
- if (PTR_ERR(exp) == -ENOENT)
- return error;
+ if (IS_ERR(exp)) {
+ trace_nfsd_set_fh_dentry_badexport(rqstp, fhp, PTR_ERR(exp));
+
+ if (PTR_ERR(exp) == -ENOENT)
+ return error;
- if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
+ }
if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
/* Elevate privileges so that the lack of 'r' or 'x'
@@ -267,6 +271,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
dentry = exportfs_decode_fh(exp->ex_path.mnt, fid,
data_left, fileid_type,
nfsd_acceptable, exp);
+ if (IS_ERR_OR_NULL(dentry))
+ trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp,
+ dentry ? PTR_ERR(dentry) : -ESTALE);
}
if (dentry == NULL)
goto out;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 3b77b904212d..ca9fd348548b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -749,6 +749,9 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
if (nrservs == 0 && nn->nfsd_serv == NULL)
goto out;
+ strlcpy(nn->nfsd_name, utsname()->nodename,
+ sizeof(nn->nfsd_name));
+
error = nfsd_create_serv(net);
if (error)
goto out;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 06dd0d337049..78c574251c60 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,6 +9,7 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+#include "export.h"
#include "nfsfh.h"
TRACE_EVENT(nfsd_compound,
@@ -50,6 +51,127 @@ TRACE_EVENT(nfsd_compound_status,
__get_str(name), __entry->status)
)
+DECLARE_EVENT_CLASS(nfsd_fh_err_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ int status),
+ TP_ARGS(rqstp, fhp, status),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->status = status;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x status=%d",
+ __entry->xid, __entry->fh_hash,
+ __entry->status)
+)
+
+#define DEFINE_NFSD_FH_ERR_EVENT(name) \
+DEFINE_EVENT(nfsd_fh_err_class, nfsd_##name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ int status), \
+ TP_ARGS(rqstp, fhp, status))
+
+DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport);
+DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle);
+
+TRACE_EVENT(nfsd_exp_find_key,
+ TP_PROTO(const struct svc_expkey *key,
+ int status),
+ TP_ARGS(key, status),
+ TP_STRUCT__entry(
+ __field(int, fsidtype)
+ __array(u32, fsid, 6)
+ __string(auth_domain, key->ek_client->name)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->fsidtype = key->ek_fsidtype;
+ memcpy(__entry->fsid, key->ek_fsid, 4*6);
+ __assign_str(auth_domain, key->ek_client->name);
+ __entry->status = status;
+ ),
+ TP_printk("fsid=%x::%s domain=%s status=%d",
+ __entry->fsidtype,
+ __print_array(__entry->fsid, 6, 4),
+ __get_str(auth_domain),
+ __entry->status
+ )
+);
+
+TRACE_EVENT(nfsd_expkey_update,
+ TP_PROTO(const struct svc_expkey *key, const char *exp_path),
+ TP_ARGS(key, exp_path),
+ TP_STRUCT__entry(
+ __field(int, fsidtype)
+ __array(u32, fsid, 6)
+ __string(auth_domain, key->ek_client->name)
+ __string(path, exp_path)
+ __field(bool, cache)
+ ),
+ TP_fast_assign(
+ __entry->fsidtype = key->ek_fsidtype;
+ memcpy(__entry->fsid, key->ek_fsid, 4*6);
+ __assign_str(auth_domain, key->ek_client->name);
+ __assign_str(path, exp_path);
+ __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
+ ),
+ TP_printk("fsid=%x::%s domain=%s path=%s cache=%s",
+ __entry->fsidtype,
+ __print_array(__entry->fsid, 6, 4),
+ __get_str(auth_domain),
+ __get_str(path),
+ __entry->cache ? "pos" : "neg"
+ )
+);
+
+TRACE_EVENT(nfsd_exp_get_by_name,
+ TP_PROTO(const struct svc_export *key,
+ int status),
+ TP_ARGS(key, status),
+ TP_STRUCT__entry(
+ __string(path, key->ex_path.dentry->d_name.name)
+ __string(auth_domain, key->ex_client->name)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __assign_str(path, key->ex_path.dentry->d_name.name);
+ __assign_str(auth_domain, key->ex_client->name);
+ __entry->status = status;
+ ),
+ TP_printk("path=%s domain=%s status=%d",
+ __get_str(path),
+ __get_str(auth_domain),
+ __entry->status
+ )
+);
+
+TRACE_EVENT(nfsd_export_update,
+ TP_PROTO(const struct svc_export *key),
+ TP_ARGS(key),
+ TP_STRUCT__entry(
+ __string(path, key->ex_path.dentry->d_name.name)
+ __string(auth_domain, key->ex_client->name)
+ __field(bool, cache)
+ ),
+ TP_fast_assign(
+ __assign_str(path, key->ex_path.dentry->d_name.name);
+ __assign_str(auth_domain, key->ex_client->name);
+ __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
+ ),
+ TP_printk("path=%s domain=%s cache=%s",
+ __get_str(path),
+ __get_str(auth_domain),
+ __entry->cache ? "pos" : "neg"
+ )
+);
+
DECLARE_EVENT_CLASS(nfsd_io_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 5778d1347b35..5435a40f82be 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -17,6 +17,59 @@
#include "fanotify.h"
+static bool fanotify_path_equal(struct path *p1, struct path *p2)
+{
+ return p1->mnt == p2->mnt && p1->dentry == p2->dentry;
+}
+
+static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
+ __kernel_fsid_t *fsid2)
+{
+ return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
+}
+
+static bool fanotify_fh_equal(struct fanotify_fh *fh1,
+ struct fanotify_fh *fh2)
+{
+ if (fh1->type != fh2->type || fh1->len != fh2->len)
+ return false;
+
+ /* Do not merge events if we failed to encode fh */
+ if (fh1->type == FILEID_INVALID)
+ return false;
+
+ return !fh1->len ||
+ !memcmp(fanotify_fh_buf(fh1), fanotify_fh_buf(fh2), fh1->len);
+}
+
+static bool fanotify_fid_event_equal(struct fanotify_fid_event *ffe1,
+ struct fanotify_fid_event *ffe2)
+{
+ /* Do not merge fid events without object fh */
+ if (!ffe1->object_fh.len)
+ return false;
+
+ return fanotify_fsid_equal(&ffe1->fsid, &ffe2->fsid) &&
+ fanotify_fh_equal(&ffe1->object_fh, &ffe2->object_fh);
+}
+
+static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
+ struct fanotify_name_event *fne2)
+{
+ /*
+ * Do not merge name events without dir fh.
+ * FAN_DIR_MODIFY does not encode object fh, so it may be empty.
+ */
+ if (!fne1->dir_fh.len)
+ return false;
+
+ if (fne1->name_len != fne2->name_len ||
+ !fanotify_fh_equal(&fne1->dir_fh, &fne2->dir_fh))
+ return false;
+
+ return !memcmp(fne1->name, fne2->name, fne1->name_len);
+}
+
static bool should_merge(struct fsnotify_event *old_fsn,
struct fsnotify_event *new_fsn)
{
@@ -26,14 +79,15 @@ static bool should_merge(struct fsnotify_event *old_fsn,
old = FANOTIFY_E(old_fsn);
new = FANOTIFY_E(new_fsn);
- if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
- old->fh_type != new->fh_type || old->fh_len != new->fh_len)
+ if (old_fsn->objectid != new_fsn->objectid ||
+ old->type != new->type || old->pid != new->pid)
return false;
- if (fanotify_event_has_path(old)) {
- return old->path.mnt == new->path.mnt &&
- old->path.dentry == new->path.dentry;
- } else if (fanotify_event_has_fid(old)) {
+ switch (old->type) {
+ case FANOTIFY_EVENT_TYPE_PATH:
+ return fanotify_path_equal(fanotify_event_path(old),
+ fanotify_event_path(new));
+ case FANOTIFY_EVENT_TYPE_FID:
/*
* We want to merge many dirent events in the same dir (i.e.
* creates/unlinks/renames), but we do not want to merge dirent
@@ -42,11 +96,18 @@ static bool should_merge(struct fsnotify_event *old_fsn,
* mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
* unlink pair or rmdir+create pair of events.
*/
- return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) &&
- fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
+ if ((old->mask & FS_ISDIR) != (new->mask & FS_ISDIR))
+ return false;
+
+ return fanotify_fid_event_equal(FANOTIFY_FE(old),
+ FANOTIFY_FE(new));
+ case FANOTIFY_EVENT_TYPE_FID_NAME:
+ return fanotify_name_event_equal(FANOTIFY_NE(old),
+ FANOTIFY_NE(new));
+ default:
+ WARN_ON_ONCE(1);
}
- /* Do not merge events if we failed to encode fid */
return false;
}
@@ -151,7 +212,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
{
__u32 marks_mask = 0, marks_ignored_mask = 0;
__u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS;
- const struct path *path = data;
+ const struct path *path = fsnotify_data_path(data, data_type);
struct fsnotify_mark *mark;
int type;
@@ -160,7 +221,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
/* Do we have path to open a file descriptor? */
- if (data_type != FSNOTIFY_EVENT_PATH)
+ if (!path)
return 0;
/* Path type events are only relevant for files and dirs */
if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry))
@@ -172,6 +233,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
continue;
mark = iter_info->marks[type];
/*
+ * If the event is on dir and this mark doesn't care about
+ * events on dir, don't send it!
+ */
+ if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR))
+ continue;
+
+ /*
* If the event is for a child and this mark doesn't care about
* events on a child, don't send it!
*/
@@ -187,9 +255,9 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
test_mask = event_mask & marks_mask & ~marks_ignored_mask;
/*
- * dirent modification events (create/delete/move) do not carry the
- * child entry name/inode information. Instead, we report FAN_ONDIR
- * for mkdir/rmdir so user can differentiate them from creat/unlink.
+ * For dirent modification events (create/delete/move) that do not carry
+ * the child entry name information, we report FAN_ONDIR for mkdir/rmdir
+ * so user can differentiate them from creat/unlink.
*
* For backward compatibility and consistency, do not report FAN_ONDIR
* to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
@@ -203,22 +271,20 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
user_mask &= ~FAN_ONDIR;
}
- if (event_mask & FS_ISDIR &&
- !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
- return 0;
-
return test_mask & user_mask;
}
-static int fanotify_encode_fid(struct fanotify_event *event,
- struct inode *inode, gfp_t gfp,
- __kernel_fsid_t *fsid)
+static void fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
+ gfp_t gfp)
{
- struct fanotify_fid *fid = &event->fid;
- int dwords, bytes = 0;
- int err, type;
+ int dwords, type, bytes = 0;
+ char *ext_buf = NULL;
+ void *buf = fh->buf;
+ int err;
+
+ if (!inode)
+ goto out;
- fid->ext_fh = NULL;
dwords = 0;
err = -ENOENT;
type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
@@ -229,31 +295,33 @@ static int fanotify_encode_fid(struct fanotify_event *event,
if (bytes > FANOTIFY_INLINE_FH_LEN) {
/* Treat failure to allocate fh as failure to allocate event */
err = -ENOMEM;
- fid->ext_fh = kmalloc(bytes, gfp);
- if (!fid->ext_fh)
+ ext_buf = kmalloc(bytes, gfp);
+ if (!ext_buf)
goto out_err;
+
+ *fanotify_fh_ext_buf_ptr(fh) = ext_buf;
+ buf = ext_buf;
}
- type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes),
- &dwords, NULL);
+ type = exportfs_encode_inode_fh(inode, buf, &dwords, NULL);
err = -EINVAL;
if (!type || type == FILEID_INVALID || bytes != dwords << 2)
goto out_err;
- fid->fsid = *fsid;
- event->fh_len = bytes;
+ fh->type = type;
+ fh->len = bytes;
- return type;
+ return;
out_err:
- pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
- "type=%d, bytes=%d, err=%i)\n",
- fsid->val[0], fsid->val[1], type, bytes, err);
- kfree(fid->ext_fh);
- fid->ext_fh = NULL;
- event->fh_len = 0;
-
- return FILEID_INVALID;
+ pr_warn_ratelimited("fanotify: failed to encode fid (type=%d, len=%d, err=%i)\n",
+ type, bytes, err);
+ kfree(ext_buf);
+ *fanotify_fh_ext_buf_ptr(fh) = NULL;
+out:
+ /* Report the event without a file identifier on encode error */
+ fh->type = FILEID_INVALID;
+ fh->len = 0;
}
/*
@@ -269,21 +337,22 @@ static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
{
if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
return to_tell;
- else if (data_type == FSNOTIFY_EVENT_INODE)
- return (struct inode *)data;
- else if (data_type == FSNOTIFY_EVENT_PATH)
- return d_inode(((struct path *)data)->dentry);
- return NULL;
+
+ return (struct inode *)fsnotify_data_inode(data, data_type);
}
struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
struct inode *inode, u32 mask,
const void *data, int data_type,
+ const struct qstr *file_name,
__kernel_fsid_t *fsid)
{
struct fanotify_event *event = NULL;
+ struct fanotify_fid_event *ffe = NULL;
+ struct fanotify_name_event *fne = NULL;
gfp_t gfp = GFP_KERNEL_ACCOUNT;
struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
+ const struct path *path = fsnotify_data_path(data, data_type);
/*
* For queues with unlimited length lost events are not expected and
@@ -305,33 +374,81 @@ struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
if (!pevent)
goto out;
+
event = &pevent->fae;
+ event->type = FANOTIFY_EVENT_TYPE_PATH_PERM;
pevent->response = 0;
pevent->state = FAN_EVENT_INIT;
goto init;
}
- event = kmem_cache_alloc(fanotify_event_cachep, gfp);
- if (!event)
- goto out;
-init: __maybe_unused
- fsnotify_init_event(&event->fse, inode);
+
+ /*
+ * For FAN_DIR_MODIFY event, we report the fid of the directory and
+ * the name of the modified entry.
+ * Allocate an fanotify_name_event struct and copy the name.
+ */
+ if (mask & FAN_DIR_MODIFY && !(WARN_ON_ONCE(!file_name))) {
+ fne = kmalloc(sizeof(*fne) + file_name->len + 1, gfp);
+ if (!fne)
+ goto out;
+
+ event = &fne->fae;
+ event->type = FANOTIFY_EVENT_TYPE_FID_NAME;
+ fne->name_len = file_name->len;
+ strcpy(fne->name, file_name->name);
+ goto init;
+ }
+
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ ffe = kmem_cache_alloc(fanotify_fid_event_cachep, gfp);
+ if (!ffe)
+ goto out;
+
+ event = &ffe->fae;
+ event->type = FANOTIFY_EVENT_TYPE_FID;
+ } else {
+ struct fanotify_path_event *pevent;
+
+ pevent = kmem_cache_alloc(fanotify_path_event_cachep, gfp);
+ if (!pevent)
+ goto out;
+
+ event = &pevent->fae;
+ event->type = FANOTIFY_EVENT_TYPE_PATH;
+ }
+
+init:
+ /*
+ * Use the victim inode instead of the watching inode as the id for
+ * event queue, so event reported on parent is merged with event
+ * reported on child when both directory and child watches exist.
+ */
+ fsnotify_init_event(&event->fse, (unsigned long)id);
event->mask = mask;
if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
event->pid = get_pid(task_pid(current));
else
event->pid = get_pid(task_tgid(current));
- event->fh_len = 0;
- if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
- /* Report the event without a file identifier on encode error */
- event->fh_type = fanotify_encode_fid(event, id, gfp, fsid);
- } else if (data_type == FSNOTIFY_EVENT_PATH) {
- event->fh_type = FILEID_ROOT;
- event->path = *((struct path *)data);
- path_get(&event->path);
- } else {
- event->fh_type = FILEID_INVALID;
- event->path.mnt = NULL;
- event->path.dentry = NULL;
+
+ if (fsid && fanotify_event_fsid(event))
+ *fanotify_event_fsid(event) = *fsid;
+
+ if (fanotify_event_object_fh(event))
+ fanotify_encode_fh(fanotify_event_object_fh(event), id, gfp);
+
+ if (fanotify_event_dir_fh(event))
+ fanotify_encode_fh(fanotify_event_dir_fh(event), id, gfp);
+
+ if (fanotify_event_has_path(event)) {
+ struct path *p = fanotify_event_path(event);
+
+ if (path) {
+ *p = *path;
+ path_get(path);
+ } else {
+ p->mnt = NULL;
+ p->dentry = NULL;
+ }
}
out:
memalloc_unuse_memcg();
@@ -392,6 +509,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
+ BUILD_BUG_ON(FAN_DIR_MODIFY != FS_DIR_MODIFY);
BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
@@ -402,7 +520,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20);
mask = fanotify_group_event_mask(group, iter_info, mask, data,
data_type);
@@ -429,7 +547,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
}
event = fanotify_alloc_event(group, inode, mask, data, data_type,
- &fsid);
+ file_name, &fsid);
ret = -ENOMEM;
if (unlikely(!event)) {
/*
@@ -451,7 +569,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
ret = 0;
} else if (fanotify_is_perm_event(mask)) {
- ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
+ ret = fanotify_get_response(group, FANOTIFY_PERM(event),
iter_info);
}
finish:
@@ -470,22 +588,58 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
free_uid(user);
}
+static void fanotify_free_path_event(struct fanotify_event *event)
+{
+ path_put(fanotify_event_path(event));
+ kmem_cache_free(fanotify_path_event_cachep, FANOTIFY_PE(event));
+}
+
+static void fanotify_free_perm_event(struct fanotify_event *event)
+{
+ path_put(fanotify_event_path(event));
+ kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PERM(event));
+}
+
+static void fanotify_free_fid_event(struct fanotify_event *event)
+{
+ struct fanotify_fid_event *ffe = FANOTIFY_FE(event);
+
+ if (fanotify_fh_has_ext_buf(&ffe->object_fh))
+ kfree(fanotify_fh_ext_buf(&ffe->object_fh));
+ kmem_cache_free(fanotify_fid_event_cachep, ffe);
+}
+
+static void fanotify_free_name_event(struct fanotify_event *event)
+{
+ struct fanotify_name_event *fne = FANOTIFY_NE(event);
+
+ if (fanotify_fh_has_ext_buf(&fne->dir_fh))
+ kfree(fanotify_fh_ext_buf(&fne->dir_fh));
+ kfree(fne);
+}
+
static void fanotify_free_event(struct fsnotify_event *fsn_event)
{
struct fanotify_event *event;
event = FANOTIFY_E(fsn_event);
- if (fanotify_event_has_path(event))
- path_put(&event->path);
- else if (fanotify_event_has_ext_fh(event))
- kfree(event->fid.ext_fh);
put_pid(event->pid);
- if (fanotify_is_perm_event(event->mask)) {
- kmem_cache_free(fanotify_perm_event_cachep,
- FANOTIFY_PE(fsn_event));
- return;
+ switch (event->type) {
+ case FANOTIFY_EVENT_TYPE_PATH:
+ fanotify_free_path_event(event);
+ break;
+ case FANOTIFY_EVENT_TYPE_PATH_PERM:
+ fanotify_free_perm_event(event);
+ break;
+ case FANOTIFY_EVENT_TYPE_FID:
+ fanotify_free_fid_event(event);
+ break;
+ case FANOTIFY_EVENT_TYPE_FID_NAME:
+ fanotify_free_name_event(event);
+ break;
+ default:
+ WARN_ON_ONCE(1);
}
- kmem_cache_free(fanotify_event_cachep, event);
}
static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 68b30504284c..35bfbf4a7aac 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -5,7 +5,8 @@
#include <linux/exportfs.h>
extern struct kmem_cache *fanotify_mark_cache;
-extern struct kmem_cache *fanotify_event_cachep;
+extern struct kmem_cache *fanotify_fid_event_cachep;
+extern struct kmem_cache *fanotify_path_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
/* Possible states of the permission event */
@@ -18,94 +19,140 @@ enum {
/*
* 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
- * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
- * fh_* fields increase the size of fanotify_event by another 4 bytes.
- * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
- * fh_* fields are packed in a hole after mask.
+ * fh buf should be dword aligned. On 64bit arch, the ext_buf pointer is
+ * stored in either the first or last 2 dwords.
*/
-#if BITS_PER_LONG == 32
#define FANOTIFY_INLINE_FH_LEN (3 << 2)
-#else
-#define FANOTIFY_INLINE_FH_LEN (4 << 2)
-#endif
-struct fanotify_fid {
- __kernel_fsid_t fsid;
- union {
- unsigned char fh[FANOTIFY_INLINE_FH_LEN];
- unsigned char *ext_fh;
- };
-};
+struct fanotify_fh {
+ unsigned char buf[FANOTIFY_INLINE_FH_LEN];
+ u8 type;
+ u8 len;
+} __aligned(4);
+
+static inline bool fanotify_fh_has_ext_buf(struct fanotify_fh *fh)
+{
+ return fh->len > FANOTIFY_INLINE_FH_LEN;
+}
+
+static inline char **fanotify_fh_ext_buf_ptr(struct fanotify_fh *fh)
+{
+ BUILD_BUG_ON(__alignof__(char *) - 4 + sizeof(char *) >
+ FANOTIFY_INLINE_FH_LEN);
+ return (char **)ALIGN((unsigned long)(fh->buf), __alignof__(char *));
+}
-static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
- unsigned int fh_len)
+static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh)
{
- return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
+ return *fanotify_fh_ext_buf_ptr(fh);
}
-static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
- struct fanotify_fid *fid2,
- unsigned int fh_len)
+static inline void *fanotify_fh_buf(struct fanotify_fh *fh)
{
- return fid1->fsid.val[0] == fid2->fsid.val[0] &&
- fid1->fsid.val[1] == fid2->fsid.val[1] &&
- !memcmp(fanotify_fid_fh(fid1, fh_len),
- fanotify_fid_fh(fid2, fh_len), fh_len);
+ return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh->buf;
}
/*
- * Structure for normal fanotify events. It gets allocated in
+ * Common structure for fanotify events. Concrete structs are allocated in
* fanotify_handle_event() and freed when the information is retrieved by
- * userspace
+ * userspace. The type of event determines how it was allocated, how it will
+ * be freed and which concrete struct it may be cast to.
*/
+enum fanotify_event_type {
+ FANOTIFY_EVENT_TYPE_FID, /* fixed length */
+ FANOTIFY_EVENT_TYPE_FID_NAME, /* variable length */
+ FANOTIFY_EVENT_TYPE_PATH,
+ FANOTIFY_EVENT_TYPE_PATH_PERM,
+};
+
struct fanotify_event {
struct fsnotify_event fse;
u32 mask;
- /*
- * Those fields are outside fanotify_fid to pack fanotify_event nicely
- * on 64bit arch and to use fh_type as an indication of whether path
- * or fid are used in the union:
- * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
- */
- u8 fh_type;
- u8 fh_len;
- u16 pad;
- union {
- /*
- * We hold ref to this path so it may be dereferenced at any
- * point during this object's lifetime
- */
- struct path path;
- /*
- * With FAN_REPORT_FID, we do not hold any reference on the
- * victim object. Instead we store its NFS file handle and its
- * filesystem's fsid as a unique identifier.
- */
- struct fanotify_fid fid;
- };
+ enum fanotify_event_type type;
struct pid *pid;
};
-static inline bool fanotify_event_has_path(struct fanotify_event *event)
+struct fanotify_fid_event {
+ struct fanotify_event fae;
+ __kernel_fsid_t fsid;
+ struct fanotify_fh object_fh;
+};
+
+static inline struct fanotify_fid_event *
+FANOTIFY_FE(struct fanotify_event *event)
{
- return event->fh_type == FILEID_ROOT;
+ return container_of(event, struct fanotify_fid_event, fae);
}
-static inline bool fanotify_event_has_fid(struct fanotify_event *event)
+struct fanotify_name_event {
+ struct fanotify_event fae;
+ __kernel_fsid_t fsid;
+ struct fanotify_fh dir_fh;
+ u8 name_len;
+ char name[0];
+};
+
+static inline struct fanotify_name_event *
+FANOTIFY_NE(struct fanotify_event *event)
{
- return event->fh_type != FILEID_ROOT &&
- event->fh_type != FILEID_INVALID;
+ return container_of(event, struct fanotify_name_event, fae);
}
-static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
+static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event)
{
- return fanotify_event_has_fid(event) &&
- event->fh_len > FANOTIFY_INLINE_FH_LEN;
+ if (event->type == FANOTIFY_EVENT_TYPE_FID)
+ return &FANOTIFY_FE(event)->fsid;
+ else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
+ return &FANOTIFY_NE(event)->fsid;
+ else
+ return NULL;
}
-static inline void *fanotify_event_fh(struct fanotify_event *event)
+static inline struct fanotify_fh *fanotify_event_object_fh(
+ struct fanotify_event *event)
{
- return fanotify_fid_fh(&event->fid, event->fh_len);
+ if (event->type == FANOTIFY_EVENT_TYPE_FID)
+ return &FANOTIFY_FE(event)->object_fh;
+ else
+ return NULL;
+}
+
+static inline struct fanotify_fh *fanotify_event_dir_fh(
+ struct fanotify_event *event)
+{
+ if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
+ return &FANOTIFY_NE(event)->dir_fh;
+ else
+ return NULL;
+}
+
+static inline int fanotify_event_object_fh_len(struct fanotify_event *event)
+{
+ struct fanotify_fh *fh = fanotify_event_object_fh(event);
+
+ return fh ? fh->len : 0;
+}
+
+static inline bool fanotify_event_has_name(struct fanotify_event *event)
+{
+ return event->type == FANOTIFY_EVENT_TYPE_FID_NAME;
+}
+
+static inline int fanotify_event_name_len(struct fanotify_event *event)
+{
+ return fanotify_event_has_name(event) ?
+ FANOTIFY_NE(event)->name_len : 0;
+}
+
+struct fanotify_path_event {
+ struct fanotify_event fae;
+ struct path path;
+};
+
+static inline struct fanotify_path_event *
+FANOTIFY_PE(struct fanotify_event *event)
+{
+ return container_of(event, struct fanotify_path_event, fae);
}
/*
@@ -117,15 +164,16 @@ static inline void *fanotify_event_fh(struct fanotify_event *event)
*/
struct fanotify_perm_event {
struct fanotify_event fae;
+ struct path path;
unsigned short response; /* userspace answer to the event */
unsigned short state; /* state of the event */
int fd; /* fd we passed to userspace for this event */
};
static inline struct fanotify_perm_event *
-FANOTIFY_PE(struct fsnotify_event *fse)
+FANOTIFY_PERM(struct fanotify_event *event)
{
- return container_of(fse, struct fanotify_perm_event, fae.fse);
+ return container_of(event, struct fanotify_perm_event, fae);
}
static inline bool fanotify_is_perm_event(u32 mask)
@@ -139,7 +187,24 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
return container_of(fse, struct fanotify_event, fse);
}
+static inline bool fanotify_event_has_path(struct fanotify_event *event)
+{
+ return event->type == FANOTIFY_EVENT_TYPE_PATH ||
+ event->type == FANOTIFY_EVENT_TYPE_PATH_PERM;
+}
+
+static inline struct path *fanotify_event_path(struct fanotify_event *event)
+{
+ if (event->type == FANOTIFY_EVENT_TYPE_PATH)
+ return &FANOTIFY_PE(event)->path;
+ else if (event->type == FANOTIFY_EVENT_TYPE_PATH_PERM)
+ return &FANOTIFY_PERM(event)->path;
+ else
+ return NULL;
+}
+
struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
struct inode *inode, u32 mask,
const void *data, int data_type,
+ const struct qstr *file_name,
__kernel_fsid_t *fsid);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 0aa362b88550..42cb794c62ac 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -46,32 +46,53 @@
extern const struct fsnotify_ops fanotify_fsnotify_ops;
struct kmem_cache *fanotify_mark_cache __read_mostly;
-struct kmem_cache *fanotify_event_cachep __read_mostly;
+struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
+struct kmem_cache *fanotify_path_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
#define FANOTIFY_EVENT_ALIGN 4
+#define FANOTIFY_INFO_HDR_LEN \
+ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
+
+static int fanotify_fid_info_len(int fh_len, int name_len)
+{
+ int info_len = fh_len;
+
+ if (name_len)
+ info_len += name_len + 1;
+
+ return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN);
+}
static int fanotify_event_info_len(struct fanotify_event *event)
{
- if (!fanotify_event_has_fid(event))
- return 0;
+ int info_len = 0;
+ int fh_len = fanotify_event_object_fh_len(event);
+
+ if (fh_len)
+ info_len += fanotify_fid_info_len(fh_len, 0);
- return roundup(sizeof(struct fanotify_event_info_fid) +
- sizeof(struct file_handle) + event->fh_len,
- FANOTIFY_EVENT_ALIGN);
+ if (fanotify_event_name_len(event)) {
+ struct fanotify_name_event *fne = FANOTIFY_NE(event);
+
+ info_len += fanotify_fid_info_len(fne->dir_fh.len,
+ fne->name_len);
+ }
+
+ return info_len;
}
/*
- * Get an fsnotify notification event if one exists and is small
+ * Get an fanotify notification event if one exists and is small
* enough to fit in "count". Return an error pointer if the count
* is not large enough. When permission event is dequeued, its state is
* updated accordingly.
*/
-static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+static struct fanotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
size_t event_size = FAN_EVENT_METADATA_LEN;
- struct fsnotify_event *fsn_event = NULL;
+ struct fanotify_event *event = NULL;
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
@@ -85,26 +106,23 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
}
if (event_size > count) {
- fsn_event = ERR_PTR(-EINVAL);
+ event = ERR_PTR(-EINVAL);
goto out;
}
- fsn_event = fsnotify_remove_first_event(group);
- if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask))
- FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED;
+ event = FANOTIFY_E(fsnotify_remove_first_event(group));
+ if (fanotify_is_perm_event(event->mask))
+ FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
out:
spin_unlock(&group->notification_lock);
- return fsn_event;
+ return event;
}
-static int create_fd(struct fsnotify_group *group,
- struct fanotify_event *event,
+static int create_fd(struct fsnotify_group *group, struct path *path,
struct file **file)
{
int client_fd;
struct file *new_file;
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
if (client_fd < 0)
return client_fd;
@@ -113,14 +131,9 @@ static int create_fd(struct fsnotify_group *group,
* we need a new file handle for the userspace program so it can read even if it was
* originally opened O_WRONLY.
*/
- /* it's possible this event was an overflow event. in that case dentry and mnt
- * are NULL; That's fine, just don't call dentry open */
- if (event->path.dentry && event->path.mnt)
- new_file = dentry_open(&event->path,
- group->fanotify_data.f_flags | FMODE_NONOTIFY,
- current_cred());
- else
- new_file = ERR_PTR(-EOVERFLOW);
+ new_file = dentry_open(path,
+ group->fanotify_data.f_flags | FMODE_NONOTIFY,
+ current_cred());
if (IS_ERR(new_file)) {
/*
* we still send an event even if we can't open the file. this
@@ -204,83 +217,111 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
-static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
+static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+ const char *name, size_t name_len,
+ char __user *buf, size_t count)
{
struct fanotify_event_info_fid info = { };
struct file_handle handle = { };
- unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh;
- size_t fh_len = event->fh_len;
- size_t len = fanotify_event_info_len(event);
+ unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
+ size_t fh_len = fh ? fh->len : 0;
+ size_t info_len = fanotify_fid_info_len(fh_len, name_len);
+ size_t len = info_len;
+
+ pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
+ __func__, fh_len, name_len, info_len, count);
- if (!len)
+ if (!fh_len || (name && !name_len))
return 0;
- if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
+ if (WARN_ON_ONCE(len < sizeof(info) || len > count))
return -EFAULT;
- /* Copy event info fid header followed by vaiable sized file handle */
- info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
+ /*
+ * Copy event info fid header followed by variable sized file handle
+ * and optionally followed by variable sized filename.
+ */
+ info.hdr.info_type = name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
+ FAN_EVENT_INFO_TYPE_FID;
info.hdr.len = len;
- info.fsid = event->fid.fsid;
+ info.fsid = *fsid;
if (copy_to_user(buf, &info, sizeof(info)))
return -EFAULT;
buf += sizeof(info);
len -= sizeof(info);
- handle.handle_type = event->fh_type;
+ if (WARN_ON_ONCE(len < sizeof(handle)))
+ return -EFAULT;
+
+ handle.handle_type = fh->type;
handle.handle_bytes = fh_len;
if (copy_to_user(buf, &handle, sizeof(handle)))
return -EFAULT;
buf += sizeof(handle);
len -= sizeof(handle);
+ if (WARN_ON_ONCE(len < fh_len))
+ return -EFAULT;
+
/*
- * For an inline fh, copy through stack to exclude the copy from
- * usercopy hardening protections.
+ * For an inline fh and inline file name, copy through stack to exclude
+ * the copy from usercopy hardening protections.
*/
- fh = fanotify_event_fh(event);
+ fh_buf = fanotify_fh_buf(fh);
if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
- memcpy(bounce, fh, fh_len);
- fh = bounce;
+ memcpy(bounce, fh_buf, fh_len);
+ fh_buf = bounce;
}
- if (copy_to_user(buf, fh, fh_len))
+ if (copy_to_user(buf, fh_buf, fh_len))
return -EFAULT;
- /* Pad with 0's */
buf += fh_len;
len -= fh_len;
+
+ if (name_len) {
+ /* Copy the filename with terminating null */
+ name_len++;
+ if (WARN_ON_ONCE(len < name_len))
+ return -EFAULT;
+
+ if (copy_to_user(buf, name, name_len))
+ return -EFAULT;
+
+ buf += name_len;
+ len -= name_len;
+ }
+
+ /* Pad with 0's */
WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
if (len > 0 && clear_user(buf, len))
return -EFAULT;
- return 0;
+ return info_len;
}
static ssize_t copy_event_to_user(struct fsnotify_group *group,
- struct fsnotify_event *fsn_event,
+ struct fanotify_event *event,
char __user *buf, size_t count)
{
struct fanotify_event_metadata metadata;
- struct fanotify_event *event;
+ struct path *path = fanotify_event_path(event);
struct file *f = NULL;
int ret, fd = FAN_NOFD;
- pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+ pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- event = container_of(fsn_event, struct fanotify_event, fse);
- metadata.event_len = FAN_EVENT_METADATA_LEN;
+ metadata.event_len = FAN_EVENT_METADATA_LEN +
+ fanotify_event_info_len(event);
metadata.metadata_len = FAN_EVENT_METADATA_LEN;
metadata.vers = FANOTIFY_METADATA_VERSION;
metadata.reserved = 0;
metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
metadata.pid = pid_vnr(event->pid);
- if (fanotify_event_has_path(event)) {
- fd = create_fd(group, event, &f);
+ if (path && path->mnt && path->dentry) {
+ fd = create_fd(group, path, &f);
if (fd < 0)
return fd;
- } else if (fanotify_event_has_fid(event)) {
- metadata.event_len += fanotify_event_info_len(event);
}
metadata.fd = fd;
@@ -295,15 +336,39 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
goto out_close_fd;
+ buf += FAN_EVENT_METADATA_LEN;
+ count -= FAN_EVENT_METADATA_LEN;
+
if (fanotify_is_perm_event(event->mask))
- FANOTIFY_PE(fsn_event)->fd = fd;
+ FANOTIFY_PERM(event)->fd = fd;
- if (fanotify_event_has_path(event)) {
+ if (f)
fd_install(fd, f);
- } else if (fanotify_event_has_fid(event)) {
- ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
+
+ /* Event info records order is: dir fid + name, child fid */
+ if (fanotify_event_name_len(event)) {
+ struct fanotify_name_event *fne = FANOTIFY_NE(event);
+
+ ret = copy_info_to_user(fanotify_event_fsid(event),
+ fanotify_event_dir_fh(event),
+ fne->name, fne->name_len,
+ buf, count);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ count -= ret;
+ }
+
+ if (fanotify_event_object_fh_len(event)) {
+ ret = copy_info_to_user(fanotify_event_fsid(event),
+ fanotify_event_object_fh(event),
+ NULL, 0, buf, count);
if (ret < 0)
return ret;
+
+ buf += ret;
+ count -= ret;
}
return metadata.event_len;
@@ -335,7 +400,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
size_t count, loff_t *pos)
{
struct fsnotify_group *group;
- struct fsnotify_event *kevent;
+ struct fanotify_event *event;
char __user *start;
int ret;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
@@ -347,13 +412,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- kevent = get_one_event(group, count);
- if (IS_ERR(kevent)) {
- ret = PTR_ERR(kevent);
+ event = get_one_event(group, count);
+ if (IS_ERR(event)) {
+ ret = PTR_ERR(event);
break;
}
- if (!kevent) {
+ if (!event) {
ret = -EAGAIN;
if (file->f_flags & O_NONBLOCK)
break;
@@ -369,7 +434,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
continue;
}
- ret = copy_event_to_user(group, kevent, buf, count);
+ ret = copy_event_to_user(group, event, buf, count);
if (unlikely(ret == -EOPENSTALE)) {
/*
* We cannot report events with stale fd so drop it.
@@ -384,17 +449,17 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
* Permission events get queued to wait for response. Other
* events can be destroyed now.
*/
- if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
- fsnotify_destroy_event(group, kevent);
+ if (!fanotify_is_perm_event(event->mask)) {
+ fsnotify_destroy_event(group, &event->fse);
} else {
if (ret <= 0) {
spin_lock(&group->notification_lock);
finish_permission_event(group,
- FANOTIFY_PE(kevent), FAN_DENY);
+ FANOTIFY_PERM(event), FAN_DENY);
wake_up(&group->fanotify_data.access_waitq);
} else {
spin_lock(&group->notification_lock);
- list_add_tail(&kevent->list,
+ list_add_tail(&event->fse.list,
&group->fanotify_data.access_list);
spin_unlock(&group->notification_lock);
}
@@ -440,8 +505,6 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
static int fanotify_release(struct inode *ignored, struct file *file)
{
struct fsnotify_group *group = file->private_data;
- struct fanotify_perm_event *event;
- struct fsnotify_event *fsn_event;
/*
* Stop new events from arriving in the notification queue. since
@@ -456,6 +519,8 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
spin_lock(&group->notification_lock);
while (!list_empty(&group->fanotify_data.access_list)) {
+ struct fanotify_perm_event *event;
+
event = list_first_entry(&group->fanotify_data.access_list,
struct fanotify_perm_event, fae.fse.list);
list_del_init(&event->fae.fse.list);
@@ -469,12 +534,14 @@ static int fanotify_release(struct inode *ignored, struct file *file)
* response is consumed and fanotify_get_response() returns.
*/
while (!fsnotify_notify_queue_is_empty(group)) {
- fsn_event = fsnotify_remove_first_event(group);
- if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
+ struct fanotify_event *event;
+
+ event = FANOTIFY_E(fsnotify_remove_first_event(group));
+ if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
spin_unlock(&group->notification_lock);
- fsnotify_destroy_event(group, fsn_event);
+ fsnotify_destroy_event(group, &event->fse);
} else {
- finish_permission_event(group, FANOTIFY_PE(fsn_event),
+ finish_permission_event(group, FANOTIFY_PERM(event),
FAN_ALLOW);
}
spin_lock(&group->notification_lock);
@@ -824,7 +891,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->memcg = get_mem_cgroup_from_mm(current->mm);
oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL,
- FSNOTIFY_EVENT_NONE, NULL);
+ FSNOTIFY_EVENT_NONE, NULL, NULL);
if (unlikely(!oevent)) {
fd = -ENOMEM;
goto out_destroy_group;
@@ -1139,7 +1206,10 @@ static int __init fanotify_user_setup(void)
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
- fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
+ fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
+ SLAB_PANIC);
+ fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
+ SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 46f225580009..72d332ce8e12 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -143,15 +143,13 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
}
/* Notify this dentry's parent about a child's events. */
-int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask)
+int fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
+ int data_type)
{
struct dentry *parent;
struct inode *p_inode;
int ret = 0;
- if (!dentry)
- dentry = path->dentry;
-
if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
return 0;
@@ -168,12 +166,7 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask
mask |= FS_EVENT_ON_CHILD;
take_dentry_name_snapshot(&name, dentry);
- if (path)
- ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
- &name.name, 0);
- else
- ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
- &name.name, 0);
+ ret = fsnotify(p_inode, mask, data, data_type, &name.name, 0);
release_dentry_name_snapshot(&name);
}
@@ -181,7 +174,7 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask
return ret;
}
-EXPORT_SYMBOL_GPL(__fsnotify_parent);
+EXPORT_SYMBOL_GPL(fsnotify_parent);
static int send_to_group(struct inode *to_tell,
__u32 mask, const void *data,
@@ -318,6 +311,7 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
const struct qstr *file_name, u32 cookie)
{
+ const struct path *path = fsnotify_data_path(data, data_is);
struct fsnotify_iter_info iter_info = {};
struct super_block *sb = to_tell->i_sb;
struct mount *mnt = NULL;
@@ -325,8 +319,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
int ret = 0;
__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
- if (data_is == FSNOTIFY_EVENT_PATH) {
- mnt = real_mount(((const struct path *)data)->mnt);
+ if (path) {
+ mnt = real_mount(path->mnt);
mnt_or_sb_mask |= mnt->mnt_fsnotify_mask;
}
/* An event "on child" is not intended for a mount/sb mark */
@@ -389,7 +383,7 @@ static __init int fsnotify_init(void)
{
int ret;
- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d510223d302c..2ebc89047153 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -39,7 +39,7 @@ static bool event_compare(struct fsnotify_event *old_fsn,
if (old->mask & FS_IN_IGNORED)
return false;
if ((old->mask == new->mask) &&
- (old_fsn->inode == new_fsn->inode) &&
+ (old_fsn->objectid == new_fsn->objectid) &&
(old->name_len == new->name_len) &&
(!old->name_len || !strcmp(old->name, new->name)))
return true;
@@ -61,6 +61,7 @@ int inotify_handle_event(struct fsnotify_group *group,
const struct qstr *file_name, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
+ const struct path *path = fsnotify_data_path(data, data_type);
struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
struct inotify_inode_mark *i_mark;
struct inotify_event_info *event;
@@ -73,12 +74,9 @@ int inotify_handle_event(struct fsnotify_group *group,
return 0;
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
- (data_type == FSNOTIFY_EVENT_PATH)) {
- const struct path *path = data;
+ path && d_unlinked(path->dentry))
+ return 0;
- if (d_unlinked(path->dentry))
- return 0;
- }
if (file_name) {
len = file_name->len;
alloc_len += len + 1;
@@ -118,7 +116,7 @@ int inotify_handle_event(struct fsnotify_group *group,
mask &= ~IN_ISDIR;
fsn_event = &event->fse;
- fsnotify_init_event(fsn_event, inode);
+ fsnotify_init_event(fsn_event, (unsigned long)inode);
event->mask = mask;
event->wd = i_mark->wd;
event->sync_cookie = cookie;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 107537a543fd..81ffc8629fc4 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -635,7 +635,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
return ERR_PTR(-ENOMEM);
}
group->overflow_event = &oevent->fse;
- fsnotify_init_event(group->overflow_event, NULL);
+ fsnotify_init_event(group->overflow_event, 0);
oevent->mask = FS_Q_OVERFLOW;
oevent->wd = -1;
oevent->sync_cookie = 0;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index b13bfd406820..4f1205725cfe 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -247,6 +247,20 @@ out_invalid:
return ERR_PTR(-EINVAL);
}
+/**
+ * ns_match() - Returns true if current namespace matches dev/ino provided.
+ * @ns_common: current ns
+ * @dev: dev_t from nsfs that will be matched against current nsfs
+ * @ino: ino_t from nsfs that will be matched against current nsfs
+ *
+ * Return: true if dev and ino matches the current nsfs.
+ */
+bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
+{
+ return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev);
+}
+
+
static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7202a1e39d70..554b744f41bf 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
"0x%llx.", (unsigned long long)bh->b_blocknr);
}
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
@@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
}
tmp = tmp->b_this_page;
} while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
* If none of the buffers had errors then we can set the page uptodate,
* but we first have to perform the post read mst fixups, if the
@@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
unlock_page(page);
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 88534eb0e7c2..2f834add165b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1060,7 +1060,6 @@ bail:
brelse(bhs[i]);
bhs[i] = NULL;
}
- mlog_errno(status);
}
return status;
}
@@ -3942,7 +3941,7 @@ rotate:
* above.
*
* This leaf needs to have space, either by the empty 1st
- * extent record, or by virtue of an l_next_rec < l_count.
+ * extent record, or by virtue of an l_next_free_rec < l_count.
*/
ocfs2_rotate_leaf(el, insert_rec);
}
@@ -7403,6 +7402,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_inline_data *idata = &di->id2.i_data;
+ /* No need to punch hole beyond i_size. */
+ if (start >= i_size_read(inode))
+ return 0;
+
if (end > i_size_read(inode))
end = i_size_read(inode);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a368350d4c27..89d13e0705fe 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -101,8 +101,6 @@ static struct o2hb_callback {
static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
-#define O2HB_DEFAULT_BLOCK_BITS 9
-
enum o2hb_heartbeat_modes {
O2HB_HEARTBEAT_LOCAL = 0,
O2HB_HEARTBEAT_GLOBAL,
@@ -1309,7 +1307,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
case O2HB_DB_TYPE_REGION_NUMBER:
reg = (struct o2hb_region *)db->db_data;
- out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%d\n",
reg->hr_region_num);
goto done;
@@ -1319,12 +1317,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
/* If 0, it has never been set before */
if (lts)
lts = jiffies_to_msecs(jiffies - lts);
- out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
goto done;
case O2HB_DB_TYPE_REGION_PINNED:
reg = (struct o2hb_region *)db->db_data;
- out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%u\n",
!!reg->hr_item_pinned);
goto done;
@@ -1333,8 +1331,8 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
}
while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
- out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
- out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += scnprintf(buf + out, PAGE_SIZE - out, "\n");
done:
i_size_write(inode, out);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 02bf4a1774cc..667a5c5e1f66 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -443,8 +443,8 @@ static int o2net_fill_bitmap(char *buf, int len)
o2net_fill_node_map(map, sizeof(map));
while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
- out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
- out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += scnprintf(buf + out, PAGE_SIZE - out, "\n");
return out;
}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 48a3398f0bf5..2c512b40a940 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1570,15 +1570,13 @@ static void o2net_start_connect(struct work_struct *work)
struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
int ret = 0, stop;
unsigned int timeout;
- unsigned int noio_flag;
+ unsigned int nofs_flag;
/*
- * sock_create allocates the sock with GFP_KERNEL. We must set
- * per-process flag PF_MEMALLOC_NOIO so that all allocations done
- * by this process are done as if GFP_NOIO was specified. So we
- * are not reentering filesystem while doing memory reclaim.
+ * sock_create allocates the sock with GFP_KERNEL. We must
+ * prevent the filesystem from being reentered by memory reclaim.
*/
- noio_flag = memalloc_noio_save();
+ nofs_flag = memalloc_nofs_save();
/* if we're greater we initiate tx, otherwise we accept */
if (o2nm_this_node() <= o2net_num_from_nn(nn))
goto out;
@@ -1683,7 +1681,7 @@ out:
if (mynode)
o2nm_node_put(mynode);
- memalloc_noio_restore(noio_flag);
+ memalloc_nofs_restore(nofs_flag);
return;
}
@@ -1810,15 +1808,13 @@ static int o2net_accept_one(struct socket *sock, int *more)
struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
struct o2net_node *nn;
- unsigned int noio_flag;
+ unsigned int nofs_flag;
/*
- * sock_create_lite allocates the sock with GFP_KERNEL. We must set
- * per-process flag PF_MEMALLOC_NOIO so that all allocations done
- * by this process are done as if GFP_NOIO was specified. So we
- * are not reentering filesystem while doing memory reclaim.
+ * sock_create_lite allocates the sock with GFP_KERNEL. We must
+ * prevent the filesystem from being reentered by memory reclaim.
*/
- noio_flag = memalloc_noio_save();
+ nofs_flag = memalloc_nofs_save();
BUG_ON(sock == NULL);
*more = 0;
@@ -1934,7 +1930,7 @@ out:
if (sc)
sc_put(sc);
- memalloc_noio_restore(noio_flag);
+ memalloc_nofs_restore(nofs_flag);
return ret;
}
@@ -1948,7 +1944,6 @@ static void o2net_accept_many(struct work_struct *work)
{
struct socket *sock = o2net_listen_sock;
int more;
- int err;
/*
* It is critical to note that due to interrupt moderation
@@ -1963,7 +1958,7 @@ static void o2net_accept_many(struct work_struct *work)
*/
for (;;) {
- err = o2net_accept_one(sock, &more);
+ o2net_accept_one(sock, &more);
if (!more)
break;
cond_resched();
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index de87cbffd175..736338f45c59 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -32,7 +32,7 @@ struct o2net_msg
__be32 status;
__be32 key;
__be32 msg_num;
- __u8 buf[0];
+ __u8 buf[];
};
typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index bdef72c0f099..5761060d2ba8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -676,7 +676,7 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
int ra_ptr = 0; /* Current index into readahead
buffer */
int num = 0;
- int nblocks, i, err;
+ int nblocks, i;
sb = dir->i_sb;
@@ -708,7 +708,7 @@ restart:
num++;
bh = NULL;
- err = ocfs2_read_dir_block(dir, b++, &bh,
+ ocfs2_read_dir_block(dir, b++, &bh,
OCFS2_BH_READAHEAD);
bh_use[ra_max] = bh;
}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0463dce65bb2..c8a444622faa 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -564,7 +564,7 @@ struct dlm_migratable_lockres
// 48 bytes
u8 lvb[DLM_LVB_LEN];
// 112 bytes
- struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112
+ struct dlm_migratable_lock ml[]; // 16 bytes each, begins at byte 112
};
#define DLM_MIG_LOCKRES_MAX_LEN \
(sizeof(struct dlm_migratable_lockres) + \
@@ -601,7 +601,7 @@ struct dlm_convert_lock
u8 name[O2NM_MAX_NAME_LEN];
- s8 lvb[0];
+ s8 lvb[];
};
#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
@@ -616,7 +616,7 @@ struct dlm_unlock_lock
u8 name[O2NM_MAX_NAME_LEN];
- s8 lvb[0];
+ s8 lvb[];
};
#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
@@ -632,7 +632,7 @@ struct dlm_proxy_ast
u8 name[O2NM_MAX_NAME_LEN];
- s8 lvb[0];
+ s8 lvb[];
};
#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c5c6efba7b5e..4b8b41d23e91 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -244,11 +244,11 @@ static int stringify_lockname(const char *lockname, int locklen, char *buf,
memcpy((__be64 *)&inode_blkno_be,
(char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
sizeof(__be64));
- out += snprintf(buf + out, len - out, "%.*s%08x",
+ out += scnprintf(buf + out, len - out, "%.*s%08x",
OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
(unsigned int)be64_to_cpu(inode_blkno_be));
} else
- out += snprintf(buf + out, len - out, "%.*s",
+ out += scnprintf(buf + out, len - out, "%.*s",
locklen, lockname);
return out;
}
@@ -260,7 +260,7 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
int i = -1;
while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
- out += snprintf(buf + out, len - out, "%d ", i);
+ out += scnprintf(buf + out, len - out, "%d ", i);
return out;
}
@@ -278,34 +278,34 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
mle_type = "MIG";
out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
mle_type, mle->master, mle->new_master,
!list_empty(&mle->hb_events),
!!mle->inuse,
kref_read(&mle->mle_refs));
- out += snprintf(buf + out, len - out, "Maybe=");
+ out += scnprintf(buf + out, len - out, "Maybe=");
out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
- out += snprintf(buf + out, len - out, "Vote=");
+ out += scnprintf(buf + out, len - out, "Vote=");
out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
- out += snprintf(buf + out, len - out, "Response=");
+ out += scnprintf(buf + out, len - out, "Response=");
out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
- out += snprintf(buf + out, len - out, "Node=");
+ out += scnprintf(buf + out, len - out, "Node=");
out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
return out;
}
@@ -353,7 +353,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
int out = 0;
unsigned long total = 0;
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Dumping Purgelist for Domain: %s\n", dlm->name);
spin_lock(&dlm->spinlock);
@@ -365,13 +365,13 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
out += stringify_lockname(res->lockname.name,
res->lockname.len,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\t%ld\n",
+ out += scnprintf(buf + out, len - out, "\t%ld\n",
(jiffies - res->last_used)/HZ);
spin_unlock(&res->spinlock);
}
spin_unlock(&dlm->spinlock);
- out += snprintf(buf + out, len - out, "Total on list: %lu\n", total);
+ out += scnprintf(buf + out, len - out, "Total on list: %lu\n", total);
return out;
}
@@ -410,7 +410,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
int i, out = 0;
unsigned long total = 0, longest = 0, bucket_count = 0;
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Dumping MLEs for Domain: %s\n", dlm->name);
spin_lock(&dlm->master_lock);
@@ -428,7 +428,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
}
spin_unlock(&dlm->master_lock);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Total: %lu, Longest: %lu\n", total, longest);
return out;
}
@@ -467,7 +467,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
#define DEBUG_LOCK_VERSION 1
spin_lock(&lock->spinlock);
- out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+ out = scnprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
"%d,%d,%d,%d\n",
DEBUG_LOCK_VERSION,
list_type, lock->ml.type, lock->ml.convert_type,
@@ -491,13 +491,13 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
int i;
int out = 0;
- out += snprintf(buf + out, len - out, "NAME:");
+ out += scnprintf(buf + out, len - out, "NAME:");
out += stringify_lockname(res->lockname.name, res->lockname.len,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
#define DEBUG_LRES_VERSION 1
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
DEBUG_LRES_VERSION,
res->owner, res->state, res->last_used,
@@ -509,17 +509,17 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
kref_read(&res->refs));
/* refmap */
- out += snprintf(buf + out, len - out, "RMAP:");
+ out += scnprintf(buf + out, len - out, "RMAP:");
out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* lvb */
- out += snprintf(buf + out, len - out, "LVBX:");
+ out += scnprintf(buf + out, len - out, "LVBX:");
for (i = 0; i < DLM_LVB_LEN; i++)
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%02x", (unsigned char)res->lvb[i]);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* granted */
list_for_each_entry(lock, &res->granted, list)
@@ -533,7 +533,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
list_for_each_entry(lock, &res->blocked, list)
out += dump_lock(lock, 2, buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
return out;
}
@@ -683,41 +683,41 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
}
/* Domain: xxxxxxxxxx Key: 0xdfbac769 */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Domain: %s Key: 0x%08x Protocol: %d.%d\n",
dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
dlm->dlm_locking_proto.pv_minor);
/* Thread Pid: xxx Node: xxx State: xxxxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Thread Pid: %d Node: %d State: %s\n",
task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
/* Number of Joins: xxx Joining Node: xxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Number of Joins: %d Joining Node: %d\n",
dlm->num_joins, dlm->joining_node);
/* Domain Map: xx xx xx */
- out += snprintf(buf + out, len - out, "Domain Map: ");
+ out += scnprintf(buf + out, len - out, "Domain Map: ");
out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* Exit Domain Map: xx xx xx */
- out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+ out += scnprintf(buf + out, len - out, "Exit Domain Map: ");
out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* Live Map: xx xx xx */
- out += snprintf(buf + out, len - out, "Live Map: ");
+ out += scnprintf(buf + out, len - out, "Live Map: ");
out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* Lock Resources: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Lock Resources: %d (%d)\n",
atomic_read(&dlm->res_cur_count),
atomic_read(&dlm->res_tot_count));
@@ -729,29 +729,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
cur_mles += atomic_read(&dlm->mle_cur_count[i]);
/* MLEs: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"MLEs: %d (%d)\n", cur_mles, tot_mles);
/* Blocking: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
" Blocking: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
/* Mastery: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
" Mastery: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
/* Migration: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
" Migration: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
/* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Lists: Dirty=%s Purge=%s PendingASTs=%s "
"PendingBASTs=%s\n",
(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -760,12 +760,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
/* Purge Count: xxx Refs: xxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Purge Count: %d Refs: %d\n", dlm->purge_count,
kref_read(&dlm->dlm_refs));
/* Dead Node: xxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Dead Node: %d\n", dlm->reco.dead_node);
/* What about DLM_RECO_STATE_FINALIZE? */
@@ -775,19 +775,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
state = "INACTIVE";
/* Recovery Pid: xxxx Master: xxx State: xxxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Recovery Pid: %d Master: %d State: %s\n",
task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.new_master, state);
/* Recovery Map: xx xx */
- out += snprintf(buf + out, len - out, "Recovery Map: ");
+ out += scnprintf(buf + out, len - out, "Recovery Map: ");
out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* Recovery Node State: */
- out += snprintf(buf + out, len - out, "Recovery Node State:\n");
+ out += scnprintf(buf + out, len - out, "Recovery Node State:\n");
list_for_each_entry(node, &dlm->reco.node_data, list) {
switch (node->state) {
case DLM_RECO_NODE_DATA_INIT:
@@ -815,7 +815,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
state = "BAD";
break;
}
- out += snprintf(buf + out, len - out, "\t%u - %s\n",
+ out += scnprintf(buf + out, len - out, "\t%u - %s\n",
node->node_num, state);
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 900f7e466d11..55a6512e9fde 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2749,8 +2749,6 @@ leave:
return ret;
}
-#define DLM_MIGRATION_RETRY_MS 100
-
/*
* Should be called only after beginning the domain leave process.
* There should not be any remaining locks on nonlocal lock resources,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index fd40c17cd022..5ccc4ff0b82a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -39,8 +39,6 @@
static int dlm_thread(void *data);
static void dlm_flush_asts(struct dlm_ctxt *dlm);
-#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
-
/* will exit holding res->spinlock, but may drop in function */
/* waits until flags are cleared on res->state */
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
@@ -680,7 +678,6 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
#define DLM_THREAD_MAX_DIRTY 100
-#define DLM_THREAD_MAX_ASTS 10
static int dlm_thread(void *data)
{
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index cb9e6a73bea9..152a0fc4e905 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2133,7 +2133,7 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
}
#define OCFS2_SEC_BITS 34
-#define OCFS2_SEC_SHIFT (64 - 34)
+#define OCFS2_SEC_SHIFT (64 - OCFS2_SEC_BITS)
#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
/* LVB only has room for 64 bits of time here so we pack it for
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 68ba354cf361..b425f0b01dce 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -91,7 +91,7 @@ enum ocfs2_replay_state {
struct ocfs2_replay_map {
unsigned int rm_slots;
enum ocfs2_replay_state rm_state;
- unsigned char rm_replay_slots[0];
+ unsigned char rm_replay_slots[];
};
static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index da65251ef815..5381020aaa9a 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -406,7 +406,7 @@ static int ocfs2_mknod(struct inode *dir,
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto roll_back;
}
if (si.enable) {
@@ -414,7 +414,7 @@ static int ocfs2_mknod(struct inode *dir,
meta_ac, data_ac);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto roll_back;
}
}
@@ -427,7 +427,7 @@ static int ocfs2_mknod(struct inode *dir,
OCFS2_I(dir)->ip_blkno);
if (status) {
mlog_errno(status);
- goto leave;
+ goto roll_back;
}
dl = dentry->d_fsdata;
@@ -437,12 +437,19 @@ static int ocfs2_mknod(struct inode *dir,
&lookup);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto roll_back;
}
insert_inode_hash(inode);
d_instantiate(dentry, inode);
status = 0;
+
+roll_back:
+ if (status < 0 && S_ISDIR(mode)) {
+ ocfs2_add_links_count(dirfe, -1);
+ drop_nlink(dir);
+ }
+
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 0db4a7ec58a2..0dd8c41bafd4 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -470,7 +470,7 @@ struct ocfs2_extent_list {
__le16 l_reserved1;
__le64 l_reserved2; /* Pad to
sizeof(ocfs2_extent_rec) */
-/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */
+/*10*/ struct ocfs2_extent_rec l_recs[]; /* Extent records */
};
/*
@@ -484,7 +484,7 @@ struct ocfs2_chain_list {
__le16 cl_count; /* Total chains in this list */
__le16 cl_next_free_rec; /* Next unused chain slot */
__le64 cl_reserved1;
-/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
+/*10*/ struct ocfs2_chain_rec cl_recs[]; /* Chain records */
};
/*
@@ -496,7 +496,7 @@ struct ocfs2_truncate_log {
/*00*/ __le16 tl_count; /* Total records in this log */
__le16 tl_used; /* Number of records in use */
__le32 tl_reserved1;
-/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */
+/*08*/ struct ocfs2_truncate_rec tl_recs[]; /* Truncate records */
};
/*
@@ -640,7 +640,7 @@ struct ocfs2_local_alloc
__le16 la_size; /* Size of included bitmap, in bytes */
__le16 la_reserved1;
__le64 la_reserved2;
-/*10*/ __u8 la_bitmap[0];
+/*10*/ __u8 la_bitmap[];
};
/*
@@ -653,7 +653,7 @@ struct ocfs2_inline_data
* for data, starting at id_data */
__le16 id_reserved0;
__le32 id_reserved1;
- __u8 id_data[0]; /* Start of user data */
+ __u8 id_data[]; /* Start of user data */
};
/*
@@ -798,7 +798,7 @@ struct ocfs2_dx_entry_list {
* possible in de_entries */
__le16 de_num_used; /* Current number of
* de_entries entries */
- struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
+ struct ocfs2_dx_entry de_entries[]; /* Indexed dir entries
* in a packed array of
* length de_num_used */
};
@@ -935,7 +935,7 @@ struct ocfs2_refcount_list {
__le16 rl_used; /* Current number of used records */
__le32 rl_reserved2;
__le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */
-/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */
+/*10*/ struct ocfs2_refcount_rec rl_recs[]; /* Refcount records */
};
@@ -1021,7 +1021,7 @@ struct ocfs2_xattr_header {
buckets. A block uses
xb_check and sets
this field to zero.) */
- struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+ struct ocfs2_xattr_entry xh_entries[]; /* xattr entry list. */
};
/*
@@ -1207,7 +1207,7 @@ struct ocfs2_local_disk_dqinfo {
/* Header of one chunk of a quota file */
struct ocfs2_local_disk_chunk {
__le32 dqc_free; /* Number of free entries in the bitmap */
- __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
+ __u8 dqc_bitmap[]; /* Bitmap of entries in the corresponding
* chunk of quota file */
};
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ee43e51188be..cfb77f70c888 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -154,6 +154,7 @@ ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
}
static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+__acquires(&rf->rf_lock)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
@@ -161,6 +162,7 @@ static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
}
static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+__releases(&rf->rf_lock)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 0249e8ca1028..bf3842e34fb9 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -33,9 +33,6 @@
static DEFINE_SPINLOCK(resv_lock);
-#define OCFS2_MIN_RESV_WINDOW_BITS 8
-#define OCFS2_MAX_RESV_WINDOW_BITS 1024
-
int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
{
return (osb->osb_resv_level && osb->osb_dir_resv_level);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 8aa6a667860c..a191094694c6 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -656,8 +656,6 @@ error:
* and easier to preserve the name.
*/
-#define FS_OCFS2_NM 1
-
static struct ctl_table ocfs2_nm_table[] = {
{
.procname = "hb_ctl_path",
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 939df99d2dec..4836becb7578 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2509,9 +2509,6 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
bail:
brelse(group_bh);
-
- if (status)
- mlog_errno(status);
return status;
}
@@ -2582,8 +2579,6 @@ static int _ocfs2_free_clusters(handle_t *handle,
num_clusters);
out:
- if (status)
- mlog_errno(status);
return status;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 05dd68ade293..ac61eeaf3837 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -220,31 +220,31 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
int i, out = 0;
unsigned long flags;
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
"Device", osb->dev_str, osb->uuid_str,
osb->fs_generation, osb->vol_label);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => State: %d Flags: 0x%lX\n", "Volume",
atomic_read(&osb->vol_state), osb->osb_flags);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Block: %lu Cluster: %d\n", "Sizes",
osb->sb->s_blocksize, osb->s_clustersize);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Compat: 0x%X Incompat: 0x%X "
"ROcompat: 0x%X\n",
"Features", osb->s_feature_compat,
osb->s_feature_incompat, osb->s_feature_ro_compat);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
osb->s_mount_opt, osb->s_atime_quantum);
if (cconn) {
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Stack: %s Name: %*s "
"Version: %d.%d\n", "Cluster",
(*osb->osb_cluster_stack == '\0' ?
@@ -255,7 +255,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
}
spin_lock_irqsave(&osb->dc_task_lock, flags);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Pid: %d Count: %lu WakeSeq: %lu "
"WorkSeq: %lu\n", "DownCnvt",
(osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
@@ -264,32 +264,32 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
spin_unlock_irqrestore(&osb->dc_task_lock, flags);
spin_lock(&osb->osb_lock);
- out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
+ out += scnprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
"Recovery",
(osb->recovery_thread_task ?
task_pid_nr(osb->recovery_thread_task) : -1));
if (rm->rm_used == 0)
- out += snprintf(buf + out, len - out, " None\n");
+ out += scnprintf(buf + out, len - out, " None\n");
else {
for (i = 0; i < rm->rm_used; i++)
- out += snprintf(buf + out, len - out, " %d",
+ out += scnprintf(buf + out, len - out, " %d",
rm->rm_entries[i]);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
}
spin_unlock(&osb->osb_lock);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Pid: %d Interval: %lu\n", "Commit",
(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
osb->osb_commit_interval);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => State: %d TxnId: %lu NumTxns: %d\n",
"Journal", osb->journal->j_state,
osb->journal->j_trans_id,
atomic_read(&osb->journal->j_num_trans));
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => GlobalAllocs: %d LocalAllocs: %d "
"SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
"Stats",
@@ -299,7 +299,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
atomic_read(&osb->alloc_stats.moves),
atomic_read(&osb->alloc_stats.bg_extends));
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => State: %u Descriptor: %llu Size: %u bits "
"Default: %u bits\n",
"LocalAlloc", osb->local_alloc_state,
@@ -307,7 +307,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
osb->local_alloc_bits, osb->local_alloc_default_bits);
spin_lock(&osb->osb_lock);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => InodeSlot: %d StolenInodes: %d, "
"MetaSlot: %d StolenMeta: %d\n", "Steal",
osb->s_inode_steal_slot,
@@ -316,20 +316,20 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
atomic_read(&osb->s_num_meta_stolen));
spin_unlock(&osb->osb_lock);
- out += snprintf(buf + out, len - out, "OrphanScan => ");
- out += snprintf(buf + out, len - out, "Local: %u Global: %u ",
+ out += scnprintf(buf + out, len - out, "OrphanScan => ");
+ out += scnprintf(buf + out, len - out, "Local: %u Global: %u ",
os->os_count, os->os_seqno);
- out += snprintf(buf + out, len - out, " Last Scan: ");
+ out += scnprintf(buf + out, len - out, " Last Scan: ");
if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
- out += snprintf(buf + out, len - out, "Disabled\n");
+ out += scnprintf(buf + out, len - out, "Disabled\n");
else
- out += snprintf(buf + out, len - out, "%lu seconds ago\n",
+ out += scnprintf(buf + out, len - out, "%lu seconds ago\n",
(unsigned long)(ktime_get_seconds() - os->os_scantime));
- out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
+ out += scnprintf(buf + out, len - out, "%10s => %3s %10s\n",
"Slots", "Num", "RecoGen");
for (i = 0; i < osb->max_slots; ++i) {
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s %c %3d %10d\n",
" ",
(i == osb->slot_num ? '*' : ' '),
diff --git a/fs/open.c b/fs/open.c
index b69d6eed67e6..719b320ede52 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1046,8 +1046,10 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
if (flags & O_CREAT) {
op->intent |= LOOKUP_CREATE;
- if (flags & O_EXCL)
+ if (flags & O_EXCL) {
op->intent |= LOOKUP_EXCL;
+ flags |= O_NOFOLLOW;
+ }
}
if (flags & O_DIRECTORY)
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index c740159d9ad1..af375e049aae 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -346,23 +346,8 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
struct iov_iter *iter)
{
int ret;
- struct orangefs_read_options *ro;
-
orangefs_stats.reads++;
- /*
- * Remember how they set "count" in read(2) or pread(2) or whatever -
- * users can use count as a knob to control orangefs io size and later
- * we can try to help them fill as many pages as possible in readpage.
- */
- if (!iocb->ki_filp->private_data) {
- iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL);
- if (!iocb->ki_filp->private_data)
- return(ENOMEM);
- ro = iocb->ki_filp->private_data;
- ro->blksiz = iter->count;
- }
-
down_read(&file_inode(iocb->ki_filp)->i_rwsem);
ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
if (ret)
@@ -650,12 +635,6 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
return rc;
}
-static int orangefs_file_open(struct inode * inode, struct file *file)
-{
- file->private_data = NULL;
- return generic_file_open(inode, file);
-}
-
static int orangefs_flush(struct file *file, fl_owner_t id)
{
/*
@@ -666,19 +645,8 @@ static int orangefs_flush(struct file *file, fl_owner_t id)
* on an explicit fsync call. This duplicates historical OrangeFS
* behavior.
*/
- struct inode *inode = file->f_mapping->host;
int r;
- kfree(file->private_data);
- file->private_data = NULL;
-
- if (inode->i_state & I_DIRTY_TIME) {
- spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
- spin_unlock(&inode->i_lock);
- mark_inode_dirty_sync(inode);
- }
-
r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
if (r > 0)
return 0;
@@ -694,7 +662,7 @@ const struct file_operations orangefs_file_operations = {
.lock = orangefs_lock,
.unlocked_ioctl = orangefs_ioctl,
.mmap = orangefs_file_mmap,
- .open = orangefs_file_open,
+ .open = generic_file_open,
.flush = orangefs_flush,
.release = orangefs_file_release,
.fsync = orangefs_fsync,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 961c0fd8675a..12ae630fbed7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -259,46 +259,19 @@ static int orangefs_readpage(struct file *file, struct page *page)
pgoff_t index; /* which page */
struct page *next_page;
char *kaddr;
- struct orangefs_read_options *ro = file->private_data;
loff_t read_size;
- loff_t roundedup;
int buffer_index = -1; /* orangefs shared memory slot */
int slot_index; /* index into slot */
int remaining;
/*
- * If they set some miniscule size for "count" in read(2)
- * (for example) then let's try to read a page, or the whole file
- * if it is smaller than a page. Once "count" goes over a page
- * then lets round up to the highest page size multiple that is
- * less than or equal to "count" and do that much orangefs IO and
- * try to fill as many pages as we can from it.
- *
- * "count" should be represented in ro->blksiz.
- *
- * inode->i_size = file size.
+ * Get up to this many bytes from Orangefs at a time and try
+ * to fill them into the page cache at once. Tests with dd made
+ * this seem like a reasonable static number, if there was
+ * interest perhaps this number could be made setable through
+ * sysfs...
*/
- if (ro) {
- if (ro->blksiz < PAGE_SIZE) {
- if (inode->i_size < PAGE_SIZE)
- read_size = inode->i_size;
- else
- read_size = PAGE_SIZE;
- } else {
- roundedup = ((PAGE_SIZE - 1) & ro->blksiz) ?
- ((ro->blksiz + PAGE_SIZE) & ~(PAGE_SIZE -1)) :
- ro->blksiz;
- if (roundedup > inode->i_size)
- read_size = inode->i_size;
- else
- read_size = roundedup;
-
- }
- } else {
- read_size = PAGE_SIZE;
- }
- if (!read_size)
- read_size = PAGE_SIZE;
+ read_size = 524288;
if (PageDirty(page))
orangefs_launder_page(page);
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index ed67f39fa7ce..e12aeb9623d6 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -239,10 +239,6 @@ struct orangefs_write_range {
kgid_t gid;
};
-struct orangefs_read_options {
- ssize_t blksiz;
-};
-
extern struct orangefs_stats orangefs_stats;
/*
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 9fc47c2e078d..9709cf22cab3 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -36,6 +36,13 @@ static int ovl_ccup_get(char *buf, const struct kernel_param *param)
module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644);
MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing");
+static bool ovl_must_copy_xattr(const char *name)
+{
+ return !strcmp(name, XATTR_POSIX_ACL_ACCESS) ||
+ !strcmp(name, XATTR_POSIX_ACL_DEFAULT) ||
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+}
+
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
@@ -107,8 +114,13 @@ retry:
continue; /* Discard */
}
error = vfs_setxattr(new, name, value, size, 0);
- if (error)
- break;
+ if (error) {
+ if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
+ break;
+
+ /* Ignore failure to copy unknown xattrs */
+ error = 0;
+ }
}
kfree(value);
out:
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 8e57d5372b8f..279009dee366 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
return err;
}
-static struct dentry *ovl_lookup_temp(struct dentry *workdir)
+struct dentry *ovl_lookup_temp(struct dentry *workdir)
{
struct dentry *temp;
char name[20];
@@ -243,6 +243,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
ovl_dir_modified(dentry->d_parent, false);
ovl_dentry_set_upper_alias(dentry);
+ ovl_dentry_update_reval(dentry, newdentry,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+
if (!hardlink) {
/*
* ovl_obtain_alias() can be called after ovl_create_real()
@@ -819,6 +822,28 @@ static bool ovl_pure_upper(struct dentry *dentry)
!ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry));
}
+static void ovl_drop_nlink(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct dentry *alias;
+
+ /* Try to find another, hashed alias */
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ if (alias != dentry && !d_unhashed(alias))
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * Changes to underlying layers may cause i_nlink to lose sync with
+ * reality. In this case prevent the link count from going to zero
+ * prematurely.
+ */
+ if (inode->i_nlink > !!alias)
+ drop_nlink(inode);
+}
+
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
int err;
@@ -856,7 +881,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (is_dir)
clear_nlink(dentry->d_inode);
else
- drop_nlink(dentry->d_inode);
+ ovl_drop_nlink(dentry);
}
ovl_nlink_end(dentry);
@@ -1201,7 +1226,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
if (new_is_dir)
clear_nlink(d_inode(new));
else
- drop_nlink(d_inode(new));
+ ovl_drop_nlink(new);
}
ovl_dir_modified(old->d_parent, ovl_type_origin(old) ||
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 6f54d70cef27..475c61f53f0f 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -308,29 +308,35 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
ovl_set_flag(OVL_UPPERDATA, inode);
dentry = d_find_any_alias(inode);
- if (!dentry) {
- dentry = d_alloc_anon(inode->i_sb);
- if (!dentry)
- goto nomem;
- oe = ovl_alloc_entry(lower ? 1 : 0);
- if (!oe)
- goto nomem;
-
- if (lower) {
- oe->lowerstack->dentry = dget(lower);
- oe->lowerstack->layer = lowerpath->layer;
- }
- dentry->d_fsdata = oe;
- if (upper_alias)
- ovl_dentry_set_upper_alias(dentry);
+ if (dentry)
+ goto out_iput;
+
+ dentry = d_alloc_anon(inode->i_sb);
+ if (unlikely(!dentry))
+ goto nomem;
+ oe = ovl_alloc_entry(lower ? 1 : 0);
+ if (!oe)
+ goto nomem;
+
+ if (lower) {
+ oe->lowerstack->dentry = dget(lower);
+ oe->lowerstack->layer = lowerpath->layer;
}
+ dentry->d_fsdata = oe;
+ if (upper_alias)
+ ovl_dentry_set_upper_alias(dentry);
+
+ ovl_dentry_update_reval(dentry, upper,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
return d_instantiate_anon(dentry, inode);
nomem:
- iput(inode);
dput(dentry);
- return ERR_PTR(-ENOMEM);
+ dentry = ERR_PTR(-ENOMEM);
+out_iput:
+ iput(inode);
+ return dentry;
}
/* Get the upper or lower dentry in stach whose on layer @idx */
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 79e8994e3bc1..b0d42ece4d7c 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -79,6 +79,7 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
{
bool samefs = ovl_same_fs(dentry->d_sb);
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
+ unsigned int xinoshift = 64 - xinobits;
if (samefs) {
/*
@@ -89,22 +90,22 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
stat->dev = dentry->d_sb->s_dev;
return 0;
} else if (xinobits) {
- unsigned int shift = 64 - xinobits;
/*
* All inode numbers of underlying fs should not be using the
* high xinobits, so we use high xinobits to partition the
* overlay st_ino address space. The high bits holds the fsid
- * (upper fsid is 0). This way overlay inode numbers are unique
- * and all inodes use overlay st_dev. Inode numbers are also
- * persistent for a given layer configuration.
+ * (upper fsid is 0). The lowest xinobit is reserved for mapping
+ * the non-peresistent inode numbers range in case of overflow.
+ * This way all overlay inode numbers are unique and use the
+ * overlay st_dev.
*/
- if (stat->ino >> shift) {
- pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
- dentry, stat->ino, xinobits);
- } else {
- stat->ino |= ((u64)fsid) << shift;
+ if (likely(!(stat->ino >> xinoshift))) {
+ stat->ino |= ((u64)fsid) << (xinoshift + 1);
stat->dev = dentry->d_sb->s_dev;
return 0;
+ } else if (ovl_xino_warn(dentry->d_sb)) {
+ pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
+ dentry, stat->ino, xinobits);
}
}
@@ -504,7 +505,7 @@ static const struct address_space_operations ovl_aops = {
/*
* It is possible to stack overlayfs instance on top of another
- * overlayfs instance as lower layer. We need to annonate the
+ * overlayfs instance as lower layer. We need to annotate the
* stackable i_mutex locks according to stack level of the super
* block instance. An overlayfs instance can never be in stack
* depth 0 (there is always a real fs below it). An overlayfs
@@ -561,27 +562,73 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
#endif
}
-static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
- unsigned long ino, int fsid)
+static void ovl_next_ino(struct inode *inode)
+{
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+ if (unlikely(!inode->i_ino))
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+}
+
+static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid)
{
int xinobits = ovl_xino_bits(inode->i_sb);
+ unsigned int xinoshift = 64 - xinobits;
/*
* When d_ino is consistent with st_ino (samefs or i_ino has enough
* bits to encode layer), set the same value used for st_ino to i_ino,
* so inode number exposed via /proc/locks and a like will be
* consistent with d_ino and st_ino values. An i_ino value inconsistent
- * with d_ino also causes nfsd readdirplus to fail. When called from
- * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
- * upper inode i_ino on ovl_inode_init() or ovl_inode_update().
+ * with d_ino also causes nfsd readdirplus to fail.
*/
- if (ovl_same_dev(inode->i_sb)) {
- inode->i_ino = ino;
- if (xinobits && fsid && !(ino >> (64 - xinobits)))
- inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
- } else {
- inode->i_ino = get_next_ino();
+ inode->i_ino = ino;
+ if (ovl_same_fs(inode->i_sb)) {
+ return;
+ } else if (xinobits && likely(!(ino >> xinoshift))) {
+ inode->i_ino |= (unsigned long)fsid << (xinoshift + 1);
+ return;
+ }
+
+ /*
+ * For directory inodes on non-samefs with xino disabled or xino
+ * overflow, we allocate a non-persistent inode number, to be used for
+ * resolving st_ino collisions in ovl_map_dev_ino().
+ *
+ * To avoid ino collision with legitimate xino values from upper
+ * layer (fsid 0), use the lowest xinobit to map the non
+ * persistent inode numbers to the unified st_ino address space.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ ovl_next_ino(inode);
+ if (xinobits) {
+ inode->i_ino &= ~0UL >> xinobits;
+ inode->i_ino |= 1UL << xinoshift;
+ }
}
+}
+
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid)
+{
+ struct inode *realinode;
+
+ if (oip->upperdentry)
+ OVL_I(inode)->__upperdentry = oip->upperdentry;
+ if (oip->lowerpath && oip->lowerpath->dentry)
+ OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry));
+ if (oip->lowerdata)
+ OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata));
+
+ realinode = ovl_inode_real(inode);
+ ovl_copyattr(realinode, inode);
+ ovl_copyflags(realinode, inode);
+ ovl_map_ino(inode, ino, fsid);
+}
+
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
inode->i_mode = mode;
inode->i_flags |= S_NOCMTIME;
#ifdef CONFIG_FS_POSIX_ACL
@@ -719,7 +766,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
inode = new_inode(sb);
if (inode)
- ovl_fill_inode(inode, mode, rdev, 0, 0);
+ ovl_fill_inode(inode, mode, rdev);
return inode;
}
@@ -891,7 +938,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
- int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
+ int fsid = bylower ? lowerpath->layer->fsid : 0;
bool is_dir, metacopy = false;
unsigned long ino = 0;
int err = oip->newinode ? -EEXIST : -ENOMEM;
@@ -941,9 +988,11 @@ struct inode *ovl_get_inode(struct super_block *sb,
err = -ENOMEM;
goto out_err;
}
+ ino = realinode->i_ino;
+ fsid = lowerpath->layer->fsid;
}
- ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
- ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata);
+ ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
+ ovl_inode_init(inode, oip, ino, fsid);
if (upperdentry && ovl_is_impuredir(upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index ed9e129fae04..0db23baf98e7 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -845,7 +845,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (err)
goto out;
- if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) {
+ if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) {
dput(upperdentry);
err = -EREMOTE;
goto out;
@@ -1076,6 +1076,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out_free_oe;
}
+ ovl_dentry_update_reval(dentry, upperdentry,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+
revert_creds(old_cred);
if (origin_path) {
dput(origin_path->dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3d3f2b8bdae5..e6f3670146ed 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -48,6 +48,12 @@ enum ovl_entry_flag {
OVL_E_CONNECTED,
};
+enum {
+ OVL_XINO_OFF,
+ OVL_XINO_AUTO,
+ OVL_XINO_ON,
+};
+
/*
* The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
* where:
@@ -87,7 +93,7 @@ struct ovl_fb {
u8 flags; /* OVL_FH_FLAG_* */
u8 type; /* fid_type of fid */
uuid_t uuid; /* uuid of filesystem */
- u32 fid[0]; /* file identifier should be 32bit aligned in-memory */
+ u32 fid[]; /* file identifier should be 32bit aligned in-memory */
} __packed;
/* In-memory and on-wire format for overlay file handle */
@@ -230,6 +236,8 @@ bool ovl_index_all(struct super_block *sb);
bool ovl_verify_lower(struct super_block *sb);
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
bool ovl_dentry_remote(struct dentry *dentry);
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry,
+ unsigned int mask);
bool ovl_dentry_weird(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
@@ -264,8 +272,6 @@ void ovl_set_upperdata(struct inode *inode);
bool ovl_redirect_dir(struct super_block *sb);
const char *ovl_dentry_get_redirect(struct dentry *dentry);
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
-void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
- struct dentry *lowerdentry, struct dentry *lowerdata);
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
u64 ovl_dentry_version_get(struct dentry *dentry);
@@ -301,6 +307,16 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
}
+/*
+ * With xino=auto, we do best effort to keep all inodes on same st_dev and
+ * d_ino consistent with st_ino.
+ * With xino=on, we do the same effort but we warn if we failed.
+ */
+static inline bool ovl_xino_warn(struct super_block *sb)
+{
+ return OVL_FS(sb)->config.xino == OVL_XINO_ON;
+}
+
/* All layers on same fs? */
static inline bool ovl_same_fs(struct super_block *sb)
{
@@ -410,6 +426,8 @@ struct ovl_inode_params {
char *redirect;
struct dentry *lowerdata;
};
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper);
@@ -451,6 +469,7 @@ struct ovl_cattr {
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
+struct dentry *ovl_lookup_temp(struct dentry *workdir);
struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
/* file.c */
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 89015ea822e7..5762d802fe01 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -75,6 +75,8 @@ struct ovl_fs {
struct inode *indexdir_trap;
/* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
int xino_mode;
+ /* For allocation of non-persistent inode numbers */
+ atomic_long_t last_ino;
};
static inline struct ovl_fs *OVL_FS(struct super_block *sb)
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 40ac9ce2465a..e452ff7d583d 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -438,15 +438,23 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
/* Map inode number to lower fs unique range */
static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
- const char *name, int namelen)
+ const char *name, int namelen, bool warn)
{
- if (ino >> (64 - xinobits)) {
- pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
- namelen, name, ino, xinobits);
+ unsigned int xinoshift = 64 - xinobits;
+
+ if (unlikely(ino >> xinoshift)) {
+ if (warn) {
+ pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
+ namelen, name, ino, xinobits);
+ }
return ino;
}
- return ino | ((u64)fsid) << (64 - xinobits);
+ /*
+ * The lowest xinobit is reserved for mapping the non-peresistent inode
+ * numbers range, but this range is only exposed via st_ino, not here.
+ */
+ return ino | ((u64)fsid) << (xinoshift + 1);
}
/*
@@ -515,7 +523,8 @@ get:
} else if (xinobits && !OVL_TYPE_UPPER(type)) {
ino = ovl_remap_lower_ino(ino, xinobits,
ovl_layer_lower(this)->fsid,
- p->name, p->len);
+ p->name, p->len,
+ ovl_xino_warn(dir->d_sb));
}
out:
@@ -645,6 +654,7 @@ struct ovl_readdir_translate {
u64 parent_ino;
int fsid;
int xinobits;
+ bool xinowarn;
};
static int ovl_fill_real(struct dir_context *ctx, const char *name,
@@ -665,7 +675,7 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name,
ino = p->ino;
} else if (rdt->xinobits) {
ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
- name, namelen);
+ name, namelen, rdt->xinowarn);
}
return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
@@ -696,6 +706,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
.ctx.actor = ovl_fill_real,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(dir->d_sb),
+ .xinowarn = ovl_xino_warn(dir->d_sb),
};
if (rdt.xinobits && lower_layer)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ac967f1cb6e5..732ad5495c92 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -113,53 +113,54 @@ bug:
return dentry;
}
-static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
{
- struct ovl_entry *oe = dentry->d_fsdata;
- unsigned int i;
int ret = 1;
- for (i = 0; i < oe->numlower; i++) {
- struct dentry *d = oe->lowerstack[i].dentry;
-
- if (d->d_flags & DCACHE_OP_REVALIDATE) {
- ret = d->d_op->d_revalidate(d, flags);
- if (ret < 0)
- return ret;
- if (!ret) {
- if (!(flags & LOOKUP_RCU))
- d_invalidate(d);
- return -ESTALE;
- }
+ if (weak) {
+ if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
+ ret = d->d_op->d_weak_revalidate(d, flags);
+ } else if (d->d_flags & DCACHE_OP_REVALIDATE) {
+ ret = d->d_op->d_revalidate(d, flags);
+ if (!ret) {
+ if (!(flags & LOOKUP_RCU))
+ d_invalidate(d);
+ ret = -ESTALE;
}
}
- return 1;
+ return ret;
}
-static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_dentry_revalidate_common(struct dentry *dentry,
+ unsigned int flags, bool weak)
{
struct ovl_entry *oe = dentry->d_fsdata;
+ struct dentry *upper;
unsigned int i;
int ret = 1;
- for (i = 0; i < oe->numlower; i++) {
- struct dentry *d = oe->lowerstack[i].dentry;
+ upper = ovl_dentry_upper(dentry);
+ if (upper)
+ ret = ovl_revalidate_real(upper, flags, weak);
- if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
- ret = d->d_op->d_weak_revalidate(d, flags);
- if (ret <= 0)
- break;
- }
+ for (i = 0; ret > 0 && i < oe->numlower; i++) {
+ ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags,
+ weak);
}
return ret;
}
-static const struct dentry_operations ovl_dentry_operations = {
- .d_release = ovl_dentry_release,
- .d_real = ovl_d_real,
-};
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return ovl_dentry_revalidate_common(dentry, flags, false);
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return ovl_dentry_revalidate_common(dentry, flags, true);
+}
-static const struct dentry_operations ovl_reval_dentry_operations = {
+static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
.d_real = ovl_d_real,
.d_revalidate = ovl_dentry_revalidate,
@@ -316,12 +317,6 @@ static const char *ovl_redirect_mode_def(void)
return ovl_redirect_dir_def ? "on" : "off";
}
-enum {
- OVL_XINO_OFF,
- OVL_XINO_AUTO,
- OVL_XINO_ON,
-};
-
static const char * const ovl_xino_str[] = {
"off",
"auto",
@@ -751,13 +746,12 @@ static int ovl_mount_dir(const char *name, struct path *path)
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
- if (!err)
- if (ovl_dentry_remote(path->dentry)) {
- pr_err("filesystem on '%s' not supported as upperdir\n",
- tmp);
- path_put_init(path);
- err = -EINVAL;
- }
+ if (!err && path->dentry->d_flags & DCACHE_OP_REAL) {
+ pr_err("filesystem on '%s' not supported as upperdir\n",
+ tmp);
+ path_put_init(path);
+ err = -EINVAL;
+ }
kfree(tmp);
}
return err;
@@ -778,7 +772,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
}
static int ovl_lower_dir(const char *name, struct path *path,
- struct ovl_fs *ofs, int *stack_depth, bool *remote)
+ struct ovl_fs *ofs, int *stack_depth)
{
int fh_type;
int err;
@@ -793,9 +787,6 @@ static int ovl_lower_dir(const char *name, struct path *path,
*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
- if (ovl_dentry_remote(path->dentry))
- *remote = true;
-
/*
* The inodes index feature and NFS export need to encode and decode
* file handles, so they require that all layers support them.
@@ -1074,11 +1065,73 @@ out:
return err;
}
+/*
+ * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
+ * negative values if error is encountered.
+ */
+static int ovl_check_rename_whiteout(struct dentry *workdir)
+{
+ struct inode *dir = d_inode(workdir);
+ struct dentry *temp;
+ struct dentry *dest;
+ struct dentry *whiteout;
+ struct name_snapshot name;
+ int err;
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+
+ temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0));
+ err = PTR_ERR(temp);
+ if (IS_ERR(temp))
+ goto out_unlock;
+
+ dest = ovl_lookup_temp(workdir);
+ err = PTR_ERR(dest);
+ if (IS_ERR(dest)) {
+ dput(temp);
+ goto out_unlock;
+ }
+
+ /* Name is inline and stable - using snapshot as a copy helper */
+ take_dentry_name_snapshot(&name, temp);
+ err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT);
+ if (err) {
+ if (err == -EINVAL)
+ err = 0;
+ goto cleanup_temp;
+ }
+
+ whiteout = lookup_one_len(name.name.name, workdir, name.name.len);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ goto cleanup_temp;
+
+ err = ovl_is_whiteout(whiteout);
+
+ /* Best effort cleanup of whiteout and temp file */
+ if (err)
+ ovl_cleanup(dir, whiteout);
+ dput(whiteout);
+
+cleanup_temp:
+ ovl_cleanup(dir, temp);
+ release_dentry_name_snapshot(&name);
+ dput(temp);
+ dput(dest);
+
+out_unlock:
+ inode_unlock(dir);
+
+ return err;
+}
+
static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
struct path *workpath)
{
struct vfsmount *mnt = ofs->upper_mnt;
struct dentry *temp;
+ bool rename_whiteout;
+ bool d_type;
int fh_type;
int err;
@@ -1104,11 +1157,8 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
if (err < 0)
goto out;
- /*
- * We allowed this configuration and don't want to break users over
- * kernel upgrade. So warn instead of erroring out.
- */
- if (!err)
+ d_type = err;
+ if (!d_type)
pr_warn("upper fs needs to support d_type.\n");
/* Check if upper/work fs supports O_TMPFILE */
@@ -1119,6 +1169,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
else
pr_warn("upper fs does not support tmpfile.\n");
+
+ /* Check if upper/work fs supports RENAME_WHITEOUT */
+ err = ovl_check_rename_whiteout(ofs->workdir);
+ if (err < 0)
+ goto out;
+
+ rename_whiteout = err;
+ if (!rename_whiteout)
+ pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
+
/*
* Check if upper/work fs supports trusted.overlay.* xattr
*/
@@ -1133,6 +1193,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
}
+ /*
+ * We allowed sub-optimal upper fs configuration and don't want to break
+ * users over kernel upgrade, but we never allowed remote upper fs, so
+ * we can enforce strict requirements for remote upper fs.
+ */
+ if (ovl_dentry_remote(ofs->workdir) &&
+ (!d_type || !rename_whiteout || ofs->noxattr)) {
+ pr_err("upper fs missing required features.\n");
+ err = -EINVAL;
+ goto out;
+ }
+
/* Check if upper/work fs supports file handles */
fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
if (ofs->config.index && !fh_type) {
@@ -1401,11 +1473,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
/*
* When all layers on same fs, overlay can use real inode numbers.
- * With mount option "xino=on", mounter declares that there are enough
- * free high bits in underlying fs to hold the unique fsid.
+ * With mount option "xino=<on|auto>", mounter declares that there are
+ * enough free high bits in underlying fs to hold the unique fsid.
* If overlayfs does encounter underlying inodes using the high xino
* bits reserved for fsid, it emits a warning and uses the original
- * inode number.
+ * inode number or a non persistent inode number allocated from a
+ * dedicated range.
*/
if (ofs->numfs - !ofs->upper_mnt == 1) {
if (ofs->config.xino == OVL_XINO_ON)
@@ -1413,14 +1486,16 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
ofs->xino_mode = 0;
} else if (ofs->config.xino == OVL_XINO_OFF) {
ofs->xino_mode = -1;
- } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) {
+ } else if (ofs->xino_mode < 0) {
/*
* This is a roundup of number of bits needed for encoding
- * fsid, where fsid 0 is reserved for upper fs even with
- * lower only overlay.
+ * fsid, where fsid 0 is reserved for upper fs (even with
+ * lower only overlay) +1 extra bit is reserved for the non
+ * persistent inode number range that is used for resolving
+ * xino lower bits overflow.
*/
- BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31);
- ofs->xino_mode = ilog2(ofs->numfs - 1) + 1;
+ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30);
+ ofs->xino_mode = ilog2(ofs->numfs - 1) + 2;
}
if (ofs->xino_mode > 0) {
@@ -1440,7 +1515,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
char *lowertmp, *lower;
struct path *stack = NULL;
unsigned int stacklen, numlower = 0, i;
- bool remote = false;
struct ovl_entry *oe;
err = -ENOMEM;
@@ -1472,7 +1546,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
lower = lowertmp;
for (numlower = 0; numlower < stacklen; numlower++) {
err = ovl_lower_dir(lower, &stack[numlower], ofs,
- &sb->s_stack_depth, &remote);
+ &sb->s_stack_depth);
if (err)
goto out_err;
@@ -1500,11 +1574,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
oe->lowerstack[i].layer = &ofs->layers[i+1];
}
- if (remote)
- sb->s_d_op = &ovl_reval_dentry_operations;
- else
- sb->s_d_op = &ovl_dentry_operations;
-
out:
for (i = 0; i < numlower; i++)
path_put(&stack[i]);
@@ -1589,6 +1658,44 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
return 0;
}
+static struct dentry *ovl_get_root(struct super_block *sb,
+ struct dentry *upperdentry,
+ struct ovl_entry *oe)
+{
+ struct dentry *root;
+ struct ovl_path *lowerpath = &oe->lowerstack[0];
+ unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
+ int fsid = lowerpath->layer->fsid;
+ struct ovl_inode_params oip = {
+ .upperdentry = upperdentry,
+ .lowerpath = lowerpath,
+ };
+
+ root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+ if (!root)
+ return NULL;
+
+ root->d_fsdata = oe;
+
+ if (upperdentry) {
+ /* Root inode uses upper st_ino/i_ino */
+ ino = d_inode(upperdentry)->i_ino;
+ fsid = 0;
+ ovl_dentry_set_upper_alias(root);
+ if (ovl_is_impuredir(upperdentry))
+ ovl_set_flag(OVL_IMPURE, d_inode(root));
+ }
+
+ /* Root is always merge -> can have whiteouts */
+ ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
+ ovl_dentry_set_flag(OVL_E_CONNECTED, root);
+ ovl_set_upperdata(d_inode(root));
+ ovl_inode_init(d_inode(root), &oip, ino, fsid);
+ ovl_dentry_update_reval(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE);
+
+ return root;
+}
+
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { };
@@ -1598,6 +1705,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
struct cred *cred;
int err;
+ sb->s_d_op = &ovl_dentry_operations;
+
err = -ENOMEM;
ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
if (!ofs)
@@ -1624,6 +1733,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ atomic_long_set(&ofs->last_ino, 1);
/* Assume underlaying fs uses 32bit inodes unless proven otherwise */
if (ofs->config.xino != OVL_XINO_OFF) {
ofs->xino_mode = BITS_PER_LONG - 32;
@@ -1710,25 +1820,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= SB_POSIXACL;
err = -ENOMEM;
- root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+ root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
if (!root_dentry)
goto out_free_oe;
- root_dentry->d_fsdata = oe;
-
mntput(upperpath.mnt);
- if (upperpath.dentry) {
- ovl_dentry_set_upper_alias(root_dentry);
- if (ovl_is_impuredir(upperpath.dentry))
- ovl_set_flag(OVL_IMPURE, d_inode(root_dentry));
- }
-
- /* Root is always merge -> can have whiteouts */
- ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
- ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry);
- ovl_set_upperdata(d_inode(root_dentry));
- ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
- ovl_dentry_lower(root_dentry), NULL);
sb->s_root = root_dentry;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 042f7eb4f7f4..36b60788ee47 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -93,8 +93,24 @@ struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
bool ovl_dentry_remote(struct dentry *dentry)
{
return dentry->d_flags &
- (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_REAL);
+ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+}
+
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry,
+ unsigned int mask)
+{
+ struct ovl_entry *oe = OVL_E(dentry);
+ unsigned int i, flags = 0;
+
+ if (upperdentry)
+ flags |= upperdentry->d_flags;
+ for (i = 0; i < oe->numlower; i++)
+ flags |= oe->lowerstack[i].dentry->d_flags;
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~mask;
+ dentry->d_flags |= flags & mask;
+ spin_unlock(&dentry->d_lock);
}
bool ovl_dentry_weird(struct dentry *dentry)
@@ -386,24 +402,6 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
oi->redirect = redirect;
}
-void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
- struct dentry *lowerdentry, struct dentry *lowerdata)
-{
- struct inode *realinode = d_inode(upperdentry ?: lowerdentry);
-
- if (upperdentry)
- OVL_I(inode)->__upperdentry = upperdentry;
- if (lowerdentry)
- OVL_I(inode)->lower = igrab(d_inode(lowerdentry));
- if (lowerdata)
- OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata));
-
- ovl_copyattr(realinode, inode);
- ovl_copyflags(realinode, inode);
- if (!inode->i_ino)
- inode->i_ino = realinode->i_ino;
-}
-
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
{
struct inode *upperinode = d_inode(upperdentry);
@@ -416,8 +414,6 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
smp_wmb();
OVL_I(inode)->__upperdentry = upperdentry;
if (inode_unhashed(inode)) {
- if (!inode->i_ino)
- inode->i_ino = upperinode->i_ino;
inode->i_private = upperinode;
__insert_inode_hash(inode, (unsigned long) upperinode);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 2144507447c5..16fb72e9abf7 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -146,7 +146,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct page *page = buf->page;
if (page_count(page) == 1) {
- memcg_kmem_uncharge(page, 0);
+ memcg_kmem_uncharge_page(page, 0);
__SetPageLocked(page);
return 0;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5efaf3708ec6..8e16f14bb05a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -635,28 +635,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
+ unsigned long size;
+ unsigned long resident = 0;
+ unsigned long shared = 0;
+ unsigned long text = 0;
+ unsigned long data = 0;
+
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
- }
- /*
- * For quick read, open code by putting numbers directly
- * expected format is
- * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
- * size, resident, shared, text, data);
- */
- seq_put_decimal_ull(m, "", size);
- seq_put_decimal_ull(m, " ", resident);
- seq_put_decimal_ull(m, " ", shared);
- seq_put_decimal_ull(m, " ", text);
- seq_put_decimal_ull(m, " ", 0);
- seq_put_decimal_ull(m, " ", data);
- seq_put_decimal_ull(m, " ", 0);
- seq_putc(m, '\n');
+ /*
+ * For quick read, open code by putting numbers directly
+ * expected format is
+ * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ * size, resident, shared, text, data);
+ */
+ seq_put_decimal_ull(m, "", size);
+ seq_put_decimal_ull(m, " ", resident);
+ seq_put_decimal_ull(m, " ", shared);
+ seq_put_decimal_ull(m, " ", text);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", data);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_putc(m, '\n');
+ } else {
+ seq_write(m, "0 0 0 0 0 0 0\n", 14);
+ }
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7c64272b0fa..6042b646ab27 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -405,11 +405,11 @@ print0:
static int lock_trace(struct task_struct *task)
{
- int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ int err = mutex_lock_killable(&task->signal->exec_update_mutex);
if (err)
return err;
if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
- mutex_unlock(&task->signal->cred_guard_mutex);
+ mutex_unlock(&task->signal->exec_update_mutex);
return -EPERM;
}
return 0;
@@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task)
static void unlock_trace(struct task_struct *task)
{
- mutex_unlock(&task->signal->cred_guard_mutex);
+ mutex_unlock(&task->signal->exec_update_mutex);
}
#ifdef CONFIG_STACKTRACE
@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
*rgid = gid;
}
+void proc_pid_evict_inode(struct proc_inode *ei)
+{
+ struct pid *pid = ei->pid;
+
+ if (S_ISDIR(ei->vfs_inode.i_mode)) {
+ spin_lock(&pid->lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(&pid->lock);
+ }
+
+ put_pid(pid);
+}
+
struct inode *proc_pid_make_inode(struct super_block * sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
struct proc_inode *ei;
+ struct pid *pid;
/* We need a new inode */
@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/*
* grab the reference to task.
*/
- ei->pid = get_task_pid(task, PIDTYPE_PID);
- if (!ei->pid)
+ pid = get_task_pid(task, PIDTYPE_PID);
+ if (!pid)
goto out_unlock;
+ /* Let the pid remember us for quick removal */
+ ei->pid = pid;
+ if (S_ISDIR(mode)) {
+ spin_lock(&pid->lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->lock);
+ }
+
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
@@ -2861,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
unsigned long flags;
int result;
- result = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ result = mutex_lock_killable(&task->signal->exec_update_mutex);
if (result)
return result;
@@ -2897,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
result = 0;
out_unlock:
- mutex_unlock(&task->signal->cred_guard_mutex);
+ mutex_unlock(&task->signal->exec_update_mutex);
return result;
}
@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.permission = proc_pid_permission,
};
-static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
-{
- struct dentry *dentry, *leader, *dir;
- char buf[10 + 1];
- struct qstr name;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", pid);
- /* no ->d_hash() rejects on procfs */
- dentry = d_hash_and_lookup(mnt->mnt_root, &name);
- if (dentry) {
- d_invalidate(dentry);
- dput(dentry);
- }
-
- if (pid == tgid)
- return;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", tgid);
- leader = d_hash_and_lookup(mnt->mnt_root, &name);
- if (!leader)
- goto out;
-
- name.name = "task";
- name.len = strlen(name.name);
- dir = d_hash_and_lookup(leader, &name);
- if (!dir)
- goto out_put_leader;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", pid);
- dentry = d_hash_and_lookup(dir, &name);
- if (dentry) {
- d_invalidate(dentry);
- dput(dentry);
- }
-
- dput(dir);
-out_put_leader:
- dput(leader);
-out:
- return;
-}
-
/**
- * proc_flush_task - Remove dcache entries for @task from the /proc dcache.
- * @task: task that should be flushed.
+ * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
+ * @pid: pid that should be flushed.
*
- * When flushing dentries from proc, one needs to flush them from global
- * proc (proc_mnt) and from all the namespaces' procs this task was seen
- * in. This call is supposed to do all of this job.
- *
- * Looks in the dcache for
- * /proc/@pid
- * /proc/@tgid/task/@pid
- * if either directory is present flushes it and all of it'ts children
- * from the dcache.
+ * This function walks a list of inodes (that belong to any proc
+ * filesystem) that are attached to the pid and flushes them from
+ * the dentry cache.
*
* It is safe and reasonable to cache /proc entries for a task until
* that task exits. After that they just clog up the dcache with
* useless entries, possibly causing useful dcache entries to be
- * flushed instead. This routine is proved to flush those useless
- * dcache entries at process exit time.
+ * flushed instead. This routine is provided to flush those useless
+ * dcache entries when a process is reaped.
*
* NOTE: This routine is just an optimization so it does not guarantee
- * that no dcache entries will exist at process exit time it
- * just makes it very unlikely that any will persist.
+ * that no dcache entries will exist after a process is reaped
+ * it just makes it very unlikely that any will persist.
*/
-void proc_flush_task(struct task_struct *task)
+void proc_flush_pid(struct pid *pid)
{
- int i;
- struct pid *pid, *tgid;
- struct upid *upid;
-
- pid = task_pid(task);
- tgid = task_tgid(task);
-
- for (i = 0; i <= pid->level; i++) {
- upid = &pid->numbers[i];
- proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
- tgid->numbers[i].nr);
- }
+ proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
+ put_pid(pid);
}
static struct dentry *proc_pid_instantiate(struct dentry * dentry,
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index c1dea9b8222e..d0989a443c77 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
}
static const struct proc_ops cpuinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = cpuinfo_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 3faed94e4b65..4ed6dabdf6ff 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -531,6 +531,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
+static inline void pde_set_flags(struct proc_dir_entry *pde)
+{
+ if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct proc_ops *proc_ops, void *data)
@@ -541,6 +547,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
if (!p)
return NULL;
p->proc_ops = proc_ops;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -572,6 +579,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)
}
static const struct proc_ops proc_seq_ops = {
+ /* not permanent -- can call into arbitrary seq_operations */
.proc_open = proc_seq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
@@ -602,6 +610,7 @@ static int proc_single_open(struct inode *inode, struct file *file)
}
static const struct proc_ops proc_single_ops = {
+ /* not permanent -- can call into arbitrary ->single_show */
.proc_open = proc_single_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
@@ -662,9 +671,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de) {
- rb_erase(&de->subdir_node, &parent->subdir);
- if (S_ISDIR(de->mode)) {
- parent->nlink--;
+ if (unlikely(pde_is_permanent(de))) {
+ WARN(1, "removing permanent /proc entry '%s'", de->name);
+ de = NULL;
+ } else {
+ rb_erase(&de->subdir_node, &parent->subdir);
+ if (S_ISDIR(de->mode))
+ parent->nlink--;
}
}
write_unlock(&proc_subdir_lock);
@@ -700,12 +713,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ root->parent->name, root->name);
+ return -EINVAL;
+ }
rb_erase(&root->subdir_node, &parent->subdir);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ next->parent->name, next->name);
+ return -EINVAL;
+ }
rb_erase(&next->subdir_node, &de->subdir);
de = next;
continue;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6da18316d209..fb4cace9ea41 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
+ struct proc_inode *ei = PROC_I(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
/* Stop tracking associated processes */
- put_pid(PROC_I(inode)->pid);
+ if (ei->pid) {
+ proc_pid_evict_inode(ei);
+ ei->pid = NULL;
+ }
/* Let go of any associated proc directory entry */
- de = PDE(inode);
- if (de)
+ de = ei->pde;
+ if (de) {
pde_put(de);
+ ei->pde = NULL;
+ }
- head = PROC_I(inode)->sysctl;
+ head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
+ RCU_INIT_POINTER(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
@@ -68,6 +74,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
+ INIT_HLIST_NODE(&ei->sibling_inodes);
ei->ns_ops = NULL;
return &ei->vfs_inode;
}
@@ -102,6 +109,62 @@ void __init proc_init_kmemcache(void)
BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct hlist_node *node;
+ struct super_block *old_sb = NULL;
+
+ rcu_read_lock();
+ for (;;) {
+ struct super_block *sb;
+ node = hlist_first_rcu(inodes);
+ if (!node)
+ break;
+ ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ spin_lock(lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(lock);
+
+ inode = &ei->vfs_inode;
+ sb = inode->i_sb;
+ if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
+ continue;
+ inode = igrab(inode);
+ rcu_read_unlock();
+ if (sb != old_sb) {
+ if (old_sb)
+ deactivate_super(old_sb);
+ old_sb = sb;
+ }
+ if (unlikely(!inode)) {
+ rcu_read_lock();
+ continue;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *dir = d_find_any_alias(inode);
+ if (dir) {
+ d_invalidate(dir);
+ dput(dir);
+ }
+ } else {
+ struct dentry *dentry;
+ while ((dentry = d_find_alias(inode))) {
+ d_invalidate(dentry);
+ dput(dentry);
+ }
+ }
+ iput(inode);
+
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ if (old_sb)
+ deactivate_super(old_sb);
+}
+
static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
struct super_block *sb = root->d_sb;
@@ -139,6 +202,7 @@ static void unuse_pde(struct proc_dir_entry *pde)
/* pde is locked on entry, unlocked on exit */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
+ __releases(&pde->pde_unload_lock)
{
/*
* close() (proc_reg_release()) can't delete an entry and proceed:
@@ -195,135 +259,204 @@ void proc_entry_rundown(struct proc_dir_entry *de)
spin_unlock(&de->pde_unload_lock);
}
+static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
+{
+ typeof_member(struct proc_ops, proc_lseek) lseek;
+
+ lseek = pde->proc_ops->proc_lseek;
+ if (!lseek)
+ lseek = default_llseek;
+ return lseek(file, offset, whence);
+}
+
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_lseek) lseek;
- lseek = pde->proc_ops->proc_lseek;
- if (!lseek)
- lseek = default_llseek;
- rv = lseek(file, offset, whence);
+ if (pde_is_permanent(pde)) {
+ return pde_lseek(pde, file, offset, whence);
+ } else if (use_pde(pde)) {
+ rv = pde_lseek(pde, file, offset, whence);
unuse_pde(pde);
}
return rv;
}
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_read) read;
+
+ read = pde->proc_ops->proc_read;
+ if (read)
+ return read(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_read) read;
- read = pde->proc_ops->proc_read;
- if (read)
- rv = read(file, buf, count, ppos);
+ if (pde_is_permanent(pde)) {
+ return pde_read(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_read(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_write) write;
+
+ write = pde->proc_ops->proc_write;
+ if (write)
+ return write(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_write) write;
- write = pde->proc_ops->proc_write;
- if (write)
- rv = write(file, buf, count, ppos);
+ if (pde_is_permanent(pde)) {
+ return pde_write(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_write(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
+{
+ typeof_member(struct proc_ops, proc_poll) poll;
+
+ poll = pde->proc_ops->proc_poll;
+ if (poll)
+ return poll(file, pts);
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
__poll_t rv = DEFAULT_POLLMASK;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_poll) poll;
- poll = pde->proc_ops->proc_poll;
- if (poll)
- rv = poll(file, pts);
+ if (pde_is_permanent(pde)) {
+ return pde_poll(pde, file, pts);
+ } else if (use_pde(pde)) {
+ rv = pde_poll(pde, file, pts);
unuse_pde(pde);
}
return rv;
}
+static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_ioctl) ioctl;
+
+ ioctl = pde->proc_ops->proc_ioctl;
+ if (ioctl)
+ return ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_ioctl) ioctl;
- ioctl = pde->proc_ops->proc_ioctl;
- if (ioctl)
- rv = ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#ifdef CONFIG_COMPAT
+static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
+
+ compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ if (compat_ioctl)
+ return compat_ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
- compat_ioctl = pde->proc_ops->proc_compat_ioctl;
- if (compat_ioctl)
- rv = compat_ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_compat_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_compat_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#endif
+static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
+{
+ typeof_member(struct proc_ops, proc_mmap) mmap;
+
+ mmap = pde->proc_ops->proc_mmap;
+ if (mmap)
+ return mmap(file, vma);
+ return -EIO;
+}
+
static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_mmap) mmap;
- mmap = pde->proc_ops->proc_mmap;
- if (mmap)
- rv = mmap(file, vma);
+ if (pde_is_permanent(pde)) {
+ return pde_mmap(pde, file, vma);
+ } else if (use_pde(pde)) {
+ rv = pde_mmap(pde, file, vma);
unuse_pde(pde);
}
return rv;
}
static unsigned long
-proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- struct proc_dir_entry *pde = PDE(file_inode(file));
- unsigned long rv = -EIO;
+ typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
-
- get_area = pde->proc_ops->proc_get_unmapped_area;
+ get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
+ if (!get_area)
+ get_area = current->mm->get_unmapped_area;
#endif
+ if (get_area)
+ return get_area(file, orig_addr, len, pgoff, flags);
+ return orig_addr;
+}
- if (get_area)
- rv = get_area(file, orig_addr, len, pgoff, flags);
- else
- rv = orig_addr;
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ unsigned long rv = -EIO;
+
+ if (pde_is_permanent(pde)) {
+ return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+ } else if (use_pde(pde)) {
+ rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
unuse_pde(pde);
}
return rv;
@@ -337,6 +470,13 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
+ if (pde_is_permanent(pde)) {
+ open = pde->proc_ops->proc_open;
+ if (open)
+ rv = open(inode, file);
+ return rv;
+ }
+
/*
* Ensure that
* 1) PDE's ->release hook will be called no matter what
@@ -386,6 +526,17 @@ static int proc_reg_release(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
struct pde_opener *pdeo;
+
+ if (pde_is_permanent(pde)) {
+ typeof_member(struct proc_ops, proc_release) release;
+
+ release = pde->proc_ops->proc_release;
+ if (release) {
+ return release(inode, file);
+ }
+ return 0;
+ }
+
spin_lock(&pde->pde_unload_lock);
list_for_each_entry(pdeo, &pde->pde_openers, lh) {
if (pdeo->file == file) {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 41587276798e..917cc85e3466 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,7 @@ struct proc_dir_entry {
struct rb_node subdir_node;
char *name;
umode_t mode;
+ u8 flags;
u8 namelen;
char inline_name[];
} __randomize_layout;
@@ -73,6 +74,11 @@ struct proc_dir_entry {
0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
+static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_PERMANENT;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -91,7 +97,7 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
- struct hlist_node sysctl_inodes;
+ struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
} __randomize_layout;
@@ -158,6 +164,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
+extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
@@ -210,6 +217,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;
void proc_init_kmemcache(void);
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index ec1b7d2fb773..b38ad552887f 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait)
static const struct proc_ops kmsg_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_read = kmsg_read,
.proc_poll = kmsg_poll,
.proc_open = kmsg_open,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c75bb4632ed1..b6f5d459b087 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -267,42 +267,9 @@ static void unuse_table(struct ctl_table_header *p)
complete(p->unregistering);
}
-static void proc_sys_prune_dcache(struct ctl_table_header *head)
+static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
{
- struct inode *inode;
- struct proc_inode *ei;
- struct hlist_node *node;
- struct super_block *sb;
-
- rcu_read_lock();
- for (;;) {
- node = hlist_first_rcu(&head->inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
- spin_lock(&sysctl_lock);
- hlist_del_init_rcu(&ei->sysctl_inodes);
- spin_unlock(&sysctl_lock);
-
- inode = &ei->vfs_inode;
- sb = inode->i_sb;
- if (!atomic_inc_not_zero(&sb->s_active))
- continue;
- inode = igrab(inode);
- rcu_read_unlock();
- if (unlikely(!inode)) {
- deactivate_super(sb);
- rcu_read_lock();
- continue;
- }
-
- d_prune_aliases(inode);
- iput(inode);
- deactivate_super(sb);
-
- rcu_read_lock();
- }
- rcu_read_unlock();
+ proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}
/* called under sysctl_lock, will reacquire if has to wait */
@@ -324,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p)
spin_unlock(&sysctl_lock);
}
/*
- * Prune dentries for unregistered sysctls: namespaced sysctls
+ * Invalidate dentries for unregistered sysctls: namespaced sysctls
* can have duplicate names and contaminate dcache very badly.
*/
- proc_sys_prune_dcache(p);
+ proc_sys_invalidate_dcache(p);
/*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
@@ -483,7 +450,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
}
ei->sysctl = head;
ei->sysctl_entry = table;
- hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
+ hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
head->count++;
spin_unlock(&sysctl_lock);
@@ -514,7 +481,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
- hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
+ hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 608233dfd29c..2633f10446c3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = {
.subdir = RB_ROOT,
.name = "/proc",
};
-
-int pid_ns_prepare_proc(struct pid_namespace *ns)
-{
- struct proc_fs_context *ctx;
- struct fs_context *fc;
- struct vfsmount *mnt;
-
- fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
-
- if (fc->user_ns != ns->user_ns) {
- put_user_ns(fc->user_ns);
- fc->user_ns = get_user_ns(ns->user_ns);
- }
-
- ctx = fc->fs_private;
- if (ctx->pid_ns != ns) {
- put_pid_ns(ctx->pid_ns);
- get_pid_ns(ns);
- ctx->pid_ns = ns;
- }
-
- mnt = fc_mount(fc);
- put_fs_context(fc);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- ns->proc_mnt = mnt;
- return 0;
-}
-
-void pid_ns_release_proc(struct pid_namespace *ns)
-{
- kern_unmount(ns->proc_mnt);
-}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 0449edf460f5..46b3293015fe 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file)
}
static const struct proc_ops stat_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = stat_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3ba9ae83bff5..8d382d4ec067 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -123,38 +123,14 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static void vma_stop(struct proc_maps_private *priv)
-{
- struct mm_struct *mm = priv->mm;
-
- release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
- mmput(mm);
-}
-
-static struct vm_area_struct *
-m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
-{
- if (vma == priv->tail_vma)
- return NULL;
- return vma->vm_next ?: priv->tail_vma;
-}
-
-static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
-{
- if (m->count < m->size) /* vma is copied successfully */
- m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
-}
-
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- unsigned long last_addr = m->version;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
struct vm_area_struct *vma;
- unsigned int pos = *ppos;
- /* See m_cache_vma(). Zero at the start or after lseek. */
+ /* See m_next(). Zero at the start or after lseek. */
if (last_addr == -1UL)
return NULL;
@@ -163,64 +139,59 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return ERR_PTR(-ESRCH);
mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
if (down_read_killable(&mm->mmap_sem)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
hold_task_mempolicy(priv);
priv->tail_vma = get_gate_vma(mm);
- if (last_addr) {
- vma = find_vma(mm, last_addr - 1);
- if (vma && vma->vm_start <= last_addr)
- vma = m_next_vma(priv, vma);
- if (vma)
- return vma;
- }
-
- m->version = 0;
- if (pos < mm->map_count) {
- for (vma = mm->mmap; pos; pos--) {
- m->version = vma->vm_start;
- vma = vma->vm_next;
- }
+ vma = find_vma(mm, last_addr);
+ if (vma)
return vma;
- }
-
- /* we do not bother to update m->version in this case */
- if (pos == mm->map_count && priv->tail_vma)
- return priv->tail_vma;
- vma_stop(priv);
- return NULL;
+ return priv->tail_vma;
}
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *next;
+ struct vm_area_struct *next, *vma = v;
+
+ if (vma == priv->tail_vma)
+ next = NULL;
+ else if (vma->vm_next)
+ next = vma->vm_next;
+ else
+ next = priv->tail_vma;
+
+ *ppos = next ? next->vm_start : -1UL;
- (*pos)++;
- next = m_next_vma(priv, v);
- if (!next)
- vma_stop(priv);
return next;
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
- if (!IS_ERR_OR_NULL(v))
- vma_stop(priv);
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ release_task_mempolicy(priv);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
static int proc_maps_open(struct inode *inode, struct file *file,
@@ -363,7 +334,6 @@ done:
static int show_map(struct seq_file *m, void *v)
{
show_map_vma(m, v);
- m_cache_vma(m, v);
return 0;
}
@@ -847,8 +817,6 @@ static int show_smap(struct seq_file *m, void *v)
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
show_smap_vma_flags(m, vma);
- m_cache_vma(m, vma);
-
return 0;
}
@@ -1887,7 +1855,6 @@ static int show_numa_map(struct seq_file *m, void *v)
seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
out:
seq_putc(m, '\n');
- m_cache_vma(m, vma);
return 0;
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 7fbe8f058220..d99b5d39aa90 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -87,11 +87,11 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos)
struct pstore_private *ps = s->private;
struct pstore_ftrace_seq_data *data = v;
+ (*pos)++;
data->off += REC_SIZE;
if (data->off + REC_SIZE > ps->total_size)
return NULL;
- (*pos)++;
return data;
}
@@ -101,6 +101,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
struct pstore_ftrace_seq_data *data = v;
struct pstore_ftrace_record *rec;
+ if (!data)
+ return 0;
+
rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off);
seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %ps <- %pS\n",
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index d896457e7c11..408277ee3cdb 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -823,9 +823,9 @@ static int __init pstore_init(void)
ret = pstore_init_fs();
if (ret)
- return ret;
+ free_buf_for_compression();
- return 0;
+ return ret;
}
late_initcall(pstore_init);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 013486b5125e..795622190c01 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -963,7 +963,6 @@ static void __init ramoops_register_dummy(void)
pr_info("could not create platform device: %ld\n",
PTR_ERR(dummy));
dummy = NULL;
- ramoops_unregister_dummy();
}
}
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 1f4d8c06f9be..c917c191e78c 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -34,7 +34,7 @@ struct persistent_ram_buffer {
uint32_t sig;
atomic_t start;
atomic_t size;
- uint8_t data[0];
+ uint8_t data[];
};
#define PERSISTENT_RAM_SIG (0x43474244) /* DBGC */
diff --git a/fs/read_write.c b/fs/read_write.c
index 59d819c5b92e..bbfa9b12b15e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -331,7 +331,8 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i
}
#endif
-#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
+#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
+ defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned long, offset_low, loff_t __user *, result,
unsigned int, whence)
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 4075e41408b4..5129efc6f2e6 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -842,7 +842,7 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb,
struct item_head *pasted;
struct buffer_info bi;
- buffer_info_init_right(tb, &bi);
+ buffer_info_init_right(tb, &bi);
leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
/* append item in R[0] */
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 45e1a5d11af3..adb21bea3d60 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -184,11 +184,12 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
}
/* we need to make sure nobody is changing the file size beneath us */
-{
- int depth = reiserfs_write_unlock_nested(inode->i_sb);
- inode_lock(inode);
- reiserfs_write_lock_nested(inode->i_sb, depth);
-}
+ {
+ int depth = reiserfs_write_unlock_nested(inode->i_sb);
+
+ inode_lock(inode);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
+ }
reiserfs_write_lock(inode->i_sb);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 072156c4f895..5c766330e493 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2599,7 +2599,6 @@ static int journal_init_dev(struct super_block *super,
int result;
dev_t jdev;
fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
- char b[BDEVNAME_SIZE];
result = 0;
@@ -2621,8 +2620,8 @@ static int journal_init_dev(struct super_block *super,
result = PTR_ERR(journal->j_dev_bd);
journal->j_dev_bd = NULL;
reiserfs_warning(super, "sh-458",
- "cannot init journal device '%s': %i",
- __bdevname(jdev, b), result);
+ "cannot init journal device unknown-block(%u,%u): %i",
+ MAJOR(jdev), MINOR(jdev), result);
return result;
} else if (jdev != super->s_dev)
set_blocksize(journal->j_dev_bd, super->s_blocksize);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 959a066b7bb0..1594687582f0 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -838,10 +838,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
*/
INC_DIR_INODE_NLINK(dir)
- retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
- old_format_only(dir->i_sb) ?
- EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
- dentry, inode, &security);
+ retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */,
+ old_format_only(dir->i_sb) ?
+ EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
+ dentry, inode, &security);
if (retval) {
DEC_DIR_INODE_NLINK(dir)
goto out_failed;
@@ -967,7 +967,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
reiserfs_update_sd(&th, inode);
DEC_DIR_INODE_NLINK(dir)
- dir->i_size -= (DEH_SIZE + de.de_entrylen);
+ dir->i_size -= (DEH_SIZE + de.de_entrylen);
reiserfs_update_sd(&th, dir);
/* prevent empty directory from getting lost */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 1600034a929b..70f5fdf99bf6 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -68,13 +68,6 @@ int seq_open(struct file *file, const struct seq_operations *op)
p->file = file;
/*
- * Wrappers around seq_open(e.g. swaps_open) need to be
- * aware of this. If they set f_version themselves, they
- * should call seq_open first and then set f_version.
- */
- file->f_version = 0;
-
- /*
* seq_files support lseek() and pread(). They do not implement
* write() at all, but we clear FMODE_PWRITE here for historical
* reasons.
@@ -94,7 +87,6 @@ static int traverse(struct seq_file *m, loff_t offset)
int error = 0;
void *p;
- m->version = 0;
m->index = 0;
m->count = m->from = 0;
if (!offset)
@@ -161,25 +153,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
mutex_lock(&m->lock);
/*
- * seq_file->op->..m_start/m_stop/m_next may do special actions
- * or optimisations based on the file->f_version, so we want to
- * pass the file->f_version to those methods.
- *
- * seq_file->version is just copy of f_version, and seq_file
- * methods can treat it simply as file version.
- * It is copied in first and copied out after all operations.
- * It is convenient to have it as part of structure to avoid the
- * need of passing another argument to all the seq_file methods.
- */
- m->version = file->f_version;
-
- /*
* if request is to read from zero offset, reset iterator to first
* record as it might have been already advanced by previous requests
*/
if (*ppos == 0) {
m->index = 0;
- m->version = 0;
m->count = 0;
}
@@ -190,7 +168,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
if (err) {
/* With prejudice... */
m->read_pos = 0;
- m->version = 0;
m->index = 0;
m->count = 0;
goto Done;
@@ -243,7 +220,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
m->buf = seq_buf_alloc(m->size <<= 1);
if (!m->buf)
goto Enomem;
- m->version = 0;
p = m->op->start(m, &m->index);
}
m->op->stop(m, p);
@@ -256,9 +232,12 @@ Fill:
loff_t pos = m->index;
p = m->op->next(m, p, &m->index);
- if (pos == m->index)
- /* Buggy ->next function */
+ if (pos == m->index) {
+ pr_info_ratelimited("buggy seq_file .next function %ps "
+ "did not updated position index\n",
+ m->op->next);
m->index++;
+ }
if (!p || IS_ERR(p)) {
err = PTR_ERR(p);
break;
@@ -287,7 +266,6 @@ Done:
*ppos += copied;
m->read_pos += copied;
}
- file->f_version = m->version;
mutex_unlock(&m->lock);
return copied;
Enomem:
@@ -313,7 +291,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
loff_t retval = -EINVAL;
mutex_lock(&m->lock);
- m->version = file->f_version;
switch (whence) {
case SEEK_CUR:
offset += file->f_pos;
@@ -329,7 +306,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
/* with extreme prejudice... */
file->f_pos = 0;
m->read_pos = 0;
- m->version = 0;
m->index = 0;
m->count = 0;
} else {
@@ -340,7 +316,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
file->f_pos = offset;
}
}
- file->f_version = m->version;
mutex_unlock(&m->lock);
return retval;
}
diff --git a/fs/splice.c b/fs/splice.c
index d671936d0aad..4735defc46ee 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/*
* Determine where to splice to/from.
*/
-static long do_splice(struct file *in, loff_t __user *off_in,
- struct file *out, loff_t __user *off_out,
- size_t len, unsigned int flags)
+long do_splice(struct file *in, loff_t __user *off_in,
+ struct file *out, loff_t __user *off_out,
+ size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 130fc6fbcc03..26bbf960e2a2 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -558,3 +558,151 @@ void sysfs_remove_bin_file(struct kobject *kobj,
kernfs_remove_by_name(kobj->sd, attr->attr.name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
+
+static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct iattr newattrs = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = kuid,
+ .ia_gid = kgid,
+ };
+ return kernfs_setattr(kn, &newattrs);
+}
+
+/**
+ * sysfs_link_change_owner - change owner of a sysfs file.
+ * @kobj: object of the kernfs_node the symlink is located in.
+ * @targ: object of the kernfs_node the symlink points to.
+ * @name: name of the link.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * This function looks up the sysfs symlink entry @name under @kobj and changes
+ * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of
+ * @targ.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
+ const char *name, kuid_t kuid, kgid_t kgid)
+{
+ struct kernfs_node *kn = NULL;
+ int error;
+
+ if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs)
+ return -EINVAL;
+
+ error = -ENOENT;
+ kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns);
+ if (!kn)
+ goto out;
+
+ error = -EINVAL;
+ if (kernfs_type(kn) != KERNFS_LINK)
+ goto out;
+ if (kn->symlink.target_kn->priv != targ)
+ goto out;
+
+ error = internal_change_owner(kn, kuid, kgid);
+
+out:
+ kernfs_put(kn);
+ return error;
+}
+
+/**
+ * sysfs_file_change_owner - change owner of a sysfs file.
+ * @kobj: object.
+ * @name: name of the file to change.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * This function looks up the sysfs entry @name under @kobj and changes the
+ * ownership to @kuid/@kgid.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct kernfs_node *kn;
+ int error;
+
+ if (!name)
+ return -EINVAL;
+
+ if (!kobj->state_in_sysfs)
+ return -EINVAL;
+
+ kn = kernfs_find_and_get(kobj->sd, name);
+ if (!kn)
+ return -ENOENT;
+
+ error = internal_change_owner(kn, kuid, kgid);
+
+ kernfs_put(kn);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_file_change_owner);
+
+/**
+ * sysfs_change_owner - change owner of the given object.
+ * @kobj: object.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * Change the owner of the default directory, files, groups, and attributes of
+ * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs
+ * entries for a kobject are added by driver core. In summary,
+ * sysfs_change_owner() takes care of the default directory entry for @kobj,
+ * the default attributes associated with the ktype of @kobj and the default
+ * attributes associated with the ktype of @kobj.
+ * Additional properties not added by driver core have to be changed by the
+ * driver or subsystem which created them. This is similar to how
+ * driver/subsystem specific entries are removed.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
+{
+ int error;
+ const struct kobj_type *ktype;
+
+ if (!kobj->state_in_sysfs)
+ return -EINVAL;
+
+ /* Change the owner of the kobject itself. */
+ error = internal_change_owner(kobj->sd, kuid, kgid);
+ if (error)
+ return error;
+
+ ktype = get_ktype(kobj);
+ if (ktype) {
+ struct attribute **kattr;
+
+ /*
+ * Change owner of the default attributes associated with the
+ * ktype of @kobj.
+ */
+ for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) {
+ error = sysfs_file_change_owner(kobj, (*kattr)->name,
+ kuid, kgid);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Change owner of the default groups associated with the
+ * ktype of @kobj.
+ */
+ error = sysfs_groups_change_owner(kobj, ktype->default_groups,
+ kuid, kgid);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_change_owner);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index c4ab045926b7..64e6a6698935 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -13,6 +13,7 @@
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/err.h>
+#include <linux/fs.h>
#include "sysfs.h"
@@ -415,15 +416,18 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
/**
- * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
+ * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
* to a group or an attribute
* @kobj: The kobject containing the group.
* @target_kobj: The target kobject.
* @target_name: The name of the target group or attribute.
+ * @symlink_name: The name of the symlink file (target_name will be
+ * considered if symlink_name is NULL).
*/
-int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
- struct kobject *target_kobj,
- const char *target_name)
+int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
+ struct kobject *target_kobj,
+ const char *target_name,
+ const char *symlink_name)
{
struct kernfs_node *target;
struct kernfs_node *entry;
@@ -448,12 +452,129 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
return -ENOENT;
}
- link = kernfs_create_link(kobj->sd, target_name, entry);
+ if (!symlink_name)
+ symlink_name = target_name;
+
+ link = kernfs_create_link(kobj->sd, symlink_name, entry);
if (PTR_ERR(link) == -EEXIST)
- sysfs_warn_dup(kobj->sd, target_name);
+ sysfs_warn_dup(kobj->sd, symlink_name);
kernfs_put(entry);
kernfs_put(target);
return PTR_ERR_OR_ZERO(link);
}
-EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj);
+EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj);
+
+static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
+ const struct attribute_group *grp,
+ struct iattr *newattrs)
+{
+ struct kernfs_node *kn;
+ int error;
+
+ if (grp->attrs) {
+ struct attribute *const *attr;
+
+ for (attr = grp->attrs; *attr; attr++) {
+ kn = kernfs_find_and_get(grp_kn, (*attr)->name);
+ if (!kn)
+ return -ENOENT;
+
+ error = kernfs_setattr(kn, newattrs);
+ kernfs_put(kn);
+ if (error)
+ return error;
+ }
+ }
+
+ if (grp->bin_attrs) {
+ struct bin_attribute *const *bin_attr;
+
+ for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
+ if (!kn)
+ return -ENOENT;
+
+ error = kernfs_setattr(kn, newattrs);
+ kernfs_put(kn);
+ if (error)
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * sysfs_group_change_owner - change owner of an attribute group.
+ * @kobj: The kobject containing the group.
+ * @grp: The attribute group.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_group_change_owner(struct kobject *kobj,
+ const struct attribute_group *grp, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct kernfs_node *grp_kn;
+ int error;
+ struct iattr newattrs = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = kuid,
+ .ia_gid = kgid,
+ };
+
+ if (!kobj->state_in_sysfs)
+ return -EINVAL;
+
+ if (grp->name) {
+ grp_kn = kernfs_find_and_get(kobj->sd, grp->name);
+ } else {
+ kernfs_get(kobj->sd);
+ grp_kn = kobj->sd;
+ }
+ if (!grp_kn)
+ return -ENOENT;
+
+ error = kernfs_setattr(grp_kn, &newattrs);
+ if (!error)
+ error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs);
+
+ kernfs_put(grp_kn);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_group_change_owner);
+
+/**
+ * sysfs_groups_change_owner - change owner of a set of attribute groups.
+ * @kobj: The kobject containing the groups.
+ * @groups: The attribute groups.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_groups_change_owner(struct kobject *kobj,
+ const struct attribute_group **groups,
+ kuid_t kuid, kgid_t kgid)
+{
+ int error = 0, i;
+
+ if (!kobj->state_in_sysfs)
+ return -EINVAL;
+
+ if (!groups)
+ return 0;
+
+ for (i = 0; groups[i]; i++) {
+ error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 8ceb51478800..7e4bfaf2871f 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -225,7 +225,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
int offs, int quiet, int must_chk_crc)
{
- int err = -EINVAL, type, node_len;
+ int err = -EINVAL, type, node_len, dump_node = 1;
uint32_t crc, node_crc, magic;
const struct ubifs_ch *ch = buf;
@@ -278,10 +278,22 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
out_len:
if (!quiet)
ubifs_err(c, "bad node length %d", node_len);
+ if (type == UBIFS_DATA_NODE && node_len > UBIFS_DATA_NODE_SZ)
+ dump_node = 0;
out:
if (!quiet) {
ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
- ubifs_dump_node(c, buf);
+ if (dump_node) {
+ ubifs_dump_node(c, buf);
+ } else {
+ int safe_len = min3(node_len, c->leb_size - offs,
+ (int)UBIFS_MAX_DATA_NODE_SZ);
+ pr_err("\tprevent out-of-bounds memory access\n");
+ pr_err("\ttruncated data node length %d\n", safe_len);
+ pr_err("\tcorrupted data node:\n");
+ print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+ buf, safe_len, 0);
+ }
dump_stack();
}
return err;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index d49fc04f2d7d..3df9be2c684c 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -208,6 +208,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -230,6 +233,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_REMOVE_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 3bf8b1fda9d7..e5ec1afe1c66 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -905,6 +905,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
ubifs_err(c, "dead directory entry '%s', error %d",
xent->name, err);
ubifs_ro_mode(c, err);
+ kfree(xent);
goto out_release;
}
ubifs_assert(c, ubifs_inode(xino)->xattr);
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index edf43ddd7dce..283f9eb48410 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -157,7 +157,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
int err = 0;
ino_t xattr_inum;
union ubifs_key key;
- struct ubifs_dent_node *xent;
+ struct ubifs_dent_node *xent, *pxent = NULL;
struct fscrypt_name nm = {0};
struct ubifs_orphan *xattr_orphan;
struct ubifs_orphan *orphan;
@@ -181,11 +181,16 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
xattr_inum = le64_to_cpu(xent->inum);
xattr_orphan = orphan_add(c, xattr_inum, orphan);
- if (IS_ERR(xattr_orphan))
+ if (IS_ERR(xattr_orphan)) {
+ kfree(xent);
return PTR_ERR(xattr_orphan);
+ }
+ kfree(pxent);
+ pxent = xent;
key_read(c, &xent->key, &key);
}
+ kfree(pxent);
return 0;
}
@@ -688,14 +693,14 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
ino_key_init(c, &key1, inum);
err = ubifs_tnc_lookup(c, &key1, ino);
- if (err)
+ if (err && err != -ENOENT)
goto out_free;
/*
* Check whether an inode can really get deleted.
* linkat() with O_TMPFILE allows rebirth of an inode.
*/
- if (ino->nlink == 0) {
+ if (err == 0 && ino->nlink == 0) {
dbg_rcvry("deleting orphaned inode %lu",
(unsigned long)inum);
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 3fd85464abd5..736ebc5dc441 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -5,7 +5,7 @@
* http://www.ecma.ch
*
* Copyright (c) 2001-2002 Ben Fennema
- * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2017-2019 Pali Rohár <pali@kernel.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 35e61b2cacfe..d5fbfab3ddb6 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -5,7 +5,7 @@
* http://www.osta.org
*
* Copyright (c) 2001-2004 Ben Fennema
- * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2017-2019 Pali Rohár <pali@kernel.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 3d83be54c474..758efe557a19 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -83,7 +83,7 @@ struct udf_virtual_data {
struct udf_bitmap {
__u32 s_extPosition;
int s_nr_groups;
- struct buffer_head *s_block_bitmap[0];
+ struct buffer_head *s_block_bitmap[];
};
struct udf_part_map {
diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore
index 0381e2221480..9b2467e77b2d 100644
--- a/fs/unicode/.gitignore
+++ b/fs/unicode/.gitignore
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
mkutf8data
utf8data.h
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 37df7c9eedb1..e39fdec8a0b0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
if (!pmd_present(_pmd))
goto out;
- if (pmd_trans_huge(_pmd))
+ if (pmd_trans_huge(_pmd)) {
+ if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
+ ret = true;
goto out;
+ }
/*
* the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
@@ -328,12 +331,38 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
*/
if (pte_none(*pte))
ret = true;
+ if (!pte_write(*pte) && (reason & VM_UFFD_WP))
+ ret = true;
pte_unmap(pte);
out:
return ret;
}
+/* Should pair with userfaultfd_signal_pending() */
+static inline long userfaultfd_get_blocking_state(unsigned int flags)
+{
+ if (flags & FAULT_FLAG_INTERRUPTIBLE)
+ return TASK_INTERRUPTIBLE;
+
+ if (flags & FAULT_FLAG_KILLABLE)
+ return TASK_KILLABLE;
+
+ return TASK_UNINTERRUPTIBLE;
+}
+
+/* Should pair with userfaultfd_get_blocking_state() */
+static inline bool userfaultfd_signal_pending(unsigned int flags)
+{
+ if (flags & FAULT_FLAG_INTERRUPTIBLE)
+ return signal_pending(current);
+
+ if (flags & FAULT_FLAG_KILLABLE)
+ return fatal_signal_pending(current);
+
+ return false;
+}
+
/*
* The locking rules involved in returning VM_FAULT_RETRY depending on
* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@ -355,7 +384,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
vm_fault_t ret = VM_FAULT_SIGBUS;
- bool must_wait, return_to_userland;
+ bool must_wait;
long blocking_state;
/*
@@ -462,11 +491,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
uwq.ctx = ctx;
uwq.waken = false;
- return_to_userland =
- (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
- (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
- blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
- TASK_KILLABLE;
+ blocking_state = userfaultfd_get_blocking_state(vmf->flags);
spin_lock_irq(&ctx->fault_pending_wqh.lock);
/*
@@ -492,8 +517,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
up_read(&mm->mmap_sem);
if (likely(must_wait && !READ_ONCE(ctx->released) &&
- (return_to_userland ? !signal_pending(current) :
- !fatal_signal_pending(current)))) {
+ !userfaultfd_signal_pending(vmf->flags))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
schedule();
ret |= VM_FAULT_MAJOR;
@@ -515,8 +539,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
set_current_state(blocking_state);
if (READ_ONCE(uwq.waken) ||
READ_ONCE(ctx->released) ||
- (return_to_userland ? signal_pending(current) :
- fatal_signal_pending(current)))
+ userfaultfd_signal_pending(vmf->flags))
break;
schedule();
}
@@ -524,30 +547,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
__set_current_state(TASK_RUNNING);
- if (return_to_userland) {
- if (signal_pending(current) &&
- !fatal_signal_pending(current)) {
- /*
- * If we got a SIGSTOP or SIGCONT and this is
- * a normal userland page fault, just let
- * userland return so the signal will be
- * handled and gdb debugging works. The page
- * fault code immediately after we return from
- * this function is going to release the
- * mmap_sem and it's not depending on it
- * (unlike gup would if we were not to return
- * VM_FAULT_RETRY).
- *
- * If a fatal signal is pending we still take
- * the streamlined VM_FAULT_RETRY failure path
- * and there's no need to retake the mmap_sem
- * in such case.
- */
- down_read(&mm->mmap_sem);
- ret = VM_FAULT_NOPAGE;
- }
- }
-
/*
* Here we race with the list_del; list_add in
* userfaultfd_ctx_read(), however because we don't ever run
@@ -1293,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm,
return 0;
}
-static inline bool vma_can_userfault(struct vm_area_struct *vma)
+static inline bool vma_can_userfault(struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
- vma_is_shmem(vma);
+ /* FIXME: add WP support to hugetlbfs and shmem */
+ return vma_is_anonymous(vma) ||
+ ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
+ !(vm_flags & VM_UFFD_WP));
}
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1328,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags = 0;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
- if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
vm_flags |= VM_UFFD_WP;
- /*
- * FIXME: remove the below error constraint by
- * implementing the wprotect tracking mode.
- */
- ret = -EINVAL;
- goto out;
- }
ret = validate_range(mm, &uffdio_register.range.start,
uffdio_register.range.len);
@@ -1386,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/* check not compatible vmas */
ret = -EINVAL;
- if (!vma_can_userfault(cur))
+ if (!vma_can_userfault(cur, vm_flags))
goto out_unlock;
/*
@@ -1414,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (end & (vma_hpagesize - 1))
goto out_unlock;
}
+ if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
+ goto out_unlock;
/*
* Check that this vma isn't already owned by a
@@ -1443,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!vma_can_userfault(vma));
+ BUG_ON(!vma_can_userfault(vma, vm_flags));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1498,14 +1495,24 @@ out_unlock:
up_write(&mm->mmap_sem);
mmput(mm);
if (!ret) {
+ __u64 ioctls_out;
+
+ ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
+ UFFD_API_RANGE_IOCTLS;
+
+ /*
+ * Declare the WP ioctl only if the WP mode is
+ * specified and all checks passed with the range
+ */
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
+ ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
+
/*
* Now that we scanned all vmas we can already tell
* userland which ioctls methods are guaranteed to
* succeed on this range.
*/
- if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
- UFFD_API_RANGE_IOCTLS,
- &user_uffdio_register->ioctls))
+ if (put_user(ioctls_out, &user_uffdio_register->ioctls))
ret = -EFAULT;
}
out:
@@ -1581,7 +1588,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (!vma_can_userfault(cur))
+ if (!vma_can_userfault(cur, cur->vm_flags))
goto out_unlock;
found = true;
@@ -1595,7 +1602,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!vma_can_userfault(vma));
+ BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
/*
* Nothing to do: this vma is already registered into this
@@ -1730,11 +1737,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
ret = -EINVAL;
if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
goto out;
- if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+ if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing);
+ uffdio_copy.len, &ctx->mmap_changing,
+ uffdio_copy.mode);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1807,6 +1815,53 @@ out:
return ret;
}
+static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_writeprotect uffdio_wp;
+ struct uffdio_writeprotect __user *user_uffdio_wp;
+ struct userfaultfd_wake_range range;
+ bool mode_wp, mode_dontwake;
+
+ if (READ_ONCE(ctx->mmap_changing))
+ return -EAGAIN;
+
+ user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
+
+ if (copy_from_user(&uffdio_wp, user_uffdio_wp,
+ sizeof(struct uffdio_writeprotect)))
+ return -EFAULT;
+
+ ret = validate_range(ctx->mm, &uffdio_wp.range.start,
+ uffdio_wp.range.len);
+ if (ret)
+ return ret;
+
+ if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
+ UFFDIO_WRITEPROTECT_MODE_WP))
+ return -EINVAL;
+
+ mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
+ mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
+
+ if (mode_wp && mode_dontwake)
+ return -EINVAL;
+
+ ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp,
+ &ctx->mmap_changing);
+ if (ret)
+ return ret;
+
+ if (!mode_wp && !mode_dontwake) {
+ range.start = uffdio_wp.range.start;
+ range.len = uffdio_wp.range.len;
+ wake_userfault(ctx, &range);
+ }
+ return ret;
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -1888,6 +1943,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_ZEROPAGE:
ret = userfaultfd_zeropage(ctx, arg);
break;
+ case UFFDIO_WRITEPROTECT:
+ ret = userfaultfd_writeprotect(ctx, arg);
+ break;
}
return ret;
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 90dd78f0eb27..e13265e65871 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
if (len < sizeof(*new_xattr))
return NULL;
- new_xattr = kmalloc(len, GFP_KERNEL);
+ new_xattr = kvmalloc(len, GFP_KERNEL);
if (!new_xattr)
return NULL;
@@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* @value: value of the xattr. If %NULL, will remove the attribute.
* @size: size of the new xattr
* @flags: %XATTR_{CREATE|REPLACE}
+ * @removed_size: returns size of the removed xattr, -1 if none removed
*
* %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
* with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
@@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* Returns 0 on success, -errno on failure.
*/
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
+ const void *value, size_t size, int flags,
+ ssize_t *removed_size)
{
struct simple_xattr *xattr;
struct simple_xattr *new_xattr = NULL;
@@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
new_xattr->name = kstrdup(name, GFP_KERNEL);
if (!new_xattr->name) {
- kfree(new_xattr);
+ kvfree(new_xattr);
return -ENOMEM;
}
}
@@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
err = -EEXIST;
} else if (new_xattr) {
list_replace(&xattr->list, &new_xattr->list);
+ if (removed_size)
+ *removed_size = xattr->size;
} else {
list_del(&xattr->list);
+ if (removed_size)
+ *removed_size = xattr->size;
}
goto out;
}
@@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
list_add(&new_xattr->list, &xattrs->head);
xattr = NULL;
}
+
+ if (removed_size)
+ *removed_size = -1;
out:
spin_unlock(&xattrs->lock);
if (xattr) {
kfree(xattr->name);
- kfree(xattr);
+ kvfree(xattr);
}
return err;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index aceca2f9a3db..4f95df476181 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -26,6 +26,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_btree.o \
+ xfs_btree_staging.o \
xfs_da_btree.o \
xfs_defer.o \
xfs_dir2.o \
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 08d6beb54f8c..9d84007a5c65 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -231,7 +231,7 @@ xfs_sbblock_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
xfs_sb_to_disk(dsb, &mp->m_sb);
dsb->sb_inprogress = 1;
@@ -243,7 +243,7 @@ xfs_agfblock_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ struct xfs_agf *agf = bp->b_addr;
xfs_extlen_t tmpsize;
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -301,7 +301,7 @@ xfs_agflblock_init(
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
}
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+ agfl_bno = xfs_buf_to_agfl_bno(bp);
for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
}
@@ -312,7 +312,7 @@ xfs_agiblock_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ struct xfs_agi *agi = bp->b_addr;
int bucket;
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -502,7 +502,7 @@ xfs_ag_extend_space(
if (error)
return error;
- agi = XFS_BUF_TO_AGI(bp);
+ agi = bp->b_addr;
be32_add_cpu(&agi->agi_length, len);
ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
@@ -515,7 +515,7 @@ xfs_ag_extend_space(
if (error)
return error;
- agf = XFS_BUF_TO_AGF(bp);
+ agf = bp->b_addr;
be32_add_cpu(&agf->agf_length, len);
ASSERT(agf->agf_length == agi->agi_length);
xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
@@ -569,11 +569,11 @@ xfs_ag_get_geometry(
memset(ageo, 0, sizeof(*ageo));
ageo->ag_number = agno;
- agi = XFS_BUF_TO_AGI(agi_bp);
+ agi = agi_bp->b_addr;
ageo->ag_icount = be32_to_cpu(agi->agi_count);
ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
- agf = XFS_BUF_TO_AGF(agf_bp);
+ agf = agf_bp->b_addr;
ageo->ag_length = be32_to_cpu(agf->agf_length);
freeblks = pag->pagf_freeblks +
pag->pagf_flcount +
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d8053bc96c4d..203e74fa64aa 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -151,7 +151,7 @@ xfs_alloc_lookup_eq(
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
- cur->bc_private.a.priv.abt.active = (*stat == 1);
+ cur->bc_ag.abt.active = (*stat == 1);
return error;
}
@@ -171,7 +171,7 @@ xfs_alloc_lookup_ge(
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
- cur->bc_private.a.priv.abt.active = (*stat == 1);
+ cur->bc_ag.abt.active = (*stat == 1);
return error;
}
@@ -190,7 +190,7 @@ xfs_alloc_lookup_le(
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
- cur->bc_private.a.priv.abt.active = (*stat == 1);
+ cur->bc_ag.abt.active = (*stat == 1);
return error;
}
@@ -198,7 +198,7 @@ static inline bool
xfs_alloc_cur_active(
struct xfs_btree_cur *cur)
{
- return cur && cur->bc_private.a.priv.abt.active;
+ return cur && cur->bc_ag.abt.active;
}
/*
@@ -230,7 +230,7 @@ xfs_alloc_get_rec(
int *stat) /* output: success/failure */
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ xfs_agnumber_t agno = cur->bc_ag.agno;
union xfs_btree_rec *rec;
int error;
@@ -589,6 +589,7 @@ xfs_agfl_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
int i;
/*
@@ -614,8 +615,8 @@ xfs_agfl_verify(
return __this_address;
for (i = 0; i < xfs_agfl_size(mp); i++) {
- if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
- be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
+ be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return __this_address;
}
@@ -713,7 +714,7 @@ xfs_alloc_update_counters(
struct xfs_buf *agbp,
long len)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
pag->pagf_freeblks += len;
be32_add_cpu(&agf->agf_freeblks, len);
@@ -721,7 +722,7 @@ xfs_alloc_update_counters(
xfs_trans_agblocks_delta(tp, len);
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
be32_to_cpu(agf->agf_length))) {
- xfs_buf_corruption_error(agbp);
+ xfs_buf_mark_corrupt(agbp);
return -EFSCORRUPTED;
}
@@ -907,7 +908,7 @@ xfs_alloc_cur_check(
deactivate = true;
out:
if (deactivate)
- cur->bc_private.a.priv.abt.active = false;
+ cur->bc_ag.abt.active = false;
trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
*new);
return 0;
@@ -922,13 +923,13 @@ xfs_alloc_cur_finish(
struct xfs_alloc_arg *args,
struct xfs_alloc_cur *acur)
{
+ struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
int error;
ASSERT(acur->cnt && acur->bnolt);
ASSERT(acur->bno >= acur->rec_bno);
ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
- ASSERT(acur->rec_bno + acur->rec_len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
acur->rec_len, acur->bno, acur->len, 0);
@@ -1026,6 +1027,7 @@ xfs_alloc_ag_vextent_small(
xfs_extlen_t *flenp, /* result length */
int *stat) /* status: 0-freelist, 1-normal/none */
{
+ struct xfs_agf *agf = args->agbp->b_addr;
int error = 0;
xfs_agblock_t fbno = NULLAGBLOCK;
xfs_extlen_t flen = 0;
@@ -1054,8 +1056,7 @@ xfs_alloc_ag_vextent_small(
if (args->minlen != 1 || args->alignment != 1 ||
args->resv == XFS_AG_RESV_AGFL ||
- (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
- args->minleft))
+ be32_to_cpu(agf->agf_flcount) <= args->minleft)
goto out;
error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -1079,9 +1080,7 @@ xfs_alloc_ag_vextent_small(
}
*fbnop = args->agbno = fbno;
*flenp = args->len = 1;
- if (XFS_IS_CORRUPT(args->mp,
- fbno >= be32_to_cpu(
- XFS_BUF_TO_AGF(args->agbp)->agf_length))) {
+ if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
error = -EFSCORRUPTED;
goto error;
}
@@ -1203,6 +1202,7 @@ STATIC int /* error */
xfs_alloc_ag_vextent_exact(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
+ struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
int error;
@@ -1281,8 +1281,7 @@ xfs_alloc_ag_vextent_exact(
*/
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
args->agno, XFS_BTNUM_CNT);
- ASSERT(args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
if (error) {
@@ -1353,7 +1352,7 @@ xfs_alloc_walk_iter(
if (error)
return error;
if (i == 0)
- cur->bc_private.a.priv.abt.active = false;
+ cur->bc_ag.abt.active = false;
if (count > 0)
count--;
@@ -1468,7 +1467,7 @@ xfs_alloc_ag_vextent_locality(
if (error)
return error;
if (i) {
- acur->cnt->bc_private.a.priv.abt.active = true;
+ acur->cnt->bc_ag.abt.active = true;
fbcur = acur->cnt;
fbinc = false;
}
@@ -1515,7 +1514,7 @@ xfs_alloc_ag_vextent_lastblock(
* maxlen, go to the start of this block, and skip all those smaller
* than minlen.
*/
- if (len || args->alignment > 1) {
+ if (*len || args->alignment > 1) {
acur->cnt->bc_ptrs[0] = 1;
do {
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
@@ -1661,6 +1660,7 @@ STATIC int /* error */
xfs_alloc_ag_vextent_size(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
+ struct xfs_agf *agf = args->agbp->b_addr;
xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
int error; /* error result */
@@ -1851,8 +1851,7 @@ restart:
args->agbno = rbno;
if (XFS_IS_CORRUPT(args->mp,
args->agbno + args->len >
- be32_to_cpu(
- XFS_BUF_TO_AGF(args->agbp)->agf_length))) {
+ be32_to_cpu(agf->agf_length))) {
error = -EFSCORRUPTED;
goto error0;
}
@@ -2424,7 +2423,7 @@ xfs_agfl_reset(
struct xfs_perag *pag)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
ASSERT(pag->pagf_agflreset);
trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
@@ -2655,7 +2654,7 @@ xfs_alloc_get_freelist(
xfs_agblock_t *bnop, /* block address retrieved from freelist */
int btreeblk) /* destination is a AGF btree */
{
- xfs_agf_t *agf; /* a.g. freespace structure */
+ struct xfs_agf *agf = agbp->b_addr;
xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
xfs_agblock_t bno; /* block number returned */
__be32 *agfl_bno;
@@ -2667,7 +2666,6 @@ xfs_alloc_get_freelist(
/*
* Freelist is empty, give up.
*/
- agf = XFS_BUF_TO_AGF(agbp);
if (!agf->agf_flcount) {
*bnop = NULLAGBLOCK;
return 0;
@@ -2684,7 +2682,7 @@ xfs_alloc_get_freelist(
/*
* Get the block number and update the data structures.
*/
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
@@ -2745,7 +2743,7 @@ xfs_alloc_log_agf(
sizeof(xfs_agf_t)
};
- trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+ trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
@@ -2783,18 +2781,15 @@ xfs_alloc_put_freelist(
xfs_agblock_t bno, /* block being freed */
int btreeblk) /* block came from a AGF btree */
{
- xfs_agf_t *agf; /* a.g. freespace structure */
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agf *agf = agbp->b_addr;
__be32 *blockp;/* pointer to array entry */
int error;
int logflags;
- xfs_mount_t *mp; /* mount structure */
xfs_perag_t *pag; /* per allocation group data */
__be32 *agfl_bno;
int startoff;
- agf = XFS_BUF_TO_AGF(agbp);
- mp = tp->t_mountp;
-
if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
@@ -2820,7 +2815,7 @@ xfs_alloc_put_freelist(
ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
*blockp = cpu_to_be32(bno);
startoff = (char *)blockp - (char *)agflbp->b_addr;
@@ -2838,13 +2833,12 @@ xfs_agf_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ struct xfs_agf *agf = bp->b_addr;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (!xfs_log_check_lsn(mp,
- be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
return __this_address;
}
@@ -2858,6 +2852,13 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
return __this_address;
+ if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
+ return __this_address;
+
+ if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
+ be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
+ return __this_address;
+
if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
@@ -2869,6 +2870,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
return __this_address;
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
+ return __this_address;
+
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -2883,6 +2888,11 @@ xfs_agf_verify(
return __this_address;
if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_refcount_blocks) >
+ be32_to_cpu(agf->agf_length))
+ return __this_address;
+
+ if (xfs_sb_version_hasreflink(&mp->m_sb) &&
(be32_to_cpu(agf->agf_refcount_level) < 1 ||
be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
return __this_address;
@@ -2914,6 +2924,7 @@ xfs_agf_write_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_agf *agf = bp->b_addr;
xfs_failaddr_t fa;
fa = xfs_agf_verify(bp);
@@ -2926,7 +2937,7 @@ xfs_agf_write_verify(
return;
if (bip)
- XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
}
@@ -2994,7 +3005,7 @@ xfs_alloc_read_agf(
return error;
ASSERT(!(*bpp)->b_error);
- agf = XFS_BUF_TO_AGF(*bpp);
+ agf = (*bpp)->b_addr;
pag = xfs_perag_get(mp, agno);
if (!pag->pagf_init) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
@@ -3275,6 +3286,7 @@ __xfs_free_extent(
struct xfs_buf *agbp;
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ struct xfs_agf *agf;
int error;
unsigned int busy_flags = 0;
@@ -3288,6 +3300,7 @@ __xfs_free_extent(
error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
if (error)
return error;
+ agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
error = -EFSCORRUPTED;
@@ -3295,9 +3308,7 @@ __xfs_free_extent(
}
/* validate the extent size is legal now we have the agf locked */
- if (XFS_IS_CORRUPT(mp,
- agbno + len >
- be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) {
+ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
error = -EFSCORRUPTED;
goto err;
}
@@ -3408,7 +3419,7 @@ xfs_agfl_walk(
unsigned int i;
int error;
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
i = be32_to_cpu(agf->agf_flfirst);
/* Nothing to walk in an empty AGFL. */
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 7380fbe4a3ff..a851bf77f17b 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -236,4 +236,13 @@ typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf,
struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv);
+static inline __be32 *
+xfs_buf_to_agfl_bno(
+ struct xfs_buf *bp)
+{
+ if (xfs_sb_version_hascrc(&bp->b_mount->m_sb))
+ return bp->b_addr + sizeof(struct xfs_agfl);
+ return bp->b_addr;
+}
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 279694d73e4e..60c453cb3ee3 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -12,6 +12,7 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
@@ -25,7 +26,7 @@ xfs_allocbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_ag.agbp, cur->bc_ag.agno,
cur->bc_btnum);
}
@@ -35,8 +36,8 @@ xfs_allocbt_set_root(
union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
int btnum = cur->bc_btnum;
struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
@@ -62,7 +63,7 @@ xfs_allocbt_alloc_block(
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
&bno, 1);
if (error)
return error;
@@ -72,7 +73,7 @@ xfs_allocbt_alloc_block(
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false);
xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
@@ -86,8 +87,8 @@ xfs_allocbt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
xfs_agblock_t bno;
int error;
@@ -113,7 +114,7 @@ xfs_allocbt_update_lastrec(
int ptr,
int reason)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
struct xfs_perag *pag;
__be32 len;
@@ -162,7 +163,7 @@ xfs_allocbt_update_lastrec(
pag = xfs_perag_get(cur->bc_mp, seqno);
pag->pagf_longest = be32_to_cpu(len);
xfs_perag_put(pag);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+ xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST);
}
STATIC int
@@ -226,9 +227,9 @@ xfs_allocbt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_roots[cur->bc_btnum];
}
@@ -471,18 +472,14 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.recs_inorder = xfs_cntbt_recs_inorder,
};
-/*
- * Allocate a new allocation btree cursor.
- */
-struct xfs_btree_cur * /* new alloc btree cursor */
-xfs_allocbt_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for agf structure */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_btnum_t btnum) /* btree identifier */
+/* Allocate most of a new allocation btree cursor. */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
struct xfs_btree_cur *cur;
ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
@@ -495,19 +492,16 @@ xfs_allocbt_init_cursor(
cur->bc_blocklog = mp->m_sb.sb_blocklog;
if (btnum == XFS_BTNUM_CNT) {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
cur->bc_ops = &xfs_cntbt_ops;
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
} else {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
cur->bc_ops = &xfs_bnobt_ops;
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
}
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
- cur->bc_private.a.priv.abt.active = false;
+ cur->bc_ag.agno = agno;
+ cur->bc_ag.abt.active = false;
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
@@ -516,6 +510,73 @@ xfs_allocbt_init_cursor(
}
/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur * /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for agf structure */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_btnum_t btnum) /* btree identifier */
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_allocbt_init_common(mp, tp, agno, btnum);
+ if (btnum == XFS_BTNUM_CNT)
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ else
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+
+ cur->bc_ag.agbp = agbp;
+
+ return cur;
+}
+
+/* Create a free space btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_allocbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_allocbt_init_common(mp, NULL, agno, btnum);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Install a new free space btree root. Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_allocbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+ agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+ } else {
+ cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+ }
+}
+
+/*
* Calculate number of records in an alloc btree block.
*/
int
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index c9305ebb69f6..047f09f0be3c 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -13,6 +13,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xbtree_afakeroot;
/*
* Btree block header size depends on a superblock flag.
@@ -48,8 +49,14 @@ struct xfs_mount;
extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_buf *,
xfs_agnumber_t, xfs_btnum_t);
+struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, xfs_agnumber_t agno,
+ xfs_btnum_t btnum);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e6149720ce02..e4fe3dca9883 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -56,33 +56,6 @@ STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-
-STATIC int
-xfs_attr_args_init(
- struct xfs_da_args *args,
- struct xfs_inode *dp,
- const unsigned char *name,
- size_t namelen,
- int flags)
-{
-
- if (!name)
- return -EINVAL;
-
- memset(args, 0, sizeof(*args));
- args->geo = dp->i_mount->m_attr_geo;
- args->whichfork = XFS_ATTR_FORK;
- args->dp = dp;
- args->flags = flags;
- args->name = name;
- args->namelen = namelen;
- if (args->namelen >= MAXNAMELEN)
- return -EFAULT; /* match IRIX behaviour */
-
- args->hashval = xfs_da_hashname(args->name, args->namelen);
- return 0;
-}
-
int
xfs_inode_hasattr(
struct xfs_inode *ip)
@@ -104,85 +77,60 @@ xfs_inode_hasattr(
*/
int
xfs_attr_get_ilocked(
- struct xfs_inode *ip,
struct xfs_da_args *args)
{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- if (!xfs_inode_hasattr(ip))
+ if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
- else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+
+ if (args->dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_getvalue(args);
- else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
+ if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK))
return xfs_attr_leaf_get(args);
- else
- return xfs_attr_node_get(args);
+ return xfs_attr_node_get(args);
}
/*
* Retrieve an extended attribute by name, and its value if requested.
*
- * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
- * just an indication whether the attribute exists and the size of the value if
- * it exists. The size is returned in @valuelenp,
+ * If args->valuelen is zero, then the caller does not want the value, just an
+ * indication whether the attribute exists and the size of the value if it
+ * exists. The size is returned in args.valuelen.
*
- * If the attribute is found, but exceeds the size limit set by the caller in
- * @valuelenp, return -ERANGE with the size of the attribute that was found in
- * @valuelenp.
+ * If args->value is NULL but args->valuelen is non-zero, allocate the buffer
+ * for the value after existence of the attribute has been determined. The
+ * caller always has to free args->value if it is set, no matter if this
+ * function was successful or not.
*
- * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
- * existence of the attribute has been determined. On success, return that
- * buffer to the caller and leave them to free it. On failure, free any
- * allocated buffer and ensure the buffer pointer returned to the caller is
- * null.
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * args->valuelen, return -ERANGE with the size of the attribute that was found
+ * in args->valuelen.
*/
int
xfs_attr_get(
- struct xfs_inode *ip,
- const unsigned char *name,
- size_t namelen,
- unsigned char **value,
- int *valuelenp,
- int flags)
+ struct xfs_da_args *args)
{
- struct xfs_da_args args;
uint lock_mode;
int error;
- ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
-
- XFS_STATS_INC(ip->i_mount, xs_attr_get);
+ XFS_STATS_INC(args->dp->i_mount, xs_attr_get);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (XFS_FORCED_SHUTDOWN(args->dp->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, ip, name, namelen, flags);
- if (error)
- return error;
+ args->geo = args->dp->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
/* Entirely possible to look up a name which doesn't exist */
- args.op_flags = XFS_DA_OP_OKNOENT;
- if (flags & ATTR_ALLOC)
- args.op_flags |= XFS_DA_OP_ALLOCVAL;
- else
- args.value = *value;
- args.valuelen = *valuelenp;
+ args->op_flags = XFS_DA_OP_OKNOENT;
- lock_mode = xfs_ilock_attr_map_shared(ip);
- error = xfs_attr_get_ilocked(ip, &args);
- xfs_iunlock(ip, lock_mode);
- *valuelenp = args.valuelen;
+ lock_mode = xfs_ilock_attr_map_shared(args->dp);
+ error = xfs_attr_get_ilocked(args);
+ xfs_iunlock(args->dp, lock_mode);
- /* on error, we have to clean up allocated value buffers */
- if (error) {
- if (flags & ATTR_ALLOC) {
- kmem_free(args.value);
- *value = NULL;
- }
- return error;
- }
- *value = args.value;
- return 0;
+ return error;
}
/*
@@ -238,7 +186,7 @@ xfs_attr_try_sf_addname(
* Commit the shortform mods, and we're done.
* NOTE: this is also the error path (EEXIST, etc).
*/
- if (!error && (args->flags & ATTR_KERNOTIME) == 0)
+ if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
if (mp->m_flags & XFS_MOUNT_WSYNC)
@@ -336,188 +284,127 @@ xfs_attr_remove_args(
return error;
}
+/*
+ * Note: If args->value is NULL the attribute will be removed, just like the
+ * Linux ->setattr API.
+ */
int
xfs_attr_set(
- struct xfs_inode *dp,
- const unsigned char *name,
- size_t namelen,
- unsigned char *value,
- int valuelen,
- int flags)
+ struct xfs_da_args *args)
{
+ struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_da_args args;
struct xfs_trans_res tres;
- int rsvd = (flags & ATTR_ROOT) != 0;
+ bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
int error, local;
-
- XFS_STATS_INC(mp, xs_attr_set);
+ unsigned int total;
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, dp, name, namelen, flags);
- if (error)
- return error;
-
- args.value = value;
- args.valuelen = valuelen;
- args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
- args.total = xfs_attr_calc_size(&args, &local);
-
error = xfs_qm_dqattach(dp);
if (error)
return error;
- /*
- * If the inode doesn't have an attribute fork, add one.
- * (inode must not be locked when we call this routine)
- */
- if (XFS_IFORK_Q(dp) == 0) {
- int sf_size = sizeof(xfs_attr_sf_hdr_t) +
- XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
-
- error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
- if (error)
- return error;
- }
-
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-
- /*
- * Root fork attributes can use reserved data blocks for this
- * operation if necessary
- */
- error = xfs_trans_alloc(mp, &tres, args.total, 0,
- rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
- if (error)
- return error;
-
- xfs_ilock(dp, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
- rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
-
- xfs_trans_ijoin(args.trans, dp, 0);
- error = xfs_attr_set_args(&args);
- if (error)
- goto out_trans_cancel;
- if (!args.trans) {
- /* shortform attribute has already been committed */
- goto out_unlock;
- }
-
- /*
- * If this is a synchronous mount, make sure that the
- * transaction goes to disk before returning to the user.
- */
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
-
- if ((flags & ATTR_KERNOTIME) == 0)
- xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
/*
- * Commit the last in the sequence of transactions.
+ * We have no control over the attribute names that userspace passes us
+ * to remove, so we have to allow the name lookup prior to attribute
+ * removal to fail as well.
*/
- xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans);
-out_unlock:
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
+ args->op_flags = XFS_DA_OP_OKNOENT;
-out_trans_cancel:
- if (args.trans)
- xfs_trans_cancel(args.trans);
- goto out_unlock;
-}
+ if (args->value) {
+ XFS_STATS_INC(mp, xs_attr_set);
-/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
- */
-int
-xfs_attr_remove(
- struct xfs_inode *dp,
- const unsigned char *name,
- size_t namelen,
- int flags)
-{
- struct xfs_mount *mp = dp->i_mount;
- struct xfs_da_args args;
- int error;
-
- XFS_STATS_INC(mp, xs_attr_remove);
+ args->op_flags |= XFS_DA_OP_ADDNAME;
+ args->total = xfs_attr_calc_size(args, &local);
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return -EIO;
+ /*
+ * If the inode doesn't have an attribute fork, add one.
+ * (inode must not be locked when we call this routine)
+ */
+ if (XFS_IFORK_Q(dp) == 0) {
+ int sf_size = sizeof(struct xfs_attr_sf_hdr) +
+ XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen,
+ args->valuelen);
- error = xfs_attr_args_init(&args, dp, name, namelen, flags);
- if (error)
- return error;
+ error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ if (error)
+ return error;
+ }
- /*
- * we have no control over the attribute names that userspace passes us
- * to remove, so we have to allow the name lookup prior to attribute
- * removal to fail.
- */
- args.op_flags = XFS_DA_OP_OKNOENT;
+ tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres *
+ args->total;
+ tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ total = args->total;
+ } else {
+ XFS_STATS_INC(mp, xs_attr_remove);
- error = xfs_qm_dqattach(dp);
- if (error)
- return error;
+ tres = M_RES(mp)->tr_attrrm;
+ total = XFS_ATTRRM_SPACE_RES(mp);
+ }
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
- XFS_ATTRRM_SPACE_RES(mp), 0,
- (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
- &args.trans);
+ error = xfs_trans_alloc(mp, &tres, total, 0,
+ rsvd ? XFS_TRANS_RESERVE : 0, &args->trans);
if (error)
return error;
xfs_ilock(dp, XFS_ILOCK_EXCL);
- /*
- * No need to make quota reservations here. We expect to release some
- * blocks not allocate in the common case.
- */
- xfs_trans_ijoin(args.trans, dp, 0);
-
- error = xfs_attr_remove_args(&args);
- if (error)
- goto out;
+ xfs_trans_ijoin(args->trans, dp, 0);
+ if (args->value) {
+ unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS;
+
+ if (rsvd)
+ quota_flags |= XFS_QMOPT_FORCE_RES;
+ error = xfs_trans_reserve_quota_nblks(args->trans, dp,
+ args->total, 0, quota_flags);
+ if (error)
+ goto out_trans_cancel;
+ error = xfs_attr_set_args(args);
+ if (error)
+ goto out_trans_cancel;
+ /* shortform attribute has already been committed */
+ if (!args->trans)
+ goto out_unlock;
+ } else {
+ error = xfs_attr_remove_args(args);
+ if (error)
+ goto out_trans_cancel;
+ }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
+ xfs_trans_set_sync(args->trans);
- if ((flags & ATTR_KERNOTIME) == 0)
- xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+ if (!(args->op_flags & XFS_DA_OP_NOTIME))
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
/*
* Commit the last in the sequence of transactions.
*/
- xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ error = xfs_trans_commit(args->trans);
+out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
return error;
-out:
- if (args.trans)
- xfs_trans_cancel(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
+out_trans_cancel:
+ if (args->trans)
+ xfs_trans_cancel(args->trans);
+ goto out_unlock;
}
/*========================================================================
@@ -536,10 +423,10 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
trace_xfs_attr_sf_addname(args);
retval = xfs_attr_shortform_lookup(args);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+ if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
return retval;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE)
+ if (retval == -EEXIST) {
+ if (args->attr_flags & XATTR_CREATE)
return retval;
retval = xfs_attr_shortform_remove(args);
if (retval)
@@ -549,7 +436,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
* that the leaf format add routine won't trip over the attr
* not being around.
*/
- args->flags &= ~ATTR_REPLACE;
+ args->attr_flags &= ~XATTR_REPLACE;
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
@@ -602,14 +489,11 @@ xfs_attr_leaf_addname(
* the given flags produce an error or call for an atomic rename.
*/
retval = xfs_attr3_leaf_lookup_int(bp, args);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
- xfs_trans_brelse(args->trans, bp);
- return retval;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE) { /* pure create op */
- xfs_trans_brelse(args->trans, bp);
- return retval;
- }
+ if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
+ goto out_brelse;
+ if (retval == -EEXIST) {
+ if (args->attr_flags & XATTR_CREATE)
+ goto out_brelse;
trace_xfs_attr_leaf_replace(args);
@@ -750,6 +634,9 @@ xfs_attr_leaf_addname(
error = xfs_attr3_leaf_clearflag(args);
}
return error;
+out_brelse:
+ xfs_trans_brelse(args->trans, bp);
+ return retval;
}
/*
@@ -876,10 +763,10 @@ restart:
goto out;
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+ if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
goto out;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE)
+ if (retval == -EEXIST) {
+ if (args->attr_flags & XATTR_CREATE)
goto out;
trace_xfs_attr_node_replace(args);
@@ -1011,7 +898,7 @@ restart:
* The INCOMPLETE flag means that we will find the "old"
* attr, not the "new" one.
*/
- args->op_flags |= XFS_DA_OP_INCOMPLETE;
+ args->attr_filter |= XFS_ATTR_INCOMPLETE;
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 4243b2272642..0d2d05908537 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -21,39 +21,6 @@ struct xfs_attr_list_context;
* as possible so as to fit into the literal area of the inode.
*/
-/*========================================================================
- * External interfaces
- *========================================================================*/
-
-
-#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */
-#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */
-#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */
-#define ATTR_SECURE 0x0008 /* use attrs in security namespace */
-#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */
-#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */
-
-#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
-#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
-
-#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
-#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */
-
-#define ATTR_KERNEL_FLAGS \
- (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC)
-
-#define XFS_ATTR_FLAGS \
- { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
- { ATTR_ROOT, "ROOT" }, \
- { ATTR_TRUST, "TRUST" }, \
- { ATTR_SECURE, "SECURE" }, \
- { ATTR_CREATE, "CREATE" }, \
- { ATTR_REPLACE, "REPLACE" }, \
- { ATTR_KERNOTIME, "KERNOTIME" }, \
- { ATTR_KERNOVAL, "KERNOVAL" }, \
- { ATTR_INCOMPLETE, "INCOMPLETE" }, \
- { ATTR_ALLOC, "ALLOC" }
-
/*
* The maximum size (into the kernel or returned from the kernel) of an
* attribute value or the buffer used for an attr_list() call. Larger
@@ -62,45 +29,16 @@ struct xfs_attr_list_context;
#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
/*
- * Define how lists of attribute names are returned to the user from
- * the attr_list() call. A large, 32bit aligned, buffer is passed in
- * along with its size. We put an array of offsets at the top that each
- * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
- */
-typedef struct attrlist {
- __s32 al_count; /* number of entries in attrlist */
- __s32 al_more; /* T/F: more attrs (do call again) */
- __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
-} attrlist_t;
-
-/*
- * Show the interesting info about one attribute. This is what the
- * al_offset[i] entry points to.
- */
-typedef struct attrlist_ent { /* data from attr_list() */
- __u32 a_valuelen; /* number bytes in value of attr */
- char a_name[1]; /* attr name (NULL terminated) */
-} attrlist_ent_t;
-
-/*
- * Given a pointer to the (char*) buffer containing the attr_list() result,
- * and an index, return a pointer to the indicated attribute in the buffer.
- */
-#define ATTR_ENTRY(buffer, index) \
- ((attrlist_ent_t *) \
- &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
-
-/*
* Kernel-internal version of the attrlist cursor.
*/
-typedef struct attrlist_cursor_kern {
+struct xfs_attrlist_cursor_kern {
__u32 hashval; /* hash value of next entry to add */
__u32 blkno; /* block containing entry (suggestion) */
__u32 offset; /* offset in list of equal-hashvals */
__u16 pad1; /* padding to match user-level */
__u8 pad2; /* padding to match user-level */
__u8 initted; /* T/F: cursor has been initialized */
-} attrlist_cursor_kern_t;
+};
/*========================================================================
@@ -112,27 +50,28 @@ typedef struct attrlist_cursor_kern {
typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
unsigned char *, int, int);
-typedef struct xfs_attr_list_context {
- struct xfs_trans *tp;
- struct xfs_inode *dp; /* inode */
- struct attrlist_cursor_kern *cursor; /* position in list */
- char *alist; /* output buffer */
+struct xfs_attr_list_context {
+ struct xfs_trans *tp;
+ struct xfs_inode *dp; /* inode */
+ struct xfs_attrlist_cursor_kern cursor; /* position in list */
+ void *buffer; /* output buffer */
/*
* Abort attribute list iteration if non-zero. Can be used to pass
* error values to the xfs_attr_list caller.
*/
- int seen_enough;
+ int seen_enough;
+ bool allow_incomplete;
- ssize_t count; /* num used entries */
- int dupcnt; /* count dup hashvals seen */
- int bufsize; /* total buffer size */
- int firstu; /* first used byte in buffer */
- int flags; /* from VOP call */
- int resynch; /* T/F: resynch with cursor */
- put_listent_func_t put_listent; /* list output fmt function */
- int index; /* index into output buffer */
-} xfs_attr_list_context_t;
+ ssize_t count; /* num used entries */
+ int dupcnt; /* count dup hashvals seen */
+ int bufsize; /* total buffer size */
+ int firstu; /* first used byte in buffer */
+ unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */
+ int resynch; /* T/F: resynch with cursor */
+ put_listent_func_t put_listent; /* list output fmt function */
+ int index; /* index into output buffer */
+};
/*========================================================================
@@ -143,21 +82,14 @@ typedef struct xfs_attr_list_context {
* Overall external interface routines.
*/
int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *);
-int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_attr_list_ilocked(struct xfs_attr_list_context *);
+int xfs_attr_list(struct xfs_attr_list_context *);
int xfs_inode_hasattr(struct xfs_inode *ip);
-int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
-int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
- size_t namelen, unsigned char **value, int *valuelenp,
- int flags);
-int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
- size_t namelen, unsigned char *value, int valuelen, int flags);
+int xfs_attr_get_ilocked(struct xfs_da_args *args);
+int xfs_attr_get(struct xfs_da_args *args);
+int xfs_attr_set(struct xfs_da_args *args);
int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name,
- size_t namelen, int flags);
int xfs_attr_remove_args(struct xfs_da_args *args);
-int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
- int flags, struct attrlist_cursor_kern *cursor);
bool xfs_attr_namecheck(const void *name, size_t length);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index fed537a4353d..863444e2dda7 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -445,14 +445,25 @@ xfs_attr3_leaf_read(
* Namespace helper routines
*========================================================================*/
-/*
- * If namespace bits don't match return 0.
- * If all match then return 1.
- */
-STATIC int
-xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+static bool
+xfs_attr_match(
+ struct xfs_da_args *args,
+ uint8_t namelen,
+ unsigned char *name,
+ int flags)
{
- return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+ if (args->namelen != namelen)
+ return false;
+ if (memcmp(args->name, name, namelen) != 0)
+ return false;
+ /*
+ * If we are looking for incomplete entries, show only those, else only
+ * show complete entries.
+ */
+ if (args->attr_filter !=
+ (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
+ return false;
+ return true;
}
static int
@@ -464,7 +475,7 @@ xfs_attr_copy_value(
/*
* No copy if all we have to do is get the length
*/
- if (args->flags & ATTR_KERNOVAL) {
+ if (!args->valuelen) {
args->valuelen = valuelen;
return 0;
}
@@ -477,7 +488,7 @@ xfs_attr_copy_value(
return -ERANGE;
}
- if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
+ if (!args->value) {
args->value = kmem_alloc_large(valuelen, 0);
if (!args->value)
return -ENOMEM;
@@ -526,7 +537,7 @@ xfs_attr_shortform_bytesfit(
int offset;
/* rounded down */
- offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
+ offset = (XFS_LITINO(mp) - bytes) >> 3;
if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) {
minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
@@ -593,8 +604,7 @@ xfs_attr_shortform_bytesfit(
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
- maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
if (offset >= maxforkoff)
@@ -678,15 +688,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-#ifdef DEBUG
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- ASSERT(0);
-#endif
+ ASSERT(!xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags));
}
offset = (char *)sfe - (char *)sf;
@@ -697,7 +700,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
sfe->namelen = args->namelen;
sfe->valuelen = args->valuelen;
- sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+ sfe->flags = args->attr_filter;
memcpy(sfe->nameval, args->name, args->namelen);
memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
sf->hdr.count++;
@@ -749,13 +752,9 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
base += size, i++) {
size = XFS_ATTR_SF_ENTSIZE(sfe);
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- break;
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ break;
}
if (i == end)
return -ENOATTR;
@@ -816,13 +815,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- return -EEXIST;
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return -EEXIST;
}
return -ENOATTR;
}
@@ -830,9 +825,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
/*
* Retrieve the attribute value and length.
*
- * If ATTR_KERNOVAL is specified, only the length needs to be returned.
- * Unlike a lookup, we only return an error if the attribute does not
- * exist or we can't retrieve the value.
+ * If args->valuelen is zero, only the length needs to be returned. Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
*/
int
xfs_attr_shortform_getvalue(
@@ -847,14 +842,10 @@ xfs_attr_shortform_getvalue(
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
- sfe->valuelen);
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return xfs_attr_copy_value(args,
+ &sfe->nameval[args->namelen], sfe->valuelen);
}
return -ENOATTR;
}
@@ -918,7 +909,7 @@ xfs_attr_shortform_to_leaf(
nargs.valuelen = sfe->valuelen;
nargs.hashval = xfs_da_hashname(sfe->nameval,
sfe->namelen);
- nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+ nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK;
error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == -ENOATTR);
error = xfs_attr3_leaf_add(bp, &nargs);
@@ -1124,7 +1115,7 @@ xfs_attr3_leaf_to_shortform(
nargs.value = &name_loc->nameval[nargs.namelen];
nargs.valuelen = be16_to_cpu(name_loc->valuelen);
nargs.hashval = be32_to_cpu(entry->hashval);
- nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+ nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK;
xfs_attr_shortform_add(&nargs, forkoff);
}
error = 0;
@@ -1449,8 +1440,9 @@ xfs_attr3_leaf_add_work(
entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
ichdr->freemap[mapindex].size);
entry->hashval = cpu_to_be32(args->hashval);
- entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
- entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+ entry->flags = args->attr_filter;
+ if (tmp)
+ entry->flags |= XFS_ATTR_LOCAL;
if (args->op_flags & XFS_DA_OP_RENAME) {
entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
@@ -2346,7 +2338,7 @@ xfs_attr3_leaf_lookup_int(
xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
if (ichdr.count >= args->geo->blksize / 8) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
@@ -2365,11 +2357,11 @@ xfs_attr3_leaf_lookup_int(
break;
}
if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
@@ -2399,33 +2391,17 @@ xfs_attr3_leaf_lookup_int(
/*
* GROT: Add code to remove incomplete entries.
*/
- /*
- * If we are looking for INCOMPLETE entries, show only those.
- * If we are looking for complete entries, show only those.
- */
- if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) !=
- !!(entry->flags & XFS_ATTR_INCOMPLETE)) {
- continue;
- }
if (entry->flags & XFS_ATTR_LOCAL) {
name_loc = xfs_attr3_leaf_name_local(leaf, probe);
- if (name_loc->namelen != args->namelen)
- continue;
- if (memcmp(args->name, name_loc->nameval,
- args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, entry->flags))
+ if (!xfs_attr_match(args, name_loc->namelen,
+ name_loc->nameval, entry->flags))
continue;
args->index = probe;
return -EEXIST;
} else {
name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
- if (name_rmt->namelen != args->namelen)
- continue;
- if (memcmp(args->name, name_rmt->name,
- args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, entry->flags))
+ if (!xfs_attr_match(args, name_rmt->namelen,
+ name_rmt->name, entry->flags))
continue;
args->index = probe;
args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
@@ -2444,9 +2420,9 @@ xfs_attr3_leaf_lookup_int(
* Get the value associated with an attribute name from a leaf attribute
* list structure.
*
- * If ATTR_KERNOVAL is specified, only the length needs to be returned.
- * Unlike a lookup, we only return an error if the attribute does not
- * exist or we can't retrieve the value.
+ * If args->valuelen is zero, only the length needs to be returned. Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
*/
int
xfs_attr3_leaf_getvalue(
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 73615b1dd1a8..6dd2d937a42a 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -8,7 +8,6 @@
#define __XFS_ATTR_LEAF_H__
struct attrlist;
-struct attrlist_cursor_kern;
struct xfs_attr_list_context;
struct xfs_da_args;
struct xfs_da_state;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 8b7f74b3bea2..01ad7f353e08 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -397,7 +397,7 @@ xfs_attr_rmtval_get(
trace_xfs_attr_rmtval_get(args);
- ASSERT(!(args->flags & ATTR_KERNOVAL));
+ ASSERT(args->valuelen != 0);
ASSERT(args->rmtvaluelen == args->valuelen);
valuelen = args->rmtvaluelen;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9a6d7a84689a..fda13cd7add0 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -193,14 +193,12 @@ xfs_default_attroffset(
struct xfs_mount *mp = ip->i_mount;
uint offset;
- if (mp->m_sb.sb_inodesize == 256) {
- offset = XFS_LITINO(mp, ip->i_d.di_version) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
- } else {
+ if (mp->m_sb.sb_inodesize == 256)
+ offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ else
offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
- }
- ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+ ASSERT(offset < XFS_LITINO(mp));
return offset;
}
@@ -690,7 +688,7 @@ xfs_bmap_extents_to_btree(
* Need a cursor. Can't allocate until bb_level is filled in.
*/
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
/*
* Convert to a btree with two levels, one record in root.
*/
@@ -727,7 +725,7 @@ xfs_bmap_extents_to_btree(
ASSERT(tp->t_firstblock == NULLFSBLOCK ||
args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock));
tp->t_firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
+ cur->bc_ino.allocated++;
ip->i_d.di_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
@@ -953,7 +951,7 @@ xfs_bmap_add_attrfork_btree(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return -ENOSPC;
}
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
}
return 0;
@@ -980,7 +978,7 @@ xfs_bmap_add_attrfork_extents(
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
XFS_DATA_FORK);
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -1178,13 +1176,13 @@ xfs_iread_bmbt_block(
{
struct xfs_iread_state *ir = priv;
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_btree_block *block;
struct xfs_buf *bp;
struct xfs_bmbt_rec *frp;
xfs_extnum_t num_recs;
xfs_extnum_t j;
- int whichfork = cur->bc_private.b.whichfork;
+ int whichfork = cur->bc_ino.whichfork;
block = xfs_btree_get_block(cur, level, &bp);
@@ -1528,7 +1526,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
- (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -1818,7 +1816,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ (bma->cur ? bma->cur->bc_ino.allocated : 0));
PREV.br_startoff = new_endoff;
PREV.br_blockcount = temp;
@@ -1904,7 +1902,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ (bma->cur ? bma->cur->bc_ino.allocated : 0));
PREV.br_startblock = nullstartblock(da_new);
PREV.br_blockcount = temp;
@@ -2025,8 +2023,8 @@ xfs_bmap_add_extent_delay_real(
xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
if (bma->cur) {
- da_new += bma->cur->bc_private.b.allocated;
- bma->cur->bc_private.b.allocated = 0;
+ da_new += bma->cur->bc_ino.allocated;
+ bma->cur->bc_ino.allocated = 0;
}
/* adjust for changes in reserved delayed indirect blocks */
@@ -2573,7 +2571,7 @@ xfs_bmap_add_extent_unwritten_real(
/* clear out the allocated field, done with it now in any case. */
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
*curp = cur;
}
@@ -2752,7 +2750,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2955,7 +2953,7 @@ xfs_bmap_add_extent_hole_real(
/* clear out the allocated field, done with it now in any case. */
if (cur)
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
@@ -4187,8 +4185,8 @@ xfs_bmapi_allocate(
bma->nallocs++;
if (bma->cur)
- bma->cur->bc_private.b.flags =
- bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ bma->cur->bc_ino.flags =
+ bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
bma->got.br_startoff = bma->offset;
bma->got.br_startblock = bma->blkno;
@@ -4709,7 +4707,7 @@ xfs_bmapi_remap(
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
got.br_startoff = bno;
@@ -5364,7 +5362,7 @@ __xfs_bunmapi(
if (ifp->if_flags & XFS_IFBROOT) {
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
} else
cur = NULL;
@@ -5620,7 +5618,7 @@ error0:
xfs_trans_log_inode(tp, ip, logflags);
if (cur) {
if (!error)
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -5839,7 +5837,7 @@ xfs_bmap_collapse_extents(
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
@@ -5956,7 +5954,7 @@ xfs_bmap_insert_extents(
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
if (*next_fsb == NULLFSBLOCK) {
@@ -6025,8 +6023,8 @@ del_cursor:
* @split_fsb is a block where the extents is split. If split_fsb lies in a
* hole or the first block of extents, just return 0.
*/
-STATIC int
-xfs_bmap_split_extent_at(
+int
+xfs_bmap_split_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t split_fsb)
@@ -6074,7 +6072,7 @@ xfs_bmap_split_extent_at(
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
@@ -6133,7 +6131,7 @@ xfs_bmap_split_extent_at(
del_cursor:
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
@@ -6142,34 +6140,6 @@ del_cursor:
return error;
}
-int
-xfs_bmap_split_extent(
- struct xfs_inode *ip,
- xfs_fileoff_t split_fsb)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- error = xfs_bmap_split_extent_at(tp, ip, split_fsb);
- if (error)
- goto out;
-
- return xfs_trans_commit(tp);
-
-out:
- xfs_trans_cancel(tp);
- return error;
-}
-
/* Deferred mapping is only for real extents in the data fork. */
static bool
xfs_bmap_is_update_needed(
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 14d25e0b7d9c..f3259ad5c22c 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -222,7 +222,8 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off,
int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
bool *done, xfs_fileoff_t stop_fsb);
-int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index ffe608d2a2d9..295a59cf8840 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -166,13 +166,13 @@ xfs_bmbt_dup_cursor(
struct xfs_btree_cur *new;
new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+ cur->bc_ino.ip, cur->bc_ino.whichfork);
/*
* Copy the firstblock, dfops, and flags values,
* since init cursor doesn't get them.
*/
- new->bc_private.b.flags = cur->bc_private.b.flags;
+ new->bc_ino.flags = cur->bc_ino.flags;
return new;
}
@@ -183,12 +183,12 @@ xfs_bmbt_update_cursor(
struct xfs_btree_cur *dst)
{
ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) ||
- (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+ (dst->bc_ino.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
- dst->bc_private.b.allocated += src->bc_private.b.allocated;
+ dst->bc_ino.allocated += src->bc_ino.allocated;
dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock;
- src->bc_private.b.allocated = 0;
+ src->bc_ino.allocated = 0;
}
STATIC int
@@ -205,8 +205,8 @@ xfs_bmbt_alloc_block(
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.fsbno = cur->bc_tp->t_firstblock;
- xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino,
- cur->bc_private.b.whichfork);
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
+ cur->bc_ino.whichfork);
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
@@ -230,7 +230,7 @@ xfs_bmbt_alloc_block(
}
args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+ args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
if (!args.wasdel && args.tp->t_blk_res == 0) {
error = -ENOSPC;
goto error0;
@@ -259,10 +259,10 @@ xfs_bmbt_alloc_block(
ASSERT(args.len == 1);
cur->bc_tp->t_firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- cur->bc_private.b.ip->i_d.di_nblocks++;
- xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
- xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+ cur->bc_ino.allocated++;
+ cur->bc_ino.ip->i_d.di_nblocks++;
+ xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
XFS_TRANS_DQ_BCOUNT, 1L);
new->l = cpu_to_be64(args.fsbno);
@@ -280,12 +280,12 @@ xfs_bmbt_free_block(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
struct xfs_owner_info oinfo;
- xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo);
ip->i_d.di_nblocks--;
@@ -302,8 +302,8 @@ xfs_bmbt_get_minrecs(
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
+ ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0) / 2;
@@ -320,8 +320,8 @@ xfs_bmbt_get_maxrecs(
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
+ ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0);
@@ -347,7 +347,7 @@ xfs_bmbt_get_dmaxrecs(
{
if (level != cur->bc_nlevels - 1)
return cur->bc_mp->m_bmap_dmxr[level != 0];
- return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
+ return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0);
}
STATIC void
@@ -566,11 +566,11 @@ xfs_bmbt_init_cursor(
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
- cur->bc_private.b.ip = ip;
- cur->bc_private.b.allocated = 0;
- cur->bc_private.b.flags = 0;
- cur->bc_private.b.whichfork = whichfork;
+ cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork);
+ cur->bc_ino.ip = ip;
+ cur->bc_ino.allocated = 0;
+ cur->bc_ino.flags = 0;
+ cur->bc_ino.whichfork = whichfork;
return cur;
}
@@ -644,7 +644,7 @@ xfs_bmbt_change_owner(
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
if (!cur)
return -ENOMEM;
- cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
+ cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index fd300dc93ca4..2d25bab68764 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -20,6 +20,7 @@
#include "xfs_trace.h"
#include "xfs_alloc.h"
#include "xfs_log.h"
+#include "xfs_btree_staging.h"
/*
* Cursor allocation zone.
@@ -214,7 +215,7 @@ xfs_btree_check_sptr(
{
if (level <= 0)
return false;
- return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
+ return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno);
}
/*
@@ -234,8 +235,8 @@ xfs_btree_check_ptr(
return 0;
xfs_err(cur->bc_mp,
"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_private.b.ip->i_ino,
- cur->bc_private.b.whichfork, cur->bc_btnum,
+ cur->bc_ino.ip->i_ino,
+ cur->bc_ino.whichfork, cur->bc_btnum,
level, index);
} else {
if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
@@ -243,7 +244,7 @@ xfs_btree_check_ptr(
return 0;
xfs_err(cur->bc_mp,
"AG %u: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_private.a.agno, cur->bc_btnum,
+ cur->bc_ag.agno, cur->bc_btnum,
level, index);
}
@@ -378,10 +379,12 @@ xfs_btree_del_cursor(
* allocated indirect blocks' accounting.
*/
ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
- cur->bc_private.b.allocated == 0);
+ cur->bc_ino.allocated == 0);
/*
* Free the cursor.
*/
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+ kmem_free((void *)cur->bc_ops);
kmem_cache_free(xfs_btree_cur_zone, cur);
}
@@ -642,6 +645,17 @@ xfs_btree_ptr_addr(
((char *)block + xfs_btree_ptr_offset(cur, n, level));
}
+struct xfs_ifork *
+xfs_btree_ifork_ptr(
+ struct xfs_btree_cur *cur)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+ if (cur->bc_flags & XFS_BTREE_STAGING)
+ return cur->bc_ino.ifake->if_fork;
+ return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
+}
+
/*
* Get the root block which is stored in the inode.
*
@@ -652,9 +666,8 @@ STATIC struct xfs_btree_block *
xfs_btree_get_iroot(
struct xfs_btree_cur *cur)
{
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
return (struct xfs_btree_block *)ifp->if_broot;
}
@@ -881,13 +894,13 @@ xfs_btree_readahead_sblock(
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno,
left, 1, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno,
right, 1, cur->bc_ops->buf_ops);
rval++;
}
@@ -945,7 +958,7 @@ xfs_btree_ptr_to_daddr(
*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
} else {
agbno = be32_to_cpu(ptr->s);
- *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno,
agbno);
}
@@ -1014,7 +1027,7 @@ xfs_btree_ptr_is_null(
return ptr->s == cpu_to_be32(NULLAGBLOCK);
}
-STATIC void
+void
xfs_btree_set_ptr_null(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
@@ -1050,7 +1063,7 @@ xfs_btree_get_sibling(
}
}
-STATIC void
+void
xfs_btree_set_sibling(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
@@ -1128,7 +1141,7 @@ xfs_btree_init_block(
btnum, level, numrecs, owner, 0);
}
-STATIC void
+void
xfs_btree_init_block_cur(
struct xfs_btree_cur *cur,
struct xfs_buf *bp,
@@ -1144,9 +1157,9 @@ xfs_btree_init_block_cur(
* code.
*/
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- owner = cur->bc_private.b.ip->i_ino;
+ owner = cur->bc_ino.ip->i_ino;
else
- owner = cur->bc_private.a.agno;
+ owner = cur->bc_ag.agno;
xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
cur->bc_btnum, level, numrecs,
@@ -1220,7 +1233,7 @@ xfs_btree_set_refs(
}
}
-STATIC int
+int
xfs_btree_get_buf_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
@@ -1280,7 +1293,7 @@ xfs_btree_read_buf_block(
/*
* Copy keys from one btree block to another.
*/
-STATIC void
+void
xfs_btree_copy_keys(
struct xfs_btree_cur *cur,
union xfs_btree_key *dst_key,
@@ -1308,11 +1321,11 @@ xfs_btree_copy_recs(
/*
* Copy block pointers from one btree block to another.
*/
-STATIC void
+void
xfs_btree_copy_ptrs(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *dst_ptr,
- union xfs_btree_ptr *src_ptr,
+ const union xfs_btree_ptr *src_ptr,
int numptrs)
{
ASSERT(numptrs >= 0);
@@ -1393,8 +1406,8 @@ xfs_btree_log_keys(
xfs_btree_key_offset(cur, first),
xfs_btree_key_offset(cur, last + 1) - 1);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1436,8 +1449,8 @@ xfs_btree_log_ptrs(
xfs_btree_ptr_offset(cur, first, level),
xfs_btree_ptr_offset(cur, last + 1, level) - 1);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1505,8 +1518,8 @@ xfs_btree_log_block(
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp, first, last);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1743,10 +1756,10 @@ xfs_btree_lookup_get_block(
/* Check the inode owner since the verifiers don't. */
if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
- !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
+ !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
- cur->bc_private.b.ip->i_ino)
+ cur->bc_ino.ip->i_ino)
goto out_bad;
/* Did we get the level we were looking for? */
@@ -1762,7 +1775,7 @@ xfs_btree_lookup_get_block(
out_bad:
*blkp = NULL;
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(cur->bc_tp, bp);
return -EFSCORRUPTED;
}
@@ -2938,9 +2951,9 @@ xfs_btree_new_iroot(
xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
- xfs_iroot_realloc(cur->bc_private.b.ip,
+ xfs_iroot_realloc(cur->bc_ino.ip,
1 - xfs_btree_get_numrecs(cblock),
- cur->bc_private.b.whichfork);
+ cur->bc_ino.whichfork);
xfs_btree_setbuf(cur, level, cbp);
@@ -2953,7 +2966,7 @@ xfs_btree_new_iroot(
xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
*logflags |=
- XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
*stat = 1;
return 0;
error0:
@@ -3105,11 +3118,11 @@ xfs_btree_make_block_unfull(
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
level == cur->bc_nlevels - 1) {
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_ino.ip;
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
- xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
*stat = 1;
} else {
/* A root block that needs replacing */
@@ -3455,8 +3468,8 @@ STATIC int
xfs_btree_kill_iroot(
struct xfs_btree_cur *cur)
{
- int whichfork = cur->bc_private.b.whichfork;
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ int whichfork = cur->bc_ino.whichfork;
+ struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_btree_block *block;
struct xfs_btree_block *cblock;
@@ -3514,8 +3527,8 @@ xfs_btree_kill_iroot(
index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
if (index) {
- xfs_iroot_realloc(cur->bc_private.b.ip, index,
- cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(cur->bc_ino.ip, index,
+ cur->bc_ino.whichfork);
block = ifp->if_broot;
}
@@ -3544,7 +3557,7 @@ xfs_btree_kill_iroot(
cur->bc_bufs[level - 1] = NULL;
be16_add_cpu(&block->bb_level, -1);
xfs_trans_log_inode(cur->bc_tp, ip,
- XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
cur->bc_nlevels--;
out0:
return 0;
@@ -3712,8 +3725,8 @@ xfs_btree_delrec(
*/
if (level == cur->bc_nlevels - 1) {
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
- xfs_iroot_realloc(cur->bc_private.b.ip, -1,
- cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(cur->bc_ino.ip, -1,
+ cur->bc_ino.whichfork);
error = xfs_btree_kill_iroot(cur);
if (error)
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3eff7c321d43..8626c5a81aad 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -10,6 +10,7 @@ struct xfs_buf;
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
+struct xfs_ifork;
extern kmem_zone_t *xfs_btree_cur_zone;
@@ -177,15 +178,37 @@ union xfs_btree_irec {
struct xfs_refcount_irec rc;
};
-/* Per-AG btree private information. */
-union xfs_btree_cur_private {
- struct {
- unsigned long nr_ops; /* # record updates */
- int shape_changes; /* # of extent splits */
- } refc;
- struct {
- bool active; /* allocation cursor state */
- } abt;
+/* Per-AG btree information. */
+struct xfs_btree_cur_ag {
+ union {
+ struct xfs_buf *agbp;
+ struct xbtree_afakeroot *afake; /* for staging cursor */
+ };
+ xfs_agnumber_t agno;
+ union {
+ struct {
+ unsigned long nr_ops; /* # record updates */
+ int shape_changes; /* # of extent splits */
+ } refc;
+ struct {
+ bool active; /* allocation cursor state */
+ } abt;
+ };
+};
+
+/* Btree-in-inode cursor information */
+struct xfs_btree_cur_ino {
+ struct xfs_inode *ip;
+ struct xbtree_ifakeroot *ifake; /* for staging cursor */
+ int allocated;
+ short forksize;
+ char whichfork;
+ char flags;
+/* We are converting a delalloc reservation */
+#define XFS_BTCUR_BMBT_WASDEL (1 << 0)
+
+/* For extent swap, ignore owner check in verifier */
+#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
};
/*
@@ -209,21 +232,9 @@ typedef struct xfs_btree_cur
xfs_btnum_t bc_btnum; /* identifies which btree type */
int bc_statoff; /* offset of btre stats array */
union {
- struct { /* needed for BNO, CNT, INO */
- struct xfs_buf *agbp; /* agf/agi buffer pointer */
- xfs_agnumber_t agno; /* ag number */
- union xfs_btree_cur_private priv;
- } a;
- struct { /* needed for BMAP */
- struct xfs_inode *ip; /* pointer to our inode */
- int allocated; /* count of alloced */
- short forksize; /* fork's inode space */
- char whichfork; /* data or attr fork */
- char flags; /* flags */
-#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
-#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
- } b;
- } bc_private; /* per-btree type data */
+ struct xfs_btree_cur_ag bc_ag;
+ struct xfs_btree_cur_ino bc_ino;
+ };
} xfs_btree_cur_t;
/* cursor flags */
@@ -232,6 +243,12 @@ typedef struct xfs_btree_cur
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata. The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING (1<<5)
#define XFS_BTREE_NOERROR 0
@@ -494,6 +511,7 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
union xfs_btree_irec *high, bool *exists);
bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
+struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
/* Does this cursor point to the last block in the given level? */
static inline bool
@@ -512,4 +530,20 @@ xfs_btree_islastblock(
return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
}
+void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr,
+ struct xfs_btree_block **block, struct xfs_buf **bpp);
+void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, union xfs_btree_ptr *ptr,
+ int lr);
+void xfs_btree_init_block_cur(struct xfs_btree_cur *cur,
+ struct xfs_buf *bp, int level, int numrecs);
+void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *dst_ptr,
+ const union xfs_btree_ptr *src_ptr, int numptrs);
+void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key, union xfs_btree_key *src_key,
+ int numkeys);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
new file mode 100644
index 000000000000..f464a7c7cf22
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -0,0 +1,879 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
+#include "xfs_btree_staging.h"
+
+/*
+ * Staging Cursors and Fake Roots for Btrees
+ * =========================================
+ *
+ * A staging btree cursor is a special type of btree cursor that callers must
+ * use to construct a new btree index using the btree bulk loader code. The
+ * bulk loading code uses the staging btree cursor to abstract the details of
+ * initializing new btree blocks and filling them with records or key/ptr
+ * pairs. Regular btree operations (e.g. queries and modifications) are not
+ * supported with staging cursors, and callers must not invoke them.
+ *
+ * Fake root structures contain all the information about a btree that is under
+ * construction by the bulk loading code. Staging btree cursors point to fake
+ * root structures instead of the usual AG header or inode structure.
+ *
+ * Callers are expected to initialize a fake root structure and pass it into
+ * the _stage_cursor function for a specific btree type. When bulk loading is
+ * complete, callers should call the _commit_staged_btree function for that
+ * specific btree type to commit the new btree into the filesystem.
+ */
+
+/*
+ * Don't allow staging cursors to be duplicated because they're supposed to be
+ * kept private to a single thread.
+ */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ ASSERT(0);
+ return NULL;
+}
+
+/*
+ * Don't allow block allocation for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ *
+ * Bulk loading uses a separate callback to obtain new blocks from a
+ * preallocated list, which prevents ENOSPC failures during loading.
+ */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int *stat)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Don't allow block freeing for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ */
+STATIC int
+xfs_btree_fakeroot_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xbtree_afakeroot *afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ afake = cur->bc_ag.afake;
+ ptr->s = cpu_to_be32(afake->af_root);
+}
+
+/*
+ * Bulk Loading for AG Btrees
+ * ==========================
+ *
+ * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
+ * staging cursor. Callers should initialize this to zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
+ * cursor. The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_afakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/* Update the btree root information for a per-AG fake root. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ afake->af_root = be32_to_cpu(ptr->s);
+ afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG btree fake root.
+ * The btree cursor's bc_ops will be overridden as needed to make the staging
+ * functionality work.
+ */
+void
+xfs_btree_stage_afakeroot(
+ struct xfs_btree_cur *cur,
+ struct xbtree_afakeroot *afake)
+{
+ struct xfs_btree_ops *nops;
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+ ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+ ASSERT(cur->bc_tp == NULL);
+
+ nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+ memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+ nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+ nops->free_block = xfs_btree_fakeroot_free_block;
+ nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+ nops->set_root = xfs_btree_afakeroot_set_root;
+ nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+ cur->bc_ag.afake = afake;
+ cur->bc_nlevels = afake->af_levels;
+ cur->bc_ops = nops;
+ cur->bc_flags |= XFS_BTREE_STAGING;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops. The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_afakeroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ const struct xfs_btree_ops *ops)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(cur->bc_tp == NULL);
+
+ trace_xfs_btree_commit_afakeroot(cur);
+
+ kmem_free((void *)cur->bc_ops);
+ cur->bc_ag.agbp = agbp;
+ cur->bc_ops = ops;
+ cur->bc_flags &= ~XFS_BTREE_STAGING;
+ cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading for Inode-Rooted Btrees
+ * ====================================
+ *
+ * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
+ * the staging cursor. This structure should be initialized as follows:
+ *
+ * - if_fork_size field should be set to the number of bytes available to the
+ * fork in the inode.
+ *
+ * - if_fork should point to a freshly allocated struct xfs_ifork.
+ *
+ * - if_format should be set to the appropriate fork type (e.g.
+ * XFS_DINODE_FMT_BTREE).
+ *
+ * All other fields must be zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
+ * cursor. The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_ifakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/*
+ * Initialize an inode-rooted btree cursor with the given inode btree fake
+ * root. The btree cursor's bc_ops will be overridden as needed to make the
+ * staging functionality work. If new_ops is not NULL, these new ops will be
+ * passed out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_ifakeroot(
+ struct xfs_btree_cur *cur,
+ struct xbtree_ifakeroot *ifake,
+ struct xfs_btree_ops **new_ops)
+{
+ struct xfs_btree_ops *nops;
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_tp == NULL);
+
+ nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+ memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+ nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+ nops->free_block = xfs_btree_fakeroot_free_block;
+ nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+ nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+ cur->bc_ino.ifake = ifake;
+ cur->bc_nlevels = ifake->if_levels;
+ cur->bc_ops = nops;
+ cur->bc_flags |= XFS_BTREE_STAGING;
+
+ if (new_ops)
+ *new_ops = nops;
+}
+
+/*
+ * Transform an inode-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops. The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_ifakeroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ int whichfork,
+ const struct xfs_btree_ops *ops)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(cur->bc_tp == NULL);
+
+ trace_xfs_btree_commit_ifakeroot(cur);
+
+ kmem_free((void *)cur->bc_ops);
+ cur->bc_ino.ifake = NULL;
+ cur->bc_ino.whichfork = whichfork;
+ cur->bc_ops = ops;
+ cur->bc_flags &= ~XFS_BTREE_STAGING;
+ cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading of Staged Btrees
+ * =============================
+ *
+ * This interface is used with a staged btree cursor to create a totally new
+ * btree with a large number of records (i.e. more than what would fit in a
+ * single root block). When the creation is complete, the new root can be
+ * linked atomically into the filesystem by committing the staged cursor.
+ *
+ * Creation of a new btree proceeds roughly as follows:
+ *
+ * The first step is to initialize an appropriate fake btree root structure and
+ * then construct a staged btree cursor. Refer to the block comments about
+ * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
+ * more information about how to do this.
+ *
+ * The second step is to initialize a struct xfs_btree_bload context as
+ * documented in the structure definition.
+ *
+ * The third step is to call xfs_btree_bload_compute_geometry to compute the
+ * height of and the number of blocks needed to construct the btree. See the
+ * section "Computing the Geometry of the New Btree" for details about this
+ * computation.
+ *
+ * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
+ * save them for later use by ->claim_block(). Bulk loading requires all
+ * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
+ * rebuild, and to minimize seek distances of the new btree.
+ *
+ * Step five is to call xfs_btree_bload() to start constructing the btree.
+ *
+ * The final step is to commit the staging btree cursor, which logs the new
+ * btree root and turns the staging cursor into a regular cursor. The caller
+ * is responsible for cleaning up the previous btree blocks, if any.
+ *
+ * Computing the Geometry of the New Btree
+ * =======================================
+ *
+ * The number of items placed in each btree block is computed via the following
+ * algorithm: For leaf levels, the number of items for the level is nr_records
+ * in the bload structure. For node levels, the number of items for the level
+ * is the number of blocks in the next lower level of the tree. For each
+ * level, the desired number of items per block is defined as:
+ *
+ * desired = max(minrecs, maxrecs - slack factor)
+ *
+ * The number of blocks for the level is defined to be:
+ *
+ * blocks = floor(nr_items / desired)
+ *
+ * Note this is rounded down so that the npb calculation below will never fall
+ * below minrecs. The number of items that will actually be loaded into each
+ * btree block is defined as:
+ *
+ * npb = nr_items / blocks
+ *
+ * Some of the leftmost blocks in the level will contain one extra record as
+ * needed to handle uneven division. If the number of records in any block
+ * would exceed maxrecs for that level, blocks is incremented and npb is
+ * recalculated.
+ *
+ * In other words, we compute the number of blocks needed to satisfy a given
+ * loading level, then spread the items as evenly as possible.
+ *
+ * The height and number of fs blocks required to create the btree are computed
+ * and returned via btree_height and nr_blocks.
+ */
+
+/*
+ * Put a btree block that we're loading onto the ordered list and release it.
+ * The btree blocks will be written to disk when bulk loading is finished.
+ */
+static void
+xfs_btree_bload_drop_buf(
+ struct list_head *buffers_list,
+ struct xfs_buf **bpp)
+{
+ if (*bpp == NULL)
+ return;
+
+ if (!xfs_buf_delwri_queue(*bpp, buffers_list))
+ ASSERT(0);
+
+ xfs_buf_relse(*bpp);
+ *bpp = NULL;
+}
+
+/*
+ * Allocate and initialize one btree block for bulk loading.
+ *
+ * The new btree block will have its level and numrecs fields set to the values
+ * of the level and nr_this_block parameters, respectively.
+ *
+ * The caller should ensure that ptrp, bpp, and blockp refer to the left
+ * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
+ * will all point to the new block.
+ */
+STATIC int
+xfs_btree_bload_prep_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ struct list_head *buffers_list,
+ unsigned int level,
+ unsigned int nr_this_block,
+ union xfs_btree_ptr *ptrp, /* in/out */
+ struct xfs_buf **bpp, /* in/out */
+ struct xfs_btree_block **blockp, /* in/out */
+ void *priv)
+{
+ union xfs_btree_ptr new_ptr;
+ struct xfs_buf *new_bp;
+ struct xfs_btree_block *new_block;
+ int ret;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ size_t new_size;
+
+ ASSERT(*bpp == NULL);
+
+ /* Allocate a new incore btree root block. */
+ new_size = bbl->iroot_size(cur, nr_this_block, priv);
+ ifp->if_broot = kmem_zalloc(new_size, 0);
+ ifp->if_broot_bytes = (int)new_size;
+ ifp->if_flags |= XFS_IFBROOT;
+
+ /* Initialize it and send it out. */
+ xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
+ XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
+ nr_this_block, cur->bc_ino.ip->i_ino,
+ cur->bc_flags);
+
+ *bpp = NULL;
+ *blockp = ifp->if_broot;
+ xfs_btree_set_ptr_null(cur, ptrp);
+ return 0;
+ }
+
+ /* Claim one of the caller's preallocated blocks. */
+ xfs_btree_set_ptr_null(cur, &new_ptr);
+ ret = bbl->claim_block(cur, &new_ptr, priv);
+ if (ret)
+ return ret;
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
+
+ ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
+ if (ret)
+ return ret;
+
+ /*
+ * The previous block (if any) is the left sibling of the new block,
+ * so set its right sibling pointer to the new block and drop it.
+ */
+ if (*blockp)
+ xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
+ xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+ /* Initialize the new btree block. */
+ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
+ xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
+
+ /* Set the out parameters. */
+ *bpp = new_bp;
+ *blockp = new_block;
+ xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
+ return 0;
+}
+
+/* Load one leaf block. */
+STATIC int
+xfs_btree_bload_leaf(
+ struct xfs_btree_cur *cur,
+ unsigned int recs_this_block,
+ xfs_btree_bload_get_record_fn get_record,
+ struct xfs_btree_block *block,
+ void *priv)
+{
+ unsigned int j;
+ int ret;
+
+ /* Fill the leaf block with records. */
+ for (j = 1; j <= recs_this_block; j++) {
+ union xfs_btree_rec *block_rec;
+
+ ret = get_record(cur, priv);
+ if (ret)
+ return ret;
+ block_rec = xfs_btree_rec_addr(cur, j, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return 0;
+}
+
+/*
+ * Load one node block with key/ptr pairs.
+ *
+ * child_ptr must point to a block within the next level down in the tree. A
+ * key/ptr entry will be created in the new node block to the block pointed to
+ * by child_ptr. On exit, child_ptr points to the next block on the child
+ * level that needs processing.
+ */
+STATIC int
+xfs_btree_bload_node(
+ struct xfs_btree_cur *cur,
+ unsigned int recs_this_block,
+ union xfs_btree_ptr *child_ptr,
+ struct xfs_btree_block *block)
+{
+ unsigned int j;
+ int ret;
+
+ /* Fill the node block with keys and pointers. */
+ for (j = 1; j <= recs_this_block; j++) {
+ union xfs_btree_key child_key;
+ union xfs_btree_ptr *block_ptr;
+ union xfs_btree_key *block_key;
+ struct xfs_btree_block *child_block;
+ struct xfs_buf *child_bp;
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
+
+ ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+ &child_bp);
+ if (ret)
+ return ret;
+
+ block_ptr = xfs_btree_ptr_addr(cur, j, block);
+ xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
+
+ block_key = xfs_btree_key_addr(cur, j, block);
+ xfs_btree_get_keys(cur, child_block, &child_key);
+ xfs_btree_copy_keys(cur, block_key, &child_key, 1);
+
+ xfs_btree_get_sibling(cur, child_block, child_ptr,
+ XFS_BB_RIGHTSIB);
+ xfs_buf_relse(child_bp);
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the maximum number of records (or keyptrs) per block that we want to
+ * install at this level in the btree. Caller is responsible for having set
+ * @cur->bc_ino.forksize to the desired fork size, if appropriate.
+ */
+STATIC unsigned int
+xfs_btree_bload_max_npb(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level)
+{
+ unsigned int ret;
+
+ if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
+ return cur->bc_ops->get_dmaxrecs(cur, level);
+
+ ret = cur->bc_ops->get_maxrecs(cur, level);
+ if (level == 0)
+ ret -= bbl->leaf_slack;
+ else
+ ret -= bbl->node_slack;
+ return ret;
+}
+
+/*
+ * Compute the desired number of records (or keyptrs) per block that we want to
+ * install at this level in the btree, which must be somewhere between minrecs
+ * and max_npb. The caller is free to install fewer records per block.
+ */
+STATIC unsigned int
+xfs_btree_bload_desired_npb(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level)
+{
+ unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
+
+ /* Root blocks are not subject to minrecs rules. */
+ if (level == cur->bc_nlevels - 1)
+ return max(1U, npb);
+
+ return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
+}
+
+/*
+ * Compute the number of records to be stored in each block at this level and
+ * the number of blocks for this level. For leaf levels, we must populate an
+ * empty root block even if there are no records, so we have to have at least
+ * one block.
+ */
+STATIC void
+xfs_btree_bload_level_geometry(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level,
+ uint64_t nr_this_level,
+ unsigned int *avg_per_block,
+ uint64_t *blocks,
+ uint64_t *blocks_with_extra)
+{
+ uint64_t npb;
+ uint64_t dontcare;
+ unsigned int desired_npb;
+ unsigned int maxnr;
+
+ maxnr = cur->bc_ops->get_maxrecs(cur, level);
+
+ /*
+ * Compute the number of blocks we need to fill each block with the
+ * desired number of records/keyptrs per block. Because desired_npb
+ * could be minrecs, we use regular integer division (which rounds
+ * the block count down) so that in the next step the effective # of
+ * items per block will never be less than desired_npb.
+ */
+ desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
+ *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
+ *blocks = max(1ULL, *blocks);
+
+ /*
+ * Compute the number of records that we will actually put in each
+ * block, assuming that we want to spread the records evenly between
+ * the blocks. Take care that the effective # of items per block (npb)
+ * won't exceed maxrecs even for the blocks that get an extra record,
+ * since desired_npb could be maxrecs, and in the previous step we
+ * rounded the block count down.
+ */
+ npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+ if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
+ (*blocks)++;
+ npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+ }
+
+ *avg_per_block = min_t(uint64_t, npb, nr_this_level);
+
+ trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
+ *avg_per_block, desired_npb, *blocks,
+ *blocks_with_extra);
+}
+
+/*
+ * Ensure a slack value is appropriate for the btree.
+ *
+ * If the slack value is negative, set slack so that we fill the block to
+ * halfway between minrecs and maxrecs. Make sure the slack is never so large
+ * that we can underflow minrecs.
+ */
+static void
+xfs_btree_bload_ensure_slack(
+ struct xfs_btree_cur *cur,
+ int *slack,
+ int level)
+{
+ int maxr;
+ int minr;
+
+ maxr = cur->bc_ops->get_maxrecs(cur, level);
+ minr = cur->bc_ops->get_minrecs(cur, level);
+
+ /*
+ * If slack is negative, automatically set slack so that we load the
+ * btree block approximately halfway between minrecs and maxrecs.
+ * Generally, this will net us 75% loading.
+ */
+ if (*slack < 0)
+ *slack = maxr - ((maxr + minr) >> 1);
+
+ *slack = min(*slack, maxr - minr);
+}
+
+/*
+ * Prepare a btree cursor for a bulk load operation by computing the geometry
+ * fields in bbl. Caller must ensure that the btree cursor is a staging
+ * cursor. This function can be called multiple times.
+ */
+int
+xfs_btree_bload_compute_geometry(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ uint64_t nr_records)
+{
+ uint64_t nr_blocks = 0;
+ uint64_t nr_this_level;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ /*
+ * Make sure that the slack values make sense for traditional leaf and
+ * node blocks. Inode-rooted btrees will return different minrecs and
+ * maxrecs values for the root block (bc_nlevels == level - 1). We're
+ * checking levels 0 and 1 here, so set bc_nlevels such that the btree
+ * code doesn't interpret either as the root level.
+ */
+ cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1;
+ xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
+ xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
+
+ bbl->nr_records = nr_this_level = nr_records;
+ for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) {
+ uint64_t level_blocks;
+ uint64_t dontcare64;
+ unsigned int level = cur->bc_nlevels - 1;
+ unsigned int avg_per_block;
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &level_blocks, &dontcare64);
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ /*
+ * If all the items we want to store at this level
+ * would fit in the inode root block, then we have our
+ * btree root and are done.
+ *
+ * Note that bmap btrees forbid records in the root.
+ */
+ if (level != 0 && nr_this_level <= avg_per_block) {
+ nr_blocks++;
+ break;
+ }
+
+ /*
+ * Otherwise, we have to store all the items for this
+ * level in traditional btree blocks and therefore need
+ * another level of btree to point to those blocks.
+ *
+ * We have to re-compute the geometry for each level of
+ * an inode-rooted btree because the geometry differs
+ * between a btree root in an inode fork and a
+ * traditional btree block.
+ *
+ * This distinction is made in the btree code based on
+ * whether level == bc_nlevels - 1. Based on the
+ * previous root block size check against the root
+ * block geometry, we know that we aren't yet ready to
+ * populate the root. Increment bc_nevels and
+ * recalculate the geometry for a traditional
+ * block-based btree level.
+ */
+ cur->bc_nlevels++;
+ xfs_btree_bload_level_geometry(cur, bbl, level,
+ nr_this_level, &avg_per_block,
+ &level_blocks, &dontcare64);
+ } else {
+ /*
+ * If all the items we want to store at this level
+ * would fit in a single root block, we're done.
+ */
+ if (nr_this_level <= avg_per_block) {
+ nr_blocks++;
+ break;
+ }
+
+ /* Otherwise, we need another level of btree. */
+ cur->bc_nlevels++;
+ }
+
+ nr_blocks += level_blocks;
+ nr_this_level = level_blocks;
+ }
+
+ if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS)
+ return -EOVERFLOW;
+
+ bbl->btree_height = cur->bc_nlevels;
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ bbl->nr_blocks = nr_blocks - 1;
+ else
+ bbl->nr_blocks = nr_blocks;
+ return 0;
+}
+
+/* Bulk load a btree given the parameters and geometry established in bbl. */
+int
+xfs_btree_bload(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ void *priv)
+{
+ struct list_head buffers_list;
+ union xfs_btree_ptr child_ptr;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp = NULL;
+ struct xfs_btree_block *block = NULL;
+ uint64_t nr_this_level = bbl->nr_records;
+ uint64_t blocks;
+ uint64_t i;
+ uint64_t blocks_with_extra;
+ uint64_t total_blocks = 0;
+ unsigned int avg_per_block;
+ unsigned int level = 0;
+ int ret;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ INIT_LIST_HEAD(&buffers_list);
+ cur->bc_nlevels = bbl->btree_height;
+ xfs_btree_set_ptr_null(cur, &child_ptr);
+ xfs_btree_set_ptr_null(cur, &ptr);
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &blocks, &blocks_with_extra);
+
+ /* Load each leaf block. */
+ for (i = 0; i < blocks; i++) {
+ unsigned int nr_this_block = avg_per_block;
+
+ /*
+ * Due to rounding, btree blocks will not be evenly populated
+ * in most cases. blocks_with_extra tells us how many blocks
+ * will receive an extra record to distribute the excess across
+ * the current level as evenly as possible.
+ */
+ if (i < blocks_with_extra)
+ nr_this_block++;
+
+ ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
+ nr_this_block, &ptr, &bp, &block, priv);
+ if (ret)
+ goto out;
+
+ trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
+ nr_this_block);
+
+ ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+ block, priv);
+ if (ret)
+ goto out;
+
+ /*
+ * Record the leftmost leaf pointer so we know where to start
+ * with the first node level.
+ */
+ if (i == 0)
+ xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
+ }
+ total_blocks += blocks;
+ xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ /* Populate the internal btree nodes. */
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ union xfs_btree_ptr first_ptr;
+
+ nr_this_level = blocks;
+ block = NULL;
+ xfs_btree_set_ptr_null(cur, &ptr);
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &blocks, &blocks_with_extra);
+
+ /* Load each node block. */
+ for (i = 0; i < blocks; i++) {
+ unsigned int nr_this_block = avg_per_block;
+
+ if (i < blocks_with_extra)
+ nr_this_block++;
+
+ ret = xfs_btree_bload_prep_block(cur, bbl,
+ &buffers_list, level, nr_this_block,
+ &ptr, &bp, &block, priv);
+ if (ret)
+ goto out;
+
+ trace_xfs_btree_bload_block(cur, level, i, blocks,
+ &ptr, nr_this_block);
+
+ ret = xfs_btree_bload_node(cur, nr_this_block,
+ &child_ptr, block);
+ if (ret)
+ goto out;
+
+ /*
+ * Record the leftmost node pointer so that we know
+ * where to start the next node level above this one.
+ */
+ if (i == 0)
+ xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
+ }
+ total_blocks += blocks;
+ xfs_btree_bload_drop_buf(&buffers_list, &bp);
+ xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
+ }
+
+ /* Initialize the new root. */
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+ cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
+ cur->bc_ino.ifake->if_blocks = total_blocks - 1;
+ } else {
+ cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
+ cur->bc_ag.afake->af_levels = cur->bc_nlevels;
+ cur->bc_ag.afake->af_blocks = total_blocks;
+ }
+
+ /*
+ * Write the new blocks to disk. If the ordered list isn't empty after
+ * that, then something went wrong and we have to fail. This should
+ * never happen, but we'll check anyway.
+ */
+ ret = xfs_buf_delwri_submit(&buffers_list);
+ if (ret)
+ goto out;
+ if (!list_empty(&buffers_list)) {
+ ASSERT(list_empty(&buffers_list));
+ ret = -EIO;
+ }
+
+out:
+ xfs_buf_delwri_cancel(&buffers_list);
+ if (bp)
+ xfs_buf_relse(bp);
+ return ret;
+}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
new file mode 100644
index 000000000000..643f0f9b2994
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_BTREE_STAGING_H__
+#define __XFS_BTREE_STAGING_H__
+
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+ /* AG block number of the new btree root. */
+ xfs_agblock_t af_root;
+
+ /* Height of the new btree. */
+ unsigned int af_levels;
+
+ /* Number of blocks used by the btree. */
+ unsigned int af_blocks;
+};
+
+/* Cursor interactions with with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+ struct xbtree_afakeroot *afake);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+ struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+
+/* Fake root for an inode-rooted btree. */
+struct xbtree_ifakeroot {
+ /* Fake inode fork. */
+ struct xfs_ifork *if_fork;
+
+ /* Number of blocks used by the btree. */
+ int64_t if_blocks;
+
+ /* Height of the new btree. */
+ unsigned int if_levels;
+
+ /* Number of bytes available for this fork in the inode. */
+ unsigned int if_fork_size;
+
+ /* Fork format. */
+ unsigned int if_format;
+
+ /* Number of records. */
+ unsigned int if_extents;
+};
+
+/* Cursor interactions with with fake roots for inode-rooted btrees. */
+void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
+ struct xbtree_ifakeroot *ifake,
+ struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+ int whichfork, const struct xfs_btree_ops *ops);
+
+/* Bulk loading of staged btrees. */
+typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr, void *priv);
+typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
+ unsigned int nr_this_level, void *priv);
+
+struct xfs_btree_bload {
+ /*
+ * This function will be called nr_records times to load records into
+ * the btree. The function does this by setting the cursor's bc_rec
+ * field in in-core format. Records must be returned in sort order.
+ */
+ xfs_btree_bload_get_record_fn get_record;
+
+ /*
+ * This function will be called nr_blocks times to obtain a pointer
+ * to a new btree block on disk. Callers must preallocate all space
+ * for the new btree before calling xfs_btree_bload, and this function
+ * is what claims that reservation.
+ */
+ xfs_btree_bload_claim_block_fn claim_block;
+
+ /*
+ * This function should return the size of the in-core btree root
+ * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
+ * types.
+ */
+ xfs_btree_bload_iroot_size_fn iroot_size;
+
+ /*
+ * The caller should set this to the number of records that will be
+ * stored in the new btree.
+ */
+ uint64_t nr_records;
+
+ /*
+ * Number of free records to leave in each leaf block. If the caller
+ * sets this to -1, the slack value will be calculated to be be halfway
+ * between maxrecs and minrecs. This typically leaves the block 75%
+ * full. Note that slack values are not enforced on inode root blocks.
+ */
+ int leaf_slack;
+
+ /*
+ * Number of free key/ptrs pairs to leave in each node block. This
+ * field has the same semantics as leaf_slack.
+ */
+ int node_slack;
+
+ /*
+ * The xfs_btree_bload_compute_geometry function will set this to the
+ * number of btree blocks needed to store nr_records records.
+ */
+ uint64_t nr_blocks;
+
+ /*
+ * The xfs_btree_bload_compute_geometry function will set this to the
+ * height of the new btree.
+ */
+ unsigned int btree_height;
+};
+
+int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl, uint64_t nr_records);
+int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl,
+ void *priv);
+
+#endif /* __XFS_BTREE_STAGING_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 875e04f82541..897749c41f36 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -590,7 +590,7 @@ xfs_da3_split(
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
- xfs_buf_corruption_error(oldblk->bp);
+ xfs_buf_mark_corrupt(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -603,7 +603,7 @@ xfs_da3_split(
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
- xfs_buf_corruption_error(oldblk->bp);
+ xfs_buf_mark_corrupt(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -1624,7 +1624,7 @@ xfs_da3_node_lookup_int(
}
if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
- xfs_buf_corruption_error(blk->bp);
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
}
@@ -1639,7 +1639,7 @@ xfs_da3_node_lookup_int(
/* Tree taller than we can handle; bail out! */
if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
- xfs_buf_corruption_error(blk->bp);
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
}
@@ -1647,7 +1647,7 @@ xfs_da3_node_lookup_int(
if (blkno == args->geo->leafblk)
expected_level = nodehdr.level - 1;
else if (expected_level != nodehdr.level) {
- xfs_buf_corruption_error(blk->bp);
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
} else
expected_level--;
@@ -1986,7 +1986,8 @@ xfs_da3_path_shift(
ASSERT(path != NULL);
ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
level = (path->active-1) - 1; /* skip bottom layer in path */
- for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+ for (; level >= 0; level--) {
+ blk = &path->blk[level];
xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
blk->bp->b_addr);
@@ -2520,8 +2521,10 @@ xfs_dabuf_map(
*/
if (nirecs > 1) {
map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
- if (!map)
+ if (!map) {
+ error = -ENOMEM;
goto out_free_irecs;
+ }
*mapp = map;
}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 0f4fbb0889ff..53e503b6f186 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -57,9 +57,10 @@ typedef struct xfs_da_args {
const uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
uint8_t filetype; /* filetype of inode for directories */
- uint8_t *value; /* set of bytes (maybe contain NULLs) */
+ void *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
- int flags; /* argument flags (eg: ATTR_NOCREATE) */
+ unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
+ unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
xfs_dahash_t hashval; /* hash value of name */
xfs_ino_t inumber; /* input/output inode number */
struct xfs_inode *dp; /* directory inode to manipulate */
@@ -88,8 +89,7 @@ typedef struct xfs_da_args {
#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
-#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
-#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */
+#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
@@ -97,8 +97,7 @@ typedef struct xfs_da_args {
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \
- { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" }
+ { XFS_DA_OP_NOTIME, "NOTIME" }
/*
* Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 734837a9b51a..08c0a4d98b89 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -692,19 +692,7 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
-
-/*
- * Conversion macros for converting namespace bits from argument flags
- * to ondisk flags.
- */
-#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
-#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
-#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
- ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
-#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
- ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
/*
* Alignment for namelist and valuelist entries (since they are mixed
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index d6ced59b9567..1dbf2f980a26 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.verify_struct = xfs_dir3_block_verify,
};
+static xfs_failaddr_t
+xfs_dir3_block_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
int
xfs_dir3_block_read(
struct xfs_trans *tp,
@@ -121,12 +138,24 @@ xfs_dir3_block_read(
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dp->i_mount;
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
- if (!err && tp && *bpp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_block_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index b9eba8213180..375b3edb2ad2 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -394,6 +394,22 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.verify_write = xfs_dir3_data_write_verify,
};
+static xfs_failaddr_t
+xfs_dir3_data_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
int
xfs_dir3_data_read(
@@ -403,12 +419,24 @@ xfs_dir3_data_read(
unsigned int flags,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK,
&xfs_dir3_data_buf_ops);
- if (!err && tp && *bpp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_data_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index a131b520aac7..95d2a3f92d75 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -1383,7 +1383,7 @@ xfs_dir2_leaf_removename(
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[db]) != oldbest) {
- xfs_buf_corruption_error(lbp);
+ xfs_buf_mark_corrupt(lbp);
return -EFSCORRUPTED;
}
/*
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index a0cc5e240306..6ac4aad98cd7 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -194,6 +194,8 @@ xfs_dir3_free_header_check(
return __this_address;
if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
return __this_address;
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
} else {
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
@@ -226,8 +228,9 @@ __xfs_dir3_free_read(
/* Check things that we can't do in the verifier. */
fa = xfs_dir3_free_header_check(dp, fbno, *bpp);
if (fa) {
- xfs_verifier_error(*bpp, -EFSCORRUPTED, fa);
+ __xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
return -EFSCORRUPTED;
}
@@ -439,7 +442,7 @@ xfs_dir2_leaf_to_node(
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
if (be32_to_cpu(ltp->bestcount) >
(uint)dp->i_d.di_size / args->geo->blksize) {
- xfs_buf_corruption_error(lbp);
+ xfs_buf_mark_corrupt(lbp);
return -EFSCORRUPTED;
}
@@ -513,7 +516,7 @@ xfs_dir2_leafn_add(
* into other peoples memory
*/
if (index < 0) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
@@ -800,7 +803,7 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_dir3_leaf_check(dp, bp);
if (leafhdr.count <= 0) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 77e9fa385980..045556e78ee2 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -497,6 +497,23 @@ static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp)
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
+/*
+ * v5 file systems support V3 inodes only, earlier file systems support
+ * v2 and v1 inodes.
+ */
+static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline bool xfs_dinode_good_version(struct xfs_sb *sbp,
+ uint8_t version)
+{
+ if (xfs_sb_version_has_v3inode(sbp))
+ return version == 3;
+ return version == 1 || version == 2;
+}
+
static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
@@ -560,7 +577,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
-#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
@@ -707,7 +723,6 @@ typedef struct xfs_agf {
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
/*
* Size of the unlinked inode hash table in the agi.
@@ -775,7 +790,6 @@ typedef struct xfs_agi {
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
/*
* The third a.g. block contains the a.g. freelist, an array
@@ -783,21 +797,15 @@ typedef struct xfs_agi {
*/
#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
+#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr))
-#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
- &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
- (__be32 *)(bp)->b_addr)
-
-typedef struct xfs_agfl {
+struct xfs_agfl {
__be32 agfl_magicnum;
__be32 agfl_seqno;
uuid_t agfl_uuid;
__be64 agfl_lsn;
__be32 agfl_crc;
- __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */
-} __attribute__((packed)) xfs_agfl_t;
+} __attribute__((packed));
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
@@ -946,8 +954,12 @@ enum xfs_dinode_fmt {
/*
* Inode size for given fs.
*/
-#define XFS_LITINO(mp, version) \
- ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+#define XFS_DINODE_SIZE(sbp) \
+ (xfs_sb_version_has_v3inode(sbp) ? \
+ sizeof(struct xfs_dinode) : \
+ offsetof(struct xfs_dinode, di_crc))
+#define XFS_LITINO(mp) \
+ ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb))
/*
* Inode data & attribute fork sizes, per inode.
@@ -956,13 +968,9 @@ enum xfs_dinode_fmt {
#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
#define XFS_DFORK_DSIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_DFORK_BOFF(dip) : \
- XFS_LITINO(mp, (dip)->di_version))
+ (XFS_DFORK_Q(dip) ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp))
#define XFS_DFORK_ASIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
- 0)
+ (XFS_DFORK_Q(dip) ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0)
#define XFS_DFORK_SIZE(dip,mp,w) \
((w) == XFS_DATA_FORK ? \
XFS_DFORK_DSIZE(dip, mp) : \
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index ef95ca07d084..245188e4f6d3 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -568,10 +568,40 @@ typedef struct xfs_fsop_setdm_handlereq {
struct fsdmidata __user *data; /* DMAPI data */
} xfs_fsop_setdm_handlereq_t;
+/*
+ * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface.
+ *
+ * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix.
+ */
+#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */
+#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */
+#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */
+#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */
+
typedef struct xfs_attrlist_cursor {
__u32 opaque[4];
} xfs_attrlist_cursor_t;
+/*
+ * Define how lists of attribute names are returned to userspace from the
+ * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the
+ * beginning of the returned buffer, and a each entry in al_offset contains the
+ * relative offset of an xfs_attrlist_ent containing the actual entry.
+ *
+ * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and
+ * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr.
+ */
+struct xfs_attrlist {
+ __s32 al_count; /* number of entries in attrlist */
+ __s32 al_more; /* T/F: more attrs (do call again) */
+ __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
+};
+
+struct xfs_attrlist_ent { /* data from attr_list() */
+ __u32 a_valuelen; /* number bytes in value of attr */
+ char a_name[1]; /* attr name (NULL terminated) */
+};
+
typedef struct xfs_fsop_attrlist_handlereq {
struct xfs_fsop_handlereq hreq; /* handle interface structure */
struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
@@ -589,7 +619,7 @@ typedef struct xfs_attr_multiop {
void __user *am_attrname;
void __user *am_attrvalue;
__u32 am_length;
- __u32 am_flags;
+ __u32 am_flags; /* XFS_IOC_ATTR_* */
} xfs_attr_multiop_t;
typedef struct xfs_fsop_attrmulti_handlereq {
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index bf161e930f1d..7fcf62b324b0 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -105,7 +105,7 @@ xfs_inobt_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ xfs_agnumber_t agno = cur->bc_ag.agno;
union xfs_btree_rec *rec;
int error;
uint64_t realfree;
@@ -177,7 +177,7 @@ xfs_inobt_insert(
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agino_t thisino;
int i;
@@ -304,7 +304,7 @@ xfs_ialloc_inode_init(
* That means for v3 inode we log the entire buffer rather than just the
* inode cores.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
version = 3;
ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
@@ -339,7 +339,7 @@ xfs_ialloc_inode_init(
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
- uint isize = xfs_dinode_size(version);
+ uint isize = XFS_DINODE_SIZE(&mp->m_sb);
free = xfs_make_iptr(mp, fbuf, i);
free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
@@ -525,7 +525,7 @@ xfs_inobt_insert_sprec(
bool merge) /* merge or replace */
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
int error;
int i;
@@ -658,7 +658,7 @@ xfs_ialloc_ag_alloc(
* chunk of inodes. If the filesystem is striped, this will fill
* an entire stripe unit with inodes.
*/
- agi = XFS_BUF_TO_AGI(agbp);
+ agi = agbp->b_addr;
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
@@ -1130,7 +1130,7 @@ xfs_dialloc_ag_inobt(
xfs_ino_t *inop)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
@@ -1583,7 +1583,7 @@ xfs_dialloc_ag(
xfs_ino_t *inop)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
@@ -1943,7 +1943,7 @@ xfs_difree_inobt(
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
struct xfs_perag *pag;
struct xfs_btree_cur *cur;
@@ -2079,7 +2079,7 @@ xfs_difree_finobt(
xfs_agino_t agino,
struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
@@ -2489,9 +2489,8 @@ xfs_ialloc_log_agi(
sizeof(xfs_agi_t)
};
#ifdef DEBUG
- xfs_agi_t *agi; /* allocation group header */
+ struct xfs_agi *agi = bp->b_addr;
- agi = XFS_BUF_TO_AGI(bp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
@@ -2523,14 +2522,13 @@ xfs_agi_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ struct xfs_agi *agi = bp->b_addr;
int i;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (!xfs_log_check_lsn(mp,
- be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
return __this_address;
}
@@ -2593,6 +2591,7 @@ xfs_agi_write_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_agi *agi = bp->b_addr;
xfs_failaddr_t fa;
fa = xfs_agi_verify(bp);
@@ -2605,7 +2604,7 @@ xfs_agi_write_verify(
return;
if (bip)
- XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
}
@@ -2661,7 +2660,7 @@ xfs_ialloc_read_agi(
if (error)
return error;
- agi = XFS_BUF_TO_AGI(*bpp);
+ agi = (*bpp)->b_addr;
pag = xfs_perag_get(mp, agno);
if (!pag->pagi_init) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
@@ -2873,7 +2872,7 @@ xfs_ialloc_setup_geometry(
* cannot change the behavior.
*/
igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
int new_size = igeo->inode_cluster_size_raw;
new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index b82992f795aa..b2c122ad8f0e 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -12,6 +12,7 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
@@ -20,7 +21,6 @@
#include "xfs_trans.h"
#include "xfs_rmap.h"
-
STATIC int
xfs_inobt_get_minrecs(
struct xfs_btree_cur *cur,
@@ -34,7 +34,7 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_ag.agbp, cur->bc_ag.agno,
cur->bc_btnum);
}
@@ -44,8 +44,8 @@ xfs_inobt_set_root(
union xfs_btree_ptr *nptr,
int inc) /* level change */
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
agi->agi_root = nptr->s;
be32_add_cpu(&agi->agi_level, inc);
@@ -58,8 +58,8 @@ xfs_finobt_set_root(
union xfs_btree_ptr *nptr,
int inc) /* level change */
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
agi->agi_free_root = nptr->s;
be32_add_cpu(&agi->agi_free_level, inc);
@@ -83,7 +83,7 @@ __xfs_inobt_alloc_block(
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.oinfo = XFS_RMAP_OINFO_INOBT;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
@@ -212,9 +212,9 @@ xfs_inobt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+ struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_root;
}
@@ -224,9 +224,9 @@ xfs_finobt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+ struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_free_root;
}
@@ -400,32 +400,27 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
};
/*
- * Allocate a new inode btree cursor.
+ * Initialize a new inode btree cursor.
*/
-struct xfs_btree_cur * /* new inode btree cursor */
-xfs_inobt_init_cursor(
+static struct xfs_btree_cur *
+xfs_inobt_init_common(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for agi structure */
xfs_agnumber_t agno, /* allocation group number */
xfs_btnum_t btnum) /* ialloc or free ino btree */
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
struct xfs_btree_cur *cur;
cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
cur->bc_tp = tp;
cur->bc_mp = mp;
cur->bc_btnum = btnum;
if (btnum == XFS_BTNUM_INO) {
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- cur->bc_ops = &xfs_inobt_ops;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
+ cur->bc_ops = &xfs_inobt_ops;
} else {
- cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
- cur->bc_ops = &xfs_finobt_ops;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
+ cur->bc_ops = &xfs_finobt_ops;
}
cur->bc_blocklog = mp->m_sb.sb_blocklog;
@@ -433,12 +428,75 @@ xfs_inobt_init_cursor(
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
+ cur->bc_ag.agno = agno;
+ return cur;
+}
+
+/* Create an inode btree cursor. */
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = agbp->b_addr;
+ cur = xfs_inobt_init_common(mp, tp, agno, btnum);
+ if (btnum == XFS_BTNUM_INO)
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ else
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur->bc_ag.agbp = agbp;
return cur;
}
+/* Create an inode btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_inobt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_inobt_init_common(mp, NULL, agno, btnum);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Install a new inobt btree root. Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_inobt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agi *agi = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ if (cur->bc_btnum == XFS_BTNUM_INO) {
+ agi->agi_root = cpu_to_be32(afake->af_root);
+ agi->agi_level = cpu_to_be32(afake->af_levels);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+ } else {
+ agi->agi_free_root = cpu_to_be32(afake->af_root);
+ agi->agi_free_level = cpu_to_be32(afake->af_levels);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT |
+ XFS_AGI_FREE_LEVEL);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+ }
+}
+
/*
* Calculate number of records in an inobt btree block.
*/
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 951305ecaae1..35bbd978c272 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -48,6 +48,9 @@ struct xfs_mount;
extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
xfs_btnum_t);
+struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, xfs_agnumber_t agno,
+ xfs_btnum_t btnum);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
/* ir_holemask to inode allocation bitmap conversion */
@@ -68,4 +71,7 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_btnum_t btnum,
struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
+void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 8afacfe4be0a..39c5a6e24915 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -44,17 +44,6 @@ xfs_inobp_check(
}
#endif
-bool
-xfs_dinode_good_version(
- struct xfs_mount *mp,
- __u8 version)
-{
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return version == 3;
-
- return version == 1 || version == 2;
-}
-
/*
* If we are doing readahead on an inode buffer, we might be in log recovery
* reading an inode allocation buffer that hasn't yet been replayed, and hence
@@ -93,7 +82,7 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
- xfs_dinode_good_version(mp, dip->di_version) &&
+ xfs_dinode_good_version(&mp->m_sb, dip->di_version) &&
xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
@@ -205,16 +194,14 @@ xfs_inode_from_disk(
struct xfs_icdinode *to = &ip->i_d;
struct inode *inode = VFS_I(ip);
-
/*
* Convert v1 inodes immediately to v2 inode format as this is the
* minimum inode version format we support in the rest of the code.
+ * They will also be unconditionally written back to disk as v2 inodes.
*/
- to->di_version = from->di_version;
- if (to->di_version == 1) {
+ if (unlikely(from->di_version == 1)) {
set_nlink(inode, be16_to_cpu(from->di_onlink));
to->di_projid = 0;
- to->di_version = 2;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
@@ -222,8 +209,8 @@ xfs_inode_from_disk(
}
to->di_format = from->di_format;
- to->di_uid = be32_to_cpu(from->di_uid);
- to->di_gid = be32_to_cpu(from->di_gid);
+ i_uid_write(inode, be32_to_cpu(from->di_uid));
+ i_gid_write(inode, be32_to_cpu(from->di_gid));
to->di_flushiter = be16_to_cpu(from->di_flushiter);
/*
@@ -252,7 +239,7 @@ xfs_inode_from_disk(
to->di_dmstate = be16_to_cpu(from->di_dmstate);
to->di_flags = be16_to_cpu(from->di_flags);
- if (to->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
inode_set_iversion_queried(inode,
be64_to_cpu(from->di_changecount));
to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec);
@@ -274,10 +261,9 @@ xfs_inode_to_disk(
to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
to->di_onlink = 0;
- to->di_version = from->di_version;
to->di_format = from->di_format;
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_uid = cpu_to_be32(i_uid_read(inode));
+ to->di_gid = cpu_to_be32(i_gid_read(inode));
to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff);
to->di_projid_hi = cpu_to_be16(from->di_projid >> 16);
@@ -303,7 +289,8 @@ xfs_inode_to_disk(
to->di_dmstate = cpu_to_be16(from->di_dmstate);
to->di_flags = cpu_to_be16(from->di_flags);
- if (from->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ to->di_version = 3;
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec);
to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec);
@@ -315,6 +302,7 @@ xfs_inode_to_disk(
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
} else {
+ to->di_version = 2;
to->di_flushiter = cpu_to_be16(from->di_flushiter);
}
}
@@ -428,7 +416,7 @@ xfs_dinode_verify_forkoff(
case XFS_DINODE_FMT_LOCAL: /* fall through ... */
case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
case XFS_DINODE_FMT_BTREE:
- if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3))
+ if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3))
return __this_address;
break;
default:
@@ -454,7 +442,7 @@ xfs_dinode_verify(
/* Verify v3 integrity information first */
if (dip->di_version >= 3) {
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
return __this_address;
if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF))
@@ -629,10 +617,9 @@ xfs_iread(
/* shortcut IO on inode allocation if possible */
if ((iget_flags & XFS_IGET_CREATE) &&
- xfs_sb_version_hascrc(&mp->m_sb) &&
+ xfs_sb_version_has_v3inode(&mp->m_sb) &&
!(mp->m_flags & XFS_MOUNT_IKEEP)) {
VFS_I(ip)->i_generation = prandom_u32();
- ip->i_d.di_version = 3;
return 0;
}
@@ -674,7 +661,6 @@ xfs_iread(
* Partial initialisation of the in-core inode. Just the bits
* that xfs_ialloc won't overwrite or relies on being correct.
*/
- ip->i_d.di_version = dip->di_version;
VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
@@ -688,7 +674,6 @@ xfs_iread(
VFS_I(ip)->i_mode = 0;
}
- ASSERT(ip->i_d.di_version >= 2);
ip->i_delayed_blks = 0;
/*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index fd94b1078722..9b373dcf9e34 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -16,11 +16,8 @@ struct xfs_dinode;
* format specific structures at the appropriate time.
*/
struct xfs_icdinode {
- int8_t di_version; /* inode version */
int8_t di_format; /* format of di_c data */
uint16_t di_flushiter; /* incremented on flush */
- uint32_t di_uid; /* owner's user id */
- uint32_t di_gid; /* owner's group id */
uint32_t di_projid; /* owner's project id */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
@@ -61,8 +58,6 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
struct xfs_dinode *to);
-bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version);
-
#if defined(DEBUG)
void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#else
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index ad2b9c313fd2..518c6f0ec3a6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -183,7 +183,7 @@ xfs_iformat_local(
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+ "corrupt inode %Lu (bad size %d for local fork, size = %zd).",
(unsigned long long) ip->i_ino, size,
XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 500333d0101e..668ee942be22 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -46,14 +46,9 @@ struct xfs_ifork {
(ip)->i_afp : \
(ip)->i_cowfp))
#define XFS_IFORK_DSIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_IFORK_BOFF(ip) : \
- XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+ (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount))
#define XFS_IFORK_ASIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
- XFS_IFORK_BOFF(ip) : \
- 0)
+ (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0)
#define XFS_IFORK_SIZE(ip,w) \
((w) == XFS_DATA_FORK ? \
XFS_IFORK_DSIZE(ip) : \
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 9bac0d2e56dc..e3400c9c71cd 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -424,12 +424,10 @@ struct xfs_log_dinode {
/* structure must be padded to 64 bit alignment */
};
-static inline uint xfs_log_dinode_size(int version)
-{
- if (version == 3)
- return sizeof(struct xfs_log_dinode);
- return offsetof(struct xfs_log_dinode, di_next_unlinked);
-}
+#define xfs_log_dinode_size(mp) \
+ (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \
+ sizeof(struct xfs_log_dinode) : \
+ offsetof(struct xfs_log_dinode, di_next_unlinked))
/*
* Buffer Log Format definitions
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 6e1665f2cb67..2076627243b0 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -46,7 +46,7 @@ xfs_refcount_lookup_le(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -63,7 +63,7 @@ xfs_refcount_lookup_ge(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
XFS_LOOKUP_GE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -80,7 +80,7 @@ xfs_refcount_lookup_eq(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -108,7 +108,7 @@ xfs_refcount_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ xfs_agnumber_t agno = cur->bc_ag.agno;
union xfs_btree_rec *rec;
int error;
xfs_agblock_t realstart;
@@ -119,7 +119,7 @@ xfs_refcount_get_rec(
xfs_refcount_btrec_to_irec(rec, irec);
- agno = cur->bc_private.a.agno;
+ agno = cur->bc_ag.agno;
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
goto out_bad_rec;
@@ -144,7 +144,7 @@ xfs_refcount_get_rec(
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
goto out_bad_rec;
- trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec);
+ trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec);
return 0;
out_bad_rec:
@@ -169,14 +169,14 @@ xfs_refcount_update(
union xfs_btree_rec rec;
int error;
- trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec);
+ trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec);
rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_refcount_update_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -193,7 +193,7 @@ xfs_refcount_insert(
{
int error;
- trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec);
+ trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec);
cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
@@ -208,7 +208,7 @@ xfs_refcount_insert(
out_error:
if (error)
trace_xfs_refcount_insert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -234,7 +234,7 @@ xfs_refcount_delete(
error = -EFSCORRUPTED;
goto out_error;
}
- trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec);
+ trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec);
error = xfs_btree_delete(cur, i);
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
error = -EFSCORRUPTED;
@@ -246,7 +246,7 @@ xfs_refcount_delete(
out_error:
if (error)
trace_xfs_refcount_delete_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -366,7 +366,7 @@ xfs_refcount_split_extent(
return 0;
*shape_changed = true;
- trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno,
&rcext, agbno);
/* Establish the right extent. */
@@ -391,7 +391,7 @@ xfs_refcount_split_extent(
out_error:
trace_xfs_refcount_split_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -411,7 +411,7 @@ xfs_refcount_merge_center_extents(
int found_rec;
trace_xfs_refcount_merge_center_extents(cur->bc_mp,
- cur->bc_private.a.agno, left, center, right);
+ cur->bc_ag.agno, left, center, right);
/*
* Make sure the center and right extents are not in the btree.
@@ -468,7 +468,7 @@ xfs_refcount_merge_center_extents(
out_error:
trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -487,7 +487,7 @@ xfs_refcount_merge_left_extent(
int found_rec;
trace_xfs_refcount_merge_left_extent(cur->bc_mp,
- cur->bc_private.a.agno, left, cleft);
+ cur->bc_ag.agno, left, cleft);
/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
if (cleft->rc_refcount > 1) {
@@ -530,7 +530,7 @@ xfs_refcount_merge_left_extent(
out_error:
trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -548,7 +548,7 @@ xfs_refcount_merge_right_extent(
int found_rec;
trace_xfs_refcount_merge_right_extent(cur->bc_mp,
- cur->bc_private.a.agno, cright, right);
+ cur->bc_ag.agno, cright, right);
/*
* If the extent ending at agbno+aglen (cright) wasn't synthesized,
@@ -594,7 +594,7 @@ xfs_refcount_merge_right_extent(
out_error:
trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -679,13 +679,13 @@ xfs_refcount_find_left_extents(
cleft->rc_blockcount = aglen;
cleft->rc_refcount = 1;
}
- trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno,
left, cleft, agbno);
return error;
out_error:
trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -768,13 +768,13 @@ xfs_refcount_find_right_extents(
cright->rc_blockcount = aglen;
cright->rc_refcount = 1;
}
- trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno,
cright, right, agbno + aglen);
return error;
out_error:
trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -883,7 +883,7 @@ xfs_refcount_still_have_space(
{
unsigned long overhead;
- overhead = cur->bc_private.a.priv.refc.shape_changes *
+ overhead = cur->bc_ag.refc.shape_changes *
xfs_allocfree_log_count(cur->bc_mp, 1);
overhead *= cur->bc_mp->m_sb.sb_blocksize;
@@ -891,17 +891,17 @@ xfs_refcount_still_have_space(
* Only allow 2 refcount extent updates per transaction if the
* refcount continue update "error" has been injected.
*/
- if (cur->bc_private.a.priv.refc.nr_ops > 2 &&
+ if (cur->bc_ag.refc.nr_ops > 2 &&
XFS_TEST_ERROR(false, cur->bc_mp,
XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
- if (cur->bc_private.a.priv.refc.nr_ops == 0)
+ if (cur->bc_ag.refc.nr_ops == 0)
return true;
else if (overhead > cur->bc_tp->t_log_res)
return false;
return cur->bc_tp->t_log_res - overhead >
- cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -952,7 +952,7 @@ xfs_refcount_adjust_extents(
ext.rc_startblock - *agbno);
tmp.rc_refcount = 1 + adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &tmp);
+ cur->bc_ag.agno, &tmp);
/*
* Either cover the hole (increment) or
@@ -968,10 +968,10 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_private.a.priv.refc.nr_ops++;
+ cur->bc_ag.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_private.a.agno,
+ cur->bc_ag.agno,
tmp.rc_startblock);
xfs_bmap_add_free(cur->bc_tp, fsbno,
tmp.rc_blockcount, oinfo);
@@ -998,12 +998,12 @@ xfs_refcount_adjust_extents(
goto skip;
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &ext);
+ cur->bc_ag.agno, &ext);
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
goto out_error;
- cur->bc_private.a.priv.refc.nr_ops++;
+ cur->bc_ag.refc.nr_ops++;
} else if (ext.rc_refcount == 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
@@ -1012,11 +1012,11 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_private.a.priv.refc.nr_ops++;
+ cur->bc_ag.refc.nr_ops++;
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_private.a.agno,
+ cur->bc_ag.agno,
ext.rc_startblock);
xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount,
oinfo);
@@ -1035,7 +1035,7 @@ advloop:
return error;
out_error:
trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1057,10 +1057,10 @@ xfs_refcount_adjust(
*new_agbno = agbno;
*new_aglen = aglen;
if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
- trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno,
agbno, aglen);
else
- trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno,
agbno, aglen);
/*
@@ -1088,7 +1088,7 @@ xfs_refcount_adjust(
if (shape_changed)
shape_changes++;
if (shape_changes)
- cur->bc_private.a.priv.refc.shape_changes++;
+ cur->bc_ag.refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
@@ -1099,7 +1099,7 @@ xfs_refcount_adjust(
return 0;
out_error:
- trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno,
error, _RET_IP_);
return error;
}
@@ -1115,7 +1115,7 @@ xfs_refcount_finish_one_cleanup(
if (rcur == NULL)
return;
- agbp = rcur->bc_private.a.agbp;
+ agbp = rcur->bc_ag.agbp;
xfs_btree_del_cursor(rcur, error);
if (error)
xfs_trans_brelse(tp, agbp);
@@ -1165,9 +1165,9 @@ xfs_refcount_finish_one(
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_private.a.agno != agno) {
- nr_ops = rcur->bc_private.a.priv.refc.nr_ops;
- shape_changes = rcur->bc_private.a.priv.refc.shape_changes;
+ if (rcur != NULL && rcur->bc_ag.agno != agno) {
+ nr_ops = rcur->bc_ag.refc.nr_ops;
+ shape_changes = rcur->bc_ag.refc.shape_changes;
xfs_refcount_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -1183,8 +1183,8 @@ xfs_refcount_finish_one(
error = -ENOMEM;
goto out_cur;
}
- rcur->bc_private.a.priv.refc.nr_ops = nr_ops;
- rcur->bc_private.a.priv.refc.shape_changes = shape_changes;
+ rcur->bc_ag.refc.nr_ops = nr_ops;
+ rcur->bc_ag.refc.shape_changes = shape_changes;
}
*pcur = rcur;
@@ -1303,7 +1303,7 @@ xfs_refcount_find_shared(
int have;
int error;
- trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno,
agbno, aglen);
/* By default, skip the whole range */
@@ -1383,12 +1383,12 @@ xfs_refcount_find_shared(
done:
trace_xfs_refcount_find_shared_result(cur->bc_mp,
- cur->bc_private.a.agno, *fbno, *flen);
+ cur->bc_ag.agno, *fbno, *flen);
out_error:
if (error)
trace_xfs_refcount_find_shared_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1485,7 +1485,7 @@ xfs_refcount_adjust_cow_extents(
tmp.rc_blockcount = aglen;
tmp.rc_refcount = 1;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &tmp);
+ cur->bc_ag.agno, &tmp);
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -1513,7 +1513,7 @@ xfs_refcount_adjust_cow_extents(
ext.rc_refcount = 0;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &ext);
+ cur->bc_ag.agno, &ext);
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
@@ -1529,7 +1529,7 @@ xfs_refcount_adjust_cow_extents(
return error;
out_error:
trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1575,7 +1575,7 @@ xfs_refcount_adjust_cow(
return 0;
out_error:
- trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno,
error, _RET_IP_);
return error;
}
@@ -1589,7 +1589,7 @@ __xfs_refcount_cow_alloc(
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
+ trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno,
agbno, aglen);
/* Add refcount btree reservation */
@@ -1606,7 +1606,7 @@ __xfs_refcount_cow_free(
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
+ trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno,
agbno, aglen);
/* Remove refcount btree reservation */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 38529dbacd55..7fd6044a4f78 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -12,6 +12,7 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
@@ -25,7 +26,7 @@ xfs_refcountbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_ag.agbp, cur->bc_ag.agno);
}
STATIC void
@@ -34,8 +35,8 @@ xfs_refcountbt_set_root(
union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
@@ -57,8 +58,8 @@ xfs_refcountbt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
struct xfs_alloc_arg args; /* block allocation args */
int error; /* error return value */
@@ -66,7 +67,7 @@ xfs_refcountbt_alloc_block(
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno,
xfs_refc_block(args.mp));
args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
@@ -75,13 +76,13 @@ xfs_refcountbt_alloc_block(
error = xfs_alloc_vextent(&args);
if (error)
goto out_error;
- trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno,
args.agbno, 1);
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
}
- ASSERT(args.agno == cur->bc_private.a.agno);
+ ASSERT(args.agno == cur->bc_ag.agno);
ASSERT(args.len == 1);
new->s = cpu_to_be32(args.agbno);
@@ -101,12 +102,12 @@ xfs_refcountbt_free_block(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
int error;
- trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno,
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
@@ -169,9 +170,9 @@ xfs_refcountbt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_refcount_root;
}
@@ -311,41 +312,90 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
};
/*
- * Allocate a new refcount btree cursor.
+ * Initialize a new refcount btree cursor.
*/
-struct xfs_btree_cur *
-xfs_refcountbt_init_cursor(
+static struct xfs_btree_cur *
+xfs_refcountbt_init_common(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
xfs_agnumber_t agno)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
struct xfs_btree_cur *cur;
ASSERT(agno != NULLAGNUMBER);
ASSERT(agno < mp->m_sb.sb_agcount);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
cur->bc_tp = tp;
cur->bc_mp = mp;
cur->bc_btnum = XFS_BTNUM_REFC;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_refcountbt_ops;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
- cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
-
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
+ cur->bc_ag.agno = agno;
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_private.a.priv.refc.nr_ops = 0;
- cur->bc_private.a.priv.refc.shape_changes = 0;
+ cur->bc_ag.refc.nr_ops = 0;
+ cur->bc_ag.refc.shape_changes = 0;
+ cur->bc_ops = &xfs_refcountbt_ops;
+ return cur;
+}
+/* Create a btree cursor. */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_refcountbt_init_common(mp, tp, agno);
+ cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ cur->bc_ag.agbp = agbp;
return cur;
}
+/* Create a btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_refcountbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ xfs_agnumber_t agno)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_refcountbt_init_common(mp, NULL, agno);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Swap in the new btree root. Once we pass this point the newly rebuilt btree
+ * is in place and we have to kill off all the old btree blocks.
+ */
+void
+xfs_refcountbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_refcount_root = cpu_to_be32(afake->af_root);
+ agf->agf_refcount_level = cpu_to_be32(afake->af_levels);
+ agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
+ XFS_AGF_REFCOUNT_ROOT |
+ XFS_AGF_REFCOUNT_LEVEL);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+}
+
/*
* Calculate the number of records in a refcount btree block.
*/
@@ -420,7 +470,7 @@ xfs_refcountbt_calc_reserves(
if (error)
return error;
- agf = XFS_BUF_TO_AGF(agbp);
+ agf = agbp->b_addr;
agblocks = be32_to_cpu(agf->agf_length);
tree_len = be32_to_cpu(agf->agf_refcount_blocks);
xfs_trans_brelse(tp, agbp);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index ba416f71c824..69dc515db671 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -13,6 +13,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xbtree_afakeroot;
/*
* Btree block header size
@@ -46,6 +47,8 @@ struct xfs_mount;
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp,
xfs_agnumber_t agno);
+struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, xfs_agnumber_t agno);
extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
@@ -58,4 +61,7 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask,
xfs_extlen_t *used);
+void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index ff9412f113c4..27c39268c31f 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -79,7 +79,7 @@ xfs_rmap_update(
union xfs_btree_rec rec;
int error;
- trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno,
irec->rm_startblock, irec->rm_blockcount,
irec->rm_owner, irec->rm_offset, irec->rm_flags);
@@ -91,7 +91,7 @@ xfs_rmap_update(
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_rmap_update_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -107,7 +107,7 @@ xfs_rmap_insert(
int i;
int error;
- trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno,
len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
@@ -133,7 +133,7 @@ xfs_rmap_insert(
done:
if (error)
trace_xfs_rmap_insert_error(rcur->bc_mp,
- rcur->bc_private.a.agno, error, _RET_IP_);
+ rcur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -149,7 +149,7 @@ xfs_rmap_delete(
int i;
int error;
- trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno,
len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
@@ -170,7 +170,7 @@ xfs_rmap_delete(
done:
if (error)
trace_xfs_rmap_delete_error(rcur->bc_mp,
- rcur->bc_private.a.agno, error, _RET_IP_);
+ rcur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -197,7 +197,7 @@ xfs_rmap_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ xfs_agnumber_t agno = cur->bc_ag.agno;
union xfs_btree_rec *rec;
int error;
@@ -260,7 +260,7 @@ xfs_rmap_find_left_neighbor_helper(
struct xfs_find_left_neighbor_info *info = priv;
trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
- cur->bc_private.a.agno, rec->rm_startblock,
+ cur->bc_ag.agno, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -312,7 +312,7 @@ xfs_rmap_find_left_neighbor(
info.stat = stat;
trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
- cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+ cur->bc_ag.agno, bno, 0, owner, offset, flags);
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_find_left_neighbor_helper, &info);
@@ -320,7 +320,7 @@ xfs_rmap_find_left_neighbor(
error = 0;
if (*stat)
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, irec->rm_startblock,
+ cur->bc_ag.agno, irec->rm_startblock,
irec->rm_blockcount, irec->rm_owner,
irec->rm_offset, irec->rm_flags);
return error;
@@ -336,7 +336,7 @@ xfs_rmap_lookup_le_range_helper(
struct xfs_find_left_neighbor_info *info = priv;
trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
- cur->bc_private.a.agno, rec->rm_startblock,
+ cur->bc_ag.agno, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -385,14 +385,14 @@ xfs_rmap_lookup_le_range(
info.stat = stat;
trace_xfs_rmap_lookup_le_range(cur->bc_mp,
- cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+ cur->bc_ag.agno, bno, 0, owner, offset, flags);
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_lookup_le_range_helper, &info);
if (error == -ECANCELED)
error = 0;
if (*stat)
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, irec->rm_startblock,
+ cur->bc_ag.agno, irec->rm_startblock,
irec->rm_blockcount, irec->rm_owner,
irec->rm_offset, irec->rm_flags);
return error;
@@ -498,7 +498,7 @@ xfs_rmap_unmap(
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/*
@@ -522,7 +522,7 @@ xfs_rmap_unmap(
goto out_error;
}
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, ltrec.rm_startblock,
+ cur->bc_ag.agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
ltoff = ltrec.rm_offset;
@@ -588,7 +588,7 @@ xfs_rmap_unmap(
if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
/* exact match, simply remove the record from rmap tree */
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
ltrec.rm_startblock, ltrec.rm_blockcount,
ltrec.rm_owner, ltrec.rm_offset,
ltrec.rm_flags);
@@ -666,7 +666,7 @@ xfs_rmap_unmap(
else
cur->bc_rec.r.rm_offset = offset + len;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno,
cur->bc_rec.r.rm_startblock,
cur->bc_rec.r.rm_blockcount,
cur->bc_rec.r.rm_owner,
@@ -678,11 +678,11 @@ xfs_rmap_unmap(
}
out_done:
- trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno,
error, _RET_IP_);
return error;
}
@@ -773,7 +773,7 @@ xfs_rmap_map(
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
@@ -795,7 +795,7 @@ xfs_rmap_map(
goto out_error;
}
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, ltrec.rm_startblock,
+ cur->bc_ag.agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
@@ -831,7 +831,7 @@ xfs_rmap_map(
goto out_error;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, gtrec.rm_startblock,
+ cur->bc_ag.agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
gtrec.rm_offset, gtrec.rm_flags);
if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
@@ -870,7 +870,7 @@ xfs_rmap_map(
* result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
*/
ltrec.rm_blockcount += gtrec.rm_blockcount;
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
gtrec.rm_startblock,
gtrec.rm_blockcount,
gtrec.rm_owner,
@@ -921,7 +921,7 @@ xfs_rmap_map(
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len,
owner, offset, flags);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -932,11 +932,11 @@ xfs_rmap_map(
}
}
- trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_map_error(mp, cur->bc_ag.agno,
error, _RET_IP_);
return error;
}
@@ -1010,7 +1010,7 @@ xfs_rmap_convert(
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/*
@@ -1034,7 +1034,7 @@ xfs_rmap_convert(
goto done;
}
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, PREV.rm_startblock,
+ cur->bc_ag.agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
PREV.rm_offset, PREV.rm_flags);
@@ -1076,7 +1076,7 @@ xfs_rmap_convert(
goto done;
}
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, LEFT.rm_startblock,
+ cur->bc_ag.agno, LEFT.rm_startblock,
LEFT.rm_blockcount, LEFT.rm_owner,
LEFT.rm_offset, LEFT.rm_flags);
if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
@@ -1114,7 +1114,7 @@ xfs_rmap_convert(
goto done;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, RIGHT.rm_startblock,
+ cur->bc_ag.agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
RIGHT.rm_offset, RIGHT.rm_flags);
if (bno + len == RIGHT.rm_startblock &&
@@ -1132,7 +1132,7 @@ xfs_rmap_convert(
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state,
_RET_IP_);
/* reset the cursor back to PREV */
@@ -1162,7 +1162,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
RIGHT.rm_flags);
@@ -1180,7 +1180,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
PREV.rm_flags);
@@ -1210,7 +1210,7 @@ xfs_rmap_convert(
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
PREV.rm_flags);
@@ -1247,7 +1247,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
RIGHT.rm_flags);
@@ -1326,7 +1326,7 @@ xfs_rmap_convert(
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno,
len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1383,7 +1383,7 @@ xfs_rmap_convert(
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno,
len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1414,7 +1414,7 @@ xfs_rmap_convert(
NEW = PREV;
NEW.rm_blockcount = offset - PREV.rm_offset;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno,
NEW.rm_startblock, NEW.rm_blockcount,
NEW.rm_owner, NEW.rm_offset,
NEW.rm_flags);
@@ -1441,7 +1441,7 @@ xfs_rmap_convert(
/* new middle extent - newext */
cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
cur->bc_rec.r.rm_flags |= newext;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len,
owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1465,12 +1465,12 @@ xfs_rmap_convert(
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
done:
if (error)
trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1506,7 +1506,7 @@ xfs_rmap_convert_shared(
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/*
@@ -1573,7 +1573,7 @@ xfs_rmap_convert_shared(
goto done;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, RIGHT.rm_startblock,
+ cur->bc_ag.agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
RIGHT.rm_offset, RIGHT.rm_flags);
if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
@@ -1589,7 +1589,7 @@ xfs_rmap_convert_shared(
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state,
_RET_IP_);
/*
* Switch out based on the FILLING and CONTIG state bits.
@@ -1880,12 +1880,12 @@ xfs_rmap_convert_shared(
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
done:
if (error)
trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1923,7 +1923,7 @@ xfs_rmap_unmap_shared(
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/*
@@ -2072,12 +2072,12 @@ xfs_rmap_unmap_shared(
goto out_error;
}
- trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
trace_xfs_rmap_unmap_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -2112,7 +2112,7 @@ xfs_rmap_map_shared(
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/* Is there a left record that abuts our range? */
@@ -2138,7 +2138,7 @@ xfs_rmap_map_shared(
goto out_error;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, gtrec.rm_startblock,
+ cur->bc_ag.agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
gtrec.rm_offset, gtrec.rm_flags);
@@ -2231,12 +2231,12 @@ xfs_rmap_map_shared(
goto out_error;
}
- trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
trace_xfs_rmap_map_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -2336,7 +2336,7 @@ xfs_rmap_finish_one_cleanup(
if (rcur == NULL)
return;
- agbp = rcur->bc_private.a.agbp;
+ agbp = rcur->bc_ag.agbp;
xfs_btree_del_cursor(rcur, error);
if (error)
xfs_trans_brelse(tp, agbp);
@@ -2386,7 +2386,7 @@ xfs_rmap_finish_one(
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+ if (rcur != NULL && rcur->bc_ag.agno != agno) {
xfs_rmap_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -2694,7 +2694,6 @@ struct xfs_rmap_key_state {
uint64_t owner;
uint64_t offset;
unsigned int flags;
- bool has_rmap;
};
/* For each rmap given, figure out if it doesn't match the key we want. */
@@ -2709,7 +2708,6 @@ xfs_rmap_has_other_keys_helper(
if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
return 0;
- rks->has_rmap = true;
return -ECANCELED;
}
@@ -2731,7 +2729,7 @@ xfs_rmap_has_other_keys(
int error;
xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
- rks.has_rmap = false;
+ *has_rmap = false;
low.rm_startblock = bno;
memset(&high, 0xFF, sizeof(high));
@@ -2739,11 +2737,12 @@ xfs_rmap_has_other_keys(
error = xfs_rmap_query_range(cur, &low, &high,
xfs_rmap_has_other_keys_helper, &rks);
- if (error < 0)
- return error;
+ if (error == -ECANCELED) {
+ *has_rmap = true;
+ return 0;
+ }
- *has_rmap = rks.has_rmap;
- return 0;
+ return error;
}
const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index fc78efa52c94..b7c05314d07c 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -14,6 +14,7 @@
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_trace.h"
@@ -51,7 +52,7 @@ xfs_rmapbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_ag.agbp, cur->bc_ag.agno);
}
STATIC void
@@ -60,8 +61,8 @@ xfs_rmapbt_set_root(
union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
int btnum = cur->bc_btnum;
struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
@@ -83,25 +84,25 @@ xfs_rmapbt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
int error;
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
&bno, 1);
if (error)
return error;
- trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno,
bno, 1);
if (bno == NULLAGBLOCK) {
*stat = 0;
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1,
false);
xfs_trans_agbtree_delta(cur->bc_tp, 1);
@@ -109,7 +110,7 @@ xfs_rmapbt_alloc_block(
be32_add_cpu(&agf->agf_rmap_blocks, 1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
+ xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno);
*stat = 1;
return 0;
@@ -120,13 +121,13 @@ xfs_rmapbt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
xfs_agblock_t bno;
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
- trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno,
bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
@@ -138,7 +139,7 @@ xfs_rmapbt_free_block(
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
- xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
+ xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno);
return 0;
}
@@ -215,9 +216,9 @@ xfs_rmapbt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_roots[cur->bc_btnum];
}
@@ -448,17 +449,12 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
.recs_inorder = xfs_rmapbt_recs_inorder,
};
-/*
- * Allocate a new allocation btree cursor.
- */
-struct xfs_btree_cur *
-xfs_rmapbt_init_cursor(
+static struct xfs_btree_cur *
+xfs_rmapbt_init_common(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
xfs_agnumber_t agno)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
struct xfs_btree_cur *cur;
cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
@@ -468,17 +464,68 @@ xfs_rmapbt_init_cursor(
cur->bc_btnum = XFS_BTNUM_RMAP;
cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+ cur->bc_ag.agno = agno;
cur->bc_ops = &xfs_rmapbt_ops;
+
+ return cur;
+}
+
+/* Create a new reverse mapping btree cursor. */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_rmapbt_init_common(mp, tp, agno);
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+ cur->bc_ag.agbp = agbp;
+ return cur;
+}
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
+/* Create a new reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rmapbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ xfs_agnumber_t agno)
+{
+ struct xfs_btree_cur *cur;
+ cur = xfs_rmapbt_init_common(mp, NULL, agno);
+ xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
/*
+ * Install a new reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rmapbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+ agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
+ XFS_AGF_RMAP_BLOCKS);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+}
+
+/*
* Calculate number of records in an rmap btree block.
*/
int
@@ -569,7 +616,7 @@ xfs_rmapbt_calc_reserves(
if (error)
return error;
- agf = XFS_BUF_TO_AGF(agbp);
+ agf = agbp->b_addr;
agblocks = be32_to_cpu(agf->agf_length);
tree_len = be32_to_cpu(agf->agf_rmap_blocks);
xfs_trans_brelse(tp, agbp);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 820d668b063d..115c3455a734 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -9,6 +9,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xbtree_afakeroot;
/* rmaps only exist on crc enabled filesystems */
#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
@@ -43,6 +44,10 @@ struct xfs_mount;
struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
xfs_agnumber_t agno);
+struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, xfs_agnumber_t agno);
+void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
int xfs_rmapbt_maxrecs(int blocklen, int leaf);
extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 2f60fc3c99a0..c526c5e5ab76 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -220,7 +220,7 @@ xfs_validate_sb_common(
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
uint32_t agcount = 0;
uint32_t rem;
@@ -328,6 +328,38 @@ xfs_validate_sb_common(
return -EFSCORRUPTED;
}
+ /* Validate the realtime geometry; stolen from xfs_repair */
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+ xfs_notice(mp,
+ "realtime extent sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
+ if (sbp->sb_rblocks == 0) {
+ if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+ sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
+ xfs_notice(mp,
+ "realtime zeroed geometry check failed");
+ return -EFSCORRUPTED;
+ }
+ } else {
+ uint64_t rexts;
+ uint64_t rbmblocks;
+
+ rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
+ rbmblocks = howmany_64(sbp->sb_rextents,
+ NBBY * sbp->sb_blocksize);
+
+ if (sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ sbp->sb_rbmblocks != rbmblocks) {
+ xfs_notice(mp,
+ "realtime geometry sanity check failed");
+ return -EFSCORRUPTED;
+ }
+ }
+
if (sbp->sb_unit) {
if (!xfs_sb_version_hasdalign(sbp) ||
sbp->sb_unit > sbp->sb_width ||
@@ -681,7 +713,7 @@ xfs_sb_read_verify(
{
struct xfs_sb sb;
struct xfs_mount *mp = bp->b_mount;
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
int error;
/*
@@ -707,7 +739,7 @@ xfs_sb_read_verify(
* Check all the superblock fields. Don't byteswap the xquota flags
* because _verify_common checks the on-disk values.
*/
- __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
+ __xfs_sb_from_disk(&sb, dsb, false);
error = xfs_validate_sb_common(mp, bp, &sb);
if (error)
goto out_error;
@@ -730,7 +762,7 @@ static void
xfs_sb_quiet_read_verify(
struct xfs_buf *bp)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
/* XFS filesystem, verify noisily! */
@@ -748,13 +780,14 @@ xfs_sb_write_verify(
struct xfs_sb sb;
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_dsb *dsb = bp->b_addr;
int error;
/*
* Check all the superblock fields. Don't byteswap the xquota flags
* because _verify_common checks the on-disk values.
*/
- __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
+ __xfs_sb_from_disk(&sb, dsb, false);
error = xfs_validate_sb_common(mp, bp, &sb);
if (error)
goto out_error;
@@ -766,7 +799,7 @@ xfs_sb_write_verify(
return;
if (bip)
- XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
return;
@@ -927,7 +960,7 @@ xfs_log_sb(
mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
}
@@ -1007,7 +1040,7 @@ xfs_update_secondary_sbs(
bp->b_ops = &xfs_sb_buf_ops;
xfs_buf_oneshot(bp);
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_buf_delwri_queue(bp, &buffer_list);
xfs_buf_relse(bp);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 7a9c04920505..d1a0848cb52e 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res(
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_sb_version_has_v3inode(&mp->m_sb))
return res;
size = XFS_FSB_TO_B(mp, 1);
}
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index ba0f747c82e8..e9bcf1faa183 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -92,7 +92,7 @@ xchk_superblock(
if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error))
return error;
- sb = XFS_BUF_TO_SBP(bp);
+ sb = bp->b_addr;
/*
* Verify the geometries match. Fields that are permanently
@@ -358,7 +358,7 @@ static inline void
xchk_agf_xref_freeblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_extlen_t blocks = 0;
int error;
@@ -378,7 +378,7 @@ static inline void
xchk_agf_xref_cntbt(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_agblock_t agbno;
xfs_extlen_t blocks;
int have;
@@ -410,7 +410,7 @@ STATIC void
xchk_agf_xref_btreeblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t blocks;
xfs_agblock_t btreeblks;
@@ -456,7 +456,7 @@ static inline void
xchk_agf_xref_refcblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_agblock_t blocks;
int error;
@@ -525,7 +525,7 @@ xchk_agf(
goto out;
xchk_buffer_recheck(sc, sc->sa.agf_bp);
- agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agf = sc->sa.agf_bp->b_addr;
/* Check the AG length */
eoag = be32_to_cpu(agf->agf_length);
@@ -711,7 +711,7 @@ xchk_agfl(
goto out;
/* Allocate buffer to ensure uniqueness of AGFL entries. */
- agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agf = sc->sa.agf_bp->b_addr;
agflcount = be32_to_cpu(agf->agf_flcount);
if (agflcount > xfs_agfl_size(sc->mp)) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
@@ -728,7 +728,7 @@ xchk_agfl(
}
/* Check the blocks in the AGFL. */
- error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+ error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr,
sc->sa.agfl_bp, xchk_agfl_block, &sai);
if (error == -ECANCELED) {
error = 0;
@@ -765,7 +765,7 @@ static inline void
xchk_agi_xref_icounts(
struct xfs_scrub *sc)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
xfs_agino_t icount;
xfs_agino_t freecount;
int error;
@@ -834,7 +834,7 @@ xchk_agi(
goto out;
xchk_buffer_recheck(sc, sc->sa.agi_bp);
- agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ agi = sc->sa.agi_bp->b_addr;
/* Check the AG length */
eoag = be32_to_cpu(agi->agi_length);
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index d5e6db9af434..bca2ab1d4be9 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -49,7 +49,7 @@ xrep_superblock(
/* Copy AG 0's superblock to this one. */
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
/* Write this to disk. */
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
@@ -140,7 +140,7 @@ xrep_agf_find_btrees(
struct xrep_find_ag_btree *fab,
struct xfs_buf *agfl_bp)
{
- struct xfs_agf *old_agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *old_agf = agf_bp->b_addr;
int error;
/* Go find the root data. */
@@ -181,7 +181,7 @@ xrep_agf_init_header(
struct xfs_agf *old_agf)
{
struct xfs_mount *mp = sc->mp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
memcpy(old_agf, agf, sizeof(*old_agf));
memset(agf, 0, BBTOB(agf_bp->b_length));
@@ -238,7 +238,7 @@ xrep_agf_calc_from_btrees(
{
struct xrep_agf_allocbt raa = { .sc = sc };
struct xfs_btree_cur *cur = NULL;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t btreeblks;
xfs_agblock_t blocks;
@@ -302,7 +302,7 @@ xrep_agf_commit_new(
struct xfs_buf *agf_bp)
{
struct xfs_perag *pag;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
/* Trigger fdblocks recalculation */
xfs_force_summary_recalc(sc->mp);
@@ -376,7 +376,7 @@ xrep_agf(
if (error)
return error;
agf_bp->b_ops = &xfs_agf_buf_ops;
- agf = XFS_BUF_TO_AGF(agf_bp);
+ agf = agf_bp->b_addr;
/*
* Load the AGFL so that we can screen out OWN_AG blocks that are on
@@ -395,7 +395,7 @@ xrep_agf(
* Spot-check the AGFL blocks; if they're obviously corrupt then
* there's nothing we can do but bail out.
*/
- error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
+ error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp,
xrep_agf_check_agfl_block, sc);
if (error)
return error;
@@ -429,10 +429,10 @@ out_revert:
struct xrep_agfl {
/* Bitmap of other OWN_AG metadata blocks. */
- struct xfs_bitmap agmetablocks;
+ struct xbitmap agmetablocks;
/* Bitmap of free space. */
- struct xfs_bitmap *freesp;
+ struct xbitmap *freesp;
struct xfs_scrub *sc;
};
@@ -453,14 +453,14 @@ xrep_agfl_walk_rmap(
/* Record all the OWN_AG blocks. */
if (rec->rm_owner == XFS_RMAP_OWN_AG) {
- fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno,
rec->rm_startblock);
- error = xfs_bitmap_set(ra->freesp, fsb, rec->rm_blockcount);
+ error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount);
if (error)
return error;
}
- return xfs_bitmap_set_btcur_path(&ra->agmetablocks, cur);
+ return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
}
/*
@@ -476,19 +476,17 @@ STATIC int
xrep_agfl_collect_blocks(
struct xfs_scrub *sc,
struct xfs_buf *agf_bp,
- struct xfs_bitmap *agfl_extents,
+ struct xbitmap *agfl_extents,
xfs_agblock_t *flcount)
{
struct xrep_agfl ra;
struct xfs_mount *mp = sc->mp;
struct xfs_btree_cur *cur;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *n;
int error;
ra.sc = sc;
ra.freesp = agfl_extents;
- xfs_bitmap_init(&ra.agmetablocks);
+ xbitmap_init(&ra.agmetablocks);
/* Find all space used by the free space btrees & rmapbt. */
cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
@@ -500,7 +498,7 @@ xrep_agfl_collect_blocks(
/* Find all blocks currently being used by the bnobt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
XFS_BTNUM_BNO);
- error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur);
+ error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
if (error)
goto err;
xfs_btree_del_cursor(cur, error);
@@ -508,7 +506,7 @@ xrep_agfl_collect_blocks(
/* Find all blocks currently being used by the cntbt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
XFS_BTNUM_CNT);
- error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur);
+ error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
if (error)
goto err;
@@ -518,8 +516,8 @@ xrep_agfl_collect_blocks(
* Drop the freesp meta blocks that are in use by btrees.
* The remaining blocks /should/ be AGFL blocks.
*/
- error = xfs_bitmap_disunion(agfl_extents, &ra.agmetablocks);
- xfs_bitmap_destroy(&ra.agmetablocks);
+ error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
+ xbitmap_destroy(&ra.agmetablocks);
if (error)
return error;
@@ -527,18 +525,12 @@ xrep_agfl_collect_blocks(
* Calculate the new AGFL size. If we found more blocks than fit in
* the AGFL we'll free them later.
*/
- *flcount = 0;
- for_each_xfs_bitmap_extent(br, n, agfl_extents) {
- *flcount += br->len;
- if (*flcount > xfs_agfl_size(mp))
- break;
- }
- if (*flcount > xfs_agfl_size(mp))
- *flcount = xfs_agfl_size(mp);
+ *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
+ xfs_agfl_size(mp));
return 0;
err:
- xfs_bitmap_destroy(&ra.agmetablocks);
+ xbitmap_destroy(&ra.agmetablocks);
xfs_btree_del_cursor(cur, error);
return error;
}
@@ -550,7 +542,7 @@ xrep_agfl_update_agf(
struct xfs_buf *agf_bp,
xfs_agblock_t flcount)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
ASSERT(flcount <= xfs_agfl_size(sc->mp));
@@ -573,13 +565,13 @@ STATIC void
xrep_agfl_init_header(
struct xfs_scrub *sc,
struct xfs_buf *agfl_bp,
- struct xfs_bitmap *agfl_extents,
+ struct xbitmap *agfl_extents,
xfs_agblock_t flcount)
{
struct xfs_mount *mp = sc->mp;
__be32 *agfl_bno;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *br;
+ struct xbitmap_range *n;
struct xfs_agfl *agfl;
xfs_agblock_t agbno;
unsigned int fl_off;
@@ -602,8 +594,8 @@ xrep_agfl_init_header(
* step.
*/
fl_off = 0;
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp);
- for_each_xfs_bitmap_extent(br, n, agfl_extents) {
+ agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
+ for_each_xbitmap_extent(br, n, agfl_extents) {
agbno = XFS_FSB_TO_AGBNO(mp, br->start);
trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len);
@@ -637,7 +629,7 @@ int
xrep_agfl(
struct xfs_scrub *sc)
{
- struct xfs_bitmap agfl_extents;
+ struct xbitmap agfl_extents;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *agf_bp;
struct xfs_buf *agfl_bp;
@@ -649,7 +641,7 @@ xrep_agfl(
return -EOPNOTSUPP;
xchk_perag_get(sc->mp, &sc->sa);
- xfs_bitmap_init(&agfl_extents);
+ xbitmap_init(&agfl_extents);
/*
* Read the AGF so that we can query the rmapbt. We hope that there's
@@ -696,10 +688,10 @@ xrep_agfl(
goto err;
/* Dump any AGFL overflow. */
- return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+ error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
err:
- xfs_bitmap_destroy(&agfl_extents);
+ xbitmap_destroy(&agfl_extents);
return error;
}
@@ -761,7 +753,7 @@ xrep_agi_init_header(
struct xfs_buf *agi_bp,
struct xfs_agi *old_agi)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
memcpy(old_agi, agi, sizeof(*old_agi));
@@ -807,7 +799,7 @@ xrep_agi_calc_from_btrees(
struct xfs_buf *agi_bp)
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agino_t count;
xfs_agino_t freecount;
@@ -835,7 +827,7 @@ xrep_agi_commit_new(
struct xfs_buf *agi_bp)
{
struct xfs_perag *pag;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
/* Trigger inode count recalculation */
xfs_force_summary_recalc(sc->mp);
@@ -892,7 +884,7 @@ xrep_agi(
if (error)
return error;
agi_bp->b_ops = &xfs_agi_buf_ops;
- agi = XFS_BUF_TO_AGI(agi_bp);
+ agi = agi_bp->b_addr;
/* Find the AGI btree roots. */
error = xrep_agi_find_btrees(sc, fab);
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 5533e48e605d..73d924e47565 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -94,7 +94,7 @@ xchk_allocbt_rec(
union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agblock_t bno;
xfs_extlen_t len;
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index d9f0dd444b80..9faddb334a2c 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -98,7 +98,7 @@ struct xchk_xattr {
/*
* Check that an extended attribute key can be looked up by hash.
*
- * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked)
+ * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked)
* to call this function for every attribute key in an inode. Once
* we're here, we load the attribute value to see if any errors happen,
* or if we get more or less data than we expected.
@@ -147,11 +147,8 @@ xchk_xattr_listent(
return;
}
- args.flags = ATTR_KERNOTIME;
- if (flags & XFS_ATTR_ROOT)
- args.flags |= ATTR_ROOT;
- else if (flags & XFS_ATTR_SECURE)
- args.flags |= ATTR_SECURE;
+ args.op_flags = XFS_DA_OP_NOTIME;
+ args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK;
args.geo = context->dp->i_mount->m_attr_geo;
args.whichfork = XFS_ATTR_FORK;
args.dp = context->dp;
@@ -162,7 +159,10 @@ xchk_xattr_listent(
args.value = xchk_xattr_valuebuf(sx->sc);
args.valuelen = valuelen;
- error = xfs_attr_get_ilocked(context->dp, &args);
+ error = xfs_attr_get_ilocked(&args);
+ /* ENODATA means the hash lookup failed and the attr is bad */
+ if (error == -ENODATA)
+ error = -EFSCORRUPTED;
if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
&error))
goto fail_xref;
@@ -474,7 +474,6 @@ xchk_xattr(
struct xfs_scrub *sc)
{
struct xchk_xattr sx;
- struct attrlist_cursor_kern cursor = { 0 };
xfs_dablk_t last_checked = -1U;
int error = 0;
@@ -493,11 +492,10 @@ xchk_xattr(
/* Check that every attr key can also be looked up by hash. */
sx.context.dp = sc->ip;
- sx.context.cursor = &cursor;
sx.context.resynch = 1;
sx.context.put_listent = xchk_xattr_listent;
sx.context.tp = sc->tp;
- sx.context.flags = ATTR_INCOMPLETE;
+ sx.context.allow_incomplete = true;
sx.sc = sc;
/*
@@ -516,7 +514,7 @@ xchk_xattr(
* iteration, which doesn't really follow the usual buffer
* locking order.
*/
- error = xfs_attr_list_int_ilocked(&sx.context);
+ error = xfs_attr_list_ilocked(&sx.context);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 18a684e18a69..f88694f22d05 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -18,14 +18,14 @@
* This is the logical equivalent of bitmap |= mask(start, len).
*/
int
-xfs_bitmap_set(
- struct xfs_bitmap *bitmap,
+xbitmap_set(
+ struct xbitmap *bitmap,
uint64_t start,
uint64_t len)
{
- struct xfs_bitmap_range *bmr;
+ struct xbitmap_range *bmr;
- bmr = kmem_alloc(sizeof(struct xfs_bitmap_range), KM_MAYFAIL);
+ bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL);
if (!bmr)
return -ENOMEM;
@@ -39,13 +39,13 @@ xfs_bitmap_set(
/* Free everything related to this bitmap. */
void
-xfs_bitmap_destroy(
- struct xfs_bitmap *bitmap)
+xbitmap_destroy(
+ struct xbitmap *bitmap)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
- for_each_xfs_bitmap_extent(bmr, n, bitmap) {
+ for_each_xbitmap_extent(bmr, n, bitmap) {
list_del(&bmr->list);
kmem_free(bmr);
}
@@ -53,24 +53,24 @@ xfs_bitmap_destroy(
/* Set up a per-AG block bitmap. */
void
-xfs_bitmap_init(
- struct xfs_bitmap *bitmap)
+xbitmap_init(
+ struct xbitmap *bitmap)
{
INIT_LIST_HEAD(&bitmap->list);
}
/* Compare two btree extents. */
static int
-xfs_bitmap_range_cmp(
+xbitmap_range_cmp(
void *priv,
struct list_head *a,
struct list_head *b)
{
- struct xfs_bitmap_range *ap;
- struct xfs_bitmap_range *bp;
+ struct xbitmap_range *ap;
+ struct xbitmap_range *bp;
- ap = container_of(a, struct xfs_bitmap_range, list);
- bp = container_of(b, struct xfs_bitmap_range, list);
+ ap = container_of(a, struct xbitmap_range, list);
+ bp = container_of(b, struct xbitmap_range, list);
if (ap->start > bp->start)
return 1;
@@ -96,14 +96,14 @@ xfs_bitmap_range_cmp(
#define LEFT_ALIGNED (1 << 0)
#define RIGHT_ALIGNED (1 << 1)
int
-xfs_bitmap_disunion(
- struct xfs_bitmap *bitmap,
- struct xfs_bitmap *sub)
+xbitmap_disunion(
+ struct xbitmap *bitmap,
+ struct xbitmap *sub)
{
struct list_head *lp;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *new_br;
- struct xfs_bitmap_range *sub_br;
+ struct xbitmap_range *br;
+ struct xbitmap_range *new_br;
+ struct xbitmap_range *sub_br;
uint64_t sub_start;
uint64_t sub_len;
int state;
@@ -113,8 +113,8 @@ xfs_bitmap_disunion(
return 0;
ASSERT(!list_empty(&sub->list));
- list_sort(NULL, &bitmap->list, xfs_bitmap_range_cmp);
- list_sort(NULL, &sub->list, xfs_bitmap_range_cmp);
+ list_sort(NULL, &bitmap->list, xbitmap_range_cmp);
+ list_sort(NULL, &sub->list, xbitmap_range_cmp);
/*
* Now that we've sorted both lists, we iterate bitmap once, rolling
@@ -124,11 +124,11 @@ xfs_bitmap_disunion(
* list traversal is similar to merge sort, but we're deleting
* instead. In this manner we avoid O(n^2) operations.
*/
- sub_br = list_first_entry(&sub->list, struct xfs_bitmap_range,
+ sub_br = list_first_entry(&sub->list, struct xbitmap_range,
list);
lp = bitmap->list.next;
while (lp != &bitmap->list) {
- br = list_entry(lp, struct xfs_bitmap_range, list);
+ br = list_entry(lp, struct xbitmap_range, list);
/*
* Advance sub_br and/or br until we find a pair that
@@ -181,7 +181,7 @@ xfs_bitmap_disunion(
* Deleting from the middle: add the new right extent
* and then shrink the left extent.
*/
- new_br = kmem_alloc(sizeof(struct xfs_bitmap_range),
+ new_br = kmem_alloc(sizeof(struct xbitmap_range),
KM_MAYFAIL);
if (!new_br) {
error = -ENOMEM;
@@ -247,8 +247,8 @@ out:
* blocks going from the leaf towards the root.
*/
int
-xfs_bitmap_set_btcur_path(
- struct xfs_bitmap *bitmap,
+xbitmap_set_btcur_path(
+ struct xbitmap *bitmap,
struct xfs_btree_cur *cur)
{
struct xfs_buf *bp;
@@ -261,7 +261,7 @@ xfs_bitmap_set_btcur_path(
if (!bp)
continue;
fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
- error = xfs_bitmap_set(bitmap, fsb, 1);
+ error = xbitmap_set(bitmap, fsb, 1);
if (error)
return error;
}
@@ -271,12 +271,12 @@ xfs_bitmap_set_btcur_path(
/* Collect a btree's block in the bitmap. */
STATIC int
-xfs_bitmap_collect_btblock(
+xbitmap_collect_btblock(
struct xfs_btree_cur *cur,
int level,
void *priv)
{
- struct xfs_bitmap *bitmap = priv;
+ struct xbitmap *bitmap = priv;
struct xfs_buf *bp;
xfs_fsblock_t fsbno;
@@ -285,15 +285,30 @@ xfs_bitmap_collect_btblock(
return 0;
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
- return xfs_bitmap_set(bitmap, fsbno, 1);
+ return xbitmap_set(bitmap, fsbno, 1);
}
/* Walk the btree and mark the bitmap wherever a btree block is found. */
int
-xfs_bitmap_set_btblocks(
- struct xfs_bitmap *bitmap,
+xbitmap_set_btblocks(
+ struct xbitmap *bitmap,
struct xfs_btree_cur *cur)
{
- return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock,
+ return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock,
XFS_BTREE_VISIT_ALL, bitmap);
}
+
+/* How many bits are set in this bitmap? */
+uint64_t
+xbitmap_hweight(
+ struct xbitmap *bitmap)
+{
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
+ uint64_t ret = 0;
+
+ for_each_xbitmap_extent(bmr, n, bitmap)
+ ret += bmr->len;
+
+ return ret;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index ae8ecbce6fa6..900646b72de1 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -6,31 +6,32 @@
#ifndef __XFS_SCRUB_BITMAP_H__
#define __XFS_SCRUB_BITMAP_H__
-struct xfs_bitmap_range {
+struct xbitmap_range {
struct list_head list;
uint64_t start;
uint64_t len;
};
-struct xfs_bitmap {
+struct xbitmap {
struct list_head list;
};
-void xfs_bitmap_init(struct xfs_bitmap *bitmap);
-void xfs_bitmap_destroy(struct xfs_bitmap *bitmap);
+void xbitmap_init(struct xbitmap *bitmap);
+void xbitmap_destroy(struct xbitmap *bitmap);
-#define for_each_xfs_bitmap_extent(bex, n, bitmap) \
+#define for_each_xbitmap_extent(bex, n, bitmap) \
list_for_each_entry_safe((bex), (n), &(bitmap)->list, list)
-#define for_each_xfs_bitmap_block(b, bex, n, bitmap) \
+#define for_each_xbitmap_block(b, bex, n, bitmap) \
list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \
- for ((b) = bex->start; (b) < bex->start + bex->len; (b)++)
+ for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++)
-int xfs_bitmap_set(struct xfs_bitmap *bitmap, uint64_t start, uint64_t len);
-int xfs_bitmap_disunion(struct xfs_bitmap *bitmap, struct xfs_bitmap *sub);
-int xfs_bitmap_set_btcur_path(struct xfs_bitmap *bitmap,
+int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
+int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
+int xbitmap_set_btcur_path(struct xbitmap *bitmap,
struct xfs_btree_cur *cur);
-int xfs_bitmap_set_btblocks(struct xfs_bitmap *bitmap,
+int xbitmap_set_btblocks(struct xbitmap *bitmap,
struct xfs_btree_cur *cur);
+uint64_t xbitmap_hweight(struct xbitmap *bitmap);
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index fa6ea6407992..add8598eacd5 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -374,7 +374,7 @@ xchk_bmapbt_rec(
struct xfs_bmbt_irec iext_irec;
struct xfs_iext_cursor icur;
struct xchk_bmap_info *info = bs->private;
- struct xfs_inode *ip = bs->cur->bc_private.b.ip;
+ struct xfs_inode *ip = bs->cur->bc_ino.ip;
struct xfs_buf *bp = NULL;
struct xfs_btree_block *block;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
@@ -501,7 +501,7 @@ xchk_bmap_check_rmap(
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
- cur->bc_private.a.agno, rec->rm_startblock))
+ cur->bc_ag.agno, rec->rm_startblock))
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
if (irec.br_blockcount > rec->rm_blockcount)
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 97a15b6f2865..9a2e27ac1300 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -219,19 +219,21 @@ xchk_da_btree_block_check_sibling(
int direction,
xfs_dablk_t sibling)
{
+ struct xfs_da_state_path *path = &ds->state->path;
+ struct xfs_da_state_path *altpath = &ds->state->altpath;
int retval;
+ int plevel;
int error;
- memcpy(&ds->state->altpath, &ds->state->path,
- sizeof(ds->state->altpath));
+ memcpy(altpath, path, sizeof(ds->state->altpath));
/*
* If the pointer is null, we shouldn't be able to move the upper
* level pointer anywhere.
*/
if (sibling == 0) {
- error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
- direction, false, &retval);
+ error = xfs_da3_path_shift(ds->state, altpath, direction,
+ false, &retval);
if (error == 0 && retval == 0)
xchk_da_set_corrupt(ds, level);
error = 0;
@@ -239,27 +241,33 @@ xchk_da_btree_block_check_sibling(
}
/* Move the alternate cursor one block in the direction given. */
- error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
- direction, false, &retval);
+ error = xfs_da3_path_shift(ds->state, altpath, direction, false,
+ &retval);
if (!xchk_da_process_error(ds, level, &error))
- return error;
+ goto out;
if (retval) {
xchk_da_set_corrupt(ds, level);
- return error;
+ goto out;
}
- if (ds->state->altpath.blk[level].bp)
- xchk_buffer_recheck(ds->sc,
- ds->state->altpath.blk[level].bp);
+ if (altpath->blk[level].bp)
+ xchk_buffer_recheck(ds->sc, altpath->blk[level].bp);
/* Compare upper level pointer to sibling pointer. */
- if (ds->state->altpath.blk[level].blkno != sibling)
+ if (altpath->blk[level].blkno != sibling)
xchk_da_set_corrupt(ds, level);
- if (ds->state->altpath.blk[level].bp) {
- xfs_trans_brelse(ds->dargs.trans,
- ds->state->altpath.blk[level].bp);
- ds->state->altpath.blk[level].bp = NULL;
- }
+
out:
+ /* Free all buffers in the altpath that aren't referenced from path. */
+ for (plevel = 0; plevel < altpath->active; plevel++) {
+ if (altpath->blk[plevel].bp == NULL ||
+ (plevel < path->active &&
+ altpath->blk[plevel].bp == path->blk[plevel].bp))
+ continue;
+
+ xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp);
+ altpath->blk[plevel].bp = NULL;
+ }
+
return error;
}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 266da4e4bde6..fe2a6e030c8a 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -155,6 +155,9 @@ xchk_dir_actor(
xname.type = XFS_DIR3_FT_UNKNOWN;
error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
+ /* ENOENT means the hash lookup failed and the dir is corrupt */
+ if (error == -ENOENT)
+ error = -EFSCORRUPTED;
if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
&error))
goto out;
@@ -500,7 +503,7 @@ xchk_directory_leaf1_bestfree(
/* Read the free space block. */
error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
- goto out;
+ return error;
xchk_buffer_recheck(sc, bp);
leaf = bp->b_addr;
@@ -565,9 +568,10 @@ xchk_directory_leaf1_bestfree(
xchk_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
+ break;
}
out:
+ xfs_trans_brelse(sc->tp, bp);
return error;
}
@@ -589,7 +593,7 @@ xchk_directory_free_bestfree(
/* Read the free space block */
error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
- goto out;
+ return error;
xchk_buffer_recheck(sc, bp);
if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
@@ -612,7 +616,7 @@ xchk_directory_free_bestfree(
0, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
- break;
+ goto out;
xchk_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
}
@@ -620,6 +624,7 @@ xchk_directory_free_bestfree(
if (freehdr.nused + stale != freehdr.nvalid)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
out:
+ xfs_trans_brelse(sc->tp, bp);
return error;
}
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 681758704fda..64c217eb06a7 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -104,7 +104,7 @@ xchk_iallocbt_chunk(
xfs_extlen_t len)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agblock_t bno;
bno = XFS_AGINO_TO_AGBNO(mp, agino);
@@ -164,7 +164,7 @@ xchk_iallocbt_check_cluster_ifree(
* the record, compute which fs inode we're talking about.
*/
agino = irec->ir_startino + irec_ino;
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino);
irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
@@ -215,7 +215,7 @@ xchk_iallocbt_check_cluster(
struct xfs_dinode *dip;
struct xfs_buf *cluster_bp;
unsigned int nr_inodes;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agblock_t agbno;
unsigned int cluster_index;
uint16_t cluster_mask = 0;
@@ -426,7 +426,7 @@ xchk_iallocbt_rec(
struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agino_t agino;
xfs_extlen_t len;
int holecount;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 0cab11a5d390..beaeb6fa3119 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -336,7 +336,7 @@ xchk_refcountbt_rec(
{
struct xfs_mount *mp = bs->cur->bc_mp;
xfs_agblock_t *cow_blocks = bs->private;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agblock_t bno;
xfs_extlen_t len;
xfs_nlink_t refcount;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index e489d7a8446a..db3cfd12803d 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -208,8 +208,10 @@ xrep_calc_ag_resblks(
/* Now grab the block counters from the AGF. */
error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
if (!error) {
- aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
- freelen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_freeblks);
+ struct xfs_agf *agf = bp->b_addr;
+
+ aglen = be32_to_cpu(agf->agf_length);
+ freelen = be32_to_cpu(agf->agf_freeblks);
usedlen = aglen - freelen;
xfs_buf_relse(bp);
}
@@ -434,10 +436,10 @@ xrep_init_btblock(
int
xrep_invalidate_blocks(
struct xfs_scrub *sc,
- struct xfs_bitmap *bitmap)
+ struct xbitmap *bitmap)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
struct xfs_buf *bp;
xfs_fsblock_t fsbno;
@@ -449,7 +451,7 @@ xrep_invalidate_blocks(
* because we never own those; and if we can't TRYLOCK the buffer we
* assume it's owned by someone else.
*/
- for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) {
+ for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
/* Skip AG headers and post-EOFS blocks */
if (!xfs_verify_fsbno(sc->mp, fsbno))
continue;
@@ -595,18 +597,18 @@ out_free:
int
xrep_reap_extents(
struct xfs_scrub *sc,
- struct xfs_bitmap *bitmap,
+ struct xbitmap *bitmap,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
xfs_fsblock_t fsbno;
int error = 0;
ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
- for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) {
+ for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
ASSERT(sc->ip != NULL ||
XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno);
trace_xrep_dispose_btree_extent(sc->mp,
@@ -615,11 +617,9 @@ xrep_reap_extents(
error = xrep_reap_block(sc, fsbno, oinfo, type);
if (error)
- goto out;
+ break;
}
-out:
- xfs_bitmap_destroy(bitmap);
return error;
}
@@ -879,7 +879,7 @@ xrep_find_ag_btree_roots(
ri.sc = sc;
ri.btree_info = btree_info;
- ri.agf = XFS_BUF_TO_AGF(agf_bp);
+ ri.agf = agf_bp->b_addr;
ri.agfl_bp = agfl_bp;
for (fab = btree_info; fab->buf_ops; fab++) {
ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index c3422403b169..04a47d45605b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -28,11 +28,11 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
struct xfs_buf **bpp, xfs_btnum_t btnum,
const struct xfs_buf_ops *ops);
-struct xfs_bitmap;
+struct xbitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
-int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist);
-int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist,
+int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
+int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
struct xrep_find_ag_btree {
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8d4cefd761c1..f4fcb4719f41 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -92,7 +92,7 @@ xchk_rmapbt_rec(
{
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_rmap_irec irec;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
bool non_inode;
bool is_unwritten;
bool is_bmbt;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index f1775bb19313..8ebf35b115ce 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -168,6 +168,7 @@ xchk_teardown(
xfs_irele(sc->ip);
sc->ip = NULL;
}
+ sb_end_write(sc->mp->m_super);
if (sc->flags & XCHK_REAPING_DISABLED)
xchk_start_reaping(sc);
if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
@@ -490,6 +491,14 @@ xfs_scrub_metadata(
sc.ops = &meta_scrub_ops[sm->sm_type];
sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
retry_op:
+ /*
+ * If freeze runs concurrently with a scrub, the freeze can be delayed
+ * indefinitely as we walk the filesystem and iterate over metadata
+ * buffers. Freeze quiesces the log (which waits for the buffer LRU to
+ * be emptied) and that won't happen while checking is running.
+ */
+ sb_start_write(mp->m_super);
+
/* Set up for the operation. */
error = sc.ops->setup(&sc, ip);
if (error)
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 9eaab2eb5ed3..2c6c248be823 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -24,9 +24,9 @@ xchk_btree_cur_fsbno(
return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn);
else if (level == cur->bc_nlevels - 1 &&
cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino);
+ return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
- return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0);
+ return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0);
return NULLFSBLOCK;
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 096203119934..e46f5cef90da 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -379,7 +379,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = sc->ip->i_ino;
- __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
@@ -459,7 +459,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = sc->ip->i_ino;
- __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cd743fad8478..d4c687b5cd06 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -14,6 +14,8 @@
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_acl.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include <linux/posix_acl_xattr.h>
@@ -67,10 +69,12 @@ xfs_acl_from_disk(
switch (acl_e->e_tag) {
case ACL_USER:
- acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+ acl_e->e_uid = make_kuid(&init_user_ns,
+ be32_to_cpu(ace->ae_id));
break;
case ACL_GROUP:
- acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
+ acl_e->e_gid = make_kgid(&init_user_ns,
+ be32_to_cpu(ace->ae_id));
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
@@ -103,10 +107,12 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
ace->ae_tag = cpu_to_be32(acl_e->e_tag);
switch (acl_e->e_tag) {
case ACL_USER:
- ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+ ace->ae_id = cpu_to_be32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
break;
case ACL_GROUP:
- ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+ ace->ae_id = cpu_to_be32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
break;
default:
ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
@@ -120,102 +126,86 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
struct posix_acl *
xfs_get_acl(struct inode *inode, int type)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct posix_acl *acl = NULL;
- struct xfs_acl *xfs_acl = NULL;
- unsigned char *ea_name;
- int error;
- int len;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct posix_acl *acl = NULL;
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_ROOT,
+ .valuelen = XFS_ACL_MAX_SIZE(mp),
+ };
+ int error;
trace_xfs_get_acl(ip);
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = SGI_ACL_FILE;
+ args.name = SGI_ACL_FILE;
break;
case ACL_TYPE_DEFAULT:
- ea_name = SGI_ACL_DEFAULT;
+ args.name = SGI_ACL_DEFAULT;
break;
default:
BUG();
}
+ args.namelen = strlen(args.name);
/*
- * If we have a cached ACLs value just return it, not need to
- * go out to the disk.
+ * If the attribute doesn't exist make sure we have a negative cache
+ * entry, for any other error assume it is transient.
*/
- len = XFS_ACL_MAX_SIZE(ip->i_mount);
- error = xfs_attr_get(ip, ea_name, strlen(ea_name),
- (unsigned char **)&xfs_acl, &len,
- ATTR_ALLOC | ATTR_ROOT);
- if (error) {
- /*
- * If the attribute doesn't exist make sure we have a negative
- * cache entry, for any other error assume it is transient.
- */
- if (error != -ENOATTR)
- acl = ERR_PTR(error);
- } else {
- acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len,
- XFS_ACL_MAX_ENTRIES(ip->i_mount));
- kmem_free(xfs_acl);
+ error = xfs_attr_get(&args);
+ if (!error) {
+ acl = xfs_acl_from_disk(mp, args.value, args.valuelen,
+ XFS_ACL_MAX_ENTRIES(mp));
+ } else if (error != -ENOATTR) {
+ acl = ERR_PTR(error);
}
+
+ kmem_free(args.value);
return acl;
}
int
__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct xfs_inode *ip = XFS_I(inode);
- unsigned char *ea_name;
- int error;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_ROOT,
+ };
+ int error;
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = SGI_ACL_FILE;
+ args.name = SGI_ACL_FILE;
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
- ea_name = SGI_ACL_DEFAULT;
+ args.name = SGI_ACL_DEFAULT;
break;
default:
return -EINVAL;
}
+ args.namelen = strlen(args.name);
if (acl) {
- struct xfs_acl *xfs_acl;
- int len = XFS_ACL_MAX_SIZE(ip->i_mount);
-
- xfs_acl = kmem_zalloc_large(len, 0);
- if (!xfs_acl)
+ args.valuelen = XFS_ACL_SIZE(acl->a_count);
+ args.value = kmem_zalloc_large(args.valuelen, 0);
+ if (!args.value)
return -ENOMEM;
-
- xfs_acl_to_disk(xfs_acl, acl);
-
- /* subtract away the unused acl entries */
- len -= sizeof(struct xfs_acl_entry) *
- (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
-
- error = xfs_attr_set(ip, ea_name, strlen(ea_name),
- (unsigned char *)xfs_acl, len, ATTR_ROOT);
-
- kmem_free(xfs_acl);
- } else {
- /*
- * A NULL ACL argument means we want to remove the ACL.
- */
- error = xfs_attr_remove(ip, ea_name,
- strlen(ea_name),
- ATTR_ROOT);
-
- /*
- * If the attribute didn't exist to start with that's fine.
- */
- if (error == -ENOATTR)
- error = 0;
+ xfs_acl_to_disk(args.value, acl);
}
+ error = xfs_attr_set(&args);
+ kmem_free(args.value);
+
+ /*
+ * If the attribute didn't exist to start with that's fine.
+ */
+ if (!acl && error == -ENOATTR)
+ error = 0;
if (!error)
set_cached_acl(inode, type, acl);
return error;
@@ -275,3 +265,19 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return error;
}
+
+/*
+ * Invalidate any cached ACLs if the user has bypassed the ACL interface.
+ * We don't validate the content whatsoever so it is caller responsibility to
+ * provide data in valid format and ensure i_mode is consistent.
+ */
+void
+xfs_forget_acl(
+ struct inode *inode,
+ const char *name)
+{
+ if (!strcmp(name, SGI_ACL_FILE))
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+ else if (!strcmp(name, SGI_ACL_DEFAULT))
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 94615e34bc86..c042c0868016 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -13,14 +13,16 @@ struct posix_acl;
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+void xfs_forget_acl(struct inode *inode, const char *name);
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
return NULL;
}
# define xfs_set_acl NULL
+static inline void xfs_forget_acl(struct inode *inode, const char *name)
+{
+}
#endif /* CONFIG_XFS_POSIX_ACL */
-extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
-
#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 58e937be24ce..9d9cebf18726 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -539,7 +539,7 @@ xfs_discard_page(
if (XFS_FORCED_SHUTDOWN(mp))
goto out_invalidate;
- xfs_alert(mp,
+ xfs_alert_ratelimited(mp,
"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
page, ip->i_ino, offset);
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index bbfa6ba84dcd..c42f90e16b4f 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -145,8 +145,8 @@ xfs_attr3_node_inactive(
* Since this code is recursive (gasp!) we must protect ourselves.
*/
if (level > XFS_DA_NODE_MAXDEPTH) {
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
- xfs_buf_corruption_error(bp);
return -EFSCORRUPTED;
}
@@ -194,7 +194,7 @@ xfs_attr3_node_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, child_bp);
break;
default:
- xfs_buf_corruption_error(child_bp);
+ xfs_buf_mark_corrupt(child_bp);
xfs_trans_brelse(*trans, child_bp);
error = -EFSCORRUPTED;
break;
@@ -289,7 +289,7 @@ xfs_attr3_root_inactive(
break;
default:
error = -EFSCORRUPTED;
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp);
break;
}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index d37743bdf274..5ff1d929d3b5 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -52,24 +52,19 @@ static int
xfs_attr_shortform_list(
struct xfs_attr_list_context *context)
{
- struct attrlist_cursor_kern *cursor;
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
+ struct xfs_inode *dp = context->dp;
struct xfs_attr_sf_sort *sbuf, *sbp;
struct xfs_attr_shortform *sf;
struct xfs_attr_sf_entry *sfe;
- struct xfs_inode *dp;
int sbsize, nsbuf, count, i;
int error = 0;
- ASSERT(context != NULL);
- dp = context->dp;
- ASSERT(dp != NULL);
ASSERT(dp->i_afp != NULL);
sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
return 0;
- cursor = context->cursor;
- ASSERT(cursor != NULL);
trace_xfs_attr_list_sf(context);
@@ -205,7 +200,7 @@ out:
STATIC int
xfs_attr_node_list_lookup(
struct xfs_attr_list_context *context,
- struct attrlist_cursor_kern *cursor,
+ struct xfs_attrlist_cursor_kern *cursor,
struct xfs_buf **pbp)
{
struct xfs_da3_icnode_hdr nodehdr;
@@ -279,7 +274,7 @@ xfs_attr_node_list_lookup(
return 0;
out_corruptbuf:
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(tp, bp);
return -EFSCORRUPTED;
}
@@ -288,8 +283,8 @@ STATIC int
xfs_attr_node_list(
struct xfs_attr_list_context *context)
{
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_attr3_icleaf_hdr leafhdr;
- struct attrlist_cursor_kern *cursor;
struct xfs_attr_leafblock *leaf;
struct xfs_da_intnode *node;
struct xfs_buf *bp;
@@ -299,7 +294,6 @@ xfs_attr_node_list(
trace_xfs_attr_node_list(context);
- cursor = context->cursor;
cursor->initted = 1;
/*
@@ -394,7 +388,7 @@ xfs_attr3_leaf_list_int(
struct xfs_buf *bp,
struct xfs_attr_list_context *context)
{
- struct attrlist_cursor_kern *cursor;
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_attr_leafblock *leaf;
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entries;
@@ -408,7 +402,6 @@ xfs_attr3_leaf_list_int(
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
- cursor = context->cursor;
cursor->initted = 1;
/*
@@ -452,8 +445,8 @@ xfs_attr3_leaf_list_int(
}
if ((entry->flags & XFS_ATTR_INCOMPLETE) &&
- !(context->flags & ATTR_INCOMPLETE))
- continue; /* skip incomplete entries */
+ !context->allow_incomplete)
+ continue;
if (entry->flags & XFS_ATTR_LOCAL) {
xfs_attr_leaf_name_local_t *name_loc;
@@ -488,14 +481,15 @@ xfs_attr3_leaf_list_int(
* Copy out attribute entries for attr_list(), for leaf attribute lists.
*/
STATIC int
-xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+xfs_attr_leaf_list(
+ struct xfs_attr_list_context *context)
{
- int error;
- struct xfs_buf *bp;
+ struct xfs_buf *bp;
+ int error;
trace_xfs_attr_leaf_list(context);
- context->cursor->blkno = 0;
+ context->cursor.blkno = 0;
error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp);
if (error)
return error;
@@ -506,7 +500,7 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
}
int
-xfs_attr_list_int_ilocked(
+xfs_attr_list_ilocked(
struct xfs_attr_list_context *context)
{
struct xfs_inode *dp = context->dp;
@@ -526,12 +520,12 @@ xfs_attr_list_int_ilocked(
}
int
-xfs_attr_list_int(
- xfs_attr_list_context_t *context)
+xfs_attr_list(
+ struct xfs_attr_list_context *context)
{
- int error;
- xfs_inode_t *dp = context->dp;
- uint lock_mode;
+ struct xfs_inode *dp = context->dp;
+ uint lock_mode;
+ int error;
XFS_STATS_INC(dp->i_mount, xs_attr_list);
@@ -539,130 +533,7 @@ xfs_attr_list_int(
return -EIO;
lock_mode = xfs_ilock_attr_map_shared(dp);
- error = xfs_attr_list_int_ilocked(context);
+ error = xfs_attr_list_ilocked(context);
xfs_iunlock(dp, lock_mode);
return error;
}
-
-#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
- (((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
- ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \
- & ~(sizeof(uint32_t)-1))
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-STATIC void
-xfs_attr_put_listent(
- xfs_attr_list_context_t *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen)
-{
- struct attrlist *alist = (struct attrlist *)context->alist;
- attrlist_ent_t *aep;
- int arraytop;
-
- ASSERT(!context->seen_enough);
- ASSERT(!(context->flags & ATTR_KERNOVAL));
- ASSERT(context->count >= 0);
- ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
- ASSERT(context->firstu >= sizeof(*alist));
- ASSERT(context->firstu <= context->bufsize);
-
- /*
- * Only list entries in the right namespace.
- */
- if (((context->flags & ATTR_SECURE) == 0) !=
- ((flags & XFS_ATTR_SECURE) == 0))
- return;
- if (((context->flags & ATTR_ROOT) == 0) !=
- ((flags & XFS_ATTR_ROOT) == 0))
- return;
-
- arraytop = sizeof(*alist) +
- context->count * sizeof(alist->al_offset[0]);
- context->firstu -= ATTR_ENTSIZE(namelen);
- if (context->firstu < arraytop) {
- trace_xfs_attr_list_full(context);
- alist->al_more = 1;
- context->seen_enough = 1;
- return;
- }
-
- aep = (attrlist_ent_t *)&context->alist[context->firstu];
- aep->a_valuelen = valuelen;
- memcpy(aep->a_name, name, namelen);
- aep->a_name[namelen] = 0;
- alist->al_offset[context->count++] = context->firstu;
- alist->al_count = context->count;
- trace_xfs_attr_list_add(context);
- return;
-}
-
-/*
- * Generate a list of extended attribute names and optionally
- * also value lengths. Positive return value follows the XFS
- * convention of being an error, zero or negative return code
- * is the length of the buffer returned (negated), indicating
- * success.
- */
-int
-xfs_attr_list(
- xfs_inode_t *dp,
- char *buffer,
- int bufsize,
- int flags,
- attrlist_cursor_kern_t *cursor)
-{
- xfs_attr_list_context_t context;
- struct attrlist *alist;
- int error;
-
- /*
- * Validate the cursor.
- */
- if (cursor->pad1 || cursor->pad2)
- return -EINVAL;
- if ((cursor->initted == 0) &&
- (cursor->hashval || cursor->blkno || cursor->offset))
- return -EINVAL;
-
- /* Only internal consumers can retrieve incomplete attrs. */
- if (flags & ATTR_INCOMPLETE)
- return -EINVAL;
-
- /*
- * Check for a properly aligned buffer.
- */
- if (((long)buffer) & (sizeof(int)-1))
- return -EFAULT;
- if (flags & ATTR_KERNOVAL)
- bufsize = 0;
-
- /*
- * Initialize the output buffer.
- */
- memset(&context, 0, sizeof(context));
- context.dp = dp;
- context.cursor = cursor;
- context.resynch = 1;
- context.flags = flags;
- context.alist = buffer;
- context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
- context.firstu = context.bufsize;
- context.put_listent = xfs_attr_put_listent;
-
- alist = (struct attrlist *)context.alist;
- alist->al_count = 0;
- alist->al_more = 0;
- alist->al_offset[0] = context.bufsize;
-
- error = xfs_attr_list_int(&context);
- ASSERT(error <= 0);
- return error;
-}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index e62fb5216341..4f800f7fe888 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1062,7 +1062,6 @@ xfs_collapse_file_space(
int error;
xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
- uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
bool done = false;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1078,32 +1077,34 @@ xfs_collapse_file_space(
if (error)
return error;
- while (!error && !done) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
- &tp);
- if (error)
- break;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+ if (error)
+ return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
- ip->i_gdquot, ip->i_pdquot, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+ while (!done) {
error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
&done);
if (error)
goto out_trans_cancel;
+ if (done)
+ break;
- error = xfs_trans_commit(tp);
+ /* finish any deferred frees and roll the transaction */
+ error = xfs_defer_finish(&tp);
+ if (error)
+ goto out_trans_cancel;
}
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1146,35 +1147,41 @@ xfs_insert_file_space(
if (error)
return error;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
/*
* The extent shifting code works on extent granularity. So, if stop_fsb
* is not the starting block of extent, we need to split the extent at
* stop_fsb.
*/
- error = xfs_bmap_split_extent(ip, stop_fsb);
+ error = xfs_bmap_split_extent(tp, ip, stop_fsb);
if (error)
- return error;
+ goto out_trans_cancel;
- while (!error && !done) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0,
- &tp);
+ do {
+ error = xfs_trans_roll_inode(&tp, ip);
if (error)
- break;
+ goto out_trans_cancel;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
&done, stop_fsb);
if (error)
goto out_trans_cancel;
+ } while (!done);
- error = xfs_trans_commit(tp);
- }
-
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1442,12 +1449,12 @@ xfs_swap_extent_forks(
* event of a crash. Set the owner change log flags now and leave the
* bmbt scan as the last step.
*/
- if (ip->i_d.di_version == 3 &&
- ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- (*target_log_flags) |= XFS_ILOG_DOWNER;
- if (tip->i_d.di_version == 3 &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- (*src_log_flags) |= XFS_ILOG_DOWNER;
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ (*target_log_flags) |= XFS_ILOG_DOWNER;
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ (*src_log_flags) |= XFS_ILOG_DOWNER;
+ }
/*
* Swap the data forks of the inodes
@@ -1482,7 +1489,7 @@ xfs_swap_extent_forks(
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(ip->i_d.di_version < 3 ||
+ ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
(*src_log_flags & XFS_ILOG_DOWNER));
(*src_log_flags) |= XFS_ILOG_DBROOT;
break;
@@ -1494,7 +1501,7 @@ xfs_swap_extent_forks(
break;
case XFS_DINODE_FMT_BTREE:
(*target_log_flags) |= XFS_ILOG_DBROOT;
- ASSERT(tip->i_d.di_version < 3 ||
+ ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
(*target_log_flags & XFS_ILOG_DOWNER));
break;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 217e4f82a44a..9ec3eaf1c618 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -327,6 +327,9 @@ xfs_buf_free(
__free_page(page);
}
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab +=
+ bp->b_page_count;
} else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
@@ -727,8 +730,9 @@ found:
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
- xfs_warn(target->bt_mount,
- "%s: failed to map pagesn", __func__);
+ xfs_warn_ratelimited(target->bt_mount,
+ "%s: failed to map %u pages", __func__,
+ bp->b_page_count);
xfs_buf_relse(bp);
return error;
}
@@ -1238,7 +1242,7 @@ xfs_buf_ioerror_alert(
struct xfs_buf *bp,
xfs_failaddr_t func)
{
- xfs_alert(bp->b_mount,
+ xfs_alert_ratelimited(bp->b_mount,
"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
-bp->b_error);
@@ -1573,6 +1577,28 @@ xfs_buf_zero(
}
/*
+ * Log a message about and stale a buffer that a caller has decided is corrupt.
+ *
+ * This function should be called for the kinds of metadata corruption that
+ * cannot be detect from a verifier, such as incorrect inter-block relationship
+ * data. Do /not/ call this function from a verifier function.
+ *
+ * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
+ * be marked stale, but b_error will not be set. The caller is responsible for
+ * releasing the buffer or fixing it.
+ */
+void
+__xfs_buf_mark_corrupt(
+ struct xfs_buf *bp,
+ xfs_failaddr_t fa)
+{
+ ASSERT(bp->b_flags & XBF_DONE);
+
+ xfs_buf_corruption_error(bp, fa);
+ xfs_buf_stale(bp);
+}
+
+/*
* Handling of buffer targets (buftargs).
*/
@@ -2091,9 +2117,11 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
- xfs_buf_zone = kmem_cache_create("xfs_buf",
- sizeof(struct xfs_buf), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD,
+ NULL);
if (!xfs_buf_zone)
goto out;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d79a1fe5d738..9a04c53c2488 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -272,6 +272,8 @@ static inline int xfs_buf_submit(struct xfs_buf *bp)
}
void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
+void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
+#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 663810e6cd59..1545657c3ca0 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -345,7 +345,7 @@ xfs_buf_item_format(
* occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) ||
!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 0d3b640cf1cc..871ec22c9aee 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -147,7 +147,7 @@ xfs_dir2_block_getdents(
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
int lock_mode;
- unsigned int offset;
+ unsigned int offset, next_offset;
unsigned int end;
/*
@@ -173,9 +173,10 @@ xfs_dir2_block_getdents(
* Loop over the data portion of the block.
* Each object is a real entry (dep) or an unused one (dup).
*/
- offset = geo->data_entry_offset;
end = xfs_dir3_data_end_offset(geo, bp->b_addr);
- while (offset < end) {
+ for (offset = geo->data_entry_offset;
+ offset < end;
+ offset = next_offset) {
struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
uint8_t filetype;
@@ -184,14 +185,15 @@ xfs_dir2_block_getdents(
* Unused, skip it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- offset += be16_to_cpu(dup->length);
+ next_offset = offset + be16_to_cpu(dup->length);
continue;
}
/*
* Bump pointer for the next iteration.
*/
- offset += xfs_dir2_data_entsize(dp->i_mount, dep->namelen);
+ next_offset = offset +
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen);
/*
* The entry is before the desired starting point, skip it.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 0b8350e84d28..f979d0d7e6cd 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -31,6 +31,7 @@ xfs_trim_extents(
struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
+ struct xfs_agf *agf;
struct xfs_perag *pag;
int error;
int i;
@@ -47,14 +48,14 @@ xfs_trim_extents(
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
if (error)
goto out_put_perag;
+ agf = agbp->b_addr;
cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
/*
* Look up the longest btree in the AGF and start with it.
*/
- error = xfs_alloc_lookup_ge(cur, 0,
- be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
+ error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i);
if (error)
goto out_del_cursor;
@@ -75,7 +76,7 @@ xfs_trim_extents(
error = -EFSCORRUPTED;
goto out_del_cursor;
}
- ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
+ ASSERT(flen <= be32_to_cpu(agf->agf_longest));
/*
* use daddr format for all range/len calculations as that is
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index d223e1ae90a6..af2c8e5ceea0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -829,9 +829,9 @@ xfs_qm_id_for_quotatype(
{
switch (type) {
case XFS_DQ_USER:
- return ip->i_d.di_uid;
+ return i_uid_read(VFS_I(ip));
case XFS_DQ_GROUP:
- return ip->i_d.di_gid;
+ return i_gid_read(VFS_I(ip));
case XFS_DQ_PROJ:
return ip->i_d.di_projid;
}
@@ -1105,8 +1105,8 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp,
- &xfs_dquot_buf_ops);
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+ &bp, &xfs_dquot_buf_ops);
if (error)
goto out_unlock;
@@ -1177,7 +1177,7 @@ xfs_qm_dqflush(
out_unlock:
xfs_dqfunlock(dqp);
- return -EIO;
+ return error;
}
/*
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index d60647d7197b..baad1748d0d1 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
@@ -307,36 +308,62 @@ xfs_qm_qoffend_logitem_committed(
{
struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
- struct xfs_ail *ailp = qfs->qql_item.li_ailp;
- /*
- * Delete the qoff-start logitem from the AIL.
- * xfs_trans_ail_delete() drops the AIL lock.
- */
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_qm_qoff_logitem_relse(qfs);
- kmem_free(qfs->qql_item.li_lv_shadow);
kmem_free(lip->li_lv_shadow);
- kmem_free(qfs);
kmem_free(qfe);
return (xfs_lsn_t)-1;
}
+STATIC void
+xfs_qm_qoff_logitem_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip);
+
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
+ if (qoff->qql_start_lip)
+ xfs_qm_qoff_logitem_relse(qoff->qql_start_lip);
+ xfs_qm_qoff_logitem_relse(qoff);
+ }
+}
+
static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_committed = xfs_qm_qoffend_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
+ .iop_release = xfs_qm_qoff_logitem_release,
};
static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_push = xfs_qm_qoff_logitem_push,
+ .iop_release = xfs_qm_qoff_logitem_release,
};
/*
+ * Delete the quotaoff intent from the AIL and free it. On success,
+ * this should only be called for the start item. It can be used for
+ * either on shutdown or abort.
+ */
+void
+xfs_qm_qoff_logitem_relse(
+ struct xfs_qoff_logitem *qoff)
+{
+ struct xfs_log_item *lip = &qoff->qql_item;
+
+ ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) ||
+ test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
+ XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ kmem_free(lip->li_lv_shadow);
+ kmem_free(qoff);
+}
+
+/*
* Allocate and initialize an quotaoff item of the correct quota type(s).
*/
struct xfs_qoff_logitem *
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 3bb19e556ade..2b86a43d7ce2 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -28,6 +28,7 @@ void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp,
struct xfs_qoff_logitem *start,
uint flags);
+void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *);
struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp,
struct xfs_qoff_logitem *startqoff,
uint flags);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 331765afc53e..a21e9cc6516a 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -345,16 +345,19 @@ xfs_corruption_error(
* Complain about the kinds of metadata corruption that we can't detect from a
* verifier, such as incorrect inter-block relationship data. Does not set
* bp->b_error.
+ *
+ * Call xfs_buf_mark_corrupt, not this function.
*/
void
xfs_buf_corruption_error(
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_failaddr_t fa)
{
struct xfs_mount *mp = bp->b_mount;
xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
"Metadata corruption detected at %pS, %s block 0x%llx",
- __return_address, bp->b_ops->name, bp->b_bn);
+ fa, bp->b_ops->name, bp->b_bn);
xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 31a5d321ba9a..1717b7508356 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -15,7 +15,7 @@ extern void xfs_corruption_error(const char *tag, int level,
struct xfs_mount *mp, const void *buf, size_t bufsize,
const char *filename, int linenum,
xfs_failaddr_t failaddr);
-void xfs_buf_corruption_error(struct xfs_buf *bp);
+void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
const char *name, const void *buf, size_t bufsz,
xfs_failaddr_t failaddr);
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f1372f9046e3..5a4b0119143a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -15,7 +15,6 @@
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
-#include "xfs_log.h"
#include "xfs_pnfs.h"
/*
@@ -221,18 +220,7 @@ STATIC int
xfs_fs_nfs_commit_metadata(
struct inode *inode)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(XFS_I(inode));
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b8a4a3f29b36..4b8bdecc3863 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -80,19 +80,9 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(ip);
}
STATIC int
@@ -1069,7 +1059,11 @@ xfs_file_remap_range(
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
+ if (ret)
+ goto out_unlock;
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_log_force_inode(dest);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 918456ca29e1..4eebcec4aae6 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -344,7 +344,7 @@ xfs_getfsmap_datadev_helper(
xfs_fsblock_t fsb;
xfs_daddr_t rec_daddr;
- fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock);
+ fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock);
rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
@@ -362,7 +362,7 @@ xfs_getfsmap_datadev_bnobt_helper(
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
- rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno,
+ rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno,
rec->ar_startblock);
irec.rm_startblock = rec->ar_startblock;
@@ -896,6 +896,14 @@ xfs_getfsmap(
info.format_arg = arg;
info.head = head;
+ /*
+ * If fsmap runs concurrently with a scrub, the freeze can be delayed
+ * indefinitely as we walk the rmapbt and iterate over metadata
+ * buffers. Freeze quiesces the log (which waits for the buffer LRU to
+ * be emptied) and that won't happen while we're reading buffers.
+ */
+ sb_start_write(mp->m_super);
+
/* For each device we support... */
for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
/* Is this device within the range the user asked for? */
@@ -935,6 +943,7 @@ xfs_getfsmap(
if (tp)
xfs_trans_cancel(tp);
+ sb_end_write(mp->m_super);
head->fmh_oflags = FMH_OF_DEV_T;
return error;
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 8dc2e5414276..a7be7a9e5c1a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -289,6 +289,8 @@ xfs_reinit_inode(
uint64_t version = inode_peek_iversion(inode);
umode_t mode = inode->i_mode;
dev_t dev = inode->i_rdev;
+ kuid_t uid = inode->i_uid;
+ kgid_t gid = inode->i_gid;
error = inode_init_always(mp->m_super, inode);
@@ -297,6 +299,8 @@ xfs_reinit_inode(
inode_set_iversion_queried(inode, version);
inode->i_mode = mode;
inode->i_rdev = dev;
+ inode->i_uid = uid;
+ inode->i_gid = gid;
return error;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c5077e6326c7..d1772786af29 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -801,26 +801,18 @@ xfs_ialloc(
return error;
ASSERT(ip != NULL);
inode = VFS_I(ip);
-
- /*
- * We always convert v1 inodes to v2 now - we only support filesystems
- * with >= v2 inode capability, so there is no reason for ever leaving
- * an inode in v1 format.
- */
- if (ip->i_d.di_version == 1)
- ip->i_d.di_version = 2;
-
inode->i_mode = mode;
set_nlink(inode, nlink);
- ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
- ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
+ inode->i_uid = current_fsuid();
inode->i_rdev = rdev;
ip->i_d.di_projid = prid;
if (pip && XFS_INHERIT_GID(pip)) {
- ip->i_d.di_gid = pip->i_d.di_gid;
+ inode->i_gid = VFS_I(pip)->i_gid;
if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
inode->i_mode |= S_ISGID;
+ } else {
+ inode->i_gid = current_fsgid();
}
/*
@@ -828,9 +820,8 @@ xfs_ialloc(
* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
- if ((irix_sgid_inherit) &&
- (inode->i_mode & S_ISGID) &&
- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
+ if (irix_sgid_inherit &&
+ (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
inode->i_mode &= ~S_ISGID;
ip->i_d.di_size = 0;
@@ -847,14 +838,13 @@ xfs_ialloc(
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
- if (ip->i_d.di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
inode_set_iversion(inode, 1);
ip->i_d.di_flags2 = 0;
ip->i_d.di_cowextsize = 0;
ip->i_d.di_crtime = tv;
}
-
flags = XFS_ILOG_CORE;
switch (mode & S_IFMT) {
case S_IFIFO:
@@ -907,20 +897,13 @@ xfs_ialloc(
ip->i_d.di_flags |= di_flags;
}
- if (pip &&
- (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
- pip->i_d.di_version == 3 &&
- ip->i_d.di_version == 3) {
- uint64_t di_flags2 = 0;
-
+ if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) {
if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
- di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
}
if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
- di_flags2 |= XFS_DIFLAG2_DAX;
-
- ip->i_d.di_flags2 |= di_flags2;
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
}
/* FALLTHROUGH */
case S_IFLNK:
@@ -1122,7 +1105,6 @@ xfs_bumplink(
{
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- ASSERT(ip->i_d.di_version > 1);
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -1158,8 +1140,7 @@ xfs_create(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1219,8 +1200,7 @@ xfs_create(
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ resblks - XFS_IALLOC_SPACE_RES(mp));
if (error) {
ASSERT(error != -ENOSPC);
goto out_trans_cancel;
@@ -1309,8 +1289,7 @@ xfs_create_tmpfile(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -2119,7 +2098,7 @@ xfs_iunlink_update_bucket(
unsigned int bucket_index,
xfs_agino_t new_agino)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t old_value;
int offset;
@@ -2135,7 +2114,7 @@ xfs_iunlink_update_bucket(
* head of the list.
*/
if (old_value == new_agino) {
- xfs_buf_corruption_error(agibp);
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
}
@@ -2259,7 +2238,7 @@ xfs_iunlink(
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
/*
* Get the index into the agi hash table for the list this inode will
@@ -2269,7 +2248,7 @@ xfs_iunlink(
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
if (next_agino == agino ||
!xfs_verify_agino_or_null(mp, agno, next_agino)) {
- xfs_buf_corruption_error(agibp);
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
}
@@ -2443,7 +2422,7 @@ xfs_iunlink_remove(
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
/*
* Get the index into the agi hash table for the list this inode will
@@ -2524,6 +2503,88 @@ out:
}
/*
+ * Look up the inode number specified and mark it stale if it is found. If it is
+ * dirty, return the inode so it can be attached to the cluster buffer so it can
+ * be processed appropriately when the cluster free transaction completes.
+ */
+static struct xfs_inode *
+xfs_ifree_get_one_inode(
+ struct xfs_perag *pag,
+ struct xfs_inode *free_ip,
+ xfs_ino_t inum)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *ip;
+
+retry:
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+ /* Inode not in memory, nothing to do */
+ if (!ip)
+ goto out_rcu_unlock;
+
+ /*
+ * because this is an RCU protected lookup, we could find a recently
+ * freed or even reallocated inode during the lookup. We need to check
+ * under the i_flags_lock for a valid inode here. Skip it if it is not
+ * valid, the wrong inode or stale.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
+ spin_unlock(&ip->i_flags_lock);
+ goto out_rcu_unlock;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Don't try to lock/unlock the current inode, but we _cannot_ skip the
+ * other inodes that we did not find in the list attached to the buffer
+ * and are not already marked stale. If we can't lock it, back off and
+ * retry.
+ */
+ if (ip != free_ip) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
+
+ /*
+ * Check the inode number again in case we're racing with
+ * freeing in xfs_reclaim_inode(). See the comments in that
+ * function for more information as to why the initial check is
+ * not sufficient.
+ */
+ if (ip->i_ino != inum) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out_rcu_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ xfs_iflock(ip);
+ xfs_iflags_set(ip, XFS_ISTALE);
+
+ /*
+ * We don't need to attach clean inodes or those only with unlogged
+ * changes (which we throw away, anyway).
+ */
+ if (!ip->i_itemp || xfs_inode_clean(ip)) {
+ ASSERT(ip != free_ip);
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out_no_inode;
+ }
+ return ip;
+
+out_rcu_unlock:
+ rcu_read_unlock();
+out_no_inode:
+ return NULL;
+}
+
+/*
* A big issue when freeing the inode cluster is that we _cannot_ skip any
* inodes that are in memory - they all must be marked stale and attached to
* the cluster buffer.
@@ -2623,77 +2684,11 @@ xfs_ifree_cluster(
* even trying to lock them.
*/
for (i = 0; i < igeo->inodes_per_cluster; i++) {
-retry:
- rcu_read_lock();
- ip = radix_tree_lookup(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, (inum + i)));
-
- /* Inode not in memory, nothing to do */
- if (!ip) {
- rcu_read_unlock();
+ ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
+ if (!ip)
continue;
- }
-
- /*
- * because this is an RCU protected lookup, we could
- * find a recently freed or even reallocated inode
- * during the lookup. We need to check under the
- * i_flags_lock for a valid inode here. Skip it if it
- * is not valid, the wrong inode or stale.
- */
- spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != inum + i ||
- __xfs_iflags_test(ip, XFS_ISTALE)) {
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- continue;
- }
- spin_unlock(&ip->i_flags_lock);
-
- /*
- * Don't try to lock/unlock the current inode, but we
- * _cannot_ skip the other inodes that we did not find
- * in the list attached to the buffer and are not
- * already marked stale. If we can't lock it, back off
- * and retry.
- */
- if (ip != free_ip) {
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- rcu_read_unlock();
- delay(1);
- goto retry;
- }
-
- /*
- * Check the inode number again in case we're
- * racing with freeing in xfs_reclaim_inode().
- * See the comments in that function for more
- * information as to why the initial check is
- * not sufficient.
- */
- if (ip->i_ino != inum + i) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- rcu_read_unlock();
- continue;
- }
- }
- rcu_read_unlock();
-
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_ISTALE);
- /*
- * we don't need to attach clean inodes or those only
- * with unlogged changes (which we throw away, anyway).
- */
iip = ip->i_itemp;
- if (!iip || xfs_inode_clean(ip)) {
- ASSERT(ip != free_ip);
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
-
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
@@ -3807,7 +3802,6 @@ xfs_iflush_int(
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
ASSERT(iip != NULL && iip->ili_fields != 0);
- ASSERT(ip->i_d.di_version > 1);
/* set *dip = inode's place in the buffer */
dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -3868,7 +3862,7 @@ xfs_iflush_int(
* backwards compatibility with old kernels that predate logging all
* inode changes.
*/
- if (ip->i_d.di_version < 3)
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
ip->i_d.di_flushiter++;
/* Check the inline fork data before we write out. */
@@ -3951,3 +3945,22 @@ xfs_irele(
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+ struct xfs_inode *ip)
+{
+ xfs_lsn_t lsn = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 492e53992fa9..c6a63f6764a6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **,
struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
+int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 8bd5d0de6321..f779cca2346f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -125,7 +125,7 @@ xfs_inode_item_size(
*nvecs += 2;
*nbytes += sizeof(struct xfs_inode_log_format) +
- xfs_log_dinode_size(ip->i_d.di_version);
+ xfs_log_dinode_size(ip->i_mount);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
if (XFS_IFORK_Q(ip))
@@ -305,11 +305,9 @@ xfs_inode_to_log_dinode(
struct inode *inode = VFS_I(ip);
to->di_magic = XFS_DINODE_MAGIC;
-
- to->di_version = from->di_version;
to->di_format = from->di_format;
- to->di_uid = from->di_uid;
- to->di_gid = from->di_gid;
+ to->di_uid = i_uid_read(inode);
+ to->di_gid = i_gid_read(inode);
to->di_projid_lo = from->di_projid & 0xffff;
to->di_projid_hi = from->di_projid >> 16;
@@ -339,7 +337,8 @@ xfs_inode_to_log_dinode(
/* log a dummy value to ensure log structure is fully initialised */
to->di_next_unlinked = NULLAGINO;
- if (from->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ to->di_version = 3;
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime.t_sec = from->di_crtime.tv_sec;
to->di_crtime.t_nsec = from->di_crtime.tv_nsec;
@@ -351,6 +350,7 @@ xfs_inode_to_log_dinode(
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
} else {
+ to->di_version = 2;
to->di_flushiter = from->di_flushiter;
}
}
@@ -370,7 +370,7 @@ xfs_inode_item_format_core(
dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
- xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
+ xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount));
}
/*
@@ -395,8 +395,6 @@ xfs_inode_item_format(
struct xfs_log_iovec *vecp = NULL;
struct xfs_inode_log_format *ilf;
- ASSERT(ip->i_d.di_version > 1);
-
ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
ilf->ilf_type = XFS_LI_INODE;
ilf->ilf_ino = ip->i_ino;
@@ -554,7 +552,8 @@ xfs_inode_item_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
@@ -732,29 +731,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
- bool mlip_changed = false;
+ xfs_lsn_t tail_lsn = 0;
/* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->ail_lock);
list_for_each_entry(blip, &tmp, li_bio_list) {
if (INODE_ITEM(blip)->ili_logged &&
- blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
- mlip_changed |= xfs_ail_delete_one(ailp, blip);
- else {
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) {
+ /*
+ * xfs_ail_update_finish() only cares about the
+ * lsn of the first tail item removed, any
+ * others will be at the same or higher lsn so
+ * we just ignore them.
+ */
+ xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip);
+ if (!tail_lsn && lsn)
+ tail_lsn = lsn;
+ } else {
xfs_clear_li_failed(blip);
}
}
-
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
- spin_unlock(&ailp->ail_lock);
-
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
/*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d42de92cb283..cdfb3cd9a25b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -35,6 +35,8 @@
#include "xfs_health.h"
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -292,62 +294,173 @@ xfs_readlink_by_handle(
return error;
}
-STATIC int
-xfs_attrlist_by_handle(
- struct file *parfilp,
- void __user *arg)
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+static void
+xfs_ioc_attr_put_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen)
{
- int error = -ENOMEM;
- attrlist_cursor_kern_t *cursor;
- struct xfs_fsop_attrlist_handlereq __user *p = arg;
- xfs_fsop_attrlist_handlereq_t al_hreq;
- struct dentry *dentry;
- char *kbuf;
+ struct xfs_attrlist *alist = context->buffer;
+ struct xfs_attrlist_ent *aep;
+ int arraytop;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
- return -EFAULT;
- if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XFS_XATTR_LIST_MAX)
+ ASSERT(!context->seen_enough);
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+ ASSERT(context->firstu >= sizeof(*alist));
+ ASSERT(context->firstu <= context->bufsize);
+
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return;
+
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
+
+ /* decrement by the actual bytes used by the attr */
+ context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) +
+ namelen + 1, sizeof(uint32_t));
+ if (context->firstu < arraytop) {
+ trace_xfs_attr_list_full(context);
+ alist->al_more = 1;
+ context->seen_enough = 1;
+ return;
+ }
+
+ aep = context->buffer + context->firstu;
+ aep->a_valuelen = valuelen;
+ memcpy(aep->a_name, name, namelen);
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
+ trace_xfs_attr_list_add(context);
+}
+
+static unsigned int
+xfs_attr_filter(
+ u32 ioc_flags)
+{
+ if (ioc_flags & XFS_IOC_ATTR_ROOT)
+ return XFS_ATTR_ROOT;
+ if (ioc_flags & XFS_IOC_ATTR_SECURE)
+ return XFS_ATTR_SECURE;
+ return 0;
+}
+
+static unsigned int
+xfs_attr_flags(
+ u32 ioc_flags)
+{
+ if (ioc_flags & XFS_IOC_ATTR_CREATE)
+ return XATTR_CREATE;
+ if (ioc_flags & XFS_IOC_ATTR_REPLACE)
+ return XATTR_REPLACE;
+ return 0;
+}
+
+int
+xfs_ioc_attr_list(
+ struct xfs_inode *dp,
+ void __user *ubuf,
+ int bufsize,
+ int flags,
+ struct xfs_attrlist_cursor __user *ucursor)
+{
+ struct xfs_attr_list_context context = { };
+ struct xfs_attrlist *alist;
+ void *buffer;
+ int error;
+
+ if (bufsize < sizeof(struct xfs_attrlist) ||
+ bufsize > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
* Reject flags, only allow namespaces.
*/
- if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+ if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+ if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
return -EINVAL;
- dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ /*
+ * Validate the cursor.
+ */
+ if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor)))
+ return -EFAULT;
+ if (context.cursor.pad1 || context.cursor.pad2)
+ return -EINVAL;
+ if (!context.cursor.initted &&
+ (context.cursor.hashval || context.cursor.blkno ||
+ context.cursor.offset))
+ return -EINVAL;
- kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
- if (!kbuf)
- goto out_dput;
+ buffer = kmem_zalloc_large(bufsize, 0);
+ if (!buffer)
+ return -ENOMEM;
- cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
- al_hreq.flags, cursor);
+ /*
+ * Initialize the output buffer.
+ */
+ context.dp = dp;
+ context.resynch = 1;
+ context.attr_filter = xfs_attr_filter(flags);
+ context.buffer = buffer;
+ context.bufsize = round_down(bufsize, sizeof(uint32_t));
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_ioc_attr_put_listent;
+
+ alist = context.buffer;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
+
+ error = xfs_attr_list(&context);
if (error)
- goto out_kfree;
+ goto out_free;
- if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
+ if (copy_to_user(ubuf, buffer, bufsize) ||
+ copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
error = -EFAULT;
- goto out_kfree;
- }
+out_free:
+ kmem_free(buffer);
+ return error;
+}
- if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
- error = -EFAULT;
+STATIC int
+xfs_attrlist_by_handle(
+ struct file *parfilp,
+ struct xfs_fsop_attrlist_handlereq __user *p)
+{
+ struct xfs_fsop_attrlist_handlereq al_hreq;
+ struct dentry *dentry;
+ int error = -ENOMEM;
-out_kfree:
- kmem_free(kbuf);
-out_dput:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
+ return -EFAULT;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer,
+ al_hreq.buflen, al_hreq.flags, &p->pos);
dput(dentry);
return error;
}
-int
+static int
xfs_attrmulti_attr_get(
struct inode *inode,
unsigned char *name,
@@ -355,31 +468,33 @@ xfs_attrmulti_attr_get(
uint32_t *len,
uint32_t flags)
{
- unsigned char *kbuf;
- int error = -EFAULT;
- size_t namelen;
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .attr_flags = xfs_attr_flags(flags),
+ .name = name,
+ .namelen = strlen(name),
+ .valuelen = *len,
+ };
+ int error;
if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
- kbuf = kmem_zalloc_large(*len, 0);
- if (!kbuf)
- return -ENOMEM;
- namelen = strlen(name);
- error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len,
- flags);
+ error = xfs_attr_get(&args);
if (error)
goto out_kfree;
- if (copy_to_user(ubuf, kbuf, *len))
+ *len = args.valuelen;
+ if (copy_to_user(ubuf, args.value, args.valuelen))
error = -EFAULT;
out_kfree:
- kmem_free(kbuf);
+ kmem_free(args.value);
return error;
}
-int
+static int
xfs_attrmulti_attr_set(
struct inode *inode,
unsigned char *name,
@@ -387,42 +502,75 @@ xfs_attrmulti_attr_set(
uint32_t len,
uint32_t flags)
{
- unsigned char *kbuf;
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .attr_flags = xfs_attr_flags(flags),
+ .name = name,
+ .namelen = strlen(name),
+ };
int error;
- size_t namelen;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- if (len > XFS_XATTR_SIZE_MAX)
- return -EINVAL;
- kbuf = memdup_user(ubuf, len);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
+ if (ubuf) {
+ if (len > XFS_XATTR_SIZE_MAX)
+ return -EINVAL;
+ args.value = memdup_user(ubuf, len);
+ if (IS_ERR(args.value))
+ return PTR_ERR(args.value);
+ args.valuelen = len;
+ }
- namelen = strlen(name);
- error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags);
- if (!error)
- xfs_forget_acl(inode, name, flags);
- kfree(kbuf);
+ error = xfs_attr_set(&args);
+ if (!error && (flags & XFS_IOC_ATTR_ROOT))
+ xfs_forget_acl(inode, name);
+ kfree(args.value);
return error;
}
int
-xfs_attrmulti_attr_remove(
+xfs_ioc_attrmulti_one(
+ struct file *parfilp,
struct inode *inode,
- unsigned char *name,
+ uint32_t opcode,
+ void __user *uname,
+ void __user *value,
+ uint32_t *len,
uint32_t flags)
{
+ unsigned char *name;
int error;
- size_t namelen;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- namelen = strlen(name);
- error = xfs_attr_remove(XFS_I(inode), name, namelen, flags);
- if (!error)
- xfs_forget_acl(inode, name, flags);
+ if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+
+ name = strndup_user(uname, MAXNAMELEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ switch (opcode) {
+ case ATTR_OP_GET:
+ error = xfs_attrmulti_attr_get(inode, name, value, len, flags);
+ break;
+ case ATTR_OP_REMOVE:
+ value = NULL;
+ *len = 0;
+ /* fall through */
+ case ATTR_OP_SET:
+ error = mnt_want_write_file(parfilp);
+ if (error)
+ break;
+ error = xfs_attrmulti_attr_set(inode, name, value, *len, flags);
+ mnt_drop_write_file(parfilp);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ kfree(name);
return error;
}
@@ -436,7 +584,6 @@ xfs_attrmulti_by_handle(
xfs_fsop_attrmulti_handlereq_t am_hreq;
struct dentry *dentry;
unsigned int i, size;
- unsigned char *attr_name;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -462,63 +609,17 @@ xfs_attrmulti_by_handle(
goto out_dput;
}
- error = -ENOMEM;
- attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
- if (!attr_name)
- goto out_kfree_ops;
-
error = 0;
for (i = 0; i < am_hreq.opcount; i++) {
- if ((ops[i].am_flags & ATTR_ROOT) &&
- (ops[i].am_flags & ATTR_SECURE)) {
- ops[i].am_error = -EINVAL;
- continue;
- }
- ops[i].am_flags &= ~ATTR_KERNEL_FLAGS;
-
- ops[i].am_error = strncpy_from_user((char *)attr_name,
- ops[i].am_attrname, MAXNAMELEN);
- if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
- if (ops[i].am_error < 0)
- break;
-
- switch (ops[i].am_opcode) {
- case ATTR_OP_GET:
- ops[i].am_error = xfs_attrmulti_attr_get(
- d_inode(dentry), attr_name,
- ops[i].am_attrvalue, &ops[i].am_length,
- ops[i].am_flags);
- break;
- case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_set(
- d_inode(dentry), attr_name,
- ops[i].am_attrvalue, ops[i].am_length,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_remove(
- d_inode(dentry), attr_name,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- default:
- ops[i].am_error = -EINVAL;
- }
+ ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+ d_inode(dentry), ops[i].am_opcode,
+ ops[i].am_attrname, ops[i].am_attrvalue,
+ &ops[i].am_length, ops[i].am_flags);
}
if (copy_to_user(am_hreq.ops, ops, size))
error = -EFAULT;
- kfree(attr_name);
- out_kfree_ops:
kfree(ops);
out_dput:
dput(dentry);
@@ -1162,7 +1263,7 @@ xfs_ioctl_setattr_xflags(
/* diflags2 only valid for v3 inodes. */
di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
- if (di_flags2 && ip->i_d.di_version < 3)
+ if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb))
return -EINVAL;
ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
@@ -1372,8 +1473,7 @@ xfs_ioctl_setattr_check_cowextsize(
if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
return 0;
- if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) ||
- ip->i_d.di_version != 3)
+ if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb))
return -EINVAL;
if (fa->fsx_cowextsize == 0)
@@ -1434,9 +1534,9 @@ xfs_ioctl_setattr(
* because the i_*dquot fields will get updated anyway.
*/
if (XFS_IS_QUOTA_ON(mp)) {
- code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
- ip->i_d.di_gid, fa->fsx_projid,
- XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
+ code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+ VFS_I(ip)->i_gid, fa->fsx_projid,
+ XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
if (code)
return code;
}
@@ -1501,7 +1601,6 @@ xfs_ioctl_setattr(
olddquot = xfs_qm_vop_chown(tp, ip,
&ip->i_pdquot, pdqp);
}
- ASSERT(ip->i_d.di_version > 1);
ip->i_d.di_projid = fa->fsx_projid;
}
@@ -1514,7 +1613,7 @@ xfs_ioctl_setattr(
ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
else
ip->i_d.di_extsize = 0;
- if (ip->i_d.di_version == 3 &&
+ if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
(ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
ip->i_d.di_cowextsize = fa->fsx_cowextsize >>
mp->m_sb.sb_blocklog;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 420bd95dc326..bab6a5a92407 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -6,6 +6,11 @@
#ifndef __XFS_IOCTL_H__
#define __XFS_IOCTL_H__
+struct xfs_bstat;
+struct xfs_ibulk;
+struct xfs_inogrp;
+
+
extern int
xfs_ioc_space(
struct file *filp,
@@ -30,27 +35,11 @@ xfs_readlink_by_handle(
struct file *parfilp,
xfs_fsop_handlereq_t *hreq);
-extern int
-xfs_attrmulti_attr_get(
- struct inode *inode,
- unsigned char *name,
- unsigned char __user *ubuf,
- uint32_t *len,
- uint32_t flags);
-
-extern int
-xfs_attrmulti_attr_set(
- struct inode *inode,
- unsigned char *name,
- const unsigned char __user *ubuf,
- uint32_t len,
- uint32_t flags);
-
-extern int
-xfs_attrmulti_attr_remove(
- struct inode *inode,
- unsigned char *name,
- uint32_t flags);
+int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
+ uint32_t opcode, void __user *uname, void __user *value,
+ uint32_t *len, uint32_t flags);
+int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize,
+ int flags, struct xfs_attrlist_cursor __user *ucursor);
extern struct dentry *
xfs_handle_to_dentry(
@@ -70,10 +59,6 @@ xfs_file_compat_ioctl(
unsigned int cmd,
unsigned long arg);
-struct xfs_ibulk;
-struct xfs_bstat;
-struct xfs_inogrp;
-
int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq,
const struct xfs_bulkstat *bstat);
int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 769581a79c58..c1771e728117 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -352,56 +352,24 @@ xfs_compat_handlereq_to_dentry(
STATIC int
xfs_compat_attrlist_by_handle(
struct file *parfilp,
- void __user *arg)
+ compat_xfs_fsop_attrlist_handlereq_t __user *p)
{
- int error;
- attrlist_cursor_kern_t *cursor;
- compat_xfs_fsop_attrlist_handlereq_t __user *p = arg;
compat_xfs_fsop_attrlist_handlereq_t al_hreq;
struct dentry *dentry;
- char *kbuf;
+ int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (copy_from_user(&al_hreq, arg,
- sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+ if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
return -EFAULT;
- if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XFS_XATTR_LIST_MAX)
- return -EINVAL;
-
- /*
- * Reject flags, only allow namespaces.
- */
- if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
- return -EINVAL;
dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- error = -ENOMEM;
- kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
- if (!kbuf)
- goto out_dput;
-
- cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
- al_hreq.flags, cursor);
- if (error)
- goto out_kfree;
-
- if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
- error = -EFAULT;
- goto out_kfree;
- }
-
- if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
- error = -EFAULT;
-
-out_kfree:
- kmem_free(kbuf);
-out_dput:
+ error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)),
+ compat_ptr(al_hreq.buffer), al_hreq.buflen,
+ al_hreq.flags, &p->pos);
dput(dentry);
return error;
}
@@ -416,7 +384,6 @@ xfs_compat_attrmulti_by_handle(
compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
struct dentry *dentry;
unsigned int i, size;
- unsigned char *attr_name;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -443,64 +410,18 @@ xfs_compat_attrmulti_by_handle(
goto out_dput;
}
- error = -ENOMEM;
- attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
- if (!attr_name)
- goto out_kfree_ops;
-
error = 0;
for (i = 0; i < am_hreq.opcount; i++) {
- if ((ops[i].am_flags & ATTR_ROOT) &&
- (ops[i].am_flags & ATTR_SECURE)) {
- ops[i].am_error = -EINVAL;
- continue;
- }
- ops[i].am_flags &= ~ATTR_KERNEL_FLAGS;
-
- ops[i].am_error = strncpy_from_user((char *)attr_name,
+ ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+ d_inode(dentry), ops[i].am_opcode,
compat_ptr(ops[i].am_attrname),
- MAXNAMELEN);
- if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
- if (ops[i].am_error < 0)
- break;
-
- switch (ops[i].am_opcode) {
- case ATTR_OP_GET:
- ops[i].am_error = xfs_attrmulti_attr_get(
- d_inode(dentry), attr_name,
- compat_ptr(ops[i].am_attrvalue),
- &ops[i].am_length, ops[i].am_flags);
- break;
- case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_set(
- d_inode(dentry), attr_name,
- compat_ptr(ops[i].am_attrvalue),
- ops[i].am_length, ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_remove(
- d_inode(dentry), attr_name,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- default:
- ops[i].am_error = -EINVAL;
- }
+ compat_ptr(ops[i].am_attrvalue),
+ &ops[i].am_length, ops[i].am_flags);
}
if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
error = -EFAULT;
- kfree(attr_name);
- out_kfree_ops:
kfree(ops);
out_dput:
dput(dentry);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 81f2f93caec0..f7a99b3bbcf7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -22,7 +22,6 @@
#include "xfs_iomap.h"
#include "xfs_error.h"
-#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
#include <linux/iversion.h>
@@ -50,10 +49,15 @@ xfs_initxattrs(
int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- error = xfs_attr_set(ip, xattr->name,
- strlen(xattr->name),
- xattr->value, xattr->value_len,
- ATTR_SECURE);
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_SECURE,
+ .name = xattr->name,
+ .namelen = strlen(xattr->name),
+ .value = xattr->value,
+ .valuelen = xattr->value_len,
+ };
+ error = xfs_attr_set(&args);
if (error < 0)
break;
}
@@ -553,7 +557,7 @@ xfs_vn_getattr(
stat->blocks =
XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
- if (ip->i_d.di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime = ip->i_d.di_crtime;
@@ -692,9 +696,7 @@ xfs_setattr_nonsize(
*/
ASSERT(udqp == NULL);
ASSERT(gdqp == NULL);
- error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
- xfs_kgid_to_gid(gid),
- ip->i_d.di_projid,
+ error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
qflags, &udqp, &gdqp, NULL);
if (error)
return error;
@@ -763,7 +765,6 @@ xfs_setattr_nonsize(
olddquot1 = xfs_qm_vop_chown(tp, ip,
&ip->i_udquot, udqp);
}
- ip->i_d.di_uid = xfs_kuid_to_uid(uid);
inode->i_uid = uid;
}
if (!gid_eq(igid, gid)) {
@@ -775,7 +776,6 @@ xfs_setattr_nonsize(
olddquot2 = xfs_qm_vop_chown(tp, ip,
&ip->i_gdquot, gdqp);
}
- ip->i_d.di_gid = xfs_kgid_to_gid(gid);
inode->i_gid = gid;
}
}
@@ -1304,9 +1304,6 @@ xfs_setup_inode(
/* make the inode look hashed for the writeback code */
inode_fake_hash(inode);
- inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
- inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
-
i_size_write(inode, ip->i_d.di_size);
xfs_diflags_to_iflags(inode, ip);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 4b31c29b7e6b..ff2da28fed90 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -86,8 +86,8 @@ xfs_bulkstat_one_int(
*/
buf->bs_projectid = ip->i_d.di_projid;
buf->bs_ino = ino;
- buf->bs_uid = dic->di_uid;
- buf->bs_gid = dic->di_gid;
+ buf->bs_uid = i_uid_read(inode);
+ buf->bs_gid = i_gid_read(inode);
buf->bs_size = dic->di_size;
buf->bs_nlink = inode->i_nlink;
@@ -110,7 +110,7 @@ xfs_bulkstat_one_int(
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
buf->bs_version = XFS_BULKSTAT_VERSION_V5;
- if (dic->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
buf->bs_cowextsize_blks = dic->di_cowextsize;
}
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 8738bb03f253..9f70d2f68e05 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -60,6 +60,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/list_sort.h>
#include <linux/ratelimit.h>
#include <linux/rhashtable.h>
+#include <linux/xattr.h>
#include <asm/page.h>
#include <asm/div64.h>
@@ -163,32 +164,6 @@ struct xstats {
extern struct xstats xfsstats;
-/* Kernel uid/gid conversion. These are used to convert to/from the on disk
- * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
- * The conversion here is type only, the value will remain the same since we
- * are converting to the init_user_ns. The uid is later mapped to a particular
- * user namespace value when crossing the kernel/user boundary.
- */
-static inline uint32_t xfs_kuid_to_uid(kuid_t uid)
-{
- return from_kuid(&init_user_ns, uid);
-}
-
-static inline kuid_t xfs_uid_to_kuid(uint32_t uid)
-{
- return make_kuid(&init_user_ns, uid);
-}
-
-static inline uint32_t xfs_kgid_to_gid(kgid_t gid)
-{
- return from_kgid(&init_user_ns, gid);
-}
-
-static inline kgid_t xfs_gid_to_kgid(uint32_t gid)
-{
- return make_kgid(&init_user_ns, gid);
-}
-
static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
{
return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f6006d94a581..00fda2e8e738 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,13 +24,6 @@
kmem_zone_t *xfs_log_ticket_zone;
/* Local miscellaneous function prototypes */
-STATIC int
-xlog_commit_record(
- struct xlog *log,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp);
-
STATIC struct xlog *
xlog_alloc_log(
struct xfs_mount *mp,
@@ -47,8 +40,7 @@ xlog_dealloc_log(
/* local state machine functions */
STATIC void xlog_state_done_syncing(
- struct xlog_in_core *iclog,
- bool aborted);
+ struct xlog_in_core *iclog);
STATIC int
xlog_state_get_iclog_space(
struct xlog *log,
@@ -63,23 +55,10 @@ xlog_state_switch_iclogs(
struct xlog_in_core *iclog,
int eventual_size);
STATIC void
-xlog_state_want_sync(
- struct xlog *log,
- struct xlog_in_core *iclog);
-
-STATIC void
xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
STATIC void
-xlog_regrant_reserve_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
-xlog_ungrant_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog);
@@ -484,73 +463,6 @@ out_error:
return error;
}
-
-/*
- * NOTES:
- *
- * 1. currblock field gets updated at startup and after in-core logs
- * marked as with WANT_SYNC.
- */
-
-/*
- * This routine is called when a user of a log manager ticket is done with
- * the reservation. If the ticket was ever used, then a commit record for
- * the associated transaction is written out as a log operation header with
- * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
- * a given ticket. If the ticket was one with a permanent reservation, then
- * a few operations are done differently. Permanent reservation tickets by
- * default don't release the reservation. They just commit the current
- * transaction with the belief that the reservation is still needed. A flag
- * must be passed in before permanent reservations are actually released.
- * When these type of tickets are not released, they need to be set into
- * the inited state again. By doing this, a start record will be written
- * out when the next write occurs.
- */
-xfs_lsn_t
-xfs_log_done(
- struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant)
-{
- struct xlog *log = mp->m_log;
- xfs_lsn_t lsn = 0;
-
- if (XLOG_FORCED_SHUTDOWN(log) ||
- /*
- * If nothing was ever written, don't write out commit record.
- * If we get an error, just continue and give back the log ticket.
- */
- (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
- (xlog_commit_record(log, ticket, iclog, &lsn)))) {
- lsn = (xfs_lsn_t) -1;
- regrant = false;
- }
-
-
- if (!regrant) {
- trace_xfs_log_done_nonperm(log, ticket);
-
- /*
- * Release ticket if not permanent reservation or a specific
- * request has been made to release a permanent reservation.
- */
- xlog_ungrant_log_space(log, ticket);
- } else {
- trace_xfs_log_done_perm(log, ticket);
-
- xlog_regrant_reserve_log_space(log, ticket);
- /* If this ticket was a permanent reservation and we aren't
- * trying to release it, reset the inited flags; so next time
- * we write, a start record will be written out.
- */
- ticket->t_flags |= XLOG_TIC_INITED;
- }
-
- xfs_log_ticket_put(ticket);
- return lsn;
-}
-
static bool
__xlog_state_release_iclog(
struct xlog *log,
@@ -597,26 +509,21 @@ xlog_state_release_iclog(
return 0;
}
-int
+void
xfs_log_release_iclog(
- struct xfs_mount *mp,
struct xlog_in_core *iclog)
{
- struct xlog *log = mp->m_log;
- bool sync;
-
- if (iclog->ic_state == XLOG_STATE_IOERROR) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- return -EIO;
- }
+ struct xlog *log = iclog->ic_log;
+ bool sync = false;
if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
- sync = __xlog_state_release_iclog(log, iclog);
+ if (iclog->ic_state != XLOG_STATE_IOERROR)
+ sync = __xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
- if (sync)
- xlog_sync(log, iclog);
}
- return 0;
+
+ if (sync)
+ xlog_sync(log, iclog);
}
/*
@@ -855,32 +762,69 @@ xfs_log_mount_cancel(
}
/*
- * Final log writes as part of unmount.
- *
- * Mark the filesystem clean as unmount happens. Note that during relocation
- * this routine needs to be executed as part of source-bag while the
- * deallocation must not be done until source-end.
+ * Wait for the iclog to be written disk, or return an error if the log has been
+ * shut down.
*/
+static int
+xlog_wait_on_iclog(
+ struct xlog_in_core *iclog)
+ __releases(iclog->ic_log->l_icloglock)
+{
+ struct xlog *log = iclog->ic_log;
-/* Actually write the unmount record to disk. */
-static void
-xfs_log_write_unmount_record(
- struct xfs_mount *mp)
+ if (!XLOG_FORCED_SHUTDOWN(log) &&
+ iclog->ic_state != XLOG_STATE_ACTIVE &&
+ iclog->ic_state != XLOG_STATE_DIRTY) {
+ XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ } else {
+ spin_unlock(&log->l_icloglock);
+ }
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return -EIO;
+ return 0;
+}
+
+/*
+ * Write out an unmount record using the ticket provided. We have to account for
+ * the data space used in the unmount ticket as this write is not done from a
+ * transaction context that has already done the accounting for us.
+ */
+static int
+xlog_write_unmount_record(
+ struct xlog *log,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *lsn,
+ uint flags)
{
- /* the data section must be 32 bit size aligned */
- struct xfs_unmount_log_format magic = {
+ struct xfs_unmount_log_format ulf = {
.magic = XLOG_UNMOUNT_TYPE,
};
struct xfs_log_iovec reg = {
- .i_addr = &magic,
- .i_len = sizeof(magic),
+ .i_addr = &ulf,
+ .i_len = sizeof(ulf),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
- struct xlog *log = mp->m_log;
+
+ /* account for space used by record data */
+ ticket->t_curr_res -= sizeof(ulf);
+ return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
+}
+
+/*
+ * Mark the filesystem clean by writing an unmount record to the head of the
+ * log.
+ */
+static void
+xlog_unmount_write(
+ struct xlog *log)
+{
+ struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
xfs_lsn_t lsn;
@@ -891,23 +835,7 @@ xfs_log_write_unmount_record(
if (error)
goto out_err;
- /*
- * If we think the summary counters are bad, clear the unmount header
- * flag in the unmount record so that the summary counters will be
- * recalculated during log recovery at next mount. Refer to
- * xlog_check_unmount_rec for more details.
- */
- if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
- xfs_alert(mp, "%s: will fix summary counters at next mount",
- __func__);
- flags &= ~XLOG_UNMOUNT_TRANS;
- }
-
- /* remove inited flag, and account for space used */
- tic->t_flags = 0;
- tic->t_curr_res -= sizeof(magic);
- error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
+ error = xlog_write_unmount_record(log, tic, &lsn, flags);
/*
* At this point, we're umounting anyway, so there's no point in
* transitioning log state to IOERROR. Just continue...
@@ -919,28 +847,32 @@ out_err:
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
atomic_inc(&iclog->ic_refcnt);
- xlog_state_want_sync(log, iclog);
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, iclog, 0);
+ else
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
error = xlog_state_release_iclog(log, iclog);
- switch (iclog->ic_state) {
- default:
- if (!XLOG_FORCED_SHUTDOWN(log)) {
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- break;
- }
- /* fall through */
- case XLOG_STATE_ACTIVE:
- case XLOG_STATE_DIRTY:
- spin_unlock(&log->l_icloglock);
- break;
- }
+ xlog_wait_on_iclog(iclog);
if (tic) {
trace_xfs_log_umount_write(log, tic);
- xlog_ungrant_log_space(log, tic);
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
}
}
+static void
+xfs_log_unmount_verify_iclog(
+ struct xlog *log)
+{
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ do {
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ ASSERT(iclog->ic_offset == 0);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+
/*
* Unmount record used to have a string "Unmount filesystem--" in the
* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
@@ -948,16 +880,11 @@ out_err:
* currently architecture converted and "Unmount" is a bit foo.
* As far as I know, there weren't any dependencies on the old behaviour.
*/
-
-static int
-xfs_log_unmount_write(xfs_mount_t *mp)
+static void
+xfs_log_unmount_write(
+ struct xfs_mount *mp)
{
- struct xlog *log = mp->m_log;
- xlog_in_core_t *iclog;
-#ifdef DEBUG
- xlog_in_core_t *first_iclog;
-#endif
- int error;
+ struct xlog *log = mp->m_log;
/*
* Don't write out unmount record on norecovery mounts or ro devices.
@@ -966,57 +893,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
xfs_readonly_buftarg(log->l_targ)) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
- return 0;
+ return;
}
- error = xfs_log_force(mp, XFS_LOG_SYNC);
- ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+ xfs_log_force(mp, XFS_LOG_SYNC);
-#ifdef DEBUG
- first_iclog = iclog = log->l_iclog;
- do {
- if (iclog->ic_state != XLOG_STATE_IOERROR) {
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
- ASSERT(iclog->ic_offset == 0);
- }
- iclog = iclog->ic_next;
- } while (iclog != first_iclog);
-#endif
- if (! (XLOG_FORCED_SHUTDOWN(log))) {
- xfs_log_write_unmount_record(mp);
- } else {
- /*
- * We're already in forced_shutdown mode, couldn't
- * even attempt to write out the unmount transaction.
- *
- * Go through the motions of sync'ing and releasing
- * the iclog, even though no I/O will actually happen,
- * we need to wait for other log I/Os that may already
- * be in progress. Do this as a separate section of
- * code so we'll know if we ever get stuck here that
- * we're in this odd situation of trying to unmount
- * a file system that went into forced_shutdown as
- * the result of an unmount..
- */
- spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- atomic_inc(&iclog->ic_refcnt);
- xlog_state_want_sync(log, iclog);
- error = xlog_state_release_iclog(log, iclog);
- switch (iclog->ic_state) {
- case XLOG_STATE_ACTIVE:
- case XLOG_STATE_DIRTY:
- case XLOG_STATE_IOERROR:
- spin_unlock(&log->l_icloglock);
- break;
- default:
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- break;
- }
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return;
+
+ /*
+ * If we think the summary counters are bad, avoid writing the unmount
+ * record to force log recovery at next mount, after which the summary
+ * counters will be recalculated. Refer to xlog_check_unmount_rec for
+ * more details.
+ */
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
+ XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ xfs_alert(mp, "%s: will fix summary counters at next mount",
+ __func__);
+ return;
}
- return error;
-} /* xfs_log_unmount_write */
+ xfs_log_unmount_verify_iclog(log);
+ xlog_unmount_write(log);
+}
/*
* Empty the log for unmount/freeze.
@@ -1279,7 +1179,6 @@ xlog_ioend_work(
struct xlog_in_core *iclog =
container_of(work, struct xlog_in_core, ic_end_io_work);
struct xlog *log = iclog->ic_log;
- bool aborted = false;
int error;
error = blk_status_to_errno(iclog->ic_bio.bi_status);
@@ -1295,17 +1194,9 @@ xlog_ioend_work(
if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
xfs_alert(log->l_mp, "log I/O error %d", error);
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
- /*
- * This flag will be propagated to the trans-committed
- * callback routines to let them know that the log-commit
- * didn't succeed.
- */
- aborted = true;
- } else if (iclog->ic_state == XLOG_STATE_IOERROR) {
- aborted = true;
}
- xlog_state_done_syncing(iclog, aborted);
+ xlog_state_done_syncing(iclog);
bio_uninit(&iclog->ic_bio);
/*
@@ -1551,20 +1442,17 @@ out:
return ERR_PTR(error);
} /* xlog_alloc_log */
-
/*
* Write out the commit record of a transaction associated with the given
- * ticket. Return the lsn of the commit record.
+ * ticket to close off a running log write. Return the lsn of the commit record.
*/
-STATIC int
+int
xlog_commit_record(
struct xlog *log,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp)
+ xfs_lsn_t *lsn)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
struct xfs_log_iovec reg = {
.i_addr = NULL,
.i_len = 0,
@@ -1574,12 +1462,15 @@ xlog_commit_record(
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
+ int error;
- ASSERT_ALWAYS(iclog);
- error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
- XLOG_COMMIT_TRANS);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return -EIO;
+
+ error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
+ false);
if (error)
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -1739,7 +1630,7 @@ xlog_bio_end_io(
&iclog->ic_end_io_work);
}
-static void
+static int
xlog_map_iclog_data(
struct bio *bio,
void *data,
@@ -1750,11 +1641,14 @@ xlog_map_iclog_data(
unsigned int off = offset_in_page(data);
size_t len = min_t(size_t, count, PAGE_SIZE - off);
- WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len);
+ if (bio_add_page(bio, page, len, off) != len)
+ return -EIO;
data += len;
count -= len;
} while (count);
+
+ return 0;
}
STATIC void
@@ -1784,7 +1678,7 @@ xlog_write_iclog(
* the buffer manually, the code needs to be kept in sync
* with the I/O completion path.
*/
- xlog_state_done_syncing(iclog, true);
+ xlog_state_done_syncing(iclog);
up(&iclog->ic_sema);
return;
}
@@ -1794,11 +1688,22 @@ xlog_write_iclog(
iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
iclog->ic_bio.bi_end_io = xlog_bio_end_io;
iclog->ic_bio.bi_private = iclog;
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
+
+ /*
+ * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
+ * IOs coming immediately after this one. This prevents the block layer
+ * writeback throttle from throttling log writes behind background
+ * metadata writeback and causing priority inversions.
+ */
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
+ REQ_IDLE | REQ_FUA;
if (need_flush)
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
- xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count);
+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ return;
+ }
if (is_vmalloc_addr(iclog->ic_data))
flush_kernel_vmap_range(iclog->ic_data, count);
@@ -2011,7 +1916,7 @@ xlog_dealloc_log(
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
-} /* xlog_dealloc_log */
+}
/*
* Update counters atomically now that memcpy is done.
@@ -2148,23 +2053,21 @@ xlog_print_trans(
}
/*
- * Calculate the potential space needed by the log vector. Each region gets
- * its own xlog_op_header_t and may need to be double word aligned.
+ * Calculate the potential space needed by the log vector. We may need a start
+ * record, and each region gets its own struct xlog_op_header and may need to be
+ * double word aligned.
*/
static int
xlog_write_calc_vec_length(
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector)
+ struct xfs_log_vec *log_vector,
+ bool need_start_rec)
{
struct xfs_log_vec *lv;
- int headers = 0;
+ int headers = need_start_rec ? 1 : 0;
int len = 0;
int i;
- /* acct for start rec of xact */
- if (ticket->t_flags & XLOG_TIC_INITED)
- headers++;
-
for (lv = log_vector; lv; lv = lv->lv_next) {
/* we don't write ordered log vectors */
if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
@@ -2186,27 +2089,16 @@ xlog_write_calc_vec_length(
return len;
}
-/*
- * If first write for transaction, insert start record We can't be trying to
- * commit if we are inited. We can't have any "partial_copy" if we are inited.
- */
-static int
+static void
xlog_write_start_rec(
struct xlog_op_header *ophdr,
struct xlog_ticket *ticket)
{
- if (!(ticket->t_flags & XLOG_TIC_INITED))
- return 0;
-
ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
ophdr->oh_clientid = ticket->t_clientid;
ophdr->oh_len = 0;
ophdr->oh_flags = XLOG_START_TRANS;
ophdr->oh_res2 = 0;
-
- ticket->t_flags &= ~XLOG_TIC_INITED;
-
- return sizeof(struct xlog_op_header);
}
static xlog_op_header_t *
@@ -2328,7 +2220,11 @@ xlog_write_copy_finish(
*record_cnt = 0;
*data_cnt = 0;
- xlog_state_want_sync(log, iclog);
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, iclog, 0);
+ else
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
if (!commit_iclog)
goto release_iclog;
spin_unlock(&log->l_icloglock);
@@ -2391,13 +2287,14 @@ xlog_write(
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
- uint flags)
+ uint flags,
+ bool need_start_rec)
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_iovec *vecp;
- struct xfs_log_vec *lv;
+ struct xfs_log_vec *lv = log_vector;
+ struct xfs_log_iovec *vecp = lv->lv_iovecp;
+ int index = 0;
int len;
- int index;
int partial_copy = 0;
int partial_copy_len = 0;
int contwr = 0;
@@ -2405,25 +2302,13 @@ xlog_write(
int data_cnt = 0;
int error = 0;
- *start_lsn = 0;
-
- len = xlog_write_calc_vec_length(ticket, log_vector);
-
/*
- * Region headers and bytes are already accounted for.
- * We only need to take into account start records and
- * split regions in this function.
+ * If this is a commit or unmount transaction, we don't need a start
+ * record to be written. We do, however, have to account for the
+ * commit or unmount header that gets written. Hence we always have
+ * to account for an extra xlog_op_header here.
*/
- if (ticket->t_flags & XLOG_TIC_INITED)
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
- /*
- * Commit record headers need to be accounted for. These
- * come in as separate writes so are easy to detect.
- */
- if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2431,9 +2316,8 @@ xlog_write(
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
- index = 0;
- lv = log_vector;
- vecp = lv->lv_iovecp;
+ len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
+ *start_lsn = 0;
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2457,7 +2341,6 @@ xlog_write(
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
- int start_rec_copy;
int copy_len;
int copy_off;
bool ordered = false;
@@ -2473,11 +2356,15 @@ xlog_write(
ASSERT(reg->i_len % sizeof(int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
- start_rec_copy = xlog_write_start_rec(ptr, ticket);
- if (start_rec_copy) {
- record_cnt++;
+ /*
+ * Before we start formatting log vectors, we need to
+ * write a start record. Only do this for the first
+ * iclog we write to.
+ */
+ if (need_start_rec) {
+ xlog_write_start_rec(ptr, ticket);
xlog_write_adv_cnt(&ptr, &len, &log_offset,
- start_rec_copy);
+ sizeof(struct xlog_op_header));
}
ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
@@ -2509,8 +2396,13 @@ xlog_write(
xlog_write_adv_cnt(&ptr, &len, &log_offset,
copy_len);
}
- copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+ copy_len += sizeof(struct xlog_op_header);
record_cnt++;
+ if (need_start_rec) {
+ copy_len += sizeof(struct xlog_op_header);
+ record_cnt++;
+ need_start_rec = false;
+ }
data_cnt += contwr ? copy_len : 0;
error = xlog_write_copy_finish(log, iclog, flags,
@@ -2567,119 +2459,106 @@ next_lv:
return error;
}
+static void
+xlog_state_activate_iclog(
+ struct xlog_in_core *iclog,
+ int *iclogs_changed)
+{
+ ASSERT(list_empty_careful(&iclog->ic_callbacks));
-/*****************************************************************************
- *
- * State Machine functions
- *
- *****************************************************************************
- */
+ /*
+ * If the number of ops in this iclog indicate it just contains the
+ * dummy transaction, we can change state into IDLE (the second time
+ * around). Otherwise we should change the state into NEED a dummy.
+ * We don't need to cover the dummy.
+ */
+ if (*iclogs_changed == 0 &&
+ iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
+ *iclogs_changed = 1;
+ } else {
+ /*
+ * We have two dirty iclogs so start over. This could also be
+ * num of ops indicating this is not the dummy going out.
+ */
+ *iclogs_changed = 2;
+ }
+
+ iclog->ic_state = XLOG_STATE_ACTIVE;
+ iclog->ic_offset = 0;
+ iclog->ic_header.h_num_logops = 0;
+ memset(iclog->ic_header.h_cycle_data, 0,
+ sizeof(iclog->ic_header.h_cycle_data));
+ iclog->ic_header.h_lsn = 0;
+}
/*
- * An iclog has just finished IO completion processing, so we need to update
- * the iclog state and propagate that up into the overall log state. Hence we
- * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
- * starting from the head, and then wake up any threads that are waiting for the
- * iclog to be marked clean.
- *
- * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
- * doesn't become ACTIVE beyond one that is SYNCING. This is also required to
- * maintain the notion that we use a ordered wait queue to hold off would be
- * writers to the log when every iclog is trying to sync to disk.
- *
- * Caller must hold the icloglock before calling us.
- *
- * State Change: !IOERROR -> DIRTY -> ACTIVE
+ * Loop through all iclogs and mark all iclogs currently marked DIRTY as
+ * ACTIVE after iclog I/O has completed.
*/
-STATIC void
-xlog_state_clean_iclog(
+static void
+xlog_state_activate_iclogs(
struct xlog *log,
- struct xlog_in_core *dirty_iclog)
+ int *iclogs_changed)
{
- struct xlog_in_core *iclog;
- int changed = 0;
-
- /* Prepare the completed iclog. */
- if (dirty_iclog->ic_state != XLOG_STATE_IOERROR)
- dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+ struct xlog_in_core *iclog = log->l_iclog;
- /* Walk all the iclogs to update the ordered active state. */
- iclog = log->l_iclog;
do {
- if (iclog->ic_state == XLOG_STATE_DIRTY) {
- iclog->ic_state = XLOG_STATE_ACTIVE;
- iclog->ic_offset = 0;
- ASSERT(list_empty_careful(&iclog->ic_callbacks));
- /*
- * If the number of ops in this iclog indicate it just
- * contains the dummy transaction, we can
- * change state into IDLE (the second time around).
- * Otherwise we should change the state into
- * NEED a dummy.
- * We don't need to cover the dummy.
- */
- if (!changed &&
- (be32_to_cpu(iclog->ic_header.h_num_logops) ==
- XLOG_COVER_OPS)) {
- changed = 1;
- } else {
- /*
- * We have two dirty iclogs so start over
- * This could also be num of ops indicates
- * this is not the dummy going out.
- */
- changed = 2;
- }
- iclog->ic_header.h_num_logops = 0;
- memset(iclog->ic_header.h_cycle_data, 0,
- sizeof(iclog->ic_header.h_cycle_data));
- iclog->ic_header.h_lsn = 0;
- } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
- /* do nothing */;
- else
- break; /* stop cleaning */
- iclog = iclog->ic_next;
- } while (iclog != log->l_iclog);
-
+ if (iclog->ic_state == XLOG_STATE_DIRTY)
+ xlog_state_activate_iclog(iclog, iclogs_changed);
+ /*
+ * The ordering of marking iclogs ACTIVE must be maintained, so
+ * an iclog doesn't become ACTIVE beyond one that is SYNCING.
+ */
+ else if (iclog->ic_state != XLOG_STATE_ACTIVE)
+ break;
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+static int
+xlog_covered_state(
+ int prev_state,
+ int iclogs_changed)
+{
/*
- * Wake up threads waiting in xfs_log_force() for the dirty iclog
- * to be cleaned.
+ * We usually go to NEED. But we go to NEED2 if the changed indicates we
+ * are done writing the dummy record. If we are done with the second
+ * dummy recored (DONE2), then we go to IDLE.
*/
- wake_up_all(&dirty_iclog->ic_force_wait);
+ switch (prev_state) {
+ case XLOG_STATE_COVER_IDLE:
+ case XLOG_STATE_COVER_NEED:
+ case XLOG_STATE_COVER_NEED2:
+ break;
+ case XLOG_STATE_COVER_DONE:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_NEED2;
+ break;
+ case XLOG_STATE_COVER_DONE2:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_IDLE;
+ break;
+ default:
+ ASSERT(0);
+ }
- /*
- * Change state for the dummy log recording.
- * We usually go to NEED. But we go to NEED2 if the changed indicates
- * we are done writing the dummy record.
- * If we are done with the second dummy recored (DONE2), then
- * we go to IDLE.
- */
- if (changed) {
- switch (log->l_covered_state) {
- case XLOG_STATE_COVER_IDLE:
- case XLOG_STATE_COVER_NEED:
- case XLOG_STATE_COVER_NEED2:
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
+ return XLOG_STATE_COVER_NEED;
+}
- case XLOG_STATE_COVER_DONE:
- if (changed == 1)
- log->l_covered_state = XLOG_STATE_COVER_NEED2;
- else
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
+STATIC void
+xlog_state_clean_iclog(
+ struct xlog *log,
+ struct xlog_in_core *dirty_iclog)
+{
+ int iclogs_changed = 0;
- case XLOG_STATE_COVER_DONE2:
- if (changed == 1)
- log->l_covered_state = XLOG_STATE_COVER_IDLE;
- else
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
+ dirty_iclog->ic_state = XLOG_STATE_DIRTY;
- default:
- ASSERT(0);
- }
+ xlog_state_activate_iclogs(log, &iclogs_changed);
+ wake_up_all(&dirty_iclog->ic_force_wait);
+
+ if (iclogs_changed) {
+ log->l_covered_state = xlog_covered_state(log->l_covered_state,
+ iclogs_changed);
}
}
@@ -2808,8 +2687,7 @@ xlog_state_iodone_process_iclog(
static void
xlog_state_do_iclog_callbacks(
struct xlog *log,
- struct xlog_in_core *iclog,
- bool aborted)
+ struct xlog_in_core *iclog)
__releases(&log->l_icloglock)
__acquires(&log->l_icloglock)
{
@@ -2821,7 +2699,7 @@ xlog_state_do_iclog_callbacks(
list_splice_init(&iclog->ic_callbacks, &tmp);
spin_unlock(&iclog->ic_callback_lock);
- xlog_cil_process_committed(&tmp, aborted);
+ xlog_cil_process_committed(&tmp);
spin_lock(&iclog->ic_callback_lock);
}
@@ -2836,8 +2714,7 @@ xlog_state_do_iclog_callbacks(
STATIC void
xlog_state_do_callback(
- struct xlog *log,
- bool aborted)
+ struct xlog *log)
{
struct xlog_in_core *iclog;
struct xlog_in_core *first_iclog;
@@ -2878,9 +2755,11 @@ xlog_state_do_callback(
* we'll have to run at least one more complete loop.
*/
cycled_icloglock = true;
- xlog_state_do_iclog_callbacks(log, iclog, aborted);
-
- xlog_state_clean_iclog(log, iclog);
+ xlog_state_do_iclog_callbacks(log, iclog);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ wake_up_all(&iclog->ic_force_wait);
+ else
+ xlog_state_clean_iclog(log, iclog);
iclog = iclog->ic_next;
} while (first_iclog != iclog);
@@ -2916,25 +2795,22 @@ xlog_state_do_callback(
*/
STATIC void
xlog_state_done_syncing(
- struct xlog_in_core *iclog,
- bool aborted)
+ struct xlog_in_core *iclog)
{
struct xlog *log = iclog->ic_log;
spin_lock(&log->l_icloglock);
-
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/*
* If we got an error, either on the first buffer, or in the case of
- * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
- * and none should ever be attempted to be written to disk
- * again.
+ * split log writes, on the second, we shut down the file system and
+ * no iclogs should ever be attempted to be written to disk again.
*/
- if (iclog->ic_state == XLOG_STATE_SYNCING)
+ if (!XLOG_FORCED_SHUTDOWN(log)) {
+ ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
iclog->ic_state = XLOG_STATE_DONE_SYNC;
- else
- ASSERT(iclog->ic_state == XLOG_STATE_IOERROR);
+ }
/*
* Someone could be sleeping prior to writing out the next
@@ -2943,9 +2819,8 @@ xlog_state_done_syncing(
*/
wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
- xlog_state_do_callback(log, aborted); /* also cleans log */
-} /* xlog_state_done_syncing */
-
+ xlog_state_do_callback(log);
+}
/*
* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
@@ -3064,21 +2939,21 @@ restart:
*logoffsetp = log_offset;
return 0;
-} /* xlog_state_get_iclog_space */
-
-/* The first cnt-1 times through here we don't need to
- * move the grant write head because the permanent
- * reservation has reserved cnt times the unit amount.
- * Release part of current permanent unit reservation and
- * reset current reservation to be one units worth. Also
- * move grant reservation head forward.
+}
+
+/*
+ * The first cnt-1 times a ticket goes through here we don't need to move the
+ * grant write head because the permanent reservation has reserved cnt times the
+ * unit amount. Release part of current permanent unit reservation and reset
+ * current reservation to be one units worth. Also move grant reservation head
+ * forward.
*/
-STATIC void
-xlog_regrant_reserve_log_space(
+void
+xfs_log_ticket_regrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- trace_xfs_log_regrant_reserve_enter(log, ticket);
+ trace_xfs_log_ticket_regrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
@@ -3090,21 +2965,20 @@ xlog_regrant_reserve_log_space(
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
- trace_xfs_log_regrant_reserve_sub(log, ticket);
+ trace_xfs_log_ticket_regrant_sub(log, ticket);
/* just return if we still have some of the pre-reserved space */
- if (ticket->t_cnt > 0)
- return;
+ if (!ticket->t_cnt) {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
+ ticket->t_unit_res);
+ trace_xfs_log_ticket_regrant_exit(log, ticket);
- xlog_grant_add_space(log, &log->l_reserve_head.grant,
- ticket->t_unit_res);
-
- trace_xfs_log_regrant_reserve_exit(log, ticket);
-
- ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
-} /* xlog_regrant_reserve_log_space */
+ ticket->t_curr_res = ticket->t_unit_res;
+ xlog_tic_reset_res(ticket);
+ }
+ xfs_log_ticket_put(ticket);
+}
/*
* Give back the space left from a reservation.
@@ -3120,18 +2994,19 @@ xlog_regrant_reserve_log_space(
* space, the count will stay at zero and the only space remaining will be
* in the current reservation field.
*/
-STATIC void
-xlog_ungrant_log_space(
+void
+xfs_log_ticket_ungrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- int bytes;
+ int bytes;
+
+ trace_xfs_log_ticket_ungrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- trace_xfs_log_ungrant_enter(log, ticket);
- trace_xfs_log_ungrant_sub(log, ticket);
+ trace_xfs_log_ticket_ungrant_sub(log, ticket);
/*
* If this is a permanent reservation ticket, we may be able to free
@@ -3146,17 +3021,15 @@ xlog_ungrant_log_space(
xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
- trace_xfs_log_ungrant_exit(log, ticket);
+ trace_xfs_log_ticket_ungrant_exit(log, ticket);
xfs_log_space_wake(log->l_mp);
+ xfs_log_ticket_put(ticket);
}
/*
- * This routine will mark the current iclog in the ring as WANT_SYNC
- * and move the current iclog pointer to the next iclog in the ring.
- * When this routine is called from xlog_state_get_iclog_space(), the
- * exact size of the iclog has not yet been determined. All we know is
- * that every data block. We have run out of space in this log record.
+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
+ * the current iclog pointer to the next iclog in the ring.
*/
STATIC void
xlog_state_switch_iclogs(
@@ -3165,6 +3038,8 @@ xlog_state_switch_iclogs(
int eventual_size)
{
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ assert_spin_locked(&log->l_icloglock);
+
if (!eventual_size)
eventual_size = iclog->ic_offset;
iclog->ic_state = XLOG_STATE_WANT_SYNC;
@@ -3199,7 +3074,7 @@ xlog_state_switch_iclogs(
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
-} /* xlog_state_switch_iclogs */
+}
/*
* Write out all data in the in-core log as of this exact moment in time.
@@ -3259,9 +3134,6 @@ xfs_log_force(
* previous iclog and go to sleep.
*/
iclog = iclog->ic_prev;
- if (iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_DIRTY)
- goto out_unlock;
} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
if (atomic_read(&iclog->ic_refcnt) == 0) {
/*
@@ -3277,8 +3149,7 @@ xfs_log_force(
if (xlog_state_release_iclog(log, iclog))
goto out_error;
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn ||
- iclog->ic_state == XLOG_STATE_DIRTY)
+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
goto out_unlock;
} else {
/*
@@ -3298,17 +3169,8 @@ xfs_log_force(
;
}
- if (!(flags & XFS_LOG_SYNC))
- goto out_unlock;
-
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- goto out_error;
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- return -EIO;
- return 0;
-
+ if (flags & XFS_LOG_SYNC)
+ return xlog_wait_on_iclog(iclog);
out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
@@ -3339,9 +3201,6 @@ __xfs_log_force_lsn(
goto out_unlock;
}
- if (iclog->ic_state == XLOG_STATE_DIRTY)
- goto out_unlock;
-
if (iclog->ic_state == XLOG_STATE_ACTIVE) {
/*
* We sleep here if we haven't already slept (e.g. this is the
@@ -3375,20 +3234,8 @@ __xfs_log_force_lsn(
*log_flushed = 1;
}
- if (!(flags & XFS_LOG_SYNC) ||
- (iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_DIRTY))
- goto out_unlock;
-
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- goto out_error;
-
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- return -EIO;
- return 0;
-
+ if (flags & XFS_LOG_SYNC)
+ return xlog_wait_on_iclog(iclog);
out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
@@ -3435,33 +3282,6 @@ xfs_log_force_lsn(
}
/*
- * Called when we want to mark the current iclog as being ready to sync to
- * disk.
- */
-STATIC void
-xlog_state_want_sync(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- assert_spin_locked(&log->l_icloglock);
-
- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
- xlog_state_switch_iclogs(log, iclog, 0);
- } else {
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR);
- }
-}
-
-
-/*****************************************************************************
- *
- * TICKET functions
- *
- *****************************************************************************
- */
-
-/*
* Free a used ticket when its refcount falls to zero.
*/
void
@@ -3609,7 +3429,6 @@ xlog_ticket_alloc(
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
tic->t_clientid = client;
- tic->t_flags = XLOG_TIC_INITED;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
@@ -3618,13 +3437,6 @@ xlog_ticket_alloc(
return tic;
}
-
-/******************************************************************************
- *
- * Log debug routines
- *
- ******************************************************************************
- */
#if defined(DEBUG)
/*
* Make sure that the destination ptr is within the valid data region of
@@ -3710,7 +3522,7 @@ xlog_verify_tail_lsn(
if (blocks < BTOBB(iclog->ic_offset) + 1)
xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
}
-} /* xlog_verify_tail_lsn */
+}
/*
* Perform a number of checks on the iclog before writing to disk.
@@ -3813,7 +3625,7 @@ xlog_verify_iclog(
}
ptr += sizeof(xlog_op_header_t) + op_len;
}
-} /* xlog_verify_iclog */
+}
#endif
/*
@@ -3937,7 +3749,7 @@ xfs_log_force_umount(
spin_lock(&log->l_cilp->xc_push_lock);
wake_up_all(&log->l_cilp->xc_commit_wait);
spin_unlock(&log->l_cilp->xc_push_lock);
- xlog_state_do_callback(log, true);
+ xlog_state_do_callback(log);
/* return non-zero if log IOERROR transition had already happened */
return retval;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 84e06805160f..1412d6993f1e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -105,10 +105,6 @@ struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
-xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant);
int xfs_log_force(struct xfs_mount *mp, uint flags);
int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
int *log_forced);
@@ -121,8 +117,7 @@ void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_release_iclog(struct xfs_mount *mp,
- struct xlog_in_core *iclog);
+void xfs_log_release_iclog(struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
int length,
int count,
@@ -138,7 +133,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket);
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, bool regrant);
-void xlog_cil_process_committed(struct list_head *list, bool aborted);
+void xlog_cil_process_committed(struct list_head *list);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 48435cf2aa16..b43f0e8f43f2 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -574,10 +574,10 @@ xlog_discard_busy_extents(
*/
static void
xlog_cil_committed(
- struct xfs_cil_ctx *ctx,
- bool abort)
+ struct xfs_cil_ctx *ctx)
{
struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
+ bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log);
/*
* If the I/O failed, we're aborting the commit and already shutdown.
@@ -613,37 +613,38 @@ xlog_cil_committed(
void
xlog_cil_process_committed(
- struct list_head *list,
- bool aborted)
+ struct list_head *list)
{
struct xfs_cil_ctx *ctx;
while ((ctx = list_first_entry_or_null(list,
struct xfs_cil_ctx, iclog_entry))) {
list_del(&ctx->iclog_entry);
- xlog_cil_committed(ctx, aborted);
+ xlog_cil_committed(ctx);
}
}
/*
- * Push the Committed Item List to the log. If @push_seq flag is zero, then it
- * is a background flush and so we can chose to ignore it. Otherwise, if the
- * current sequence is the same as @push_seq we need to do a flush. If
- * @push_seq is less than the current sequence, then it has already been
+ * Push the Committed Item List to the log.
+ *
+ * If the current sequence is the same as xc_push_seq we need to do a flush. If
+ * xc_push_seq is less than the current sequence, then it has already been
* flushed and we don't need to do anything - the caller will wait for it to
* complete if necessary.
*
- * @push_seq is a value rather than a flag because that allows us to do an
- * unlocked check of the sequence number for a match. Hence we can allows log
- * forces to run racily and not issue pushes for the same sequence twice. If we
- * get a race between multiple pushes for the same sequence they will block on
- * the first one and then abort, hence avoiding needless pushes.
+ * xc_push_seq is checked unlocked against the sequence number for a match.
+ * Hence we can allow log forces to run racily and not issue pushes for the
+ * same sequence twice. If we get a race between multiple pushes for the same
+ * sequence they will block on the first one and then abort, hence avoiding
+ * needless pushes.
*/
-STATIC int
-xlog_cil_push(
- struct xlog *log)
+static void
+xlog_cil_push_work(
+ struct work_struct *work)
{
- struct xfs_cil *cil = log->l_cilp;
+ struct xfs_cil *cil =
+ container_of(work, struct xfs_cil, xc_push_work);
+ struct xlog *log = cil->xc_log;
struct xfs_log_vec *lv;
struct xfs_cil_ctx *ctx;
struct xfs_cil_ctx *new_ctx;
@@ -657,9 +658,6 @@ xlog_cil_push(
xfs_lsn_t commit_lsn;
xfs_lsn_t push_seq;
- if (!cil)
- return 0;
-
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -671,6 +669,11 @@ xlog_cil_push(
ASSERT(push_seq <= ctx->sequence);
/*
+ * Wake up any background push waiters now this context is being pushed.
+ */
+ wake_up_all(&ctx->push_wait);
+
+ /*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
* this sequence again later.
@@ -746,6 +749,7 @@ xlog_cil_push(
*/
INIT_LIST_HEAD(&new_ctx->committing);
INIT_LIST_HEAD(&new_ctx->busy_extents);
+ init_waitqueue_head(&new_ctx->push_wait);
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
@@ -803,7 +807,7 @@ xlog_cil_push(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
- error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+ error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true);
if (error)
goto out_abort_free_ticket;
@@ -841,10 +845,11 @@ restart:
}
spin_unlock(&cil->xc_push_lock);
- /* xfs_log_done always frees the ticket on error. */
- commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
- if (commit_lsn == -1)
- goto out_abort;
+ error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
+ if (error)
+ goto out_abort_free_ticket;
+
+ xfs_log_ticket_ungrant(log, tic);
spin_lock(&commit_iclog->ic_callback_lock);
if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
@@ -867,28 +872,20 @@ restart:
spin_unlock(&cil->xc_push_lock);
/* release the hounds! */
- return xfs_log_release_iclog(log->l_mp, commit_iclog);
+ xfs_log_release_iclog(commit_iclog);
+ return;
out_skip:
up_write(&cil->xc_ctx_lock);
xfs_log_ticket_put(new_ctx->ticket);
kmem_free(new_ctx);
- return 0;
+ return;
out_abort_free_ticket:
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
out_abort:
- xlog_cil_committed(ctx, true);
- return -EIO;
-}
-
-static void
-xlog_cil_push_work(
- struct work_struct *work)
-{
- struct xfs_cil *cil = container_of(work, struct xfs_cil,
- xc_push_work);
- xlog_cil_push(cil->xc_log);
+ ASSERT(XLOG_FORCED_SHUTDOWN(log));
+ xlog_cil_committed(ctx);
}
/*
@@ -900,7 +897,7 @@ xlog_cil_push_work(
*/
static void
xlog_cil_push_background(
- struct xlog *log)
+ struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
@@ -914,14 +911,36 @@ xlog_cil_push_background(
* don't do a background push if we haven't used up all the
* space available yet.
*/
- if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+ up_read(&cil->xc_ctx_lock);
return;
+ }
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
+
+ /*
+ * Drop the context lock now, we can't hold that if we need to sleep
+ * because we are over the blocking threshold. The push_lock is still
+ * held, so blocking threshold sleep/wakeup is still correctly
+ * serialised here.
+ */
+ up_read(&cil->xc_ctx_lock);
+
+ /*
+ * If we are well over the space limit, throttle the work that is being
+ * done until the push work on this context has begun.
+ */
+ if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+ trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
+ ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+ xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
+ return;
+ }
+
spin_unlock(&cil->xc_push_lock);
}
@@ -1017,7 +1036,10 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = xc_commit_lsn;
- xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+ if (regrant && !XLOG_FORCED_SHUTDOWN(log))
+ xfs_log_ticket_regrant(log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
@@ -1038,9 +1060,9 @@ xfs_log_commit_cil(
if (lip->li_ops->iop_committing)
lip->li_ops->iop_committing(lip, xc_commit_lsn);
}
- xlog_cil_push_background(log);
- up_read(&cil->xc_ctx_lock);
+ /* xlog_cil_push_background() releases cil->xc_ctx_lock */
+ xlog_cil_push_background(log);
}
/*
@@ -1199,6 +1221,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
+ init_waitqueue_head(&ctx->push_wait);
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b192c5a9f9fd..ec22c7a3867f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -51,13 +51,11 @@ enum xlog_iclog_state {
};
/*
- * Flags to log ticket
+ * Log ticket flags
*/
-#define XLOG_TIC_INITED 0x1 /* has been initialized */
-#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
#define XLOG_TIC_FLAGS \
- { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
/*
@@ -242,6 +240,7 @@ struct xfs_cil_ctx {
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
+ wait_queue_head_t push_wait; /* background push throttle */
struct work_struct discard_endio_work;
};
@@ -318,13 +317,53 @@ struct xfs_cil {
* tries to keep 25% of the log free, so we need to keep below that limit or we
* risk running out of free log space to start any new transactions.
*
- * In order to keep background CIL push efficient, we will set a lower
- * threshold at which background pushing is attempted without blocking current
- * transaction commits. A separate, higher bound defines when CIL pushes are
- * enforced to ensure we stay within our maximum checkpoint size bounds.
- * threshold, yet give us plenty of space for aggregation on large logs.
+ * In order to keep background CIL push efficient, we only need to ensure the
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
+ * CIL but does not limit performance by reducing relogging efficiency
+ * significantly.
+ *
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
+ * made without excessive blocking of incoming transaction commits. This is
+ * defined to be 12.5% of the log space - half the 25% push threshold of the
+ * AIL.
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
+ * close to peak relogging efficiency. This is defined to be 16x the iclog
+ * buffer window (32MB) as measurements have shown this to be roughly the
+ * point of diminishing performance increases under highly concurrent
+ * modification workloads.
+ *
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
+ * new threshold at which we block committing transactions until the background
+ * CIL commit commences and switches to a new context. While this is not a hard
+ * limit, it forces the process committing a transaction to the CIL to block and
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
+ * work. This prevents a process running lots of transactions from overfilling
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
+ * twice the background push space threshold so we keep in line with the AIL
+ * push thresholds.
+ *
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
+ * is inserted into the CIL and the push has been triggered. It is largely a
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
+ * limit will be difficult to implement without introducing global serialisation
+ * in the CIL commit fast path, and it's not at all clear that we actually need
+ * such hard limits given the ~7 years we've run without a hard limit before
+ * finding the first situation where a checkpoint size overflow actually
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
+ * we've overrun the max size.
*/
-#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
+#define XLOG_CIL_SPACE_LIMIT(log) \
+ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
+
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
+ (XLOG_CIL_SPACE_LIMIT(log) * 2)
/*
* ticket grant locks, queues and accounting have their own cachlines
@@ -402,7 +441,8 @@ struct xlog {
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
-#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
+#define XLOG_FORCED_SHUTDOWN(log) \
+ (unlikely((log)->l_flags & XLOG_IO_ERROR))
/* common routines */
extern int
@@ -438,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
-int
-xlog_write(
- struct xlog *log,
- struct xfs_log_vec *log_vector,
- struct xlog_ticket *tic,
- xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog,
- uint flags);
+int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
+ struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+ struct xlog_in_core **commit_iclog, uint flags,
+ bool need_start_rec);
+int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog, xfs_lsn_t *lsn);
+void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
+void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
@@ -525,12 +565,6 @@ xlog_cil_force(struct xlog *log)
}
/*
- * Unmount record type is used as a pseudo transaction type for the ticket.
- * It's value must be outside the range of XFS_TRANS_* values.
- */
-#define XLOG_UNMOUNT_REC_TYPE (-1U)
-
-/*
* Wrapper function for waiting on a wait queue serialised against wakeups
* by a spinlock. This matches the semantics of all the wait queues used in the
* log code.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 25cfc85dbaa7..11c3502b07b1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2869,8 +2869,8 @@ xfs_recover_inode_owner_change(
return -ENOMEM;
/* instantiate the inode */
+ ASSERT(dip->di_version >= 3);
xfs_inode_from_disk(ip, dip);
- ASSERT(ip->i_d.di_version >= 3);
error = xfs_iformat_fork(ip, dip);
if (error)
@@ -2997,7 +2997,7 @@ xlog_recover_inode_pass2(
* superblock flag to determine whether we need to look at di_flushiter
* to skip replay when the on disk inode is newer than the log one
*/
- if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb) &&
ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
@@ -3068,7 +3068,7 @@ xlog_recover_inode_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- isize = xfs_log_dinode_size(ldip->di_version);
+ isize = xfs_log_dinode_size(mp);
if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, ldip,
@@ -4947,7 +4947,7 @@ xlog_recover_clear_agi_bucket(
if (error)
goto out_abort;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket);
@@ -5083,7 +5083,7 @@ xlog_recover_process_iunlinks(
* buffer reference though, so that it stays pinned in memory
* while we need the buffer.
*/
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
xfs_buf_unlock(agibp);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
@@ -5636,7 +5636,7 @@ xlog_do_recover(
/* Convert superblock from on-disk format */
sbp = &mp->m_sb;
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, bp->b_addr);
xfs_buf_relse(bp);
/* re-initialise in-core superblock and geometry structures */
@@ -5809,7 +5809,6 @@ xlog_recover_check_summary(
struct xlog *log)
{
xfs_mount_t *mp;
- xfs_agf_t *agfp;
xfs_buf_t *agfbp;
xfs_buf_t *agibp;
xfs_agnumber_t agno;
@@ -5829,7 +5828,8 @@ xlog_recover_check_summary(
xfs_alert(mp, "%s agf read failed agno %d error %d",
__func__, agno, error);
} else {
- agfp = XFS_BUF_TO_AGF(agfbp);
+ struct xfs_agf *agfp = agfbp->b_addr;
+
freeblks += be32_to_cpu(agfp->agf_freeblks) +
be32_to_cpu(agfp->agf_flcount);
xfs_buf_relse(agfbp);
@@ -5840,7 +5840,7 @@ xlog_recover_check_summary(
xfs_alert(mp, "%s agi read failed agno %d error %d",
__func__, agno, error);
} else {
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ struct xfs_agi *agi = agibp->b_addr;
itotal += be32_to_cpu(agi->agi_count);
ifree += be32_to_cpu(agi->agi_freecount);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 56efe140c923..c5513e5a226a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -310,7 +310,7 @@ reread:
/*
* Initialize the mount structure from the superblock.
*/
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, bp->b_addr);
/*
* If we haven't validated the superblock, do so now before we try
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 88ab09ed29e7..50c43422fa17 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -167,6 +167,7 @@ typedef struct xfs_mount {
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
+ struct ratelimit_state m_flush_inodes_ratelimit;
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 0b0909657bad..c225691fad15 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -121,12 +121,11 @@ xfs_qm_dqpurge(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
+ int error = -EAGAIN;
xfs_dqlock(dqp);
- if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- return -EAGAIN;
- }
+ if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0)
+ goto out_unlock;
dqp->dq_flags |= XFS_DQ_FREEING;
@@ -139,7 +138,6 @@ xfs_qm_dqpurge(
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
- int error;
/*
* We don't care about getting disk errors here. We need
@@ -149,6 +147,8 @@ xfs_qm_dqpurge(
if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
+ } else if (error == -EAGAIN) {
+ goto out_unlock;
}
xfs_dqflock(dqp);
}
@@ -174,6 +174,10 @@ xfs_qm_dqpurge(
xfs_qm_dqdestroy(dqp);
return 0;
+
+out_unlock:
+ xfs_dqunlock(dqp);
+ return error;
}
/*
@@ -326,16 +330,16 @@ xfs_qm_dqattach_locked(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
- doalloc, &ip->i_udquot);
+ error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)),
+ XFS_DQ_USER, doalloc, &ip->i_udquot);
if (error)
goto done;
ASSERT(ip->i_udquot);
}
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
- doalloc, &ip->i_gdquot);
+ error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)),
+ XFS_DQ_GROUP, doalloc, &ip->i_gdquot);
if (error)
goto done;
ASSERT(ip->i_gdquot);
@@ -871,12 +875,20 @@ xfs_qm_reset_dqcounts(
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;
- ddq->d_btimer = 0;
- ddq->d_itimer = 0;
- ddq->d_rtbtimer = 0;
- ddq->d_bwarns = 0;
- ddq->d_iwarns = 0;
- ddq->d_rtbwarns = 0;
+
+ /*
+ * dquot id 0 stores the default grace period and the maximum
+ * warning limit that were set by the administrator, so we
+ * should not reset them.
+ */
+ if (ddq->d_id != 0) {
+ ddq->d_btimer = 0;
+ ddq->d_itimer = 0;
+ ddq->d_rtbtimer = 0;
+ ddq->d_bwarns = 0;
+ ddq->d_iwarns = 0;
+ ddq->d_rtbwarns = 0;
+ }
if (xfs_sb_version_hascrc(&mp->m_sb)) {
xfs_update_cksum((char *)&dqb[j],
@@ -1613,8 +1625,8 @@ xfs_qm_dqfree_one(
int
xfs_qm_vop_dqalloc(
struct xfs_inode *ip,
- xfs_dqid_t uid,
- xfs_dqid_t gid,
+ kuid_t uid,
+ kgid_t gid,
prid_t prid,
uint flags,
struct xfs_dquot **O_udqpp,
@@ -1622,6 +1634,8 @@ xfs_qm_vop_dqalloc(
struct xfs_dquot **O_pdqpp)
{
struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ struct user_namespace *user_ns = inode->i_sb->s_user_ns;
struct xfs_dquot *uq = NULL;
struct xfs_dquot *gq = NULL;
struct xfs_dquot *pq = NULL;
@@ -1635,7 +1649,7 @@ xfs_qm_vop_dqalloc(
xfs_ilock(ip, lockflags);
if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
- gid = ip->i_d.di_gid;
+ gid = inode->i_gid;
/*
* Attach the dquot(s) to this inode, doing a dquot allocation
@@ -1650,7 +1664,7 @@ xfs_qm_vop_dqalloc(
}
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
- if (ip->i_d.di_uid != uid) {
+ if (!uid_eq(inode->i_uid, uid)) {
/*
* What we need is the dquot that has this uid, and
* if we send the inode to dqget, the uid of the inode
@@ -1661,7 +1675,8 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq);
+ error = xfs_qm_dqget(mp, from_kuid(user_ns, uid),
+ XFS_DQ_USER, true, &uq);
if (error) {
ASSERT(error != -ENOENT);
return error;
@@ -1682,9 +1697,10 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
- if (ip->i_d.di_gid != gid) {
+ if (!gid_eq(inode->i_gid, gid)) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq);
+ error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
+ XFS_DQ_GROUP, true, &gq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1810,7 +1826,7 @@ xfs_qm_vop_chown_reserve(
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
+ i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) {
udq_delblks = udqp;
/*
* If there are delayed allocation blocks, then we have to
@@ -1823,7 +1839,7 @@ xfs_qm_vop_chown_reserve(
}
}
if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
+ i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) {
gdq_delblks = gdqp;
if (delblks) {
ASSERT(ip->i_gdquot);
@@ -1920,14 +1936,15 @@ xfs_qm_vop_create_dqattach(
if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+ ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id));
ip->i_udquot = xfs_qm_dqhold(udqp);
xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
- ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
+ ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id));
+
ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 1ea82764bf89..5d5ac65aa1cc 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -29,8 +29,6 @@ xfs_qm_log_quotaoff(
int error;
struct xfs_qoff_logitem *qoffi;
- *qoffstartp = NULL;
-
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
if (error)
goto out;
@@ -62,7 +60,7 @@ out:
STATIC int
xfs_qm_log_quotaoff_end(
struct xfs_mount *mp,
- struct xfs_qoff_logitem *startqoff,
+ struct xfs_qoff_logitem **startqoff,
uint flags)
{
struct xfs_trans *tp;
@@ -73,9 +71,10 @@ xfs_qm_log_quotaoff_end(
if (error)
return error;
- qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+ qoffi = xfs_trans_get_qoff_item(tp, *startqoff,
flags & XFS_ALL_QUOTA_ACCT);
xfs_trans_log_quotaoff_item(tp, qoffi);
+ *startqoff = NULL;
/*
* We have to make sure that the transaction is secure on disk before we
@@ -103,7 +102,7 @@ xfs_qm_scall_quotaoff(
uint dqtype;
int error;
uint inactivate_flags;
- struct xfs_qoff_logitem *qoffstart;
+ struct xfs_qoff_logitem *qoffstart = NULL;
/*
* No file system can have quotas enabled on disk but not in core.
@@ -228,7 +227,7 @@ xfs_qm_scall_quotaoff(
* So, we have QUOTAOFF start and end logitems; the start
* logitem won't get overwritten until the end logitem appears...
*/
- error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+ error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags);
if (error) {
/* We're screwed now. Shutdown is the only option. */
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -261,6 +260,8 @@ xfs_qm_scall_quotaoff(
}
out_unlock:
+ if (error && qoffstart)
+ xfs_qm_qoff_logitem_relse(qoffstart);
mutex_unlock(&q->qi_quotaofflock);
return error;
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index efe42ae7a2f3..aa8fc1f55fbd 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -86,7 +86,7 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t,
prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
struct xfs_dquot **);
extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
@@ -109,7 +109,7 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
#else
static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
prid_t prid, uint flags, struct xfs_dquot **udqp,
struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
{
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 113883c4f202..f70f1255220b 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -57,13 +57,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
/* Loop over all stats groups */
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- len += snprintf(buf + len, PATH_MAX - len, "%s",
+ len += scnprintf(buf + len, PATH_MAX - len, "%s",
xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- len += snprintf(buf + len, PATH_MAX - len, " %u",
+ len += scnprintf(buf + len, PATH_MAX - len, " %u",
counter_val(stats, j));
- len += snprintf(buf + len, PATH_MAX - len, "\n");
+ len += scnprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
@@ -72,9 +72,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
}
- len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2094386af8ac..abf06bf9c3f3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -528,6 +528,9 @@ xfs_flush_inodes(
{
struct super_block *sb = mp->m_super;
+ if (!__ratelimit(&mp->m_flush_inodes_ratelimit))
+ return;
+
if (down_read_trylock(&sb->s_umount)) {
sync_inodes_sb(sb);
up_read(&sb->s_umount);
@@ -1366,6 +1369,17 @@ xfs_fc_fill_super(
if (error)
goto out_free_names;
+ /*
+ * Cap the number of invocations of xfs_flush_inodes to 16 for every
+ * quarter of a second. The magic numbers here were determined by
+ * observation neither to cause stalls in writeback when there are a
+ * lot of IO threads and the fs is near ENOSPC, nor cause any fstest
+ * regressions. YMMV.
+ */
+ ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16);
+ ratelimit_set_flags(&mp->m_flush_inodes_ratelimit,
+ RATELIMIT_MSG_ON_RELEASE);
+
error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
@@ -1861,7 +1875,8 @@ xfs_init_zones(void)
xfs_ili_zone = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
- SLAB_MEM_SPREAD, NULL);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index d762d42ed0ff..13fb4b919648 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -176,15 +176,12 @@ xfs_symlink(
return -ENAMETOOLONG;
ASSERT(pathlen > 0);
- udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp,
- xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -194,7 +191,7 @@ xfs_symlink(
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
*/
- if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
+ if (pathlen <= XFS_LITINO(mp))
fs_blocks = 0;
else
fs_blocks = xfs_symlink_blocks(mp, pathlen);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index bc85b89f88ca..120398a37c2a 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -6,6 +6,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_bit.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -27,6 +28,7 @@
#include "xfs_log_recover.h"
#include "xfs_filestream.h"
#include "xfs_fsmap.h"
+#include "xfs_btree_staging.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e242988f57fb..a4323a63438d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -35,6 +35,12 @@ struct xfs_icreate_log;
struct xfs_owner_info;
struct xfs_trans_res;
struct xfs_inobt_rec_incore;
+union xfs_btree_ptr;
+
+#define XFS_ATTR_FILTER_FLAGS \
+ { XFS_ATTR_ROOT, "ROOT" }, \
+ { XFS_ATTR_SECURE, "SECURE" }, \
+ { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -45,39 +51,39 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
__field(u32, hashval)
__field(u32, blkno)
__field(u32, offset)
- __field(void *, alist)
+ __field(void *, buffer)
__field(int, bufsize)
__field(int, count)
__field(int, firstu)
__field(int, dupcnt)
- __field(int, flags)
+ __field(unsigned int, attr_filter)
),
TP_fast_assign(
__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
__entry->ino = ctx->dp->i_ino;
- __entry->hashval = ctx->cursor->hashval;
- __entry->blkno = ctx->cursor->blkno;
- __entry->offset = ctx->cursor->offset;
- __entry->alist = ctx->alist;
+ __entry->hashval = ctx->cursor.hashval;
+ __entry->blkno = ctx->cursor.blkno;
+ __entry->offset = ctx->cursor.offset;
+ __entry->buffer = ctx->buffer;
__entry->bufsize = ctx->bufsize;
__entry->count = ctx->count;
__entry->firstu = ctx->firstu;
- __entry->flags = ctx->flags;
+ __entry->attr_filter = ctx->attr_filter;
),
TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
- "alist %p size %u count %u firstu %u flags %d %s",
+ "buffer %p size %u count %u firstu %u filter %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->hashval,
__entry->blkno,
__entry->offset,
__entry->dupcnt,
- __entry->alist,
+ __entry->buffer,
__entry->bufsize,
__entry->count,
__entry->firstu,
- __entry->flags,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS)
)
)
@@ -169,31 +175,31 @@ TRACE_EVENT(xfs_attr_list_node_descend,
__field(u32, hashval)
__field(u32, blkno)
__field(u32, offset)
- __field(void *, alist)
+ __field(void *, buffer)
__field(int, bufsize)
__field(int, count)
__field(int, firstu)
__field(int, dupcnt)
- __field(int, flags)
+ __field(unsigned int, attr_filter)
__field(u32, bt_hashval)
__field(u32, bt_before)
),
TP_fast_assign(
__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
__entry->ino = ctx->dp->i_ino;
- __entry->hashval = ctx->cursor->hashval;
- __entry->blkno = ctx->cursor->blkno;
- __entry->offset = ctx->cursor->offset;
- __entry->alist = ctx->alist;
+ __entry->hashval = ctx->cursor.hashval;
+ __entry->blkno = ctx->cursor.blkno;
+ __entry->offset = ctx->cursor.offset;
+ __entry->buffer = ctx->buffer;
__entry->bufsize = ctx->bufsize;
__entry->count = ctx->count;
__entry->firstu = ctx->firstu;
- __entry->flags = ctx->flags;
+ __entry->attr_filter = ctx->attr_filter;
__entry->bt_hashval = be32_to_cpu(btree->hashval);
__entry->bt_before = be32_to_cpu(btree->before);
),
TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
- "alist %p size %u count %u firstu %u flags %d %s "
+ "buffer %p size %u count %u firstu %u filter %s "
"node hashval %u, node before %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -201,12 +207,12 @@ TRACE_EVENT(xfs_attr_list_node_descend,
__entry->blkno,
__entry->offset,
__entry->dupcnt,
- __entry->alist,
+ __entry->buffer,
__entry->bufsize,
__entry->count,
__entry->firstu,
- __entry->flags,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS),
__entry->bt_hashval,
__entry->bt_before)
);
@@ -995,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
DEFINE_EVENT(xfs_loggrant_class, name, \
TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
TP_ARGS(log, tic))
-DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
-DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
@@ -1005,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
@@ -1701,7 +1706,8 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(int, namelen)
__field(int, valuelen)
__field(xfs_dahash_t, hashval)
- __field(int, flags)
+ __field(unsigned int, attr_filter)
+ __field(unsigned int, attr_flags)
__field(int, op_flags)
),
TP_fast_assign(
@@ -1712,11 +1718,12 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->namelen = args->namelen;
__entry->valuelen = args->valuelen;
__entry->hashval = args->hashval;
- __entry->flags = args->flags;
+ __entry->attr_filter = args->attr_filter;
+ __entry->attr_flags = args->attr_flags;
__entry->op_flags = args->op_flags;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
- "hashval 0x%x flags %s op_flags %s",
+ "hashval 0x%x filter %s flags %s op_flags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -1724,7 +1731,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->namelen,
__entry->valuelen,
__entry->hashval,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS),
+ __print_flags(__entry->attr_flags, "|",
+ { XATTR_CREATE, "CREATE" },
+ { XATTR_REPLACE, "REPLACE" }),
__print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
)
@@ -3594,6 +3605,151 @@ TRACE_EVENT(xfs_check_new_dalign,
__entry->calc_rootino)
)
+TRACE_EVENT(xfs_btree_commit_afakeroot,
+ TP_PROTO(struct xfs_btree_cur *cur),
+ TP_ARGS(cur),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, levels)
+ __field(unsigned int, blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = cur->bc_ag.agno;
+ __entry->agbno = cur->bc_ag.afake->af_root;
+ __entry->levels = cur->bc_ag.afake->af_levels;
+ __entry->blocks = cur->bc_ag.afake->af_blocks;
+ ),
+ TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->agno,
+ __entry->levels,
+ __entry->blocks,
+ __entry->agbno)
+)
+
+TRACE_EVENT(xfs_btree_commit_ifakeroot,
+ TP_PROTO(struct xfs_btree_cur *cur),
+ TP_ARGS(cur),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, levels)
+ __field(unsigned int, blocks)
+ __field(int, whichfork)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
+ cur->bc_ino.ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
+ cur->bc_ino.ip->i_ino);
+ __entry->levels = cur->bc_ino.ifake->if_levels;
+ __entry->blocks = cur->bc_ino.ifake->if_blocks;
+ __entry->whichfork = cur->bc_ino.whichfork;
+ ),
+ TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->agno,
+ __entry->agino,
+ __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data",
+ __entry->levels,
+ __entry->blocks)
+)
+
+TRACE_EVENT(xfs_btree_bload_level_geometry,
+ TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+ uint64_t nr_this_level, unsigned int nr_per_block,
+ unsigned int desired_npb, uint64_t blocks,
+ uint64_t blocks_with_extra),
+ TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks,
+ blocks_with_extra),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(unsigned int, level)
+ __field(unsigned int, nlevels)
+ __field(uint64_t, nr_this_level)
+ __field(unsigned int, nr_per_block)
+ __field(unsigned int, desired_npb)
+ __field(unsigned long long, blocks)
+ __field(unsigned long long, blocks_with_extra)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->nr_this_level = nr_this_level;
+ __entry->nr_per_block = nr_per_block;
+ __entry->desired_npb = desired_npb;
+ __entry->blocks = blocks;
+ __entry->blocks_with_extra = blocks_with_extra;
+ ),
+ TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->level,
+ __entry->nlevels,
+ __entry->nr_this_level,
+ __entry->nr_per_block,
+ __entry->desired_npb,
+ __entry->blocks,
+ __entry->blocks_with_extra)
+)
+
+TRACE_EVENT(xfs_btree_bload_block,
+ TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+ uint64_t block_idx, uint64_t nr_blocks,
+ union xfs_btree_ptr *ptr, unsigned int nr_records),
+ TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(unsigned int, level)
+ __field(unsigned long long, block_idx)
+ __field(unsigned long long, nr_blocks)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, nr_records)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->block_idx = block_idx;
+ __entry->nr_blocks = nr_blocks;
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
+
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
+ __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb);
+ } else {
+ __entry->agno = cur->bc_ag.agno;
+ __entry->agbno = be32_to_cpu(ptr->s);
+ }
+ __entry->nr_records = nr_records;
+ ),
+ TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->level,
+ __entry->block_idx,
+ __entry->nr_blocks,
+ __entry->agno,
+ __entry->agbno,
+ __entry->nr_records)
+)
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 3b208f9a865c..28b983ff8b11 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -9,6 +9,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_log_priv.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_extent_busy.h"
@@ -150,8 +151,9 @@ xfs_trans_reserve(
uint blocks,
uint rtextents)
{
- int error = 0;
- bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ struct xfs_mount *mp = tp->t_mountp;
+ int error = 0;
+ bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -162,7 +164,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
- error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+ error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
if (error != 0) {
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
@@ -191,9 +193,9 @@ xfs_trans_reserve(
if (tp->t_ticket != NULL) {
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
- error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(tp->t_mountp,
+ error = xfs_log_reserve(mp,
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
@@ -213,7 +215,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
- error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
+ error = xfs_mod_frextents(mp, -((int64_t)rtextents));
if (error) {
error = -ENOSPC;
goto undo_log;
@@ -229,7 +231,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
- xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -237,7 +239,7 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
+ xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
@@ -306,6 +308,11 @@ xfs_trans_alloc(
*
* Note the zero-length reservation; this transaction MUST be cancelled
* without any dirty data.
+ *
+ * Callers should obtain freeze protection to avoid two conflicts with fs
+ * freezing: (1) having active transactions trip the m_active_trans ASSERTs;
+ * and (2) grabbing buffers at the same time that freeze is trying to drain
+ * the buffer LRU list.
*/
int
xfs_trans_alloc_empty(
@@ -450,7 +457,7 @@ xfs_trans_apply_sb_deltas(
int whole = 0;
bp = xfs_trans_getsb(tp, tp->t_mountp);
- sbp = XFS_BUF_TO_SBP(bp);
+ sbp = bp->b_addr;
/*
* Check that superblock mods match the mods made to AGF counters.
@@ -999,9 +1006,10 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
- if (commit_lsn == -1 && !error)
- error = -EIO;
+ if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log))
+ xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -1060,7 +1068,7 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- xfs_log_done(mp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 00cc5b8734be..564253550b75 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -32,6 +32,7 @@ STATIC void
xfs_ail_check(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
+ __must_hold(&ailp->ail_lock)
{
struct xfs_log_item *prev_lip;
struct xfs_log_item *next_lip;
@@ -108,17 +109,25 @@ xfs_ail_next(
* We need the AIL lock in order to get a coherent read of the lsn of the last
* item in the AIL.
*/
+static xfs_lsn_t
+__xfs_ail_min_lsn(
+ struct xfs_ail *ailp)
+{
+ struct xfs_log_item *lip = xfs_ail_min(ailp);
+
+ if (lip)
+ return lip->li_lsn;
+ return 0;
+}
+
xfs_lsn_t
xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- struct xfs_log_item *lip;
+ xfs_lsn_t lsn;
spin_lock(&ailp->ail_lock);
- lip = xfs_ail_min(ailp);
- if (lip)
- lsn = lip->li_lsn;
+ lsn = __xfs_ail_min_lsn(ailp);
spin_unlock(&ailp->ail_lock);
return lsn;
@@ -529,8 +538,9 @@ xfsaild(
{
struct xfs_ail *ailp = data;
long tout = 0; /* milliseconds */
+ unsigned int noreclaim_flag;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
set_freezable();
while (1) {
@@ -601,6 +611,7 @@ xfsaild(
tout = xfsaild_push(ailp);
}
+ memalloc_noreclaim_restore(noreclaim_flag);
return 0;
}
@@ -678,6 +689,28 @@ xfs_ail_push_all_sync(
finish_wait(&ailp->ail_empty, &wait);
}
+void
+xfs_ail_update_finish(
+ struct xfs_ail *ailp,
+ xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
+{
+ struct xfs_mount *mp = ailp->ail_mount;
+
+ /* if the tail lsn hasn't changed, don't do updates or wakeups. */
+ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
+ spin_unlock(&ailp->ail_lock);
+ return;
+ }
+
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
+
+ if (list_empty(&ailp->ail_head))
+ wake_up_all(&ailp->ail_empty);
+ spin_unlock(&ailp->ail_lock);
+ xfs_log_space_wake(mp);
+}
+
/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
@@ -709,7 +742,7 @@ xfs_trans_ail_update_bulk(
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
struct xfs_log_item *mlip;
- int mlip_changed = 0;
+ xfs_lsn_t tail_lsn = 0;
int i;
LIST_HEAD(tmp);
@@ -724,9 +757,10 @@ xfs_trans_ail_update_bulk(
continue;
trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+ if (mlip == lip && !tail_lsn)
+ tail_lsn = lip->li_lsn;
+
xfs_ail_delete(ailp, lip);
- if (mlip == lip)
- mlip_changed = 1;
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
@@ -737,23 +771,23 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- spin_unlock(&ailp->ail_lock);
-
- xfs_log_space_wake(ailp->ail_mount);
- } else {
- spin_unlock(&ailp->ail_lock);
- }
+ xfs_ail_update_finish(ailp, tail_lsn);
}
-bool
+/*
+ * Delete one log item from the AIL.
+ *
+ * If this item was at the tail of the AIL, return the LSN of the log item so
+ * that we can use it to check if the LSN of the tail of the log has moved
+ * when finishing up the AIL delete process in xfs_ail_update_finish().
+ */
+xfs_lsn_t
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
+ xfs_lsn_t lsn = lip->li_lsn;
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
@@ -761,7 +795,9 @@ xfs_ail_delete_one(
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
- return mlip == lip;
+ if (mlip == lip)
+ return lsn;
+ return 0;
}
/**
@@ -789,10 +825,10 @@ void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock)
+ int shutdown_type)
{
struct xfs_mount *mp = ailp->ail_mount;
- bool mlip_changed;
+ xfs_lsn_t tail_lsn;
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
@@ -805,17 +841,8 @@ xfs_trans_ail_delete(
return;
}
- mlip_changed = xfs_ail_delete_one(ailp, lip);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(mp))
- xlog_assign_tail_lsn_locked(mp);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
-
- spin_unlock(&ailp->ail_lock);
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ tail_lsn = xfs_ail_delete_one(ailp, lip);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 2e073c1c4614..35655eac01a6 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -91,9 +91,11 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
+ __releases(ailp->ail_lock);
void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock);
+ int shutdown_type);
static inline void
xfs_trans_ail_remove(
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index b0fedb543f97..fc5d7276026e 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -12,53 +12,30 @@
#include "xfs_inode.h"
#include "xfs_attr.h"
#include "xfs_acl.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
static int
xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
struct inode *inode, const char *name, void *value, size_t size)
{
- int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(inode);
- int error, asize = size;
- size_t namelen = strlen(name);
-
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (!size) {
- xflags |= ATTR_KERNOVAL;
- value = NULL;
- }
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = handler->flags,
+ .name = name,
+ .namelen = strlen(name),
+ .value = value,
+ .valuelen = size,
+ };
+ int error;
- error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value,
- &asize, xflags);
+ error = xfs_attr_get(&args);
if (error)
return error;
- return asize;
-}
-
-void
-xfs_forget_acl(
- struct inode *inode,
- const char *name,
- int xflags)
-{
- /*
- * Invalidate any cached ACLs if the user has bypassed the ACL
- * interface. We don't validate the content whatsoever so it is caller
- * responsibility to provide data in valid format and ensure i_mode is
- * consistent.
- */
- if (xflags & ATTR_ROOT) {
-#ifdef CONFIG_XFS_POSIX_ACL
- if (!strcmp(name, SGI_ACL_FILE))
- forget_cached_acl(inode, ACL_TYPE_ACCESS);
- else if (!strcmp(name, SGI_ACL_DEFAULT))
- forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-#endif
- }
+ return args.valuelen;
}
static int
@@ -66,25 +43,20 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
struct inode *inode, const char *name, const void *value,
size_t size, int flags)
{
- int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = handler->flags,
+ .attr_flags = flags,
+ .name = name,
+ .namelen = strlen(name),
+ .value = (void *)value,
+ .valuelen = size,
+ };
int error;
- size_t namelen = strlen(name);
-
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (flags & XATTR_CREATE)
- xflags |= ATTR_CREATE;
- if (flags & XATTR_REPLACE)
- xflags |= ATTR_REPLACE;
-
- if (value)
- error = xfs_attr_set(ip, name, namelen, (void *)value, size,
- xflags);
- else
- error = xfs_attr_remove(ip, name, namelen, xflags);
- if (!error)
- xfs_forget_acl(inode, name, xflags);
+ error = xfs_attr_set(&args);
+ if (!error && (handler->flags & XFS_ATTR_ROOT))
+ xfs_forget_acl(inode, name);
return error;
}
@@ -97,14 +69,14 @@ static const struct xattr_handler xfs_xattr_user_handler = {
static const struct xattr_handler xfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .flags = ATTR_ROOT,
+ .flags = XFS_ATTR_ROOT,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
static const struct xattr_handler xfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .flags = ATTR_SECURE,
+ .flags = XFS_ATTR_SECURE,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
@@ -134,7 +106,7 @@ __xfs_xattr_put_listent(
if (context->count < 0 || context->seen_enough)
return;
- if (!context->alist)
+ if (!context->buffer)
goto compute_size;
arraytop = context->count + prefix_len + namelen + 1;
@@ -143,7 +115,7 @@ __xfs_xattr_put_listent(
context->seen_enough = 1;
return;
}
- offset = (char *)context->alist + context->count;
+ offset = context->buffer + context->count;
strncpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
@@ -218,7 +190,6 @@ xfs_vn_listxattr(
size_t size)
{
struct xfs_attr_list_context context;
- struct attrlist_cursor_kern cursor = { 0 };
struct inode *inode = d_inode(dentry);
int error;
@@ -227,14 +198,13 @@ xfs_vn_listxattr(
*/
memset(&context, 0, sizeof(context));
context.dp = XFS_I(inode);
- context.cursor = &cursor;
context.resynch = 1;
- context.alist = size ? data : NULL;
+ context.buffer = size ? data : NULL;
context.bufsize = size;
context.firstu = context.bufsize;
context.put_listent = xfs_xattr_put_listent;
- error = xfs_attr_list_int(&context);
+ error = xfs_attr_list(&context);
if (error)
return error;
if (context.count < 0)
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 69aee3dfb660..3ce9829a6936 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -178,7 +178,8 @@ static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
* amount of readable data in the zone.
*/
static loff_t zonefs_check_zone_condition(struct inode *inode,
- struct blk_zone *zone, bool warn)
+ struct blk_zone *zone, bool warn,
+ bool mount)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
@@ -196,13 +197,26 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
zone->wp = zone->start;
return 0;
case BLK_ZONE_COND_READONLY:
- /* Do not allow writes in read-only zones */
+ /*
+ * The write pointer of read-only zones is invalid. If such a
+ * zone is found during mount, the file size cannot be retrieved
+ * so we treat the zone as offline (mount == true case).
+ * Otherwise, keep the file size as it was when last updated
+ * so that the user can recover data. In both cases, writes are
+ * always disabled for the zone.
+ */
if (warn)
zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
inode->i_ino);
inode->i_flags |= S_IMMUTABLE;
+ if (mount) {
+ zone->cond = BLK_ZONE_COND_OFFLINE;
+ inode->i_mode &= ~0777;
+ zone->wp = zone->start;
+ return 0;
+ }
inode->i_mode &= ~0222;
- /* fallthrough */
+ return i_size_read(inode);
default:
if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
return zi->i_max_size;
@@ -231,7 +245,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* as there is no inconsistency between the inode size and the amount of
* data writen in the zone (data_size).
*/
- data_size = zonefs_check_zone_condition(inode, zone, true);
+ data_size = zonefs_check_zone_condition(inode, zone, true, false);
isize = i_size_read(inode);
if (zone->cond != BLK_ZONE_COND_OFFLINE &&
zone->cond != BLK_ZONE_COND_READONLY &&
@@ -274,7 +288,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
if (zone->cond != BLK_ZONE_COND_OFFLINE) {
zone->cond = BLK_ZONE_COND_OFFLINE;
data_size = zonefs_check_zone_condition(inode, zone,
- false);
+ false, false);
}
} else if (zone->cond == BLK_ZONE_COND_READONLY ||
sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) {
@@ -283,7 +297,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
if (zone->cond != BLK_ZONE_COND_READONLY) {
zone->cond = BLK_ZONE_COND_READONLY;
data_size = zonefs_check_zone_condition(inode, zone,
- false);
+ false, false);
}
}
@@ -975,7 +989,7 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
zi->i_zsector = zone->start;
zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
zone->len << SECTOR_SHIFT);
- zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true);
+ zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true);
inode->i_uid = sbi->s_uid;
inode->i_gid = sbi->s_gid;