summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/9p/vfs_inode_dotl.c4
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/Makefile3
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c30
-rw-r--r--fs/affs/file.c76
-rw-r--r--fs/affs/inode.c4
-rw-r--r--fs/affs/namei.c40
-rw-r--r--fs/afs/dir.c80
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/afs/mntpt.c22
-rw-r--r--fs/afs/rxrpc.c14
-rw-r--r--fs/afs/write.c9
-rw-r--r--fs/aio.c54
-rw-r--r--fs/autofs4/expire.c42
-rw-r--r--fs/autofs4/root.c25
-rw-r--r--fs/befs/linuxvfs.c20
-rw-r--r--fs/binfmt_aout.c8
-rw-r--r--fs/binfmt_elf.c278
-rw-r--r--fs/binfmt_em86.c4
-rw-r--r--fs/binfmt_misc.c397
-rw-r--r--fs/binfmt_script.c10
-rw-r--r--fs/block_dev.c13
-rw-r--r--fs/btrfs/check-integrity.c163
-rw-r--r--fs/btrfs/compression.c51
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c16
-rw-r--r--fs/btrfs/ctree.h91
-rw-r--r--fs/btrfs/dev-replace.c32
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c98
-rw-r--r--fs/btrfs/extent-tree.c252
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file-item.c2
-rw-r--r--fs/btrfs/file.c51
-rw-r--r--fs/btrfs/free-space-cache.c123
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c154
-rw-r--r--fs/btrfs/ioctl.c58
-rw-r--r--fs/btrfs/locking.c24
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/lzo.c15
-rw-r--r--fs/btrfs/ordered-data.c49
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/raid56.c763
-rw-r--r--fs/btrfs/raid56.h16
-rw-r--r--fs/btrfs/scrub.c893
-rw-r--r--fs/btrfs/send.c49
-rw-r--r--fs/btrfs/super.c95
-rw-r--r--fs/btrfs/sysfs.c34
-rw-r--r--fs/btrfs/transaction.c166
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c52
-rw-r--r--fs/btrfs/volumes.c90
-rw-r--r--fs/btrfs/volumes.h32
-rw-r--r--fs/btrfs/xattr.c150
-rw-r--r--fs/btrfs/zlib.c20
-rw-r--r--fs/buffer.c38
-rw-r--r--fs/cachefiles/namei.c21
-rw-r--r--fs/cachefiles/xattr.c15
-rw-r--r--fs/ceph/addr.c273
-rw-r--r--fs/ceph/caps.c134
-rw-r--r--fs/ceph/debugfs.c14
-rw-r--r--fs/ceph/dir.c83
-rw-r--r--fs/ceph/file.c103
-rw-r--r--fs/ceph/inode.c77
-rw-r--r--fs/ceph/locks.c64
-rw-r--r--fs/ceph/mds_client.c41
-rw-r--r--fs/ceph/mds_client.h10
-rw-r--r--fs/ceph/snap.c37
-rw-r--r--fs/ceph/super.c16
-rw-r--r--fs/ceph/super.h55
-rw-r--r--fs/ceph/xattr.c7
-rw-r--r--fs/char_dev.c1
-rw-r--r--fs/cifs/cifs_debug.c77
-rw-r--r--fs/cifs/cifs_debug.h7
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h6
-rw-r--r--fs/cifs/cifssmb.c20
-rw-r--r--fs/cifs/connect.c51
-rw-r--r--fs/cifs/file.c16
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/misc.c32
-rw-r--r--fs/cifs/readdir.c14
-rw-r--r--fs/cifs/sess.c7
-rw-r--r--fs/cifs/smb2file.c4
-rw-r--r--fs/cifs/smb2misc.c38
-rw-r--r--fs/cifs/smb2ops.c65
-rw-r--r--fs/cifs/smb2pdu.c5
-rw-r--r--fs/cifs/smb2pdu.h47
-rw-r--r--fs/cifs/transport.c4
-rw-r--r--fs/coda/cache.c2
-rw-r--r--fs/coda/coda_linux.c6
-rw-r--r--fs/coda/coda_linux.h1
-rw-r--r--fs/coda/dir.c16
-rw-r--r--fs/compat.c21
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/dcache.c270
-rw-r--r--fs/debugfs/file.c69
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/dlm/debug_fs.c263
-rw-r--r--fs/dlm/lock.c76
-rw-r--r--fs/dlm/lock.h3
-rw-r--r--fs/dlm/user.c13
-rw-r--r--fs/drop_caches.c11
-rw-r--r--fs/ecryptfs/crypto.c2
-rw-r--r--fs/ecryptfs/file.c6
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/efivarfs/file.c4
-rw-r--r--fs/efivarfs/super.c11
-rw-r--r--fs/eventfd.c9
-rw-r--r--fs/eventpoll.c13
-rw-r--r--fs/exec.c114
-rw-r--r--fs/exportfs/expfs.c7
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/super.c10
-rw-r--r--fs/ext3/ext3.h4
-rw-r--r--fs/ext3/super.c17
-rw-r--r--fs/ext4/ext4.h45
-rw-r--r--fs/ext4/extents.c232
-rw-r--r--fs/ext4/extents_status.c321
-rw-r--r--fs/ext4/extents_status.h82
-rw-r--r--fs/ext4/file.c222
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inline.c35
-rw-r--r--fs/ext4/inode.c44
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c15
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/namei.c124
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c76
-rw-r--r--fs/f2fs/acl.c148
-rw-r--r--fs/f2fs/acl.h5
-rw-r--r--fs/f2fs/checkpoint.c186
-rw-r--r--fs/f2fs/data.c166
-rw-r--r--fs/f2fs/debug.c15
-rw-r--r--fs/f2fs/dir.c308
-rw-r--r--fs/f2fs/f2fs.h176
-rw-r--r--fs/f2fs/file.c212
-rw-r--r--fs/f2fs/gc.c89
-rw-r--r--fs/f2fs/gc.h5
-rw-r--r--fs/f2fs/inline.c482
-rw-r--r--fs/f2fs/inode.c44
-rw-r--r--fs/f2fs/namei.c58
-rw-r--r--fs/f2fs/node.c163
-rw-r--r--fs/f2fs/node.h8
-rw-r--r--fs/f2fs/recovery.c14
-rw-r--r--fs/f2fs/segment.c122
-rw-r--r--fs/f2fs/segment.h8
-rw-r--r--fs/f2fs/super.c29
-rw-r--r--fs/f2fs/xattr.c6
-rw-r--r--fs/f2fs/xattr.h6
-rw-r--r--fs/fat/dir.c5
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/file.c3
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/fat/namei_vfat.c20
-rw-r--r--fs/file.c2
-rw-r--r--fs/fs-writeback.c29
-rw-r--r--fs/fuse/cuse.c2
-rw-r--r--fs/fuse/dev.c29
-rw-r--r--fs/fuse/dir.c542
-rw-r--r--fs/fuse/file.c232
-rw-r--r--fs/fuse/fuse_i.h45
-rw-r--r--fs/fuse/inode.c39
-rw-r--r--fs/gfs2/dir.c39
-rw-r--r--fs/gfs2/export.c8
-rw-r--r--fs/gfs2/file.c83
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/glops.c26
-rw-r--r--fs/gfs2/glops.h2
-rw-r--r--fs/gfs2/incore.h19
-rw-r--r--fs/gfs2/inode.c72
-rw-r--r--fs/gfs2/log.c42
-rw-r--r--fs/gfs2/main.c11
-rw-r--r--fs/gfs2/ops_fstype.c19
-rw-r--r--fs/gfs2/quota.c9
-rw-r--r--fs/gfs2/rgrp.c69
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/super.c112
-rw-r--r--fs/gfs2/super.h1
-rw-r--r--fs/gfs2/trans.c17
-rw-r--r--fs/hfs/catalog.c14
-rw-r--r--fs/hfsplus/catalog.c89
-rw-r--r--fs/hfsplus/dir.c11
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hppfs/hppfs.c5
-rw-r--r--fs/hugetlbfs/inode.c14
-rw-r--r--fs/inode.c16
-rw-r--r--fs/internal.h12
-rw-r--r--fs/ioctl.c8
-rw-r--r--fs/isofs/inode.c64
-rw-r--r--fs/isofs/namei.c22
-rw-r--r--fs/isofs/rock.c6
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd/revoke.c7
-rw-r--r--fs/jbd2/journal.c8
-rw-r--r--fs/jbd2/revoke.c10
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/summary.c1
-rw-r--r--fs/jfs/jfs_incore.h3
-rw-r--r--fs/jfs/namei.c18
-rw-r--r--fs/jfs/super.c9
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/file.c73
-rw-r--r--fs/libfs.c12
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namei.c146
-rw-r--r--fs/namespace.c98
-rw-r--r--fs/ncpfs/dir.c12
-rw-r--r--fs/ncpfs/file.c14
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/ncpfs/mmap.c4
-rw-r--r--fs/ncpfs/ncplib_kernel.h4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c4
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c16
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/delegation.c25
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/filelayout/filelayout.c3
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c3
-rw-r--r--fs/nfs/fscache.c24
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c11
-rw-r--r--fs/nfs/iostat.h5
-rw-r--r--fs/nfs/netns.h1
-rw-r--r--fs/nfs/nfs42.h2
-rw-r--r--fs/nfs/nfs42proc.c77
-rw-r--r--fs/nfs/nfs42xdr.c139
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c46
-rw-r--r--fs/nfs/nfs4file.c31
-rw-r--r--fs/nfs/nfs4proc.c107
-rw-r--r--fs/nfs/nfs4xdr.c12
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/write.c21
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/nfs4proc.c64
-rw-r--r--fs/nfsd/nfs4recover.c7
-rw-r--r--fs/nfsd/nfs4state.c74
-rw-r--r--fs/nfsd/nfs4xdr.c36
-rw-r--r--fs/nfsd/nfscache.c4
-rw-r--r--fs/nfsd/nfsctl.c43
-rw-r--r--fs/nfsd/nfsd.h9
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h19
-rw-r--r--fs/nfsd/vfs.c53
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nfsd/xdr4.h9
-rw-r--r--fs/nilfs2/file.c10
-rw-r--r--fs/nilfs2/inode.c32
-rw-r--r--fs/nilfs2/namei.c15
-rw-r--r--fs/nilfs2/the_nilfs.c3
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/notify/fdinfo.c84
-rw-r--r--fs/notify/fdinfo.h4
-rw-r--r--fs/notify/fsnotify.c44
-rw-r--r--fs/notify/fsnotify.h16
-rw-r--r--fs/notify/inode_mark.c132
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c19
-rw-r--r--fs/notify/mark.c133
-rw-r--r--fs/notify/vfsmount_mark.c111
-rw-r--r--fs/nsfs.c161
-rw-r--r--fs/ntfs/namei.c4
-rw-r--r--fs/ocfs2/alloc.c28
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c18
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c4
-rw-r--r--fs/ocfs2/dcache.c20
-rw-r--r--fs/ocfs2/dir.c12
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c24
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c18
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmglue.c40
-rw-r--r--fs/ocfs2/file.c6
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/move_extents.c3
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h6
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/super.c11
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/open.c43
-rw-r--r--fs/overlayfs/Kconfig10
-rw-r--r--fs/overlayfs/Makefile7
-rw-r--r--fs/overlayfs/copy_up.c414
-rw-r--r--fs/overlayfs/dir.c928
-rw-r--r--fs/overlayfs/inode.c434
-rw-r--r--fs/overlayfs/overlayfs.h191
-rw-r--r--fs/overlayfs/readdir.c588
-rw-r--r--fs/overlayfs/super.c833
-rw-r--r--fs/pnode.c1
-rw-r--r--fs/proc/array.c47
-rw-r--r--fs/proc/base.c60
-rw-r--r--fs/proc/fd.c3
-rw-r--r--fs/proc/generic.c163
-rw-r--r--fs/proc/inode.c10
-rw-r--r--fs/proc/internal.h13
-rw-r--r--fs/proc/meminfo.c15
-rw-r--r--fs/proc/namespaces.c153
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c107
-rw-r--r--fs/pstore/inode.c22
-rw-r--r--fs/pstore/ram.c36
-rw-r--r--fs/pstore/ram_core.c31
-rw-r--r--fs/quota/dquot.c61
-rw-r--r--fs/quota/quota.c13
-rw-r--r--fs/read_write.c24
-rw-r--r--fs/readdir.c21
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/reiserfs.h4
-rw-r--r--fs/reiserfs/super.c14
-rw-r--r--fs/reiserfs/xattr.c21
-rw-r--r--fs/seq_file.c21
-rw-r--r--fs/signalfd.c4
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/Kconfig15
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/squashfs/decompressor.c7
-rw-r--r--fs/squashfs/decompressor.h4
-rw-r--r--fs/squashfs/lz4_wrapper.c142
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/sync.c2
-rw-r--r--fs/sysfs/file.c59
-rw-r--r--fs/timerfd.c27
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/udf/super.c11
-rw-r--r--fs/xattr.c16
-rw-r--r--fs/xfs/libxfs/xfs_ag.h281
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_attr.c3
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c77
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c2
-rw-r--r--fs/xfs/libxfs/xfs_dinode.h243
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c20
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h140
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c11
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h140
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c13
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c2
-rw-r--r--fs/xfs/libxfs/xfs_format.h1107
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c43
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c3
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c4
-rw-r--r--fs/xfs/libxfs/xfs_inum.h60
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.h584
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_acl.h36
-rw-r--r--fs/xfs/xfs_aops.c3
-rw-r--r--fs/xfs/xfs_attr_inactive.c3
-rw-r--r--fs/xfs/xfs_attr_list.c3
-rw-r--r--fs/xfs/xfs_bmap_util.c75
-rw-r--r--fs/xfs/xfs_buf.c27
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c21
-rw-r--r--fs/xfs/xfs_discard.c1
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_export.c3
-rw-r--r--fs/xfs/xfs_extent_busy.c1
-rw-r--r--fs/xfs/xfs_extfree_item.c3
-rw-r--r--fs/xfs/xfs_file.c9
-rw-r--r--fs/xfs/xfs_filestream.c3
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_icache.h8
-rw-r--r--fs/xfs/xfs_icreate_item.c3
-rw-r--r--fs/xfs/xfs_inode.c29
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c3
-rw-r--r--fs/xfs/xfs_ioctl.c3
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c18
-rw-r--r--fs/xfs/xfs_iops.c5
-rw-r--r--fs/xfs/xfs_itable.c256
-rw-r--r--fs/xfs/xfs_itable.h16
-rw-r--r--fs/xfs/xfs_linux.h6
-rw-r--r--fs/xfs/xfs_log.c8
-rw-r--r--fs/xfs/xfs_log_cil.c3
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_message.c3
-rw-r--r--fs/xfs/xfs_mount.c33
-rw-r--r--fs/xfs/xfs_mount.h8
-rw-r--r--fs/xfs/xfs_qm.c14
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c27
-rw-r--r--fs/xfs/xfs_quotaops.c2
-rw-r--r--fs/xfs/xfs_rtalloc.c3
-rw-r--r--fs/xfs/xfs_super.c20
-rw-r--r--fs/xfs/xfs_symlink.c3
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans_ail.c3
-rw-r--r--fs/xfs/xfs_trans_buf.c137
-rw-r--r--fs/xfs/xfs_trans_dquot.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c3
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_xattr.c2
445 files changed, 15502 insertions, 7412 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 296482fc77a9..9ee5343d4884 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -832,7 +832,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
* moved b under k and client parallely did a lookup for
* k/b.
*/
- res = d_materialise_unique(dentry, inode);
+ res = d_splice_alias(inode, dentry);
if (!res)
v9fs_fid_add(dentry, fid);
else if (!IS_ERR(res))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 02b64f4e576a..6054c16b8fae 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -826,8 +826,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
- p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
- dir->i_ino, dentry->d_name.name, omode,
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
if (!new_valid_dev(rdev))
diff --git a/fs/Kconfig b/fs/Kconfig
index db5dc1598716..664991afe0c0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -67,6 +67,7 @@ source "fs/quota/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
+source "fs/overlayfs/Kconfig"
menu "Caches"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 370b24cee4d8..c055d56ec63d 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -30,6 +30,9 @@ config COMPAT_BINFMT_ELF
config ARCH_BINFMT_ELF_RANDOMIZE_PIE
bool
+config ARCH_BINFMT_ELF_STATE
+ bool
+
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y
diff --git a/fs/Makefile b/fs/Makefile
index 90c88529892b..bedff48e8fdc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o fs_pin.o
+ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/
obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
+obj-$(CONFIG_OVERLAY_FS) += overlayfs/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 9bca88159725..ff44ff3ff015 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -135,8 +135,10 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
extern umode_t prot_to_mode(u32 prot);
extern void mode_to_prot(struct inode *inode);
+__printf(3, 4)
extern void affs_error(struct super_block *sb, const char *function,
const char *fmt, ...);
+__printf(3, 4)
extern void affs_warning(struct super_block *sb, const char *function,
const char *fmt, ...);
extern bool affs_nofilenametruncate(const struct dentry *dentry);
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index abc853968fed..c852f2fa1710 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -10,8 +10,6 @@
#include "affs.h"
-static char ErrorBuffer[256];
-
/*
* Functions for accessing Amiga-FFS structures.
*/
@@ -125,7 +123,7 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino)
{
struct dentry *dentry;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
if (entry_ino == (u32)(long)dentry->d_fsdata) {
dentry->d_fsdata = (void *)inode->i_ino;
break;
@@ -444,30 +442,30 @@ mode_to_prot(struct inode *inode)
void
affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
{
- va_list args;
-
- va_start(args,fmt);
- vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
- va_end(args);
+ struct va_format vaf;
+ va_list args;
- pr_crit("error (device %s): %s(): %s\n", sb->s_id,
- function,ErrorBuffer);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf);
if (!(sb->s_flags & MS_RDONLY))
pr_warn("Remounting filesystem read-only\n");
sb->s_flags |= MS_RDONLY;
+ va_end(args);
}
void
affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
{
- va_list args;
+ struct va_format vaf;
+ va_list args;
- va_start(args,fmt);
- vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf);
va_end(args);
-
- pr_warn("(device %s): %s(): %s\n", sb->s_id,
- function,ErrorBuffer);
}
bool
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1ed590aafecf..8faa6593ca6d 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -12,35 +12,10 @@
* affs regular file handling primitives
*/
+#include <linux/aio.h>
#include "affs.h"
-#if PAGE_SIZE < 4096
-#error PAGE_SIZE must be at least 4096
-#endif
-
-static int affs_grow_extcache(struct inode *inode, u32 lc_idx);
-static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext);
-static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext);
static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
-static int affs_file_open(struct inode *inode, struct file *filp);
-static int affs_file_release(struct inode *inode, struct file *filp);
-
-const struct file_operations affs_file_operations = {
- .llseek = generic_file_llseek,
- .read = new_sync_read,
- .read_iter = generic_file_read_iter,
- .write = new_sync_write,
- .write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
- .open = affs_file_open,
- .release = affs_file_release,
- .fsync = affs_file_fsync,
- .splice_read = generic_file_splice_read,
-};
-
-const struct inode_operations affs_file_inode_operations = {
- .setattr = affs_notify_change,
-};
static int
affs_file_open(struct inode *inode, struct file *filp)
@@ -355,7 +330,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
/* store new block */
if (bh_result->b_blocknr)
- affs_warning(sb, "get_block", "block already set (%x)", bh_result->b_blocknr);
+ affs_warning(sb, "get_block", "block already set (%lx)",
+ (unsigned long)bh_result->b_blocknr);
AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
@@ -377,7 +353,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
return 0;
err_big:
- affs_error(inode->i_sb,"get_block","strange block request %d", block);
+ affs_error(inode->i_sb, "get_block", "strange block request %d",
+ (int)block);
return -EIO;
err_ext:
// unlock cache
@@ -412,6 +389,22 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
}
}
+static ssize_t
+affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
+ if (ret < 0 && (rw & WRITE))
+ affs_write_failed(mapping, offset + count);
+ return ret;
+}
+
static int affs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -438,6 +431,7 @@ const struct address_space_operations affs_aops = {
.writepage = affs_writepage,
.write_begin = affs_write_begin,
.write_end = generic_write_end,
+ .direct_IO = affs_direct_IO,
.bmap = _affs_bmap
};
@@ -867,8 +861,9 @@ affs_truncate(struct inode *inode)
// lock cache
ext_bh = affs_get_extblock(inode, ext);
if (IS_ERR(ext_bh)) {
- affs_warning(sb, "truncate", "unexpected read error for ext block %u (%d)",
- ext, PTR_ERR(ext_bh));
+ affs_warning(sb, "truncate",
+ "unexpected read error for ext block %u (%ld)",
+ (unsigned int)ext, PTR_ERR(ext_bh));
return;
}
if (AFFS_I(inode)->i_lc) {
@@ -914,8 +909,9 @@ affs_truncate(struct inode *inode)
struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
u32 tmp;
if (IS_ERR(bh)) {
- affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
- ext, PTR_ERR(bh));
+ affs_warning(sb, "truncate",
+ "unexpected read error for last block %u (%ld)",
+ (unsigned int)ext, PTR_ERR(bh));
return;
}
tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
@@ -961,3 +957,19 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
mutex_unlock(&inode->i_mutex);
return ret;
}
+const struct file_operations affs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
+ .mmap = generic_file_mmap,
+ .open = affs_file_open,
+ .release = affs_file_release,
+ .fsync = affs_file_fsync,
+ .splice_read = generic_file_splice_read,
+};
+
+const struct inode_operations affs_file_inode_operations = {
+ .setattr = affs_notify_change,
+};
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index e217c511459b..d0609a282e1d 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -348,9 +348,9 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
u32 block = 0;
int retval;
- pr_debug("%s(dir=%u, inode=%u, \"%*s\", type=%d)\n",
+ pr_debug("%s(dir=%u, inode=%u, \"%pd\", type=%d)\n",
__func__, (u32)dir->i_ino,
- (u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type);
+ (u32)inode->i_ino, dentry, type);
retval = -EIO;
bh = affs_bread(sb, inode->i_ino);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 035bd31556fc..bbc38530e924 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -190,8 +190,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
toupper_t toupper = affs_get_toupper(sb);
u32 key;
- pr_debug("%s(\"%.*s\")\n",
- __func__, (int)dentry->d_name.len, dentry->d_name.name);
+ pr_debug("%s(\"%pd\")\n", __func__, dentry);
bh = affs_bread(sb, dir->i_ino);
if (!bh)
@@ -219,8 +218,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
struct buffer_head *bh;
struct inode *inode = NULL;
- pr_debug("%s(\"%.*s\")\n",
- __func__, (int)dentry->d_name.len, dentry->d_name.name);
+ pr_debug("%s(\"%pd\")\n", __func__, dentry);
affs_lock_dir(dir);
bh = affs_find_entry(dir, dentry);
@@ -250,9 +248,9 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
int
affs_unlink(struct inode *dir, struct dentry *dentry)
{
- pr_debug("%s(dir=%d, %lu \"%.*s\")\n",
+ pr_debug("%s(dir=%d, %lu \"%pd\")\n",
__func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
- (int)dentry->d_name.len, dentry->d_name.name);
+ dentry);
return affs_remove_header(dentry);
}
@@ -264,9 +262,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
struct inode *inode;
int error;
- pr_debug("%s(%lu,\"%.*s\",0%ho)\n",
- __func__, dir->i_ino, (int)dentry->d_name.len,
- dentry->d_name.name,mode);
+ pr_debug("%s(%lu,\"%pd\",0%ho)\n",
+ __func__, dir->i_ino, dentry, mode);
inode = affs_new_inode(dir);
if (!inode)
@@ -294,9 +291,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int error;
- pr_debug("%s(%lu,\"%.*s\",0%ho)\n",
- __func__, dir->i_ino, (int)dentry->d_name.len,
- dentry->d_name.name, mode);
+ pr_debug("%s(%lu,\"%pd\",0%ho)\n",
+ __func__, dir->i_ino, dentry, mode);
inode = affs_new_inode(dir);
if (!inode)
@@ -321,9 +317,9 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int
affs_rmdir(struct inode *dir, struct dentry *dentry)
{
- pr_debug("%s(dir=%u, %lu \"%.*s\")\n",
+ pr_debug("%s(dir=%u, %lu \"%pd\")\n",
__func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
- (int)dentry->d_name.len, dentry->d_name.name);
+ dentry);
return affs_remove_header(dentry);
}
@@ -338,9 +334,8 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
int i, maxlen, error;
char c, lc;
- pr_debug("%s(%lu,\"%.*s\" -> \"%s\")\n",
- __func__, dir->i_ino, (int)dentry->d_name.len,
- dentry->d_name.name, symname);
+ pr_debug("%s(%lu,\"%pd\" -> \"%s\")\n",
+ __func__, dir->i_ino, dentry, symname);
maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1;
inode = affs_new_inode(dir);
@@ -409,9 +404,9 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- pr_debug("%s(%u, %u, \"%.*s\")\n",
+ pr_debug("%s(%u, %u, \"%pd\")\n",
__func__, (u32)inode->i_ino, (u32)dir->i_ino,
- (int)dentry->d_name.len,dentry->d_name.name);
+ dentry);
return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
}
@@ -424,10 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh = NULL;
int retval;
- pr_debug("%s(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
- __func__, (u32)old_dir->i_ino, (int)old_dentry->d_name.len,
- old_dentry->d_name.name, (u32)new_dir->i_ino,
- (int)new_dentry->d_name.len, new_dentry->d_name.name);
+ pr_debug("%s(old=%u,\"%pd\" to new=%u,\"%pd\")\n",
+ __func__, (u32)old_dir->i_ino, old_dentry,
+ (u32)new_dir->i_ino, new_dentry);
retval = affs_check_name(new_dentry->d_name.name,
new_dentry->d_name.len,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a1645b88fe8a..4ec35e9130e1 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -26,7 +26,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_release(struct dentry *dentry);
-static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
+static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl);
@@ -391,10 +391,11 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
- loff_t fpos, u64 ino, unsigned dtype)
+static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
+ int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
- struct afs_lookup_cookie *cookie = _cookie;
+ struct afs_lookup_cookie *cookie =
+ container_of(ctx, struct afs_lookup_cookie, ctx);
_enter("{%s,%u},%s,%u,,%llu,%u",
cookie->name.name, cookie->name.len, name, nlen,
@@ -433,7 +434,7 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
};
int ret;
- _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
+ _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
/* search the directory */
ret = afs_dir_iterate(dir, &cookie.ctx, key);
@@ -465,8 +466,8 @@ static struct inode *afs_try_auto_mntpt(
struct afs_vnode *vnode = AFS_FS_I(dir);
struct inode *inode;
- _enter("%d, %p{%s}, {%x:%u}, %p",
- ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
+ _enter("%d, %p{%pd}, {%x:%u}, %p",
+ ret, dentry, dentry, vnode->fid.vid, vnode->fid.vnode, key);
if (ret != -ENOENT ||
!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
@@ -501,8 +502,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
vnode = AFS_FS_I(dir);
- _enter("{%x:%u},%p{%s},",
- vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
+ _enter("{%x:%u},%p{%pd},",
+ vnode->fid.vid, vnode->fid.vnode, dentry, dentry);
ASSERTCMP(dentry->d_inode, ==, NULL);
@@ -588,11 +589,11 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
vnode = AFS_FS_I(dentry->d_inode);
if (dentry->d_inode)
- _enter("{v={%x:%u} n=%s fl=%lx},",
- vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+ _enter("{v={%x:%u} n=%pd fl=%lx},",
+ vnode->fid.vid, vnode->fid.vnode, dentry,
vnode->flags);
else
- _enter("{neg n=%s}", dentry->d_name.name);
+ _enter("{neg n=%pd}", dentry);
key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
if (IS_ERR(key))
@@ -607,7 +608,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
afs_validate(dir, key);
if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
- _debug("%s: parent dir deleted", dentry->d_name.name);
+ _debug("%pd: parent dir deleted", dentry);
goto out_bad;
}
@@ -625,16 +626,16 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (!dentry->d_inode)
goto out_bad;
if (is_bad_inode(dentry->d_inode)) {
- printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
- parent->d_name.name, dentry->d_name.name);
+ printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
+ dentry);
goto out_bad;
}
/* if the vnode ID has changed, then the dirent points to a
* different file */
if (fid.vnode != vnode->fid.vnode) {
- _debug("%s: dirent changed [%u != %u]",
- dentry->d_name.name, fid.vnode,
+ _debug("%pd: dirent changed [%u != %u]",
+ dentry, fid.vnode,
vnode->fid.vnode);
goto not_found;
}
@@ -643,8 +644,8 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
* been deleted and replaced, and the original vnode ID has
* been reused */
if (fid.unique != vnode->fid.unique) {
- _debug("%s: file deleted (uq %u -> %u I:%u)",
- dentry->d_name.name, fid.unique,
+ _debug("%pd: file deleted (uq %u -> %u I:%u)",
+ dentry, fid.unique,
vnode->fid.unique,
dentry->d_inode->i_generation);
spin_lock(&vnode->lock);
@@ -656,14 +657,14 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
case -ENOENT:
/* the filename is unknown */
- _debug("%s: dirent not found", dentry->d_name.name);
+ _debug("%pd: dirent not found", dentry);
if (dentry->d_inode)
goto not_found;
goto out_valid;
default:
- _debug("failed to iterate dir %s: %d",
- parent->d_name.name, ret);
+ _debug("failed to iterate dir %pd: %d",
+ parent, ret);
goto out_bad;
}
@@ -681,8 +682,7 @@ not_found:
spin_unlock(&dentry->d_lock);
out_bad:
- _debug("dropping dentry %s/%s",
- parent->d_name.name, dentry->d_name.name);
+ _debug("dropping dentry %pd2", dentry);
dput(parent);
key_put(key);
@@ -698,7 +698,7 @@ out_bad:
*/
static int afs_d_delete(const struct dentry *dentry)
{
- _enter("%s", dentry->d_name.name);
+ _enter("%pd", dentry);
if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
goto zap;
@@ -721,7 +721,7 @@ zap:
*/
static void afs_d_release(struct dentry *dentry)
{
- _enter("%s", dentry->d_name.name);
+ _enter("%pd", dentry);
}
/*
@@ -740,8 +740,8 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dvnode = AFS_FS_I(dir);
- _enter("{%x:%u},{%s},%ho",
- dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+ _enter("{%x:%u},{%pd},%ho",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
key = afs_request_key(dvnode->volume->cell);
if (IS_ERR(key)) {
@@ -801,8 +801,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
dvnode = AFS_FS_I(dir);
- _enter("{%x:%u},{%s}",
- dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+ _enter("{%x:%u},{%pd}",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry);
key = afs_request_key(dvnode->volume->cell);
if (IS_ERR(key)) {
@@ -843,8 +843,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
dvnode = AFS_FS_I(dir);
- _enter("{%x:%u},{%s}",
- dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+ _enter("{%x:%u},{%pd}",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry);
ret = -ENAMETOOLONG;
if (dentry->d_name.len >= AFSNAMEMAX)
@@ -917,8 +917,8 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
dvnode = AFS_FS_I(dir);
- _enter("{%x:%u},{%s},%ho,",
- dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+ _enter("{%x:%u},{%pd},%ho,",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
key = afs_request_key(dvnode->volume->cell);
if (IS_ERR(key)) {
@@ -980,10 +980,10 @@ static int afs_link(struct dentry *from, struct inode *dir,
vnode = AFS_FS_I(from->d_inode);
dvnode = AFS_FS_I(dir);
- _enter("{%x:%u},{%x:%u},{%s}",
+ _enter("{%x:%u},{%x:%u},{%pd}",
vnode->fid.vid, vnode->fid.vnode,
dvnode->fid.vid, dvnode->fid.vnode,
- dentry->d_name.name);
+ dentry);
key = afs_request_key(dvnode->volume->cell);
if (IS_ERR(key)) {
@@ -1025,8 +1025,8 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
dvnode = AFS_FS_I(dir);
- _enter("{%x:%u},{%s},%s",
- dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
+ _enter("{%x:%u},{%pd},%s",
+ dvnode->fid.vid, dvnode->fid.vnode, dentry,
content);
ret = -EINVAL;
@@ -1093,11 +1093,11 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
- _enter("{%x:%u},{%x:%u},{%x:%u},{%s}",
+ _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}",
orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
vnode->fid.vid, vnode->fid.vnode,
new_dvnode->fid.vid, new_dvnode->fid.vnode,
- new_dentry->d_name.name);
+ new_dentry);
key = afs_request_key(orig_dvnode->volume->cell);
if (IS_ERR(key)) {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 294671288449..8a1d38ef0fc2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -462,8 +462,8 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
struct key *key;
int ret;
- _enter("{%x:%u},{n=%s},%x",
- vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+ _enter("{%x:%u},{n=%pd},%x",
+ vnode->fid.vid, vnode->fid.vnode, dentry,
attr->ia_valid);
if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 9682c33d5daf..938c5ab06d5a 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -106,14 +106,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
- _enter("%p,%p{%p{%s},%s}",
- dir,
- dentry,
- dentry->d_parent,
- dentry->d_parent ?
- dentry->d_parent->d_name.name : (const unsigned char *) "",
- dentry->d_name.name);
-
+ _enter("%p,%p{%pd2}", dir, dentry, dentry);
return ERR_PTR(-EREMOTE);
}
@@ -122,14 +115,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
*/
static int afs_mntpt_open(struct inode *inode, struct file *file)
{
- _enter("%p,%p{%p{%s},%s}",
- inode, file,
- file->f_path.dentry->d_parent,
- file->f_path.dentry->d_parent ?
- file->f_path.dentry->d_parent->d_name.name :
- (const unsigned char *) "",
- file->f_path.dentry->d_name.name);
-
+ _enter("%p,%p{%pD2}", inode, file, file);
return -EREMOTE;
}
@@ -146,7 +132,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
bool rwpath = false;
int ret;
- _enter("{%s}", mntpt->d_name.name);
+ _enter("{%pd}", mntpt);
BUG_ON(!mntpt->d_inode);
@@ -242,7 +228,7 @@ struct vfsmount *afs_d_automount(struct path *path)
{
struct vfsmount *newmnt;
- _enter("{%s}", path->dentry->d_name.name);
+ _enter("{%pd}", path->dentry);
newmnt = afs_mntpt_do_automount(path->dentry);
if (IS_ERR(newmnt))
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 03a3beb17004..06e14bfb3496 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
_debug("- range %u-%u%s",
offset, to, msg->msg_flags ? " [more]" : "");
- msg->msg_iov = (struct iovec *) iov;
- msg->msg_iovlen = 1;
+ iov_iter_init(&msg->msg_iter, WRITE,
+ (struct iovec *) iov, 1, to - offset);
/* have to change the state *before* sending the last
* packet as RxRPC might give us the reply before it
@@ -384,8 +384,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
msg.msg_name = NULL;
msg.msg_namelen = 0;
- msg.msg_iov = (struct iovec *) iov;
- msg.msg_iovlen = 1;
+ iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1,
+ call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = (call->send_pages ? MSG_MORE : 0);
@@ -778,8 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
iov[0].iov_len = 0;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- msg.msg_iov = iov;
- msg.msg_iovlen = 0;
+ iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0); /* WTF? */
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -815,8 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
iov[0].iov_len = len;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- msg.msg_iov = iov;
- msg.msg_iovlen = 1;
+ iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index ab6adfd52516..c13cb08964ed 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -682,14 +682,13 @@ int afs_writeback_all(struct afs_vnode *vnode)
*/
int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = file_inode(file);
struct afs_writeback *wb, *xwb;
- struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
int ret;
- _enter("{%x:%u},{n=%s},%d",
- vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+ _enter("{%x:%u},{n=%pD},%d",
+ vnode->fid.vid, vnode->fid.vnode, file,
datasync);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
diff --git a/fs/aio.c b/fs/aio.c
index 84a751005f5b..1b7893ecc296 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,6 +165,15 @@ static struct vfsmount *aio_mnt;
static const struct file_operations aio_ring_fops;
static const struct address_space_operations aio_ctx_aops;
+/* Backing dev info for aio fs.
+ * -no dirty page accounting or writeback happens
+ */
+static struct backing_dev_info aio_fs_backing_dev_info = {
+ .name = "aiofs",
+ .state = 0,
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
+};
+
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
struct qstr this = QSTR_INIT("[aio]", 5);
@@ -176,6 +185,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
inode->i_mapping->a_ops = &aio_ctx_aops;
inode->i_mapping->private_data = ctx;
+ inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
inode->i_size = PAGE_SIZE * nr_pages;
path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -220,6 +230,9 @@ static int __init aio_setup(void)
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
+ if (bdi_init(&aio_fs_backing_dev_info))
+ panic("Failed to init aio fs backing dev info.");
+
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
@@ -273,19 +286,39 @@ static void aio_free_ring(struct kioctx *ctx)
static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
{
+ vma->vm_flags |= VM_DONTEXPAND;
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
+static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct kioctx_table *table;
+ int i;
+
+ spin_lock(&mm->ioctx_lock);
+ rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
+ for (i = 0; i < table->nr; i++) {
+ struct kioctx *ctx;
+
+ ctx = table->table[i];
+ if (ctx && ctx->aio_ring_file == file) {
+ ctx->user_id = ctx->mmap_base = vma->vm_start;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ spin_unlock(&mm->ioctx_lock);
+}
+
static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
+ .mremap = aio_ring_remap,
};
-static int aio_set_page_dirty(struct page *page)
-{
- return 0;
-}
-
#if IS_ENABLED(CONFIG_MIGRATION)
static int aio_migratepage(struct address_space *mapping, struct page *new,
struct page *old, enum migrate_mode mode)
@@ -357,7 +390,7 @@ out:
#endif
static const struct address_space_operations aio_ctx_aops = {
- .set_page_dirty = aio_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
#if IS_ENABLED(CONFIG_MIGRATION)
.migratepage = aio_migratepage,
#endif
@@ -412,7 +445,6 @@ static int aio_setup_ring(struct kioctx *ctx)
pr_debug("pid(%d) page[%d]->count=%d\n",
current->pid, i, page_count(page));
SetPageUptodate(page);
- SetPageDirty(page);
unlock_page(page);
ctx->ring_pages[i] = page;
@@ -1221,8 +1253,12 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
* the ringbuffer empty. So in practice we should be ok, but it's
* something to be aware of when touching this code.
*/
- wait_event_interruptible_hrtimeout(ctx->wait,
- aio_read_events(ctx, min_nr, nr, event, &ret), until);
+ if (until.tv64 == 0)
+ aio_read_events(ctx, min_nr, nr, event, &ret);
+ else
+ wait_event_interruptible_hrtimeout(ctx->wait,
+ aio_read_events(ctx, min_nr, nr, event, &ret),
+ until);
if (!ret && signal_pending(current))
ret = -EINTR;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 683a5b9ce22a..bfdbaba9c2ba 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -41,8 +41,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
struct path path = {.mnt = mnt, .dentry = dentry};
int status = 1;
- DPRINTK("dentry %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("dentry %p %pd", dentry, dentry);
path_get(&path);
@@ -85,7 +84,7 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev,
spin_lock(&root->d_lock);
if (prev)
- next = prev->d_u.d_child.next;
+ next = prev->d_child.next;
else {
prev = dget_dlock(root);
next = prev->d_subdirs.next;
@@ -99,13 +98,13 @@ cont:
return NULL;
}
- q = list_entry(next, struct dentry, d_u.d_child);
+ q = list_entry(next, struct dentry, d_child);
spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
/* Already gone or negative dentry (under construction) - try next */
if (!d_count(q) || !simple_positive(q)) {
spin_unlock(&q->d_lock);
- next = q->d_u.d_child.next;
+ next = q->d_child.next;
goto cont;
}
dget_dlock(q);
@@ -155,13 +154,13 @@ again:
goto relock;
}
spin_unlock(&p->d_lock);
- next = p->d_u.d_child.next;
+ next = p->d_child.next;
p = parent;
if (next != &parent->d_subdirs)
break;
}
}
- ret = list_entry(next, struct dentry, d_u.d_child);
+ ret = list_entry(next, struct dentry, d_child);
spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
/* Negative dentry - try next */
@@ -192,8 +191,7 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
unsigned long timeout,
int do_now)
{
- DPRINTK("top %p %.*s",
- top, (int) top->d_name.len, top->d_name.name);
+ DPRINTK("top %p %pd", top, top);
/* If it's busy update the expiry counters */
if (!may_umount_tree(mnt)) {
@@ -221,8 +219,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
struct autofs_info *top_ino = autofs4_dentry_ino(top);
struct dentry *p;
- DPRINTK("top %p %.*s",
- top, (int)top->d_name.len, top->d_name.name);
+ DPRINTK("top %p %pd", top, top);
/* Negative dentry - give up */
if (!simple_positive(top))
@@ -230,8 +227,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
p = NULL;
while ((p = get_next_positive_dentry(p, top))) {
- DPRINTK("dentry %p %.*s",
- p, (int) p->d_name.len, p->d_name.name);
+ DPRINTK("dentry %p %pd", p, p);
/*
* Is someone visiting anywhere in the subtree ?
@@ -277,13 +273,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
{
struct dentry *p;
- DPRINTK("parent %p %.*s",
- parent, (int)parent->d_name.len, parent->d_name.name);
+ DPRINTK("parent %p %pd", parent, parent);
p = NULL;
while ((p = get_next_positive_dentry(p, parent))) {
- DPRINTK("dentry %p %.*s",
- p, (int) p->d_name.len, p->d_name.name);
+ DPRINTK("dentry %p %pd", p, p);
if (d_mountpoint(p)) {
/* Can we umount this guy */
@@ -368,8 +362,7 @@ static struct dentry *should_expire(struct dentry *dentry,
* offset (autofs-5.0+).
*/
if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("checking mountpoint %p %pd", dentry, dentry);
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, dentry))
@@ -382,8 +375,7 @@ static struct dentry *should_expire(struct dentry *dentry,
}
if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
- DPRINTK("checking symlink %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("checking symlink %p %pd", dentry, dentry);
/*
* A symlink can't be "busy" in the usual sense so
* just check last used for expire timeout.
@@ -479,8 +471,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
return NULL;
found:
- DPRINTK("returning %p %.*s",
- expired, (int)expired->d_name.len, expired->d_name.name);
+ DPRINTK("returning %p %pd", expired, expired);
ino->flags |= AUTOFS_INF_EXPIRING;
smp_mb();
ino->flags &= ~AUTOFS_INF_NO_RCU;
@@ -489,7 +480,7 @@ found:
spin_lock(&sbi->lookup_lock);
spin_lock(&expired->d_parent->d_lock);
spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
+ list_move(&expired->d_parent->d_subdirs, &expired->d_child);
spin_unlock(&expired->d_lock);
spin_unlock(&expired->d_parent->d_lock);
spin_unlock(&sbi->lookup_lock);
@@ -512,8 +503,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
- DPRINTK("waiting for expire %p name=%.*s",
- dentry, dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("waiting for expire %p name=%pd", dentry, dentry);
status = autofs4_wait(sbi, dentry, NFY_NONE);
wait_for_completion(&ino->expire_complete);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d76d083f2f06..dbb5b7212ce1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -108,8 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- DPRINTK("file=%p dentry=%p %.*s",
- file, dentry, dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("file=%p dentry=%p %pD", file, dentry, dentry);
if (autofs4_oz_mode(sbi))
goto out;
@@ -279,8 +278,7 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
- DPRINTK("waiting for mount name=%.*s",
- dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("waiting for mount name=%pd", dentry);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
DPRINTK("mount wait done status=%d", status);
}
@@ -340,8 +338,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %.*s",
- dentry, dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("dentry=%p %pd", dentry, dentry);
/* The daemon never triggers a mount. */
if (autofs4_oz_mode(sbi))
@@ -428,8 +425,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %.*s",
- dentry, dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("dentry=%p %pd", dentry, dentry);
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
@@ -504,7 +500,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
struct autofs_info *ino;
struct dentry *active;
- DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("name = %pd", dentry);
/* File name too long to exist */
if (dentry->d_name.len > NAME_MAX)
@@ -558,8 +554,7 @@ static int autofs4_dir_symlink(struct inode *dir,
size_t size = strlen(symname);
char *cp;
- DPRINTK("%s <- %.*s", symname,
- dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("%s <- %pd", symname, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -687,7 +682,7 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
/* only consider parents below dentrys in the root */
if (IS_ROOT(parent->d_parent))
return;
- d_child = &dentry->d_u.d_child;
+ d_child = &dentry->d_child;
/* Set parent managed if it's becoming empty */
if (d_child->next == &parent->d_subdirs &&
d_child->prev == &parent->d_subdirs)
@@ -701,8 +696,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
- DPRINTK("dentry %p, removing %.*s",
- dentry, dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("dentry %p, removing %pd", dentry, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -744,8 +738,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
if (!autofs4_oz_mode(sbi))
return -EACCES;
- DPRINTK("dentry %p, creating %.*s",
- dentry, dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("dentry %p, creating %pd", dentry, dentry);
BUG_ON(!ino);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 4cf61ec6b7a8..edf47774b03d 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -172,8 +172,8 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
char *utfname;
const char *name = dentry->d_name.name;
- befs_debug(sb, "---> %s name %s inode %ld", __func__,
- dentry->d_name.name, dir->i_ino);
+ befs_debug(sb, "---> %s name %pd inode %ld", __func__,
+ dentry, dir->i_ino);
/* Convert to UTF-8 */
if (BEFS_SB(sb)->nls) {
@@ -191,8 +191,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
}
if (ret == BEFS_BT_NOT_FOUND) {
- befs_debug(sb, "<--- %s %s not found", __func__,
- dentry->d_name.name);
+ befs_debug(sb, "<--- %s %pd not found", __func__, dentry);
return ERR_PTR(-ENOENT);
} else if (ret != BEFS_OK || offset == 0) {
@@ -222,10 +221,9 @@ befs_readdir(struct file *file, struct dir_context *ctx)
size_t keysize;
unsigned char d_type;
char keybuf[BEFS_NAME_LEN + 1];
- const char *dirname = file->f_path.dentry->d_name.name;
- befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld",
- __func__, dirname, inode->i_ino, ctx->pos);
+ befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld",
+ __func__, file, inode->i_ino, ctx->pos);
more:
result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
@@ -233,8 +231,8 @@ more:
if (result == BEFS_ERR) {
befs_debug(sb, "<--- %s ERROR", __func__);
- befs_error(sb, "IO error reading %s (inode %lu)",
- dirname, inode->i_ino);
+ befs_error(sb, "IO error reading %pD (inode %lu)",
+ file, inode->i_ino);
return -EIO;
} else if (result == BEFS_BT_END) {
@@ -271,10 +269,6 @@ more:
}
ctx->pos++;
goto more;
-
- befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos);
-
- return 0;
}
static struct inode *
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 929dec08c348..4c556680fa74 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -292,8 +292,8 @@ static int load_aout_binary(struct linux_binprm * bprm)
if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
{
printk(KERN_WARNING
- "fd_offset is not page aligned. Please convert program: %s\n",
- bprm->file->f_path.dentry->d_name.name);
+ "fd_offset is not page aligned. Please convert program: %pD\n",
+ bprm->file);
}
if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
@@ -375,8 +375,8 @@ static int load_aout_library(struct file *file)
if (printk_ratelimit())
{
printk(KERN_WARNING
- "N_TXTOFF is not page aligned. Please convert library: %s\n",
- file->f_path.dentry->d_name.name);
+ "N_TXTOFF is not page aligned. Please convert library: %pD\n",
+ file);
}
vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d8fc0605b9d2..02b16910f4c9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -386,6 +386,127 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
ELF_PAGESTART(cmds[first_idx].p_vaddr);
}
+/**
+ * load_elf_phdrs() - load ELF program headers
+ * @elf_ex: ELF header of the binary whose program headers should be loaded
+ * @elf_file: the opened ELF binary file
+ *
+ * Loads ELF program headers from the binary file elf_file, which has the ELF
+ * header pointed to by elf_ex, into a newly allocated array. The caller is
+ * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
+ */
+static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+ struct file *elf_file)
+{
+ struct elf_phdr *elf_phdata = NULL;
+ int retval, size, err = -1;
+
+ /*
+ * If the size of this structure has changed, then punt, since
+ * we will be doing the wrong thing.
+ */
+ if (elf_ex->e_phentsize != sizeof(struct elf_phdr))
+ goto out;
+
+ /* Sanity check the number of program headers... */
+ if (elf_ex->e_phnum < 1 ||
+ elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
+ goto out;
+
+ /* ...and their total size. */
+ size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
+ if (size > ELF_MIN_ALIGN)
+ goto out;
+
+ elf_phdata = kmalloc(size, GFP_KERNEL);
+ if (!elf_phdata)
+ goto out;
+
+ /* Read in the program headers */
+ retval = kernel_read(elf_file, elf_ex->e_phoff,
+ (char *)elf_phdata, size);
+ if (retval != size) {
+ err = (retval < 0) ? retval : -EIO;
+ goto out;
+ }
+
+ /* Success! */
+ err = 0;
+out:
+ if (err) {
+ kfree(elf_phdata);
+ elf_phdata = NULL;
+ }
+ return elf_phdata;
+}
+
+#ifndef CONFIG_ARCH_BINFMT_ELF_STATE
+
+/**
+ * struct arch_elf_state - arch-specific ELF loading state
+ *
+ * This structure is used to preserve architecture specific data during
+ * the loading of an ELF file, throughout the checking of architecture
+ * specific ELF headers & through to the point where the ELF load is
+ * known to be proceeding (ie. SET_PERSONALITY).
+ *
+ * This implementation is a dummy for architectures which require no
+ * specific state.
+ */
+struct arch_elf_state {
+};
+
+#define INIT_ARCH_ELF_STATE {}
+
+/**
+ * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * @ehdr: The main ELF header
+ * @phdr: The program header to check
+ * @elf: The open ELF file
+ * @is_interp: True if the phdr is from the interpreter of the ELF being
+ * loaded, else false.
+ * @state: Architecture-specific state preserved throughout the process
+ * of loading the ELF.
+ *
+ * Inspects the program header phdr to validate its correctness and/or
+ * suitability for the system. Called once per ELF program header in the
+ * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its
+ * interpreter.
+ *
+ * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
+ * with that return code.
+ */
+static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
+ struct elf_phdr *phdr,
+ struct file *elf, bool is_interp,
+ struct arch_elf_state *state)
+{
+ /* Dummy implementation, always proceed */
+ return 0;
+}
+
+/**
+ * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * @ehdr: The main ELF header
+ * @has_interp: True if the ELF has an interpreter, else false.
+ * @state: Architecture-specific state preserved throughout the process
+ * of loading the ELF.
+ *
+ * Provides a final opportunity for architecture code to reject the loading
+ * of the ELF & cause an exec syscall to return an error. This is called after
+ * all program headers to be checked by arch_elf_pt_proc have been.
+ *
+ * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
+ * with that return code.
+ */
+static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
+ struct arch_elf_state *state)
+{
+ /* Dummy implementation, always proceed */
+ return 0;
+}
+
+#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
/* This is much more generalized than the library routine read function,
so we keep this separate. Technically the library read function
@@ -394,16 +515,15 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
struct file *interpreter, unsigned long *interp_map_addr,
- unsigned long no_base)
+ unsigned long no_base, struct elf_phdr *interp_elf_phdata)
{
- struct elf_phdr *elf_phdata;
struct elf_phdr *eppnt;
unsigned long load_addr = 0;
int load_addr_set = 0;
unsigned long last_bss = 0, elf_bss = 0;
unsigned long error = ~0UL;
unsigned long total_size;
- int retval, i, size;
+ int i;
/* First of all, some simple consistency checks */
if (interp_elf_ex->e_type != ET_EXEC &&
@@ -414,40 +534,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
if (!interpreter->f_op->mmap)
goto out;
- /*
- * If the size of this structure has changed, then punt, since
- * we will be doing the wrong thing.
- */
- if (interp_elf_ex->e_phentsize != sizeof(struct elf_phdr))
- goto out;
- if (interp_elf_ex->e_phnum < 1 ||
- interp_elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
- goto out;
-
- /* Now read in all of the header information */
- size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum;
- if (size > ELF_MIN_ALIGN)
- goto out;
- elf_phdata = kmalloc(size, GFP_KERNEL);
- if (!elf_phdata)
- goto out;
-
- retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
- (char *)elf_phdata, size);
- error = -EIO;
- if (retval != size) {
- if (retval < 0)
- error = retval;
- goto out_close;
- }
-
- total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
+ total_size = total_mapping_size(interp_elf_phdata,
+ interp_elf_ex->e_phnum);
if (!total_size) {
error = -EINVAL;
- goto out_close;
+ goto out;
}
- eppnt = elf_phdata;
+ eppnt = interp_elf_phdata;
for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
if (eppnt->p_type == PT_LOAD) {
int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
@@ -474,7 +568,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
*interp_map_addr = map_addr;
error = map_addr;
if (BAD_ADDR(map_addr))
- goto out_close;
+ goto out;
if (!load_addr_set &&
interp_elf_ex->e_type == ET_DYN) {
@@ -493,7 +587,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
eppnt->p_memsz > TASK_SIZE ||
TASK_SIZE - eppnt->p_memsz < k) {
error = -ENOMEM;
- goto out_close;
+ goto out;
}
/*
@@ -523,7 +617,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
*/
if (padzero(elf_bss)) {
error = -EFAULT;
- goto out_close;
+ goto out;
}
/* What we have mapped so far */
@@ -532,13 +626,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
/* Map the last of the bss segment */
error = vm_brk(elf_bss, last_bss - elf_bss);
if (BAD_ADDR(error))
- goto out_close;
+ goto out;
}
error = load_addr;
-
-out_close:
- kfree(elf_phdata);
out:
return error;
}
@@ -575,10 +666,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
int load_addr_set = 0;
char * elf_interpreter = NULL;
unsigned long error;
- struct elf_phdr *elf_ppnt, *elf_phdata;
+ struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
unsigned long elf_bss, elf_brk;
int retval, i;
- unsigned int size;
unsigned long elf_entry;
unsigned long interp_load_addr = 0;
unsigned long start_code, end_code, start_data, end_data;
@@ -589,6 +679,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
struct elfhdr elf_ex;
struct elfhdr interp_elf_ex;
} *loc;
+ struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
loc = kmalloc(sizeof(*loc), GFP_KERNEL);
if (!loc) {
@@ -611,26 +702,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (!bprm->file->f_op->mmap)
goto out;
- /* Now read in all of the header information */
- if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
- goto out;
- if (loc->elf_ex.e_phnum < 1 ||
- loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
- goto out;
- size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
- retval = -ENOMEM;
- elf_phdata = kmalloc(size, GFP_KERNEL);
+ elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file);
if (!elf_phdata)
goto out;
- retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
- (char *)elf_phdata, size);
- if (retval != size) {
- if (retval >= 0)
- retval = -EIO;
- goto out_free_ph;
- }
-
elf_ppnt = elf_phdata;
elf_bss = 0;
elf_brk = 0;
@@ -699,12 +774,21 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_ppnt = elf_phdata;
for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
- if (elf_ppnt->p_type == PT_GNU_STACK) {
+ switch (elf_ppnt->p_type) {
+ case PT_GNU_STACK:
if (elf_ppnt->p_flags & PF_X)
executable_stack = EXSTACK_ENABLE_X;
else
executable_stack = EXSTACK_DISABLE_X;
break;
+
+ case PT_LOPROC ... PT_HIPROC:
+ retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt,
+ bprm->file, false,
+ &arch_state);
+ if (retval)
+ goto out_free_dentry;
+ break;
}
/* Some simple consistency checks for the interpreter */
@@ -716,8 +800,36 @@ static int load_elf_binary(struct linux_binprm *bprm)
/* Verify the interpreter has a valid arch */
if (!elf_check_arch(&loc->interp_elf_ex))
goto out_free_dentry;
+
+ /* Load the interpreter program headers */
+ interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex,
+ interpreter);
+ if (!interp_elf_phdata)
+ goto out_free_dentry;
+
+ /* Pass PT_LOPROC..PT_HIPROC headers to arch code */
+ elf_ppnt = interp_elf_phdata;
+ for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++)
+ switch (elf_ppnt->p_type) {
+ case PT_LOPROC ... PT_HIPROC:
+ retval = arch_elf_pt_proc(&loc->interp_elf_ex,
+ elf_ppnt, interpreter,
+ true, &arch_state);
+ if (retval)
+ goto out_free_dentry;
+ break;
+ }
}
+ /*
+ * Allow arch code to reject the ELF at this point, whilst it's
+ * still possible to return an error to the code that invoked
+ * the exec syscall.
+ */
+ retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state);
+ if (retval)
+ goto out_free_dentry;
+
/* Flush all traces of the currently running executable */
retval = flush_old_exec(bprm);
if (retval)
@@ -725,7 +837,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality. */
- SET_PERSONALITY(loc->elf_ex);
+ SET_PERSONALITY2(loc->elf_ex, &arch_state);
if (elf_read_implies_exec(loc->elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -890,7 +1002,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_entry = load_elf_interp(&loc->interp_elf_ex,
interpreter,
&interp_map_addr,
- load_bias);
+ load_bias, interp_elf_phdata);
if (!IS_ERR((void *)elf_entry)) {
/*
* load_elf_interp() returns relocation
@@ -917,6 +1029,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
}
}
+ kfree(interp_elf_phdata);
kfree(elf_phdata);
set_binfmt(&elf_format);
@@ -981,6 +1094,7 @@ out_ret:
/* error cleanup */
out_free_dentry:
+ kfree(interp_elf_phdata);
allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
@@ -1994,18 +2108,6 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
shdr4extnum->sh_info = segs;
}
-static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
- unsigned long mm_flags)
-{
- struct vm_area_struct *vma;
- size_t size = 0;
-
- for (vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma))
- size += vma_dump_size(vma, mm_flags);
- return size;
-}
-
/*
* Actual dumper
*
@@ -2017,7 +2119,8 @@ static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
mm_segment_t fs;
- int segs;
+ int segs, i;
+ size_t vma_data_size = 0;
struct vm_area_struct *vma, *gate_vma;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
@@ -2026,6 +2129,7 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
+ elf_addr_t *vma_filesz = NULL;
/*
* We no longer stop all VM operations.
@@ -2093,7 +2197,20 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags);
+ vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL);
+ if (!vma_filesz)
+ goto end_coredump;
+
+ for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+ vma = next_vma(vma, gate_vma)) {
+ unsigned long dump_size;
+
+ dump_size = vma_dump_size(vma, cprm->mm_flags);
+ vma_filesz[i++] = dump_size;
+ vma_data_size += dump_size;
+ }
+
+ offset += vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2113,7 +2230,7 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (vma = first_vma(current, gate_vma); vma != NULL;
+ for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
struct elf_phdr phdr;
@@ -2121,7 +2238,7 @@ static int elf_core_dump(struct coredump_params *cprm)
phdr.p_offset = offset;
phdr.p_vaddr = vma->vm_start;
phdr.p_paddr = 0;
- phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
+ phdr.p_filesz = vma_filesz[i++];
phdr.p_memsz = vma->vm_end - vma->vm_start;
offset += phdr.p_filesz;
phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -2149,12 +2266,12 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!dump_skip(cprm, dataoff - cprm->written))
goto end_coredump;
- for (vma = first_vma(current, gate_vma); vma != NULL;
+ for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
unsigned long addr;
unsigned long end;
- end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags);
+ end = vma->vm_start + vma_filesz[i++];
for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
struct page *page;
@@ -2187,6 +2304,7 @@ end_coredump:
cleanup:
free_note_info(&info);
kfree(shdr4extnum);
+ kfree(vma_filesz);
kfree(phdr4note);
kfree(elf);
out:
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index f37b08cea1f7..490538536cb4 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm)
return -ENOEXEC;
}
+ /* Need to be able to load the file after exec */
+ if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+ return -ENOENT;
+
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd8beb9657a2..c04ef1d4f18a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -1,21 +1,14 @@
/*
- * binfmt_misc.c
+ * binfmt_misc.c
*
- * Copyright (C) 1997 Richard Günther
+ * Copyright (C) 1997 Richard Günther
*
- * binfmt_misc detects binaries via a magic or filename extension and invokes
- * a specified wrapper. This should obsolete binfmt_java, binfmt_em86 and
- * binfmt_mz.
- *
- * 1997-04-25 first version
- * [...]
- * 1997-05-19 cleanup
- * 1997-06-26 hpa: pass the real filename rather than argv[0]
- * 1997-06-30 minor cleanup
- * 1997-08-09 removed extension stripping, locking cleanup
- * 2001-02-28 AV: rewritten into something that resembles C. Original didn't.
+ * binfmt_misc detects binaries via a magic or filename extension and invokes
+ * a specified wrapper. See Documentation/binfmt_misc.txt for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
@@ -30,8 +23,13 @@
#include <linux/mount.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
+#ifdef DEBUG
+# define USE_DEBUG 1
+#else
+# define USE_DEBUG 0
+#endif
enum {
VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
@@ -41,9 +39,9 @@ static LIST_HEAD(entries);
static int enabled = 1;
enum {Enabled, Magic};
-#define MISC_FMT_PRESERVE_ARGV0 (1<<31)
-#define MISC_FMT_OPEN_BINARY (1<<30)
-#define MISC_FMT_CREDENTIALS (1<<29)
+#define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
+#define MISC_FMT_OPEN_BINARY (1 << 30)
+#define MISC_FMT_CREDENTIALS (1 << 29)
typedef struct {
struct list_head list;
@@ -87,20 +85,24 @@ static Node *check_file(struct linux_binprm *bprm)
char *p = strrchr(bprm->interp, '.');
struct list_head *l;
+ /* Walk all the registered handlers. */
list_for_each(l, &entries) {
Node *e = list_entry(l, Node, list);
char *s;
int j;
+ /* Make sure this one is currently enabled. */
if (!test_bit(Enabled, &e->flags))
continue;
+ /* Do matching based on extension if applicable. */
if (!test_bit(Magic, &e->flags)) {
if (p && !strcmp(e->magic, p + 1))
return e;
continue;
}
+ /* Do matching based on magic & mask. */
s = bprm->buf + e->offset;
if (e->mask) {
for (j = 0; j < e->size; j++)
@@ -123,7 +125,7 @@ static Node *check_file(struct linux_binprm *bprm)
static int load_misc_binary(struct linux_binprm *bprm)
{
Node *fmt;
- struct file * interp_file = NULL;
+ struct file *interp_file = NULL;
char iname[BINPRM_BUF_SIZE];
const char *iname_addr = iname;
int retval;
@@ -131,7 +133,7 @@ static int load_misc_binary(struct linux_binprm *bprm)
retval = -ENOEXEC;
if (!enabled)
- goto _ret;
+ goto ret;
/* to keep locking time low, we copy the interpreter string */
read_lock(&entries_lock);
@@ -140,25 +142,30 @@ static int load_misc_binary(struct linux_binprm *bprm)
strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
read_unlock(&entries_lock);
if (!fmt)
- goto _ret;
+ goto ret;
+
+ /* Need to be able to load the file after exec */
+ if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+ return -ENOENT;
if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
retval = remove_arg_zero(bprm);
if (retval)
- goto _ret;
+ goto ret;
}
if (fmt->flags & MISC_FMT_OPEN_BINARY) {
/* if the binary should be opened on behalf of the
* interpreter than keep it open and assign descriptor
- * to it */
- fd_binary = get_unused_fd();
- if (fd_binary < 0) {
- retval = fd_binary;
- goto _ret;
- }
- fd_install(fd_binary, bprm->file);
+ * to it
+ */
+ fd_binary = get_unused_fd_flags(0);
+ if (fd_binary < 0) {
+ retval = fd_binary;
+ goto ret;
+ }
+ fd_install(fd_binary, bprm->file);
/* if the binary is not readable than enforce mm->dumpable=0
regardless of the interpreter's permissions */
@@ -171,32 +178,32 @@ static int load_misc_binary(struct linux_binprm *bprm)
bprm->interp_flags |= BINPRM_FLAGS_EXECFD;
bprm->interp_data = fd_binary;
- } else {
- allow_write_access(bprm->file);
- fput(bprm->file);
- bprm->file = NULL;
- }
+ } else {
+ allow_write_access(bprm->file);
+ fput(bprm->file);
+ bprm->file = NULL;
+ }
/* make argv[1] be the path to the binary */
- retval = copy_strings_kernel (1, &bprm->interp, bprm);
+ retval = copy_strings_kernel(1, &bprm->interp, bprm);
if (retval < 0)
- goto _error;
+ goto error;
bprm->argc++;
/* add the interp as argv[0] */
- retval = copy_strings_kernel (1, &iname_addr, bprm);
+ retval = copy_strings_kernel(1, &iname_addr, bprm);
if (retval < 0)
- goto _error;
- bprm->argc ++;
+ goto error;
+ bprm->argc++;
/* Update interp in case binfmt_script needs it. */
retval = bprm_change_interp(iname, bprm);
if (retval < 0)
- goto _error;
+ goto error;
- interp_file = open_exec (iname);
- retval = PTR_ERR (interp_file);
- if (IS_ERR (interp_file))
- goto _error;
+ interp_file = open_exec(iname);
+ retval = PTR_ERR(interp_file);
+ if (IS_ERR(interp_file))
+ goto error;
bprm->file = interp_file;
if (fmt->flags & MISC_FMT_CREDENTIALS) {
@@ -207,23 +214,23 @@ static int load_misc_binary(struct linux_binprm *bprm)
memset(bprm->buf, 0, BINPRM_BUF_SIZE);
retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
} else
- retval = prepare_binprm (bprm);
+ retval = prepare_binprm(bprm);
if (retval < 0)
- goto _error;
+ goto error;
retval = search_binary_handler(bprm);
if (retval < 0)
- goto _error;
+ goto error;
-_ret:
+ret:
return retval;
-_error:
+error:
if (fd_binary > 0)
sys_close(fd_binary);
bprm->interp_flags = 0;
bprm->interp_data = 0;
- goto _ret;
+ goto ret;
}
/* Command parsers */
@@ -250,36 +257,40 @@ static char *scanarg(char *s, char del)
return s;
}
-static char * check_special_flags (char * sfs, Node * e)
+static char *check_special_flags(char *sfs, Node *e)
{
- char * p = sfs;
+ char *p = sfs;
int cont = 1;
/* special flags */
while (cont) {
switch (*p) {
- case 'P':
- p++;
- e->flags |= MISC_FMT_PRESERVE_ARGV0;
- break;
- case 'O':
- p++;
- e->flags |= MISC_FMT_OPEN_BINARY;
- break;
- case 'C':
- p++;
- /* this flags also implies the
- open-binary flag */
- e->flags |= (MISC_FMT_CREDENTIALS |
- MISC_FMT_OPEN_BINARY);
- break;
- default:
- cont = 0;
+ case 'P':
+ pr_debug("register: flag: P (preserve argv0)\n");
+ p++;
+ e->flags |= MISC_FMT_PRESERVE_ARGV0;
+ break;
+ case 'O':
+ pr_debug("register: flag: O (open binary)\n");
+ p++;
+ e->flags |= MISC_FMT_OPEN_BINARY;
+ break;
+ case 'C':
+ pr_debug("register: flag: C (preserve creds)\n");
+ p++;
+ /* this flags also implies the
+ open-binary flag */
+ e->flags |= (MISC_FMT_CREDENTIALS |
+ MISC_FMT_OPEN_BINARY);
+ break;
+ default:
+ cont = 0;
}
}
return p;
}
+
/*
* This registers a new binary format, it recognises the syntax
* ':name:type:offset:magic:mask:interpreter:flags'
@@ -292,6 +303,8 @@ static Node *create_entry(const char __user *buffer, size_t count)
char *buf, *p;
char del;
+ pr_debug("register: received %zu bytes\n", count);
+
/* some sanity checks */
err = -EINVAL;
if ((count < 11) || (count > MAX_REGISTER_LENGTH))
@@ -299,7 +312,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
err = -ENOMEM;
memsize = sizeof(Node) + count + 8;
- e = kmalloc(memsize, GFP_USER);
+ e = kmalloc(memsize, GFP_KERNEL);
if (!e)
goto out;
@@ -307,98 +320,175 @@ static Node *create_entry(const char __user *buffer, size_t count)
memset(e, 0, sizeof(Node));
if (copy_from_user(buf, buffer, count))
- goto Efault;
+ goto efault;
del = *p++; /* delimeter */
- memset(buf+count, del, 8);
+ pr_debug("register: delim: %#x {%c}\n", del, del);
+
+ /* Pad the buffer with the delim to simplify parsing below. */
+ memset(buf + count, del, 8);
+ /* Parse the 'name' field. */
e->name = p;
p = strchr(p, del);
if (!p)
- goto Einval;
+ goto einval;
*p++ = '\0';
if (!e->name[0] ||
!strcmp(e->name, ".") ||
!strcmp(e->name, "..") ||
strchr(e->name, '/'))
- goto Einval;
+ goto einval;
+
+ pr_debug("register: name: {%s}\n", e->name);
+
+ /* Parse the 'type' field. */
switch (*p++) {
- case 'E': e->flags = 1<<Enabled; break;
- case 'M': e->flags = (1<<Enabled) | (1<<Magic); break;
- default: goto Einval;
+ case 'E':
+ pr_debug("register: type: E (extension)\n");
+ e->flags = 1 << Enabled;
+ break;
+ case 'M':
+ pr_debug("register: type: M (magic)\n");
+ e->flags = (1 << Enabled) | (1 << Magic);
+ break;
+ default:
+ goto einval;
}
if (*p++ != del)
- goto Einval;
+ goto einval;
+
if (test_bit(Magic, &e->flags)) {
- char *s = strchr(p, del);
+ /* Handle the 'M' (magic) format. */
+ char *s;
+
+ /* Parse the 'offset' field. */
+ s = strchr(p, del);
if (!s)
- goto Einval;
+ goto einval;
*s++ = '\0';
e->offset = simple_strtoul(p, &p, 10);
if (*p++)
- goto Einval;
+ goto einval;
+ pr_debug("register: offset: %#x\n", e->offset);
+
+ /* Parse the 'magic' field. */
e->magic = p;
p = scanarg(p, del);
if (!p)
- goto Einval;
+ goto einval;
p[-1] = '\0';
- if (!e->magic[0])
- goto Einval;
+ if (p == e->magic)
+ goto einval;
+ if (USE_DEBUG)
+ print_hex_dump_bytes(
+ KBUILD_MODNAME ": register: magic[raw]: ",
+ DUMP_PREFIX_NONE, e->magic, p - e->magic);
+
+ /* Parse the 'mask' field. */
e->mask = p;
p = scanarg(p, del);
if (!p)
- goto Einval;
+ goto einval;
p[-1] = '\0';
- if (!e->mask[0])
+ if (p == e->mask) {
e->mask = NULL;
+ pr_debug("register: mask[raw]: none\n");
+ } else if (USE_DEBUG)
+ print_hex_dump_bytes(
+ KBUILD_MODNAME ": register: mask[raw]: ",
+ DUMP_PREFIX_NONE, e->mask, p - e->mask);
+
+ /*
+ * Decode the magic & mask fields.
+ * Note: while we might have accepted embedded NUL bytes from
+ * above, the unescape helpers here will stop at the first one
+ * it encounters.
+ */
e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX);
if (e->mask &&
string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
- goto Einval;
+ goto einval;
if (e->size + e->offset > BINPRM_BUF_SIZE)
- goto Einval;
+ goto einval;
+ pr_debug("register: magic/mask length: %i\n", e->size);
+ if (USE_DEBUG) {
+ print_hex_dump_bytes(
+ KBUILD_MODNAME ": register: magic[decoded]: ",
+ DUMP_PREFIX_NONE, e->magic, e->size);
+
+ if (e->mask) {
+ int i;
+ char *masked = kmalloc(e->size, GFP_KERNEL);
+
+ print_hex_dump_bytes(
+ KBUILD_MODNAME ": register: mask[decoded]: ",
+ DUMP_PREFIX_NONE, e->mask, e->size);
+
+ if (masked) {
+ for (i = 0; i < e->size; ++i)
+ masked[i] = e->magic[i] & e->mask[i];
+ print_hex_dump_bytes(
+ KBUILD_MODNAME ": register: magic[masked]: ",
+ DUMP_PREFIX_NONE, masked, e->size);
+
+ kfree(masked);
+ }
+ }
+ }
} else {
+ /* Handle the 'E' (extension) format. */
+
+ /* Skip the 'offset' field. */
p = strchr(p, del);
if (!p)
- goto Einval;
+ goto einval;
*p++ = '\0';
+
+ /* Parse the 'magic' field. */
e->magic = p;
p = strchr(p, del);
if (!p)
- goto Einval;
+ goto einval;
*p++ = '\0';
if (!e->magic[0] || strchr(e->magic, '/'))
- goto Einval;
+ goto einval;
+ pr_debug("register: extension: {%s}\n", e->magic);
+
+ /* Skip the 'mask' field. */
p = strchr(p, del);
if (!p)
- goto Einval;
+ goto einval;
*p++ = '\0';
}
+
+ /* Parse the 'interpreter' field. */
e->interpreter = p;
p = strchr(p, del);
if (!p)
- goto Einval;
+ goto einval;
*p++ = '\0';
if (!e->interpreter[0])
- goto Einval;
-
-
- p = check_special_flags (p, e);
+ goto einval;
+ pr_debug("register: interpreter: {%s}\n", e->interpreter);
+ /* Parse the 'flags' field. */
+ p = check_special_flags(p, e);
if (*p == '\n')
p++;
if (p != buf + count)
- goto Einval;
+ goto einval;
+
return e;
out:
return ERR_PTR(err);
-Efault:
+efault:
kfree(e);
return ERR_PTR(-EFAULT);
-Einval:
+einval:
kfree(e);
return ERR_PTR(-EINVAL);
}
@@ -417,7 +507,7 @@ static int parse_command(const char __user *buffer, size_t count)
return -EFAULT;
if (!count)
return 0;
- if (s[count-1] == '\n')
+ if (s[count - 1] == '\n')
count--;
if (count == 1 && s[0] == '0')
return 1;
@@ -434,7 +524,7 @@ static void entry_status(Node *e, char *page)
{
char *dp;
char *status = "disabled";
- const char * flags = "flags: ";
+ const char *flags = "flags: ";
if (test_bit(Enabled, &e->flags))
status = "enabled";
@@ -448,19 +538,15 @@ static void entry_status(Node *e, char *page)
dp = page + strlen(page);
/* print the special flags */
- sprintf (dp, "%s", flags);
- dp += strlen (flags);
- if (e->flags & MISC_FMT_PRESERVE_ARGV0) {
- *dp ++ = 'P';
- }
- if (e->flags & MISC_FMT_OPEN_BINARY) {
- *dp ++ = 'O';
- }
- if (e->flags & MISC_FMT_CREDENTIALS) {
- *dp ++ = 'C';
- }
- *dp ++ = '\n';
-
+ sprintf(dp, "%s", flags);
+ dp += strlen(flags);
+ if (e->flags & MISC_FMT_PRESERVE_ARGV0)
+ *dp++ = 'P';
+ if (e->flags & MISC_FMT_OPEN_BINARY)
+ *dp++ = 'O';
+ if (e->flags & MISC_FMT_CREDENTIALS)
+ *dp++ = 'C';
+ *dp++ = '\n';
if (!test_bit(Magic, &e->flags)) {
sprintf(dp, "extension .%s\n", e->magic);
@@ -488,7 +574,7 @@ static void entry_status(Node *e, char *page)
static struct inode *bm_get_inode(struct super_block *sb, int mode)
{
- struct inode * inode = new_inode(sb);
+ struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
@@ -528,13 +614,14 @@ static void kill_node(Node *e)
/* /<entry> */
static ssize_t
-bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
+bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
Node *e = file_inode(file)->i_private;
ssize_t res;
char *page;
- if (!(page = (char*) __get_free_page(GFP_KERNEL)))
+ page = (char *) __get_free_page(GFP_KERNEL);
+ if (!page)
return -ENOMEM;
entry_status(e, page);
@@ -553,20 +640,28 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
int res = parse_command(buffer, count);
switch (res) {
- case 1: clear_bit(Enabled, &e->flags);
- break;
- case 2: set_bit(Enabled, &e->flags);
- break;
- case 3: root = dget(file->f_path.dentry->d_sb->s_root);
- mutex_lock(&root->d_inode->i_mutex);
-
- kill_node(e);
-
- mutex_unlock(&root->d_inode->i_mutex);
- dput(root);
- break;
- default: return res;
+ case 1:
+ /* Disable this handler. */
+ clear_bit(Enabled, &e->flags);
+ break;
+ case 2:
+ /* Enable this handler. */
+ set_bit(Enabled, &e->flags);
+ break;
+ case 3:
+ /* Delete this handler. */
+ root = dget(file->f_path.dentry->d_sb->s_root);
+ mutex_lock(&root->d_inode->i_mutex);
+
+ kill_node(e);
+
+ mutex_unlock(&root->d_inode->i_mutex);
+ dput(root);
+ break;
+ default:
+ return res;
}
+
return count;
}
@@ -654,26 +749,36 @@ bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
}
-static ssize_t bm_status_write(struct file * file, const char __user * buffer,
+static ssize_t bm_status_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
int res = parse_command(buffer, count);
struct dentry *root;
switch (res) {
- case 1: enabled = 0; break;
- case 2: enabled = 1; break;
- case 3: root = dget(file->f_path.dentry->d_sb->s_root);
- mutex_lock(&root->d_inode->i_mutex);
-
- while (!list_empty(&entries))
- kill_node(list_entry(entries.next, Node, list));
-
- mutex_unlock(&root->d_inode->i_mutex);
- dput(root);
- break;
- default: return res;
+ case 1:
+ /* Disable all handlers. */
+ enabled = 0;
+ break;
+ case 2:
+ /* Enable all handlers. */
+ enabled = 1;
+ break;
+ case 3:
+ /* Delete all handlers. */
+ root = dget(file->f_path.dentry->d_sb->s_root);
+ mutex_lock(&root->d_inode->i_mutex);
+
+ while (!list_empty(&entries))
+ kill_node(list_entry(entries.next, Node, list));
+
+ mutex_unlock(&root->d_inode->i_mutex);
+ dput(root);
+ break;
+ default:
+ return res;
}
+
return count;
}
@@ -690,14 +795,16 @@ static const struct super_operations s_ops = {
.evict_inode = bm_evict_inode,
};
-static int bm_fill_super(struct super_block * sb, void * data, int silent)
+static int bm_fill_super(struct super_block *sb, void *data, int silent)
{
+ int err;
static struct tree_descr bm_files[] = {
[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
[3] = {"register", &bm_register_operations, S_IWUSR},
/* last one */ {""}
};
- int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
+
+ err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
if (!err)
sb->s_op = &s_ops;
return err;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 5027a3e14922..afdf4e3cafc2 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm)
if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
return -ENOEXEC;
+
+ /*
+ * If the script filename will be inaccessible after exec, typically
+ * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give
+ * up now (on the assumption that the interpreter will want to load
+ * this file).
+ */
+ if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+ return -ENOENT;
+
/*
* This section does the #! interpretation.
* Sorta complicated, but hopefully it will work. -TYT
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cc9d4114cda0..b48c41bf0f86 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -235,7 +235,10 @@ struct super_block *freeze_bdev(struct block_device *bdev)
sb = get_active_super(bdev);
if (!sb)
goto out;
- error = freeze_super(sb);
+ if (sb->s_op->freeze_super)
+ error = sb->s_op->freeze_super(sb);
+ else
+ error = freeze_super(sb);
if (error) {
deactivate_super(sb);
bdev->bd_fsfreeze_count--;
@@ -272,7 +275,10 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
if (!sb)
goto out;
- error = thaw_super(sb);
+ if (sb->s_op->thaw_super)
+ error = sb->s_op->thaw_super(sb);
+ else
+ error = thaw_super(sb);
if (error) {
bdev->bd_fsfreeze_count++;
mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -1585,7 +1591,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
EXPORT_SYMBOL_GPL(blkdev_write_iter);
-static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = file->f_mapping->host;
@@ -1599,6 +1605,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
iov_iter_truncate(to, size);
return generic_file_read_iter(iocb, to);
}
+EXPORT_SYMBOL_GPL(blkdev_read_iter);
/*
* Try to release a page associated with block device when the system
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index cb7f3fe9c9f6..d897ef803b3b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,6 +94,7 @@
#include <linux/mutex.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "hash.h"
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfsic_block_data_ctx *block_ctx_out,
int mirror_num);
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
- u32 len, struct block_device *bdev,
- struct btrfsic_block_data_ctx *block_ctx_out);
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block(
l = NULL;
next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
} else {
- if (next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr)) {
- printk(KERN_INFO
- "Referenced block @%llu (%s/%llu/%d)"
- " found in hash table, %c,"
- " bytenr mismatch (!= stored %llu).\n",
- next_bytenr, next_block_ctx->dev->name,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state, next_block),
- next_block->logical_bytenr);
- } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- printk(KERN_INFO
- "Referenced block @%llu (%s/%llu/%d)"
- " found in hash table, %c.\n",
- next_bytenr, next_block_ctx->dev->name,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state, next_block));
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr))
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+ next_bytenr, next_block_ctx->dev->name,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block),
+ next_block->logical_bytenr);
+ else
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+ next_bytenr, next_block_ctx->dev->name,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block));
+ }
next_block->logical_bytenr = next_bytenr;
next_block->mirror_num = *mirror_nump;
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data(
return -1;
}
if (!block_was_created) {
- if (next_block->logical_bytenr != next_bytenr &&
+ if ((state->print_mask &
+ BTRFSIC_PRINT_MASK_VERBOSE) &&
+ next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr)) {
printk(KERN_INFO
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
return ret;
}
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
- u32 len, struct block_device *bdev,
- struct btrfsic_block_data_ctx *block_ctx_out)
-{
- block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
- block_ctx_out->dev_bytenr = bytenr;
- block_ctx_out->start = bytenr;
- block_ctx_out->len = len;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
- if (NULL != block_ctx_out->dev) {
- return 0;
- } else {
- printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
- return -ENXIO;
- }
-}
-
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
{
if (block_ctx->mem_to_free) {
@@ -1901,25 +1883,26 @@ again:
dev_state,
dev_bytenr);
}
- if (block->logical_bytenr != bytenr &&
- !(!block->is_metadata &&
- block->logical_bytenr == 0))
- printk(KERN_INFO
- "Written block @%llu (%s/%llu/%d)"
- " found in hash table, %c,"
- " bytenr mismatch"
- " (!= stored %llu).\n",
- bytenr, dev_state->name, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr);
- else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- printk(KERN_INFO
- "Written block @%llu (%s/%llu/%d)"
- " found in hash table, %c.\n",
- bytenr, dev_state->name, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block));
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (block->logical_bytenr != bytenr &&
+ !(!block->is_metadata &&
+ block->logical_bytenr == 0))
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+ bytenr, dev_state->name,
+ dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state,
+ block),
+ block->logical_bytenr);
+ else
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+ bytenr, dev_state->name,
+ dev_bytenr, block->mirror_num,
+ btrfsic_get_block_type(state,
+ block));
+ }
block->logical_bytenr = bytenr;
} else {
if (num_pages * PAGE_CACHE_SIZE <
@@ -2002,24 +1985,13 @@ again:
}
}
- if (block->is_superblock)
- ret = btrfsic_map_superblock(state, bytenr,
- processed_len,
- bdev, &block_ctx);
- else
- ret = btrfsic_map_block(state, bytenr, processed_len,
- &block_ctx, 0);
- if (ret) {
- printk(KERN_INFO
- "btrfsic: btrfsic_map_block(root @%llu)"
- " failed!\n", bytenr);
- goto continue_loop;
- }
- block_ctx.datav = mapped_datav;
- /* the following is required in case of writes to mirrors,
- * use the same that was used for the lookup */
block_ctx.dev = dev_state;
block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
if (is_metadata || state->include_extent_data) {
block->never_written = 0;
@@ -2133,10 +2105,6 @@ again:
/* this is getting ugly for the
* include_extent_data case... */
bytenr = 0; /* unknown */
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.mem_to_free = NULL;
- block_ctx.pagev = NULL;
} else {
processed_len = state->metablock_size;
bytenr = btrfs_stack_header_bytenr(
@@ -2149,22 +2117,15 @@ again:
"Written block @%llu (%s/%llu/?)"
" !found in hash table, M.\n",
bytenr, dev_state->name, dev_bytenr);
-
- ret = btrfsic_map_block(state, bytenr, processed_len,
- &block_ctx, 0);
- if (ret) {
- printk(KERN_INFO
- "btrfsic: btrfsic_map_block(root @%llu)"
- " failed!\n",
- dev_bytenr);
- goto continue_loop;
- }
}
- block_ctx.datav = mapped_datav;
- /* the following is required in case of writes to mirrors,
- * use the same that was used for the lookup */
+
block_ctx.dev = dev_state;
block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
block = btrfsic_block_alloc();
if (NULL == block) {
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root,
root->sectorsize, PAGE_CACHE_SIZE);
return -1;
}
- state = kzalloc(sizeof(*state), GFP_NOFS);
- if (NULL == state) {
- printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
- return -1;
+ state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!state) {
+ state = vzalloc(sizeof(*state));
+ if (!state) {
+ printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
+ return -1;
+ }
}
if (!btrfsic_is_initialized) {
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root,
mutex_unlock(&btrfsic_mutex);
- kfree(state);
+ if (is_vmalloc_addr(state))
+ vfree(state);
+ else
+ kfree(state);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d3220d31d3cb..e9df8862012c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
* Clear the writeback bits on all of the file
* pages for a compressed write
*/
-static noinline void end_compressed_writeback(struct inode *inode, u64 start,
- unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode,
+ const struct compressed_bio *cb)
{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
int i;
int ret;
+ if (cb->errors)
+ mapping_set_error(inode->i_mapping, -EIO);
+
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
continue;
}
for (i = 0; i < ret; i++) {
+ if (cb->errors)
+ SetPageError(pages[i]);
end_page_writeback(pages[i]);
page_cache_release(pages[i]);
}
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
cb->start,
cb->start + cb->len - 1,
- NULL, 1);
+ NULL,
+ err ? 0 : 1);
cb->compressed_pages[0]->mapping = NULL;
- end_compressed_writeback(inode, cb->start, cb->len);
+ end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
/*
@@ -1011,8 +1017,6 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(page_out);
memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
- if (*pg_index == (vcnt - 1) && *pg_offset == 0)
- memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
kunmap_atomic(kaddr);
flush_dcache_page(page_out);
@@ -1054,3 +1058,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
return 1;
}
+
+/*
+ * When uncompressing data, we need to make sure and zero any parts of
+ * the biovec that were not filled in by the decompression code. pg_index
+ * and pg_offset indicate the last page and the last offset of that page
+ * that have been filled in. This will zero everything remaining in the
+ * biovec.
+ */
+void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
+ unsigned long pg_index,
+ unsigned long pg_offset)
+{
+ while (pg_index < vcnt) {
+ struct page *page = bvec[pg_index].bv_page;
+ unsigned long off = bvec[pg_index].bv_offset;
+ unsigned long len = bvec[pg_index].bv_len;
+
+ if (pg_offset < off)
+ pg_offset = off;
+ if (pg_offset < off + len) {
+ unsigned long bytes = off + len - pg_offset;
+ char *kaddr;
+
+ kaddr = kmap_atomic(page);
+ memset(kaddr + pg_offset, 0, bytes);
+ kunmap_atomic(kaddr);
+ }
+ pg_index++;
+ pg_offset = 0;
+ }
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 0c803b4fbf93..d181f70caae0 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -45,7 +45,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long nr_pages);
int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-
+void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
+ unsigned long pg_index,
+ unsigned long pg_offset);
struct btrfs_compress_op {
struct list_head *(*alloc_workspace)(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19bc6162fb8e..14a72ed14ef7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -80,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
{
int i;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /* lockdep really cares that we take all of these spinlocks
- * in the right order. If any of the locks in the path are not
- * currently blocking, it is going to complain. So, make really
- * really sure by forcing the path to blocking before we clear
- * the path blocking.
- */
if (held) {
btrfs_set_lock_blocking_rw(held, held_rw);
if (held_rw == BTRFS_WRITE_LOCK)
@@ -95,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
held_rw = BTRFS_READ_LOCK_BLOCKING;
}
btrfs_set_path_blocking(p);
-#endif
for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
if (p->nodes[i] && p->locks[i]) {
@@ -107,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
}
}
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (held)
btrfs_clear_lock_blocking_rw(held, held_rw);
-#endif
}
/* this also releases the path */
@@ -2893,7 +2883,7 @@ cow_done:
}
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- err = btrfs_try_tree_read_lock(b);
+ err = btrfs_tree_read_lock_atomic(b);
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
@@ -2939,7 +2929,7 @@ done:
*/
if (!p->leave_spinning)
btrfs_set_path_blocking(p);
- if (ret < 0)
+ if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
return ret;
}
@@ -3025,7 +3015,7 @@ again:
}
level = btrfs_header_level(b);
- err = btrfs_try_tree_read_lock(b);
+ err = btrfs_tree_read_lock_atomic(b);
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d557264ee974..7e607416755a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
+ unsigned int skip_release_on_error:1;
};
/*
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
struct percpu_counter total_bytes_pinned;
struct list_head list;
+ struct list_head ro_bgs;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache {
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
+ unsigned int has_caching_ctl:1;
+ unsigned int removed:1;
int disk_cache_state;
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache {
/* For delayed block group creation or deletion of empty block groups */
struct list_head bg_list;
+
+ /* For read-only block groups */
+ struct list_head ro_list;
+
+ atomic_t trimming;
};
/* delayed seq elem */
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info {
*/
u64 last_trans_log_full_commit;
unsigned long mount_opt;
+ /*
+ * Track requests for actions that need to be done during transaction
+ * commit (like for some mount options).
+ */
+ unsigned long pending_changes;
unsigned long compress_type:4;
int commit_interval;
/*
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info {
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
+
+ /*
+ * Chunks that can't be freed yet (under a trim/discard operation)
+ * and will be latter freed. Protected by fs_info->chunk_mutex.
+ */
+ struct list_head pinned_chunks;
};
struct btrfs_subvolume_writers {
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
-#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args {
#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
BTRFS_MOUNT_##opt)
+
#define btrfs_set_and_info(root, opt, fmt, args...) \
{ \
if (!btrfs_test_opt(root, opt)) \
@@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args {
}
/*
+ * Requests for changes that need to be done during transaction commit.
+ *
+ * Internal mount options that are used for special handling of the real
+ * mount options (eg. cannot be set during remount and have to be set during
+ * transaction commit)
+ */
+
+#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
+#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
+#define BTRFS_PENDING_COMMIT (2)
+
+#define btrfs_test_pending(info, opt) \
+ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_set_pending(info, opt) \
+ set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_clear_pending(info, opt) \
+ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+
+/*
+ * Helpers for setting pending mount option changes.
+ *
+ * Expects corresponding macros
+ * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
+ */
+#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
+do { \
+ if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
+ btrfs_info((info), fmt, ##args); \
+ btrfs_set_pending((info), SET_##opt); \
+ btrfs_clear_pending((info), CLEAR_##opt); \
+ } \
+} while(0)
+
+#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
+do { \
+ if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
+ btrfs_info((info), fmt, ##args); \
+ btrfs_set_pending((info), CLEAR_##opt); \
+ btrfs_clear_pending((info), SET_##opt); \
+ } \
+} while(0)
+
+/*
* Inode flags
*/
#define BTRFS_INODE_NODATASUM (1 << 0)
@@ -3276,7 +3339,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_async_run_delayed_refs(struct btrfs_root *root,
unsigned long count, int wait);
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start);
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3417,8 +3481,8 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_root *root,
u64 start, u64 end);
-int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes);
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 type);
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int __get_raid_index(u64 flags);
-int btrfs_start_nocow_write(struct btrfs_root *root);
-void btrfs_end_nocow_write(struct btrfs_root *root);
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
int verify_dir_item(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_dir_item *dir_item);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name,
+ int name_len);
/* orphan.c */
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
+int btrfs_inode_check_errors(struct inode *inode);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
struct page **pages, size_t num_pages,
loff_t pos, size_t write_bytes,
struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+ btrfs_bio_counter_sub(fs_info, 1);
+}
/* reada.c */
struct reada_control {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 6f662b34ba0e..ca6a3a3b6b6c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
- if (btrfs_fs_incompat(fs_info, RAID56)) {
- btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
- return -EOPNOTSUPP;
- }
-
switch (args->start.cont_reading_from_srcdev_mode) {
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(root->fs_info, ret);
- WARN_ON(ret);
+ /* don't warn if EINPROGRESS, someone else might be running scrub */
+ if (ret == -EINPROGRESS) {
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
+ ret = 0;
+ } else {
+ WARN_ON(ret);
+ }
- return 0;
+ return ret;
leave:
dev_replace->srcdev = NULL;
@@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
+ return scrub_ret;
}
printk_in_rcu(KERN_INFO
@@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
- /* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info, src_device);
- btrfs_kobj_add_device(fs_info, tgt_device);
-
btrfs_dev_replace_unlock(dev_replace);
btrfs_rm_dev_replace_blocked(fs_info);
- btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+ btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
btrfs_rm_dev_replace_unblocked(fs_info);
@@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
+ /* replace the sysfs entry */
+ btrfs_kobj_rm_device(fs_info, src_device);
+ btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
+
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
if (!IS_ERR(trans))
@@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
percpu_counter_inc(&fs_info->bio_counter);
}
-void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
- percpu_counter_dec(&fs_info->bio_counter);
+ percpu_counter_sub(&fs_info->bio_counter, amount);
if (waitqueue_active(&fs_info->replace_wait))
wake_up(&fs_info->replace_wait);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index fc8df866e919..1752625fb4dd 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
#include "hash.h"
#include "transaction.h"
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len);
-
/*
* insert a name into a directory, doing overflow properly if there is a hash
* collision. data_size indicates how big the item inserted should be. On
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len)
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
unsigned long name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ad0f47ac850..8c63419a7f70 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ INIT_LIST_HEAD(&fs_info->pinned_chunks);
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
@@ -2830,9 +2832,11 @@ retry_root_backup:
btrfs_set_opt(fs_info->mount_opt, SSD);
}
- /* Set the real inode map cache flag */
- if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
- btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
+ /*
+ * Mount does not set all options immediatelly, we can do it now and do
+ * not have to wait for transaction commit
+ */
+ btrfs_apply_pending_changes(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
+
+ lock_chunks(root);
+ while (!list_empty(&fs_info->pinned_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&fs_info->pinned_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
+ unlock_chunks(root);
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3817,19 +3832,19 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
struct btrfs_super_block *sb = fs_info->super_copy;
int ret = 0;
- if (sb->root_level > BTRFS_MAX_LEVEL) {
- printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
- sb->root_level, BTRFS_MAX_LEVEL);
+ if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
+ btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
ret = -EINVAL;
}
- if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
- printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
- sb->chunk_root_level, BTRFS_MAX_LEVEL);
+ if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
+ btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
ret = -EINVAL;
}
- if (sb->log_root_level > BTRFS_MAX_LEVEL) {
- printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
- sb->log_root_level, BTRFS_MAX_LEVEL);
+ if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
+ btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
ret = -EINVAL;
}
@@ -3837,15 +3852,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
* The common minimum, we don't know if we can trust the nodesize/sectorsize
* items yet, they'll be verified later. Issue just a warning.
*/
- if (!IS_ALIGNED(sb->root, 4096))
- printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
- sb->root);
- if (!IS_ALIGNED(sb->chunk_root, 4096))
- printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
- sb->chunk_root);
- if (!IS_ALIGNED(sb->log_root, 4096))
+ if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
- sb->log_root);
+ btrfs_super_root(sb));
+ if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+ printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
+ btrfs_super_chunk_root(sb));
+ if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
+ printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
+ btrfs_super_log_root(sb));
if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
@@ -3857,13 +3872,13 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
* done later
*/
- if (sb->num_devices > (1UL << 31))
+ if (btrfs_super_num_devices(sb) > (1UL << 31))
printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
- sb->num_devices);
+ btrfs_super_num_devices(sb));
- if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+ if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
- sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+ btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
ret = -EINVAL;
}
@@ -3871,14 +3886,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
* The generation is a global counter, we'll trust it more than the others
* but it's still possible that it's the one that's wrong.
*/
- if (sb->generation < sb->chunk_root_generation)
+ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
printk(KERN_WARNING
"BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
- sb->generation, sb->chunk_root_generation);
- if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+ btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
+ if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+ && btrfs_super_cache_generation(sb) != (u64)-1)
printk(KERN_WARNING
"BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
- sb->generation, sb->cache_generation);
+ btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
return ret;
}
@@ -4105,12 +4121,6 @@ again:
if (ret)
break;
- /* opt_discard */
- if (btrfs_test_opt(root, DISCARD))
- ret = btrfs_error_discard_extent(root, start,
- end + 1 - start,
- NULL);
-
clear_extent_dirty(unpin, start, end, GFP_NOFS);
btrfs_error_unpin_extent_range(root, start, end);
cond_resched();
@@ -4128,6 +4138,25 @@ again:
return 0;
}
+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&cur_trans->pending_ordered)) {
+ ordered = list_first_entry(&cur_trans->pending_ordered,
+ struct btrfs_ordered_extent,
+ trans_list);
+ list_del_init(&ordered->trans_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_put_ordered_extent(ordered);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
@@ -4139,6 +4168,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
+ btrfs_free_pending_ordered(cur_trans, root->fs_info);
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d56589571012..a80b97100d90 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
struct btrfs_caching_control *ctl;
spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_STARTED) {
- spin_unlock(&cache->lock);
- return NULL;
- }
-
- /* We're loading it the fast way, so we don't have a caching_ctl. */
if (!cache->caching_ctl) {
spin_unlock(&cache->lock);
return NULL;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
spin_unlock(&cache->lock);
if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+ mutex_lock(&caching_ctl->mutex);
ret = load_free_space_cache(fs_info, cache);
spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_FINISHED;
cache->last_byte_to_unpin = (u64)-1;
+ caching_ctl->progress = (u64)-1;
} else {
if (load_cache_only) {
cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_NO;
} else {
cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
}
}
spin_unlock(&cache->lock);
+ mutex_unlock(&caching_ctl->mutex);
+
wake_up(&caching_ctl->wait);
if (ret == 1) {
put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_NO;
} else {
cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
}
spin_unlock(&cache->lock);
wake_up(&caching_ctl->wait);
@@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-/* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
{
int ret;
struct btrfs_key key;
@@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
0, 0);
- if (ret > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == start &&
- key.type == BTRFS_METADATA_ITEM_KEY)
- ret = 0;
- }
btrfs_free_path(path);
return ret;
}
@@ -786,7 +780,6 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
ret = btrfs_search_slot(trans, root->fs_info->extent_root,
&key, path, 0, 0);
if (ret < 0)
@@ -802,13 +795,6 @@ again:
key.offset == root->nodesize)
ret = 0;
}
- if (ret) {
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = root->nodesize;
- btrfs_release_path(path);
- goto again;
- }
}
if (ret == 0) {
@@ -1903,8 +1889,8 @@ static int btrfs_issue_discard(struct block_device *bdev,
return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
}
-static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes)
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes)
{
int ret;
u64 discarded_bytes = 0;
@@ -3176,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct rb_node *node;
+
spin_lock(&root->fs_info->block_group_cache_lock);
+
+ /* If our block group was removed, we need a full search. */
+ if (RB_EMPTY_NODE(&cache->cache_node)) {
+ const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_first_block_group(root->fs_info,
+ next_bytenr);
+ return cache;
+ }
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
if (node) {
@@ -3518,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->chunk_alloc = 0;
found->flush = 0;
init_waitqueue_head(&found->wait);
+ INIT_LIST_HEAD(&found->ro_bgs);
ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
@@ -5439,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ cache->space_info->bytes_used -= num_bytes;
+ cache->space_info->disk_used -= num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ set_extent_dirty(info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ GFP_NOFS | __GFP_NOFAIL);
/*
* No longer have used bytes in this block group, queue
* it for deletion.
@@ -5453,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
}
spin_unlock(&info->unused_bgs_lock);
}
- btrfs_set_block_group_used(&cache->item, old_val);
- cache->pinned += num_bytes;
- cache->space_info->bytes_pinned += num_bytes;
- cache->space_info->bytes_used -= num_bytes;
- cache->space_info->disk_used -= num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- set_extent_dirty(info->pinned_extents,
- bytenr, bytenr + num_bytes - 1,
- GFP_NOFS | __GFP_NOFAIL);
}
btrfs_put_block_group(cache);
total -= num_bytes;
@@ -5729,7 +5727,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
-static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
+ const bool return_free_space)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
@@ -5753,7 +5752,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
if (start < cache->last_byte_to_unpin) {
len = min(len, cache->last_byte_to_unpin - start);
- btrfs_add_free_space(cache, start, len);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
}
start += len;
@@ -5817,7 +5817,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
- unpin_extent_range(root, start, end);
+ unpin_extent_range(root, start, end, true);
cond_resched();
}
@@ -8525,6 +8525,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro = 1;
+ list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
out:
@@ -8579,15 +8580,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
/*
* helper to account the unused space of all the readonly block group in the
- * list. takes mirrors into account.
+ * space_info. takes mirrors into account.
*/
-static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
struct btrfs_block_group_cache *block_group;
u64 free_bytes = 0;
int factor;
- list_for_each_entry(block_group, groups_list, list) {
+ /* It's df, we don't care if it's racey */
+ if (list_empty(&sinfo->ro_bgs))
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
spin_lock(&block_group->lock);
if (!block_group->ro) {
@@ -8608,26 +8614,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
spin_unlock(&block_group->lock);
}
-
- return free_bytes;
-}
-
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
- int i;
- u64 free_bytes = 0;
-
- spin_lock(&sinfo->lock);
-
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
- if (!list_empty(&sinfo->block_groups[i]))
- free_bytes += __btrfs_get_ro_block_group_free_space(
- &sinfo->block_groups[i]);
-
spin_unlock(&sinfo->lock);
return free_bytes;
@@ -8647,6 +8633,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
cache->bytes_super - btrfs_block_group_used(&cache->item);
sinfo->bytes_readonly -= num_bytes;
cache->ro = 0;
+ list_del_init(&cache->ro_list);
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -8887,6 +8874,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
cache_node);
rb_erase(&block_group->cache_node,
&info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
spin_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
@@ -9016,7 +9004,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
+ INIT_LIST_HEAD(&cache->ro_list);
btrfs_init_free_space_ctl(cache);
+ atomic_set(&cache->trimming, 0);
return cache;
}
@@ -9143,6 +9133,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
spin_lock(&info->block_group_cache_lock);
rb_erase(&cache->cache_node,
&info->block_group_cache_tree);
+ RB_CLEAR_NODE(&cache->cache_node);
spin_unlock(&info->block_group_cache_lock);
btrfs_put_block_group(cache);
goto error;
@@ -9209,9 +9200,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
int ret = 0;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
- list_del_init(&block_group->bg_list);
if (ret)
- continue;
+ goto next;
spin_lock(&block_group->lock);
memcpy(&item, &block_group->item, sizeof(item));
@@ -9226,6 +9216,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
key.objectid, key.offset);
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+next:
+ list_del_init(&block_group->bg_list);
}
}
@@ -9283,6 +9275,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&cache->cache_node,
&root->fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&cache->cache_node);
spin_unlock(&root->fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
return ret;
@@ -9318,7 +9311,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start)
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em)
{
struct btrfs_path *path;
struct btrfs_block_group_cache *block_group;
@@ -9330,6 +9324,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int ret;
int index;
int factor;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ bool remove_em;
root = root->fs_info->extent_root;
@@ -9414,6 +9410,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
if (root->fs_info->first_logical_byte == block_group->key.objectid)
root->fs_info->first_logical_byte = (u64)-1;
@@ -9425,6 +9422,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
+ list_del_init(&block_group->ro_list);
if (list_empty(&block_group->space_info->block_groups[index])) {
kobj = block_group->space_info->block_group_kobjs[index];
block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9436,8 +9434,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
+ if (block_group->has_caching_ctl)
+ caching_ctl = get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_block_group_cache_done(block_group);
+ if (block_group->has_caching_ctl) {
+ down_write(&root->fs_info->commit_root_sem);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl,
+ &root->fs_info->caching_block_groups, list)
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ atomic_inc(&caching_ctl->count);
+ break;
+ }
+ }
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ up_write(&root->fs_info->commit_root_sem);
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ put_caching_control(caching_ctl);
+ put_caching_control(caching_ctl);
+ }
+ }
btrfs_remove_free_space_cache(block_group);
@@ -9449,6 +9471,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
+ lock_chunks(root);
+ if (!list_empty(&em->list)) {
+ /* We're in the transaction->pending_chunks list. */
+ free_extent_map(em);
+ }
+ spin_lock(&block_group->lock);
+ block_group->removed = 1;
+ /*
+ * At this point trimming can't start on this block group, because we
+ * removed the block group from the tree fs_info->block_group_cache_tree
+ * so no one can't find it anymore and even if someone already got this
+ * block group before we removed it from the rbtree, they have already
+ * incremented block_group->trimming - if they didn't, they won't find
+ * any free space entries because we already removed them all when we
+ * called btrfs_remove_free_space_cache().
+ *
+ * And we must not remove the extent map from the fs_info->mapping_tree
+ * to prevent the same logical address range and physical device space
+ * ranges from being reused for a new block group. This is because our
+ * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * completely transactionless, so while it is trimming a range the
+ * currently running transaction might finish and a new one start,
+ * allowing for new block groups to be created that can reuse the same
+ * physical device locations unless we take this special care.
+ */
+ remove_em = (atomic_read(&block_group->trimming) == 0);
+ /*
+ * Make sure a trimmer task always sees the em in the pinned_chunks list
+ * if it sees block_group->removed == 1 (needs to lock block_group->lock
+ * before checking block_group->removed).
+ */
+ if (!remove_em) {
+ /*
+ * Our em might be in trans->transaction->pending_chunks which
+ * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+ * and so is the fs_info->pinned_chunks list.
+ *
+ * So at this point we must be holding the chunk_mutex to avoid
+ * any races with chunk allocation (more specifically at
+ * volumes.c:contains_pending_extent()), to ensure it always
+ * sees the em, either in the pending_chunks list or in the
+ * pinned_chunks list.
+ */
+ list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+ }
+ spin_unlock(&block_group->lock);
+
+ if (remove_em) {
+ struct extent_map_tree *em_tree;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ /*
+ * The em might be in the pending_chunks list, so make sure the
+ * chunk mutex is locked, since remove_extent_mapping() will
+ * delete us from that list.
+ */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+
+ unlock_chunks(root);
+
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9537,10 +9624,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
start = block_group->key.objectid;
end = start + block_group->key.offset - 1;
- clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
EXTENT_DIRTY, GFP_NOFS);
- clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ if (ret) {
+ btrfs_set_block_group_rw(root, block_group);
+ goto end_trans;
+ }
+ ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
+ if (ret) {
+ btrfs_set_block_group_rw(root, block_group);
+ goto end_trans;
+ }
/* Reset pinned so btrfs_put_block_group doesn't complain */
block_group->pinned = 0;
@@ -9551,6 +9646,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
ret = btrfs_remove_chunk(trans, root,
block_group->key.objectid);
+end_trans:
btrfs_end_transaction(trans, root);
next:
btrfs_put_block_group(block_group);
@@ -9599,13 +9695,7 @@ out:
int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
- return unpin_extent_range(root, start, end);
-}
-
-int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes)
-{
- return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
+ return unpin_extent_range(root, start, end, false);
}
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
@@ -9671,12 +9761,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
}
/*
- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
- * they are used to prevent the some tasks writing data into the page cache
- * by nocow before the subvolume is snapshoted, but flush the data into
- * the disk after the snapshot creation.
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
+ * data into the page cache through nocow before the subvolume is snapshoted,
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
*/
-void btrfs_end_nocow_write(struct btrfs_root *root)
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
/*
@@ -9688,7 +9780,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
wake_up(&root->subv_writers->wait);
}
-int btrfs_start_nocow_write(struct btrfs_root *root)
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
{
if (atomic_read(&root->will_be_snapshoted))
return 0;
@@ -9699,7 +9791,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
*/
smp_mb();
if (atomic_read(&root->will_be_snapshoted)) {
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
return 0;
}
return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bf3f424e0013..4ebabd237153 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
clear = 1;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
prealloc = alloc_extent_state(mask);
- if (!prealloc)
- return -ENOMEM;
}
spin_lock(&tree->lock);
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
state->state |= bits_to_set;
}
-static void cache_state(struct extent_state *state,
- struct extent_state **cached_ptr)
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ const u64 flags)
{
if (cached_ptr && !(*cached_ptr)) {
- if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+ if (!flags || (state->state & flags)) {
*cached_ptr = state;
atomic_inc(&state->refs);
}
}
}
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_IOBITS | EXTENT_BOUNDARY);
+}
+
/*
* set some bits on a range in the tree. This may require allocations or
* sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int err = 0;
u64 last_start;
u64 last_end;
+ bool first_iteration = true;
btrfs_debug_check_extent_io_range(tree, start, end);
again:
if (!prealloc && (mask & __GFP_WAIT)) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
prealloc = alloc_extent_state(mask);
- if (!prealloc)
+ if (!prealloc && !first_iteration)
return -ENOMEM;
}
@@ -1234,6 +1255,7 @@ search_again:
spin_unlock(&tree->lock);
if (mask & __GFP_WAIT)
cond_resched();
+ first_iteration = false;
goto again;
}
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
state = find_first_extent_bit_state(tree, start, bits);
got_it:
if (state) {
- cache_state(state, cached_state);
+ cache_state_if_flags(state, cached_state, 0);
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
if (page_ops == 0)
return 0;
+ if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ mapping_set_error(inode->i_mapping, -EIO);
+
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
min_t(unsigned long,
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
clear_page_dirty_for_io(pages[i]);
if (page_ops & PAGE_SET_WRITEBACK)
set_page_writeback(pages[i]);
+ if (page_ops & PAGE_SET_ERROR)
+ SetPageError(pages[i]);
if (page_ops & PAGE_END_WRITEBACK)
end_page_writeback(pages[i]);
if (page_ops & PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d4b938be986..ece9ce87edff 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
#define PAGE_SET_WRITEBACK (1 << 2)
#define PAGE_END_WRITEBACK (1 << 3)
#define PAGE_SET_PRIVATE2 (1 << 4)
+#define PAGE_SET_ERROR (1 << 5)
/*
* page->private values. Every page that is controlled by the extent
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 225302b39afb..6a98bddd8f33 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
if (!em)
goto out;
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
- list_move(&em->list, &tree->modified_extents);
em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
em->mod_start = em->start;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 783a94355efd..84a2d1868271 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -413,7 +413,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
ret = 0;
fail:
while (ret < 0 && !list_empty(&tmplist)) {
- sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+ sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
list_del(&sums->list);
kfree(sums);
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a18ceabd99a8..e4090259569b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
u64 num_bytes;
int ret;
- ret = btrfs_start_nocow_write(root);
+ ret = btrfs_start_write_no_snapshoting(root);
if (!ret)
return -ENOSPC;
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
btrfs_free_reserved_data_space(inode,
reserve_bytes);
else
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
break;
}
@@ -1632,7 +1632,7 @@ again:
release_bytes = 0;
if (only_release_metadata)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
if (only_release_metadata && copied > 0) {
u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
if (release_bytes) {
if (only_release_metadata) {
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
btrfs_delalloc_release_space(inode, release_bytes);
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
loff_t pos)
{
struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
err = written_buffered;
goto out;
}
+ /*
+ * Ensure all data is persisted. We want the next direct IO read to be
+ * able to read what was just written.
+ */
endbyte = pos + written_buffered - 1;
- err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ err = btrfs_fdatawrite_range(inode, pos, endbyte);
+ if (err)
+ goto out;
+ err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
if (err)
goto out;
written += written_buffered;
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
int ret;
atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ ret = btrfs_fdatawrite_range(inode, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
return ret;
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
return 0;
}
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ /*
+ * So with compression we will find and lock a dirty page and clear the
+ * first one as dirty, setup an async extent, and immediately return
+ * with the entire range locked but with nobody actually marked with
+ * writeback. So we can't just filemap_write_and_wait_range() and
+ * expect it to work since it will just kick off a thread to do the
+ * actual work. So we need to call filemap_fdatawrite_range _again_
+ * since it will wait on the page lock, which won't be unlocked until
+ * after the pages have been marked as writeback and so we're good to go
+ * from there. We have to do this otherwise we'll miss the ordered
+ * extents and that results in badness. Please Josef, do not think you
+ * know better and pull this out at some point in the future, it is
+ * right and you are wrong.
+ */
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+
+ return ret;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 33848196550e..d6c03f7f136b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,10 +27,17 @@
#include "disk-io.h"
#include "extent_io.h"
#include "inode-map.h"
+#include "volumes.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+struct btrfs_trim_range {
+ u64 start;
+ u64 bytes;
+ struct list_head list;
+};
+
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
int ret;
struct btrfs_free_cluster *cluster = NULL;
struct rb_node *node = rb_first(&ctl->free_space_offset);
+ struct btrfs_trim_range *trim_entry;
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
cluster = NULL;
}
}
+
+ /*
+ * Make sure we don't miss any range that was removed from our rbtree
+ * because trimming is running. Otherwise after a umount+mount (or crash
+ * after committing the transaction) we would leak free space and get
+ * an inconsistent free space cache report from fsck.
+ */
+ list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
+ ret = io_ctl_add_entry(io_ctl, trim_entry->start,
+ trim_entry->bytes, NULL);
+ if (ret)
+ goto fail;
+ *entries += 1;
+ }
+
return 0;
fail:
return -ENOSPC;
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_set_generation(&io_ctl, trans->transid);
+ mutex_lock(&ctl->cache_writeout_mutex);
/* Write out the extent entries in the free space cache */
ret = write_cache_extent_entries(&io_ctl, ctl,
block_group, &entries, &bitmaps,
&bitmap_list);
- if (ret)
+ if (ret) {
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto out_nospc;
+ }
/*
* Some spaces that are freed in the current transaction are pinned,
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* committed, we shouldn't lose them.
*/
ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
- if (ret)
+ if (ret) {
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto out_nospc;
+ }
- /* At last, we write out all the bitmaps. */
+ /*
+ * At last, we write out all the bitmaps and keep cache_writeout_mutex
+ * locked while doing it because a concurrent trim can be manipulating
+ * or freeing the bitmap.
+ */
ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ mutex_unlock(&ctl->cache_writeout_mutex);
if (ret)
goto out_nospc;
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
ctl->start = block_group->key.objectid;
ctl->private = block_group;
ctl->op = &free_space_op;
+ INIT_LIST_HEAD(&ctl->trimming_ranges);
+ mutex_init(&ctl->cache_writeout_mutex);
/*
* we only want to have 32k of ram per block group for keeping
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
static int do_trimming(struct btrfs_block_group_cache *block_group,
u64 *total_trimmed, u64 start, u64 bytes,
- u64 reserved_start, u64 reserved_bytes)
+ u64 reserved_start, u64 reserved_bytes,
+ struct btrfs_trim_range *trim_entry)
{
struct btrfs_space_info *space_info = block_group->space_info;
struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
int update = 0;
u64 trimmed = 0;
@@ -2929,12 +2966,15 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
- ret = btrfs_error_discard_extent(fs_info->extent_root,
- start, bytes, &trimmed);
+ ret = btrfs_discard_extent(fs_info->extent_root,
+ start, bytes, &trimmed);
if (!ret)
*total_trimmed += trimmed;
+ mutex_lock(&ctl->cache_writeout_mutex);
btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+ list_del(&trim_entry->list);
+ mutex_unlock(&ctl->cache_writeout_mutex);
if (update) {
spin_lock(&space_info->lock);
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
u64 bytes;
while (start < end) {
+ struct btrfs_trim_range trim_entry;
+
+ mutex_lock(&ctl->cache_writeout_mutex);
spin_lock(&ctl->tree_lock);
if (ctl->free_space < minlen) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
entry = tree_search_offset(ctl, start, 0, 1);
if (!entry) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
node = rb_next(&entry->offset_index);
if (!node) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto out;
}
entry = rb_entry(node, struct btrfs_free_space,
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
if (entry->offset >= end) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
bytes = min(extent_start + extent_bytes, end) - start;
if (bytes < minlen) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
kmem_cache_free(btrfs_free_space_cachep, entry);
spin_unlock(&ctl->tree_lock);
+ trim_entry.start = extent_start;
+ trim_entry.bytes = extent_bytes;
+ list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+ mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
- extent_start, extent_bytes);
+ extent_start, extent_bytes, &trim_entry);
if (ret)
break;
next:
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
while (offset < end) {
bool next_bitmap = false;
+ struct btrfs_trim_range trim_entry;
+ mutex_lock(&ctl->cache_writeout_mutex);
spin_lock(&ctl->tree_lock);
if (ctl->free_space < minlen) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
entry = tree_search_offset(ctl, offset, 1, 0);
if (!entry) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
ret2 = search_bitmap(ctl, entry, &start, &bytes);
if (ret2 || start >= end) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
bytes = min(bytes, end - start);
if (bytes < minlen) {
spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
free_bitmap(ctl, entry);
spin_unlock(&ctl->tree_lock);
+ trim_entry.start = start;
+ trim_entry.bytes = bytes;
+ list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+ mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
- start, bytes);
+ start, bytes, &trim_entry);
if (ret)
break;
next:
@@ -3101,11 +3163,54 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
*trimmed = 0;
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ atomic_inc(&block_group->trimming);
+ spin_unlock(&block_group->lock);
+
ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
if (ret)
- return ret;
+ goto out;
ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+ spin_lock(&block_group->lock);
+ if (atomic_dec_and_test(&block_group->trimming) &&
+ block_group->removed) {
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ spin_unlock(&block_group->lock);
+
+ lock_chunks(block_group->fs_info->chunk_root);
+ em_tree = &block_group->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+ 1);
+ BUG_ON(!em); /* logic error, can't happen */
+ /*
+ * remove_extent_mapping() will delete us from the pinned_chunks
+ * list, which is protected by the chunk mutex.
+ */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ unlock_chunks(block_group->fs_info->chunk_root);
+
+ /* once for us and once for the tree */
+ free_extent_map(em);
+ free_extent_map(em);
+
+ /*
+ * We've left one free space entry and other tasks trimming
+ * this block group have left 1 entry each one. Free them.
+ */
+ __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
return ret;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 0cf4977ef70d..88b2238a0aed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
u64 start;
struct btrfs_free_space_op *op;
void *private;
+ struct mutex cache_writeout_mutex;
+ struct list_head trimming_ranges;
};
struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 83d646bd2e4b..74faea3a516e 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
root->root_key.objectid);
if (IS_ERR(tsk)) {
btrfs_warn(root->fs_info, "failed to start inode caching task");
- btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+ btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
"disabling inode map caching");
}
}
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
ctl->start = 0;
ctl->private = NULL;
ctl->op = &free_ino_op;
+ INIT_LIST_HEAD(&ctl->trimming_ranges);
+ mutex_init(&ctl->cache_writeout_mutex);
/*
* Initially we allow to use 16K of ram to cache chunks of
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d23362f4464e..e687bb0dc73a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
* are written in the same order that the flusher thread sent them
* down.
*/
-static noinline int compress_file_range(struct inode *inode,
+static noinline void compress_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end,
struct async_cow *async_cow,
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
- /*
- * skip compression for a small file range(<=blocksize) that
- * isn't an inline extent, since it dosen't save disk space at all.
- */
- if ((end - start + 1) <= blocksize &&
- (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
- goto cleanup_and_bail_uncompressed;
-
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
@@ -440,6 +432,14 @@ again:
total_compressed = actual_end - start;
+ /*
+ * skip compression for a small file range(<=blocksize) that
+ * isn't an inline extent, since it dosen't save disk space at all.
+ */
+ if (total_compressed <= blocksize &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ goto cleanup_and_bail_uncompressed;
+
/* we want to make sure that amount of ram required to uncompress
* an extent is reasonable, so we limit the total size in ram
* of a compressed extent to 128k. This is a crucial number
@@ -527,7 +527,10 @@ cont:
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
EXTENT_DEFRAG;
+ unsigned long page_error_op;
+
clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+ page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
/*
* inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
clear_flags, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK |
+ page_error_op |
PAGE_END_WRITEBACK);
goto free_pages_out;
}
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
*num_added += 1;
}
-out:
- return ret;
+ return;
free_pages_out:
for (i = 0; i < nr_pages_ret; i++) {
@@ -629,8 +632,22 @@ free_pages_out:
page_cache_release(pages[i]);
}
kfree(pages);
+}
- goto out;
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+ int i;
+
+ if (!async_extent->pages)
+ return;
+
+ for (i = 0; i < async_extent->nr_pages; i++) {
+ WARN_ON(async_extent->pages[i]->mapping);
+ page_cache_release(async_extent->pages[i]);
+ }
+ kfree(async_extent->pages);
+ async_extent->nr_pages = 0;
+ async_extent->pages = NULL;
}
/*
@@ -639,7 +656,7 @@ free_pages_out:
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline int submit_compressed_extents(struct inode *inode,
+static noinline void submit_compressed_extents(struct inode *inode,
struct async_cow *async_cow)
{
struct async_extent *async_extent;
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
struct extent_io_tree *io_tree;
int ret = 0;
- if (list_empty(&async_cow->extents))
- return 0;
-
again:
while (!list_empty(&async_cow->extents)) {
async_extent = list_entry(async_cow->extents.next,
@@ -709,15 +723,7 @@ retry:
async_extent->compressed_size,
0, alloc_hint, &ins, 1, 1);
if (ret) {
- int i;
-
- for (i = 0; i < async_extent->nr_pages; i++) {
- WARN_ON(async_extent->pages[i]->mapping);
- page_cache_release(async_extent->pages[i]);
- }
- kfree(async_extent->pages);
- async_extent->nr_pages = 0;
- async_extent->pages = NULL;
+ free_async_extent_pages(async_extent);
if (ret == -ENOSPC) {
unlock_extent(io_tree, async_extent->start,
@@ -814,15 +820,26 @@ retry:
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages);
+ if (ret) {
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct page *p = async_extent->pages[0];
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ p->mapping = inode->i_mapping;
+ tree->ops->writepage_end_io_hook(p, start, end,
+ NULL, 0);
+ p->mapping = NULL;
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
- if (ret)
- goto out;
cond_resched();
}
- ret = 0;
-out:
- return ret;
+ return;
out_free_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
@@ -832,7 +849,9 @@ out_free:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+ PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
kfree(async_extent);
goto again;
}
@@ -1318,7 +1337,7 @@ next_slot:
* we fall into common COW way.
*/
if (!nolock) {
- err = btrfs_start_nocow_write(root);
+ err = btrfs_start_write_no_snapshoting(root);
if (!err)
goto out_check;
}
@@ -1342,7 +1361,7 @@ out_check:
if (extent_end <= start) {
path->slots[0]++;
if (!nolock && nocow)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
goto next_slot;
}
if (!nocow) {
@@ -1362,7 +1381,7 @@ out_check:
page_started, nr_written, 1);
if (ret) {
if (!nolock && nocow)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
goto error;
}
cow_start = (u64)-1;
@@ -1413,7 +1432,7 @@ out_check:
num_bytes);
if (ret) {
if (!nolock && nocow)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
goto error;
}
}
@@ -1424,7 +1443,7 @@ out_check:
EXTENT_DELALLOC, PAGE_UNLOCK |
PAGE_SET_PRIVATE2);
if (!nolock && nocow)
- btrfs_end_nocow_write(root);
+ btrfs_end_write_no_snapshoting(root);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -4580,6 +4599,26 @@ next:
return err;
}
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+ while (true) {
+ int ret;
+
+ ret = btrfs_start_write_no_snapshoting(root);
+ if (ret)
+ break;
+ wait_on_atomic_t(&root->will_be_snapshoted,
+ wait_snapshoting_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+ }
+}
+
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (newsize > oldsize) {
truncate_pagecache(inode, newsize);
+ /*
+ * Don't do an expanding truncate while snapshoting is ongoing.
+ * This is to ensure the snapshot captures a fully consistent
+ * state of this file - if the snapshot captures this expanding
+ * truncation, it must capture all writes that happened before
+ * this truncation.
+ */
+ wait_for_snapshot_creation(root);
ret = btrfs_cont_expand(inode, oldsize, newsize);
- if (ret)
+ if (ret) {
+ btrfs_end_write_no_snapshoting(root);
return ret;
+ }
trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ btrfs_end_write_no_snapshoting(root);
return PTR_ERR(trans);
+ }
i_size_write(inode, newsize);
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
ret = btrfs_update_inode(trans, root, inode);
+ btrfs_end_write_no_snapshoting(root);
btrfs_end_transaction(trans, root);
} else {
@@ -5303,7 +5355,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_CAST(inode);
}
- return d_materialise_unique(dentry, inode);
+ return d_splice_alias(inode, dentry);
}
unsigned char btrfs_filetype_table[] = {
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
btrfs_put_ordered_extent(ordered);
} else {
/* Screw you mmap */
- ret = filemap_write_and_wait_range(inode->i_mapping,
- lockstart,
- lockend);
+ ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
+ if (ret)
+ break;
+ ret = filemap_fdatawait_range(inode->i_mapping,
+ lockstart,
+ lockend);
if (ret)
break;
@@ -9442,6 +9497,21 @@ out_inode:
}
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+ int ret = 0;
+
+ if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+ test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+ ret = -ENOSPC;
+ if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+ test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+ ret = -EIO;
+
+ return ret;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d2b76e29d3b..d49fe8a0f6b5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
return ret;
}
-static void btrfs_wait_nocow_write(struct btrfs_root *root)
+static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
{
s64 writers;
DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
atomic_inc(&root->will_be_snapshoted);
smp_mb__after_atomic();
- btrfs_wait_nocow_write(root);
+ btrfs_wait_for_no_snapshoting_writes(root);
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;
- /*
- * If orphan cleanup did remove any orphans, it means the tree was
- * modified and therefore the commit root is not the same as the
- * current root anymore. This is a problem, because send uses the
- * commit root and therefore can see inode items that don't exist
- * in the current root anymore, and for example make calls to
- * btrfs_iget, which will do tree lookups based on the current root
- * and not on the commit root. Those lookups will fail, returning a
- * -ESTALE error, and making send fail with that error. So make sure
- * a send does not see any orphans we have just removed, and that it
- * will see the same inodes regardless of whether a transaction
- * commit happened before it started (meaning that the commit root
- * will be the same as the current root) or not.
- */
- if (readonly && pending_snapshot->snap->node !=
- pending_snapshot->snap->commit_root) {
- trans = btrfs_join_transaction(pending_snapshot->snap);
- if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
- ret = PTR_ERR(trans);
- goto fail;
- }
- if (!IS_ERR(trans)) {
- ret = btrfs_commit_transaction(trans,
- pending_snapshot->snap);
- if (ret)
- goto fail;
- }
- }
-
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -761,27 +732,11 @@ fail:
free:
kfree(pending_snapshot);
out:
- atomic_dec(&root->will_be_snapshoted);
+ if (atomic_dec_and_test(&root->will_be_snapshoted))
+ wake_up_atomic_t(&root->will_be_snapshoted);
return ret;
}
-/* copy of check_sticky in fs/namei.c()
-* It's inline, so penalty for filesystems that don't use sticky bit is
-* minimal.
-*/
-static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
-{
- kuid_t fsuid = current_fsuid();
-
- if (!(dir->i_mode & S_ISVTX))
- return 0;
- if (uid_eq(inode->i_uid, fsuid))
- return 0;
- if (uid_eq(dir->i_uid, fsuid))
- return 0;
- return !capable(CAP_FOWNER);
-}
-
/* copy of may_delete in fs/namei.c()
* Check whether we can remove a link victim from directory dir, check
* whether the type of victim is right.
@@ -817,8 +772,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (btrfs_check_sticky(dir, victim->d_inode)||
- IS_APPEND(victim->d_inode)||
+ if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
return -EPERM;
if (isdir) {
@@ -5314,7 +5268,7 @@ long btrfs_ioctl(struct file *file, unsigned int
ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
if (ret)
return ret;
- ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
/*
* The transaction thread may want to do more work,
* namely it pokes the cleaner ktread that will start
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 5665d2149249..f8229ef1b46d 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -128,6 +128,26 @@ again:
}
/*
+ * take a spinning read lock.
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
+ */
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
+{
+ if (atomic_read(&eb->blocking_writers))
+ return 0;
+
+ read_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers)) {
+ read_unlock(&eb->lock);
+ return 0;
+ }
+ atomic_inc(&eb->read_locks);
+ atomic_inc(&eb->spinning_readers);
+ return 1;
+}
+
+/*
* returns 1 if we get the read lock and 0 if we don't
* this won't wait for blocking writers
*/
@@ -158,9 +178,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
atomic_read(&eb->blocking_readers))
return 0;
- if (!write_trylock(&eb->lock))
- return 0;
-
+ write_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index b81e0e9a4894..c44a9d5f5362 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -35,6 +35,8 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+
static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 78285f30909e..617553cdb7d3 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -373,6 +373,8 @@ cont:
}
done:
kunmap(pages_in[page_in_index]);
+ if (!ret)
+ btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
return ret;
}
@@ -410,10 +412,23 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
goto out;
}
+ /*
+ * the caller is already checking against PAGE_SIZE, but lets
+ * move this check closer to the memcpy/memset
+ */
+ destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
kaddr = kmap_atomic(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
+
+ /*
+ * btrfs_getblock is doing a zero on the tail of the page too,
+ * but this will cover anything missing from the decompressed
+ * data.
+ */
+ if (bytes < destlen)
+ memset(kaddr+bytes, 0, destlen-bytes);
kunmap_atomic(kaddr);
out:
return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec4cc20..534544e08f76 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
INIT_LIST_HEAD(&entry->log_list);
+ INIT_LIST_HEAD(&entry->trans_list);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -431,19 +432,31 @@ out:
/* Needs to either be called under a log transaction or the log_mutex */
void btrfs_get_logged_extents(struct inode *inode,
- struct list_head *logged_list)
+ struct list_head *logged_list,
+ const loff_t start,
+ const loff_t end)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_ordered_extent *ordered;
struct rb_node *n;
+ struct rb_node *prev;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
- for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ n = __tree_search(&tree->tree, end, &prev);
+ if (!n)
+ n = prev;
+ for (; n; n = rb_prev(n)) {
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ if (ordered->file_offset > end)
+ continue;
+ if (entry_end(ordered) <= start)
+ break;
if (!list_empty(&ordered->log_list))
continue;
- list_add_tail(&ordered->log_list, logged_list);
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+ list_add(&ordered->log_list, logged_list);
atomic_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
spin_unlock_irq(&log->log_extents_lock[index]);
}
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, u64 transid)
{
struct btrfs_ordered_extent *ordered;
int index = transid % 2;
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- btrfs_put_ordered_extent(ordered);
+ if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ list_add_tail(&ordered->trans_list, &trans->ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
/* start IO across the range first to instantiate any delalloc
* extents
*/
- ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+ ret = btrfs_fdatawrite_range(inode, start, orig_end);
if (ret)
return ret;
- /*
- * So with compression we will find and lock a dirty page and clear the
- * first one as dirty, setup an async extent, and immediately return
- * with the entire range locked but with nobody actually marked with
- * writeback. So we can't just filemap_write_and_wait_range() and
- * expect it to work since it will just kick off a thread to do the
- * actual work. So we need to call filemap_fdatawrite_range _again_
- * since it will wait on the page lock, which won't be unlocked until
- * after the pages have been marked as writeback and so we're good to go
- * from there. We have to do this otherwise we'll miss the ordered
- * extents and that results in badness. Please Josef, do not think you
- * know better and pull this out at some point in the future, it is
- * right and you are wrong.
- */
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags)) {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- orig_end);
- if (ret)
- return ret;
- }
+
ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
if (ret)
return ret;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274d621e..e96cd4ccd805 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
ordered extent */
#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+ * in the logging code. */
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
/* If we need to wait on this to be done */
struct list_head log_list;
+ /* If the transaction needs to wait on this ordered extent */
+ struct list_head trans_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
void btrfs_get_logged_extents(struct inode *inode,
- struct list_head *logged_list);
+ struct list_head *logged_list,
+ const loff_t start,
+ const loff_t end);
void btrfs_put_logged_extents(struct list_head *logged_list);
void btrfs_submit_logged_extents(struct list_head *logged_list,
struct btrfs_root *log);
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
void ordered_data_exit(void);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6a41631cb959..8ab2a17bbba8 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
*/
#define RBIO_CACHE_READY_BIT 3
+/*
+ * bbio and raid_map is managed by the caller, so we shouldn't free
+ * them here. And besides that, all rbios with this flag should not
+ * be cached, because we need raid_map to check the rbios' stripe
+ * is the same or not, but it is very likely that the caller has
+ * free raid_map, so don't cache those rbios.
+ */
+#define RBIO_HOLD_BBIO_MAP_BIT 4
#define RBIO_CACHE_SIZE 1024
+enum btrfs_rbio_ops {
+ BTRFS_RBIO_WRITE = 0,
+ BTRFS_RBIO_READ_REBUILD = 1,
+ BTRFS_RBIO_PARITY_SCRUB = 2,
+};
+
struct btrfs_raid_bio {
struct btrfs_fs_info *fs_info;
struct btrfs_bio *bbio;
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
/* number of data stripes (no p/q) */
int nr_data;
+ int real_stripes;
+
+ int stripe_npages;
/*
* set if we're doing a parity rebuild
* for a read from higher up, which is handled
* differently from a parity rebuild as part of
* rmw
*/
- int read_rebuild;
+ enum btrfs_rbio_ops operation;
/* first bad stripe */
int faila;
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
/* second bad stripe (for raid6 use) */
int failb;
+ int scrubp;
/*
* number of pages needed to represent the full
* stripe
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
*/
int bio_list_bytes;
+ int generic_bio_cnt;
+
atomic_t refs;
+ atomic_t stripes_pending;
+
+ atomic_t error;
/*
* these are two arrays of pointers. We allocate the
* rbio big enough to hold them both and setup their
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
* here for faster lookup
*/
struct page **bio_pages;
+
+ /*
+ * bitmap to record which horizontal stripe has data
+ */
+ unsigned long *dbitmap;
};
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+ int need_check);
+static void async_scrub_parity(struct btrfs_raid_bio *rbio);
+
/*
* the stripe hash table is used for locking, and to collect
* bios in hopes of making a full stripe
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
+ dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
cur->raid_map[0])
return 0;
- /* reads can't merge with writes */
- if (last->read_rebuild !=
- cur->read_rebuild) {
+ /* we can't merge with different operations */
+ if (last->operation != cur->operation)
+ return 0;
+ /*
+ * We've need read the full stripe from the drive.
+ * check and repair the parity and write the new results.
+ *
+ * We're not allowed to add any new bios to the
+ * bio list here, anyone else that wants to
+ * change this stripe needs to do their own rmw.
+ */
+ if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
+ cur->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
- }
return 1;
}
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
*/
static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
{
- if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+ if (rbio->nr_data + 1 == rbio->real_stripes)
return NULL;
index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
- if (next->read_rebuild)
+ if (next->operation == BTRFS_RBIO_READ_REBUILD)
async_read_rebuild(next);
- else {
+ else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
async_rmw_stripe(next);
+ } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
+ steal_rbio(rbio, next);
+ async_scrub_parity(next);
}
goto done_nolock;
@@ -796,6 +841,21 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
+static inline void
+__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
+{
+ if (need) {
+ kfree(raid_map);
+ kfree(bbio);
+ }
+}
+
+static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
+{
+ __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
+ !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
+}
+
static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
rbio->stripe_pages[i] = NULL;
}
}
- kfree(rbio->raid_map);
- kfree(rbio->bbio);
+
+ free_bbio_and_raid_map(rbio);
+
kfree(rbio);
}
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *next;
+
+ if (rbio->generic_bio_cnt)
+ btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+
free_raid_bio(rbio);
while (cur) {
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
bio_put(bio);
- if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
err = 0;
/* OK, we have read all the stripes we need to. */
- if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
err = -EIO;
rbio_orig_end_io(rbio, err, 0);
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
- int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+ int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ int num_pages = rbio_nr_pages(stripe_len, real_stripes);
+ int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
- rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+ rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
+ DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
GFP_NOFS);
- if (!rbio) {
- kfree(raid_map);
- kfree(bbio);
+ if (!rbio)
return ERR_PTR(-ENOMEM);
- }
bio_list_init(&rbio->bio_list);
INIT_LIST_HEAD(&rbio->plug_list);
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio->fs_info = root->fs_info;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
+ rbio->real_stripes = real_stripes;
+ rbio->stripe_npages = stripe_npages;
rbio->faila = -1;
rbio->failb = -1;
atomic_set(&rbio->refs, 1);
+ atomic_set(&rbio->error, 0);
+ atomic_set(&rbio->stripes_pending, 0);
/*
* the stripe_pages and bio_pages array point to the extra
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
p = rbio + 1;
rbio->stripe_pages = p;
rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+ rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
- if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
- nr_data = bbio->num_stripes - 2;
+ if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+ nr_data = real_stripes - 2;
else
- nr_data = bbio->num_stripes - 1;
+ nr_data = real_stripes - 1;
rbio->nr_data = nr_data;
return rbio;
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
{
if (rbio->faila >= 0 || rbio->failb >= 0) {
- BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+ BUG_ON(rbio->faila == rbio->real_stripes - 1);
__raid56_parity_recover(rbio);
} else {
finish_rmw(rbio);
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_bio *bbio = rbio->bbio;
- void *pointers[bbio->num_stripes];
+ void *pointers[rbio->real_stripes];
int stripe_len = rbio->stripe_len;
int nr_data = rbio->nr_data;
int stripe;
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
bio_list_init(&bio_list);
- if (bbio->num_stripes - rbio->nr_data == 1) {
- p_stripe = bbio->num_stripes - 1;
- } else if (bbio->num_stripes - rbio->nr_data == 2) {
- p_stripe = bbio->num_stripes - 2;
- q_stripe = bbio->num_stripes - 1;
+ if (rbio->real_stripes - rbio->nr_data == 1) {
+ p_stripe = rbio->real_stripes - 1;
+ } else if (rbio->real_stripes - rbio->nr_data == 2) {
+ p_stripe = rbio->real_stripes - 2;
+ q_stripe = rbio->real_stripes - 1;
} else {
BUG();
}
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock_irq(&rbio->bio_list_lock);
- atomic_set(&rbio->bbio->error, 0);
+ atomic_set(&rbio->error, 0);
/*
* now that we've set rmw_locked, run through the
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
SetPageUptodate(p);
pointers[stripe++] = kmap(p);
- raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+ raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
} else {
/* raid5 */
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
- for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
}
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
- for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
struct page *page;
if (stripe < rbio->nr_data) {
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
}
- atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
- BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+ if (likely(!bbio->num_tgtdevs))
+ goto write_data;
+
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ if (!bbio->tgtdev_map[stripe])
+ continue;
+
+ for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ struct page *page;
+ if (stripe < rbio->nr_data) {
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (!page)
+ continue;
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ rbio->bbio->tgtdev_map[stripe],
+ pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+write_data:
+ atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+ BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
while (1) {
bio = bio_list_pop(&bio_list);
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
stripe = &rbio->bbio->stripes[i];
stripe_start = stripe->physical;
if (physical >= stripe_start &&
- physical < stripe_start + rbio->stripe_len) {
+ physical < stripe_start + rbio->stripe_len &&
+ bio->bi_bdev == stripe->dev->bdev) {
return i;
}
}
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
if (rbio->faila == -1) {
/* first failure on this rbio */
rbio->faila = failed;
- atomic_inc(&rbio->bbio->error);
+ atomic_inc(&rbio->error);
} else if (rbio->failb == -1) {
/* second failure on this rbio */
rbio->failb = failed;
- atomic_inc(&rbio->bbio->error);
+ atomic_inc(&rbio->error);
} else {
ret = -EIO;
}
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
bio_put(bio);
- if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
err = 0;
- if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
goto cleanup;
/*
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
- struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
int ret;
int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- atomic_set(&rbio->bbio->error, 0);
+ atomic_set(&rbio->error, 0);
/*
* build a list of bios to read all the missing parts of this
* stripe
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* the bbio may be freed once we submit the last bio. Make sure
* not to touch it after that
*/
- atomic_set(&bbio->stripes_pending, bios_to_read);
+ atomic_set(&rbio->stripes_pending, bios_to_read);
while (1) {
bio = bio_list_pop(&bio_list);
if (!bio)
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
+ int ret;
rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
- if (IS_ERR(rbio))
+ if (IS_ERR(rbio)) {
+ __free_bbio_and_raid_map(bbio, raid_map, 1);
return PTR_ERR(rbio);
+ }
bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio->operation = BTRFS_RBIO_WRITE;
+
+ btrfs_bio_counter_inc_noblocked(root->fs_info);
+ rbio->generic_bio_cnt = 1;
/*
* don't plug on full rbios, just get them out the door
* as quickly as we can
*/
- if (rbio_is_full(rbio))
- return full_stripe_write(rbio);
+ if (rbio_is_full(rbio)) {
+ ret = full_stripe_write(rbio);
+ if (ret)
+ btrfs_bio_counter_dec(root->fs_info);
+ return ret;
+ }
cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
sizeof(*plug));
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
+ ret = 0;
} else {
- return __raid56_parity_write(rbio);
+ ret = __raid56_parity_write(rbio);
+ if (ret)
+ btrfs_bio_counter_dec(root->fs_info);
}
- return 0;
+ return ret;
}
/*
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
int err;
int i;
- pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+ pointers = kzalloc(rbio->real_stripes * sizeof(void *),
GFP_NOFS);
if (!pointers) {
err = -ENOMEM;
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
faila = rbio->faila;
failb = rbio->failb;
- if (rbio->read_rebuild) {
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
spin_lock_irq(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock_irq(&rbio->bio_list_lock);
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ /*
+ * Now we just use bitmap to mark the horizontal stripes in
+ * which we have data when doing parity scrub.
+ */
+ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+ !test_bit(pagenr, rbio->dbitmap))
+ continue;
+
/* setup our array of pointers with pages
* from each stripe
*/
- for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
- if (rbio->read_rebuild &&
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* all raid6 handling here */
- if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+ if (rbio->raid_map[rbio->real_stripes - 1] ==
RAID6_Q_STRIPE) {
/*
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
- raid6_datap_recov(rbio->bbio->num_stripes,
+ raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
- raid6_2data_recov(rbio->bbio->num_stripes,
+ raid6_2data_recov(rbio->real_stripes,
PAGE_SIZE, faila, failb,
pointers);
}
@@ -1850,7 +1968,7 @@ pstripe:
* know they can be trusted. If this was a read reconstruction,
* other endio functions will fiddle the uptodate bits
*/
- if (!rbio->read_rebuild) {
+ if (rbio->operation == BTRFS_RBIO_WRITE) {
for (i = 0; i < nr_pages; i++) {
if (faila != -1) {
page = rbio_stripe_page(rbio, faila, i);
@@ -1862,12 +1980,12 @@ pstripe:
}
}
}
- for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
- if (rbio->read_rebuild &&
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
@@ -1882,9 +2000,9 @@ cleanup:
kfree(pointers);
cleanup_io:
-
- if (rbio->read_rebuild) {
- if (err == 0)
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+ if (err == 0 &&
+ !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1893,7 +2011,13 @@ cleanup_io:
} else if (err == 0) {
rbio->faila = -1;
rbio->failb = -1;
- finish_rmw(rbio);
+
+ if (rbio->operation == BTRFS_RBIO_WRITE)
+ finish_rmw(rbio);
+ else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
+ finish_parity_scrub(rbio, 0);
+ else
+ BUG();
} else {
rbio_orig_end_io(rbio, err, 0);
}
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
set_bio_pages_uptodate(bio);
bio_put(bio);
- if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
rbio_orig_end_io(rbio, -EIO, 0);
else
__raid_recover_end_io(rbio);
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
- struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
int ret;
int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
if (ret)
goto cleanup;
- atomic_set(&rbio->bbio->error, 0);
+ atomic_set(&rbio->error, 0);
/*
* read everything that hasn't failed. Thanks to the
* stripe cache, it is possible that some or all of these
* pages are going to be uptodate.
*/
- for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
if (rbio->faila == stripe || rbio->failb == stripe) {
- atomic_inc(&rbio->bbio->error);
+ atomic_inc(&rbio->error);
continue;
}
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* were up to date, or we might have no bios to read because
* the devices were gone.
*/
- if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+ if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
__raid_recover_end_io(rbio);
goto out;
} else {
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* the bbio may be freed once we submit the last bio. Make sure
* not to touch it after that
*/
- atomic_set(&bbio->stripes_pending, bios_to_read);
+ atomic_set(&rbio->stripes_pending, bios_to_read);
while (1) {
bio = bio_list_pop(&bio_list);
if (!bio)
@@ -2021,7 +2144,7 @@ out:
return 0;
cleanup:
- if (rbio->read_rebuild)
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
rbio_orig_end_io(rbio, -EIO, 0);
return -EIO;
}
@@ -2034,34 +2157,42 @@ cleanup:
*/
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, int mirror_num)
+ u64 stripe_len, int mirror_num, int generic_io)
{
struct btrfs_raid_bio *rbio;
int ret;
rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
- if (IS_ERR(rbio))
+ if (IS_ERR(rbio)) {
+ __free_bbio_and_raid_map(bbio, raid_map, generic_io);
return PTR_ERR(rbio);
+ }
- rbio->read_rebuild = 1;
+ rbio->operation = BTRFS_RBIO_READ_REBUILD;
bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
BUG();
- kfree(raid_map);
- kfree(bbio);
+ __free_bbio_and_raid_map(bbio, raid_map, generic_io);
kfree(rbio);
return -EIO;
}
+ if (generic_io) {
+ btrfs_bio_counter_inc_noblocked(root->fs_info);
+ rbio->generic_bio_cnt = 1;
+ } else {
+ set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+ }
+
/*
* reconstruct from the q stripe if they are
* asking for mirror 3
*/
if (mirror_num == 3)
- rbio->failb = bbio->num_stripes - 2;
+ rbio->failb = rbio->real_stripes - 2;
ret = lock_stripe_add(rbio);
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
rbio = container_of(work, struct btrfs_raid_bio, work);
__raid56_parity_recover(rbio);
}
+
+/*
+ * The following code is used to scrub/replace the parity stripe
+ *
+ * Note: We need make sure all the pages that add into the scrub/replace
+ * raid bio are correct and not be changed during the scrub/replace. That
+ * is those pages just hold metadata or file data with checksum.
+ */
+
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len, struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors)
+{
+ struct btrfs_raid_bio *rbio;
+ int i;
+
+ rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ if (IS_ERR(rbio))
+ return NULL;
+ bio_list_add(&rbio->bio_list, bio);
+ /*
+ * This is a special bio which is used to hold the completion handler
+ * and make the scrub rbio is similar to the other types
+ */
+ ASSERT(!bio->bi_iter.bi_size);
+ rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
+
+ for (i = 0; i < rbio->real_stripes; i++) {
+ if (bbio->stripes[i].dev == scrub_dev) {
+ rbio->scrubp = i;
+ break;
+ }
+ }
+
+ /* Now we just support the sectorsize equals to page size */
+ ASSERT(root->sectorsize == PAGE_SIZE);
+ ASSERT(rbio->stripe_npages == stripe_nsectors);
+ bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+
+ return rbio;
+}
+
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+ struct page *page, u64 logical)
+{
+ int stripe_offset;
+ int index;
+
+ ASSERT(logical >= rbio->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+ rbio->stripe_len * rbio->nr_data);
+ stripe_offset = (int)(logical - rbio->raid_map[0]);
+ index = stripe_offset >> PAGE_CACHE_SHIFT;
+ rbio->bio_pages[index] = page;
+}
+
+/*
+ * We just scrub the parity that we have correct data on the same horizontal,
+ * so we needn't allocate all pages for all the stripes.
+ */
+static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ int bit;
+ int index;
+ struct page *page;
+
+ for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
+ for (i = 0; i < rbio->real_stripes; i++) {
+ index = i * rbio->stripe_npages + bit;
+ if (rbio->stripe_pages[index])
+ continue;
+
+ page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[index] = page;
+ ClearPageUptodate(page);
+ }
+ }
+ return 0;
+}
+
+/*
+ * end io function used by finish_rmw. When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_parity_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
+ return;
+
+ err = 0;
+
+ if (atomic_read(&rbio->error))
+ err = -EIO;
+
+ rbio_orig_end_io(rbio, err, 0);
+}
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+ int need_check)
+{
+ struct btrfs_bio *bbio = rbio->bbio;
+ void *pointers[rbio->real_stripes];
+ DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+ int nr_data = rbio->nr_data;
+ int stripe;
+ int pagenr;
+ int p_stripe = -1;
+ int q_stripe = -1;
+ struct page *p_page = NULL;
+ struct page *q_page = NULL;
+ struct bio_list bio_list;
+ struct bio *bio;
+ int is_replace = 0;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ if (rbio->real_stripes - rbio->nr_data == 1) {
+ p_stripe = rbio->real_stripes - 1;
+ } else if (rbio->real_stripes - rbio->nr_data == 2) {
+ p_stripe = rbio->real_stripes - 2;
+ q_stripe = rbio->real_stripes - 1;
+ } else {
+ BUG();
+ }
+
+ if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+ is_replace = 1;
+ bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+ }
+
+ /*
+ * Because the higher layers(scrubber) are unlikely to
+ * use this area of the disk again soon, so don't cache
+ * it.
+ */
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ if (!need_check)
+ goto writeback;
+
+ p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!p_page)
+ goto cleanup;
+ SetPageUptodate(p_page);
+
+ if (q_stripe != -1) {
+ q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!q_page) {
+ __free_page(p_page);
+ goto cleanup;
+ }
+ SetPageUptodate(q_page);
+ }
+
+ atomic_set(&rbio->error, 0);
+
+ for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+ struct page *p;
+ void *parity;
+ /* first collect one page from each data stripe */
+ for (stripe = 0; stripe < nr_data; stripe++) {
+ p = page_in_rbio(rbio, stripe, pagenr, 0);
+ pointers[stripe] = kmap(p);
+ }
+
+ /* then add the parity stripe */
+ pointers[stripe++] = kmap(p_page);
+
+ if (q_stripe != -1) {
+
+ /*
+ * raid6, add the qstripe and call the
+ * library function to fill in our p/q
+ */
+ pointers[stripe++] = kmap(q_page);
+
+ raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ }
+
+ /* Check scrubbing pairty and repair it */
+ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+ parity = kmap(p);
+ if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
+ memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+ else
+ /* Parity is right, needn't writeback */
+ bitmap_clear(rbio->dbitmap, pagenr, 1);
+ kunmap(p);
+
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+ kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ }
+
+ __free_page(p_page);
+ if (q_page)
+ __free_page(q_page);
+
+writeback:
+ /*
+ * time to start writing. Make bios for everything from the
+ * higher layers (the bio_list in our rbio) and our p/q. Ignore
+ * everything else.
+ */
+ for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+ struct page *page;
+
+ page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+ ret = rbio_add_io_page(rbio, &bio_list,
+ page, rbio->scrubp, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+
+ if (!is_replace)
+ goto submit_write;
+
+ for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
+ struct page *page;
+
+ page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ bbio->tgtdev_map[rbio->scrubp],
+ pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+
+submit_write:
+ nr_data = bio_list_size(&bio_list);
+ if (!nr_data) {
+ /* Every parity is right */
+ rbio_orig_end_io(rbio, 0, 0);
+ return;
+ }
+
+ atomic_set(&rbio->stripes_pending, nr_data);
+
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_write_parity_end_io;
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(WRITE, bio);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
+{
+ if (stripe >= 0 && stripe < rbio->nr_data)
+ return 1;
+ return 0;
+}
+
+/*
+ * While we're doing the parity check and repair, we could have errors
+ * in reading pages off the disk. This checks for errors and if we're
+ * not able to read the page it'll trigger parity reconstruction. The
+ * parity scrub will be finished after we've reconstructed the failed
+ * stripes
+ */
+static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+{
+ if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ goto cleanup;
+
+ if (rbio->faila >= 0 || rbio->failb >= 0) {
+ int dfail = 0, failp = -1;
+
+ if (is_data_stripe(rbio, rbio->faila))
+ dfail++;
+ else if (is_parity_stripe(rbio->faila))
+ failp = rbio->faila;
+
+ if (is_data_stripe(rbio, rbio->failb))
+ dfail++;
+ else if (is_parity_stripe(rbio->failb))
+ failp = rbio->failb;
+
+ /*
+ * Because we can not use a scrubbing parity to repair
+ * the data, so the capability of the repair is declined.
+ * (In the case of RAID5, we can not repair anything)
+ */
+ if (dfail > rbio->bbio->max_errors - 1)
+ goto cleanup;
+
+ /*
+ * If all data is good, only parity is correctly, just
+ * repair the parity.
+ */
+ if (dfail == 0) {
+ finish_parity_scrub(rbio, 0);
+ return;
+ }
+
+ /*
+ * Here means we got one corrupted data stripe and one
+ * corrupted parity on RAID6, if the corrupted parity
+ * is scrubbing parity, luckly, use the other one to repair
+ * the data, or we can not repair the data stripe.
+ */
+ if (failp != rbio->scrubp)
+ goto cleanup;
+
+ __raid_recover_end_io(rbio);
+ } else {
+ finish_parity_scrub(rbio, 1);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * end io for the read phase of the rmw cycle. All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
+ return;
+
+ /*
+ * this will normally call finish_rmw to start our write
+ * but if there are any failed stripes we'll reconstruct
+ * from parity first
+ */
+ validate_rbio_for_parity_scrub(rbio);
+}
+
+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct bio_list bio_list;
+ int ret;
+ int pagenr;
+ int stripe;
+ struct bio *bio;
+
+ ret = alloc_rbio_essential_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ bio_list_init(&bio_list);
+
+ atomic_set(&rbio->error, 0);
+ /*
+ * build a list of bios to read all the missing parts of this
+ * stripe
+ */
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+ struct page *page;
+ /*
+ * we want to find all the pages missing from
+ * the rbio and read them from the disk. If
+ * page_in_rbio finds a page in the bio list
+ * we don't need to read it off the stripe.
+ */
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (page)
+ continue;
+
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ /*
+ * the bio cache may have handed us an uptodate
+ * page. If so, be happy and use it
+ */
+ if (PageUptodate(page))
+ continue;
+
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ stripe, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * this can happen if others have merged with
+ * us, it means there is nothing left to read.
+ * But if there are missing devices it may not be
+ * safe to do the full stripe write yet.
+ */
+ goto finish;
+ }
+
+ /*
+ * the bbio may be freed once we submit the last bio. Make sure
+ * not to touch it after that
+ */
+ atomic_set(&rbio->stripes_pending, bios_to_read);
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid56_parity_scrub_end_io;
+
+ btrfs_bio_wq_end_io(rbio->fs_info, bio,
+ BTRFS_WQ_ENDIO_RAID56);
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(READ, bio);
+ }
+ /* the actual write will happen once the reads are done */
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+ return;
+
+finish:
+ validate_rbio_for_parity_scrub(rbio);
+}
+
+static void scrub_parity_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ raid56_parity_scrub_stripe(rbio);
+}
+
+static void async_scrub_parity(struct btrfs_raid_bio *rbio)
+{
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ scrub_parity_work, NULL, NULL);
+
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
+}
+
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!lock_stripe_add(rbio))
+ async_scrub_parity(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73bfdfbe..31d4a157b5e3 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map)
#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
((x) == RAID6_Q_STRIPE))
+struct btrfs_raid_bio;
+struct btrfs_device;
+
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, int mirror_num);
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len, int mirror_num, int generic_io);
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map,
u64 stripe_len);
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len, struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors);
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+ struct page *page, u64 logical);
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa083113827..f2bb13a23f86 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
*/
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
+struct scrub_recover {
+ atomic_t refs;
+ struct btrfs_bio *bbio;
+ u64 *raid_map;
+ u64 map_length;
+};
+
struct scrub_page {
struct scrub_block *sblock;
struct page *page;
struct btrfs_device *dev;
+ struct list_head list;
u64 flags; /* extent flags */
u64 generation;
u64 logical;
@@ -79,6 +87,8 @@ struct scrub_page {
unsigned int io_error:1;
};
u8 csum[BTRFS_CSUM_SIZE];
+
+ struct scrub_recover *recover;
};
struct scrub_bio {
@@ -105,14 +115,52 @@ struct scrub_block {
atomic_t outstanding_pages;
atomic_t ref_count; /* free mem on transition to zero */
struct scrub_ctx *sctx;
+ struct scrub_parity *sparity;
struct {
unsigned int header_error:1;
unsigned int checksum_error:1;
unsigned int no_io_error_seen:1;
unsigned int generation_error:1; /* also sets header_error */
+
+ /* The following is for the data used to check parity */
+ /* It is for the data with checksum */
+ unsigned int data_corrected:1;
};
};
+/* Used for the chunks with parity stripe such RAID5/6 */
+struct scrub_parity {
+ struct scrub_ctx *sctx;
+
+ struct btrfs_device *scrub_dev;
+
+ u64 logic_start;
+
+ u64 logic_end;
+
+ int nsectors;
+
+ int stripe_len;
+
+ atomic_t ref_count;
+
+ struct list_head spages;
+
+ /* Work of parity check and repair */
+ struct btrfs_work work;
+
+ /* Mark the parity blocks which have data */
+ unsigned long *dbitmap;
+
+ /*
+ * Mark the parity blocks which have data, but errors happen when
+ * read data or check data
+ */
+ unsigned long *ebitmap;
+
+ unsigned long bitmap[0];
+};
+
struct scrub_wr_ctx {
struct scrub_bio *wr_curr_bio;
struct btrfs_device *tgtdev;
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation,
- u16 csum_size);
+ u16 csum_size, int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
static void scrub_page_get(struct scrub_page *spage);
static void scrub_page_put(struct scrub_page *spage);
+static void scrub_parity_get(struct scrub_parity *sparity);
+static void scrub_parity_put(struct scrub_parity *sparity);
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -790,6 +840,20 @@ out:
scrub_pending_trans_workers_dec(sctx);
}
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+ atomic_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+ if (atomic_dec_and_test(&recover->refs)) {
+ kfree(recover->bbio);
+ kfree(recover->raid_map);
+ kfree(recover);
+ }
+}
+
/*
* scrub_handle_errored_block gets called when either verification of the
* pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
- csum, generation, sctx->csum_size);
+ csum, generation, sctx->csum_size, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
*/
spin_lock(&sctx->stat_lock);
sctx->stat.unverified_errors++;
+ sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
if (sctx->is_dev_replace)
@@ -1019,7 +1084,7 @@ nodatasum_case:
/* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, is_metadata,
have_csum, csum, generation,
- sctx->csum_size);
+ sctx->csum_size, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
@@ -1169,7 +1234,7 @@ nodatasum_case:
*/
scrub_recheck_block(fs_info, sblock_bad,
is_metadata, have_csum, csum,
- generation, sctx->csum_size);
+ generation, sctx->csum_size, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
@@ -1180,6 +1245,7 @@ nodatasum_case:
corrected_error:
spin_lock(&sctx->stat_lock);
sctx->stat.corrected_errors++;
+ sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
printk_ratelimited_in_rcu(KERN_ERR
"BTRFS: fixed up error at logical %llu on dev %s\n",
@@ -1201,11 +1267,18 @@ out:
mirror_index++) {
struct scrub_block *sblock = sblocks_for_recheck +
mirror_index;
+ struct scrub_recover *recover;
int page_index;
for (page_index = 0; page_index < sblock->page_count;
page_index++) {
sblock->pagev[page_index]->sblock = NULL;
+ recover = sblock->pagev[page_index]->recover;
+ if (recover) {
+ scrub_put_recover(recover);
+ sblock->pagev[page_index]->recover =
+ NULL;
+ }
scrub_page_put(sblock->pagev[page_index]);
}
}
@@ -1215,14 +1288,63 @@ out:
return 0;
}
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+{
+ if (raid_map) {
+ if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+ return 3;
+ else
+ return 2;
+ } else {
+ return (int)bbio->num_stripes;
+ }
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+ u64 mapped_length,
+ int nstripes, int mirror,
+ int *stripe_index,
+ u64 *stripe_offset)
+{
+ int i;
+
+ if (raid_map) {
+ /* RAID5/6 */
+ for (i = 0; i < nstripes; i++) {
+ if (raid_map[i] == RAID6_Q_STRIPE ||
+ raid_map[i] == RAID5_P_STRIPE)
+ continue;
+
+ if (logical >= raid_map[i] &&
+ logical < raid_map[i] + mapped_length)
+ break;
+ }
+
+ *stripe_index = i;
+ *stripe_offset = logical - raid_map[i];
+ } else {
+ /* The other RAID type */
+ *stripe_index = mirror;
+ *stripe_offset = 0;
+ }
+}
+
static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info,
struct scrub_block *original_sblock,
u64 length, u64 logical,
struct scrub_block *sblocks_for_recheck)
{
+ struct scrub_recover *recover;
+ struct btrfs_bio *bbio;
+ u64 *raid_map;
+ u64 sublen;
+ u64 mapped_length;
+ u64 stripe_offset;
+ int stripe_index;
int page_index;
int mirror_index;
+ int nmirrors;
int ret;
/*
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
page_index = 0;
while (length > 0) {
- u64 sublen = min_t(u64, length, PAGE_SIZE);
- u64 mapped_length = sublen;
- struct btrfs_bio *bbio = NULL;
+ sublen = min_t(u64, length, PAGE_SIZE);
+ mapped_length = sublen;
+ bbio = NULL;
+ raid_map = NULL;
/*
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
- ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio, 0);
+ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
+ &mapped_length, &bbio, 0, &raid_map);
if (ret || !bbio || mapped_length < sublen) {
kfree(bbio);
+ kfree(raid_map);
return -EIO;
}
+ recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+ if (!recover) {
+ kfree(bbio);
+ kfree(raid_map);
+ return -ENOMEM;
+ }
+
+ atomic_set(&recover->refs, 1);
+ recover->bbio = bbio;
+ recover->raid_map = raid_map;
+ recover->map_length = mapped_length;
+
BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
- for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+
+ nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+ for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_page *page;
@@ -1265,26 +1403,38 @@ leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
- kfree(bbio);
+ scrub_put_recover(recover);
return -ENOMEM;
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
page->logical = logical;
- page->physical = bbio->stripes[mirror_index].physical;
+
+ scrub_stripe_index_and_offset(logical, raid_map,
+ mapped_length,
+ bbio->num_stripes,
+ mirror_index,
+ &stripe_index,
+ &stripe_offset);
+ page->physical = bbio->stripes[stripe_index].physical +
+ stripe_offset;
+ page->dev = bbio->stripes[stripe_index].dev;
+
BUG_ON(page_index >= original_sblock->page_count);
page->physical_for_dev_replace =
original_sblock->pagev[page_index]->
physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */
- page->dev = bbio->stripes[mirror_index].dev;
page->mirror_num = mirror_index + 1;
sblock->page_count++;
page->page = alloc_page(GFP_NOFS);
if (!page->page)
goto leave_nomem;
+
+ scrub_get_recover(recover);
+ page->recover = recover;
}
- kfree(bbio);
+ scrub_put_recover(recover);
length -= sublen;
logical += sublen;
page_index++;
@@ -1293,6 +1443,51 @@ leave_nomem:
return 0;
}
+struct scrub_bio_ret {
+ struct completion event;
+ int error;
+};
+
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+ struct scrub_bio_ret *ret = bio->bi_private;
+
+ ret->error = error;
+ complete(&ret->event);
+}
+
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+ return page->recover && page->recover->raid_map;
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+ struct bio *bio,
+ struct scrub_page *page)
+{
+ struct scrub_bio_ret done;
+ int ret;
+
+ init_completion(&done.event);
+ done.error = 0;
+ bio->bi_iter.bi_sector = page->logical >> 9;
+ bio->bi_private = &done;
+ bio->bi_end_io = scrub_bio_wait_endio;
+
+ ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+ page->recover->raid_map,
+ page->recover->map_length,
+ page->mirror_num, 0);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&done.event);
+ if (done.error)
+ return -EIO;
+
+ return 0;
+}
+
/*
* this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1498,7 @@ leave_nomem:
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation,
- u16 csum_size)
+ u16 csum_size, int retry_failed_mirror)
{
int page_num;
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
continue;
}
bio->bi_bdev = page->dev->bdev;
- bio->bi_iter.bi_sector = page->physical >> 9;
bio_add_page(bio, page->page, PAGE_SIZE, 0);
- if (btrfsic_submit_bio_wait(READ, bio))
- sblock->no_io_error_seen = 0;
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+ sblock->no_io_error_seen = 0;
+ } else {
+ bio->bi_iter.bi_sector = page->physical >> 9;
+
+ if (btrfsic_submit_bio_wait(READ, bio))
+ sblock->no_io_error_seen = 0;
+ }
bio_put(bio);
}
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
int page_num;
+ /*
+ * This block is used for the check of the parity on the source device,
+ * so the data needn't be written into the destination device.
+ */
+ if (sblock->sparity)
+ return;
+
for (page_num = 0; page_num < sblock->page_count; page_num++) {
int ret;
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
if (atomic_dec_and_test(&sblock->ref_count)) {
int i;
+ if (sblock->sparity)
+ scrub_parity_put(sblock->sparity);
+
for (i = 0; i < sblock->page_count; i++)
scrub_page_put(sblock->pagev[i]);
kfree(sblock);
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
scrub_pending_bio_dec(sctx);
}
+static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
+ unsigned long *bitmap,
+ u64 start, u64 len)
+{
+ int offset;
+ int nsectors;
+ int sectorsize = sparity->sctx->dev_root->sectorsize;
+
+ if (len >= sparity->stripe_len) {
+ bitmap_set(bitmap, 0, sparity->nsectors);
+ return;
+ }
+
+ start -= sparity->logic_start;
+ offset = (int)do_div(start, sparity->stripe_len);
+ offset /= sectorsize;
+ nsectors = (int)len / sectorsize;
+
+ if (offset + nsectors <= sparity->nsectors) {
+ bitmap_set(bitmap, offset, nsectors);
+ return;
+ }
+
+ bitmap_set(bitmap, offset, sparity->nsectors - offset);
+ bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
+}
+
+static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
+ u64 start, u64 len)
+{
+ __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+}
+
+static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
+ u64 start, u64 len)
+{
+ __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+}
+
static void scrub_block_complete(struct scrub_block *sblock)
{
+ int corrupted = 0;
+
if (!sblock->no_io_error_seen) {
+ corrupted = 1;
scrub_handle_errored_block(sblock);
} else {
/*
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
* dev replace case, otherwise write here in dev replace
* case.
*/
- if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+ corrupted = scrub_checksum(sblock);
+ if (!corrupted && sblock->sctx->is_dev_replace)
scrub_write_block_to_dev_replace(sblock);
}
+
+ if (sblock->sparity && corrupted && !sblock->data_corrected) {
+ u64 start = sblock->pagev[0]->logical;
+ u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+ PAGE_SIZE;
+
+ scrub_parity_mark_sectors_error(sblock->sparity,
+ start, end - start);
+ }
}
static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
return 0;
}
+static int scrub_pages_for_parity(struct scrub_parity *sparity,
+ u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev,
+ u64 flags, u64 gen, int mirror_num, u8 *csum)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct scrub_block *sblock;
+ int index;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ if (!sblock) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ /* one ref inside this function, plus one for each page added to
+ * a bio later on */
+ atomic_set(&sblock->ref_count, 1);
+ sblock->sctx = sctx;
+ sblock->no_io_error_seen = 1;
+ sblock->sparity = sparity;
+ scrub_parity_get(sparity);
+
+ for (index = 0; len > 0; index++) {
+ struct scrub_page *spage;
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ if (!spage) {
+leave_nomem:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_block_put(sblock);
+ return -ENOMEM;
+ }
+ BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ /* For scrub block */
+ scrub_page_get(spage);
+ sblock->pagev[index] = spage;
+ /* For scrub parity */
+ scrub_page_get(spage);
+ list_add_tail(&spage->list, &sparity->spages);
+ spage->sblock = sblock;
+ spage->dev = dev;
+ spage->flags = flags;
+ spage->generation = gen;
+ spage->logical = logical;
+ spage->physical = physical;
+ spage->mirror_num = mirror_num;
+ if (csum) {
+ spage->have_csum = 1;
+ memcpy(spage->csum, csum, sctx->csum_size);
+ } else {
+ spage->have_csum = 0;
+ }
+ sblock->page_count++;
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page)
+ goto leave_nomem;
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+
+ WARN_ON(sblock->page_count == 0);
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev[index];
+ int ret;
+
+ ret = scrub_add_page_to_rd_bio(sctx, spage);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
+ }
+
+ /* last one frees, either here or in bio completion for last page */
+ scrub_block_put(sblock);
+ return 0;
+}
+
+static int scrub_extent_for_parity(struct scrub_parity *sparity,
+ u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev,
+ u64 flags, u64 gen, int mirror_num)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ int ret;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 blocksize;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ blocksize = sctx->sectorsize;
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ blocksize = sctx->nodesize;
+ } else {
+ blocksize = sctx->sectorsize;
+ WARN_ON(1);
+ }
+
+ while (len) {
+ u64 l = min_t(u64, len, blocksize);
+ int have_csum = 0;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ /* push csums to sbio */
+ have_csum = scrub_find_csum(sctx, logical, l, csum);
+ if (have_csum == 0)
+ goto skip;
+ }
+ ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+ flags, gen, mirror_num,
+ have_csum ? csum : NULL);
+skip:
+ if (ret)
+ return ret;
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+ return 0;
+}
+
/*
* Given a physical address, this will calculate it's
* logical offset. if this is a parity stripe, it will return
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
- struct map_lookup *map, u64 *offset)
+ struct map_lookup *map, u64 *offset,
+ u64 *stripe_start)
{
int i;
int j = 0;
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
last_offset = (physical - map->stripes[num].physical) *
nr_data_stripes(map);
+ if (stripe_start)
+ *stripe_start = last_offset;
+
*offset = last_offset;
for (i = 0; i < nr_data_stripes(map); i++) {
*offset = last_offset + i * map->stripe_len;
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
return 1;
}
+static void scrub_free_parity(struct scrub_parity *sparity)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct scrub_page *curr, *next;
+ int nbits;
+
+ nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+ if (nbits) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors += nbits;
+ sctx->stat.uncorrectable_errors += nbits;
+ spin_unlock(&sctx->stat_lock);
+ }
+
+ list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+ list_del_init(&curr->list);
+ scrub_page_put(curr);
+ }
+
+ kfree(sparity);
+}
+
+static void scrub_parity_bio_endio(struct bio *bio, int error)
+{
+ struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct scrub_ctx *sctx = sparity->sctx;
+
+ if (error)
+ bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ sparity->nsectors);
+
+ scrub_free_parity(sparity);
+ scrub_pending_bio_dec(sctx);
+ bio_put(bio);
+}
+
+static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct bio *bio;
+ struct btrfs_raid_bio *rbio;
+ struct scrub_page *spage;
+ struct btrfs_bio *bbio = NULL;
+ u64 *raid_map = NULL;
+ u64 length;
+ int ret;
+
+ if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
+ sparity->nsectors))
+ goto out;
+
+ length = sparity->logic_end - sparity->logic_start + 1;
+ ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
+ sparity->logic_start,
+ &length, &bbio, 0, &raid_map);
+ if (ret || !bbio || !raid_map)
+ goto bbio_out;
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ goto bbio_out;
+
+ bio->bi_iter.bi_sector = sparity->logic_start >> 9;
+ bio->bi_private = sparity;
+ bio->bi_end_io = scrub_parity_bio_endio;
+
+ rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
+ raid_map, length,
+ sparity->scrub_dev,
+ sparity->dbitmap,
+ sparity->nsectors);
+ if (!rbio)
+ goto rbio_out;
+
+ list_for_each_entry(spage, &sparity->spages, list)
+ raid56_parity_add_scrub_pages(rbio, spage->page,
+ spage->logical);
+
+ scrub_pending_bio_inc(sctx);
+ raid56_parity_submit_scrub_rbio(rbio);
+ return;
+
+rbio_out:
+ bio_put(bio);
+bbio_out:
+ kfree(bbio);
+ kfree(raid_map);
+ bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ sparity->nsectors);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+out:
+ scrub_free_parity(sparity);
+}
+
+static inline int scrub_calc_parity_bitmap_len(int nsectors)
+{
+ return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+}
+
+static void scrub_parity_get(struct scrub_parity *sparity)
+{
+ atomic_inc(&sparity->ref_count);
+}
+
+static void scrub_parity_put(struct scrub_parity *sparity)
+{
+ if (!atomic_dec_and_test(&sparity->ref_count))
+ return;
+
+ scrub_parity_check_and_repair(sparity);
+}
+
+static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ struct btrfs_path *path,
+ u64 logic_start,
+ u64 logic_end)
+{
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_extent_item *extent;
+ u64 flags;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ u64 generation;
+ u64 extent_logical;
+ u64 extent_physical;
+ u64 extent_len;
+ struct btrfs_device *extent_dev;
+ struct scrub_parity *sparity;
+ int nsectors;
+ int bitmap_len;
+ int extent_mirror_num;
+ int stop_loop = 0;
+
+ nsectors = map->stripe_len / root->sectorsize;
+ bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
+ sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
+ GFP_NOFS);
+ if (!sparity) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ sparity->stripe_len = map->stripe_len;
+ sparity->nsectors = nsectors;
+ sparity->sctx = sctx;
+ sparity->scrub_dev = sdev;
+ sparity->logic_start = logic_start;
+ sparity->logic_end = logic_end;
+ atomic_set(&sparity->ref_count, 1);
+ INIT_LIST_HEAD(&sparity->spages);
+ sparity->dbitmap = sparity->bitmap;
+ sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
+
+ ret = 0;
+ while (logic_start < logic_end) {
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = logic_start;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = btrfs_previous_extent_item(root, path, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &key,
+ path, 0, 0);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ stop_loop = 0;
+ while (1) {
+ u64 bytes;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ stop_loop = 1;
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ bytes = root->nodesize;
+ else
+ bytes = key.offset;
+
+ if (key.objectid + bytes <= logic_start)
+ goto next;
+
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
+ if (key.objectid > logic_end) {
+ stop_loop = 1;
+ break;
+ }
+
+ while (key.objectid >= logic_start + map->stripe_len)
+ logic_start += map->stripe_len;
+
+ extent = btrfs_item_ptr(l, slot,
+ struct btrfs_extent_item);
+ flags = btrfs_extent_flags(l, extent);
+ generation = btrfs_extent_generation(l, extent);
+
+ if (key.objectid < logic_start &&
+ (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ key.objectid, logic_start);
+ goto next;
+ }
+again:
+ extent_logical = key.objectid;
+ extent_len = bytes;
+
+ if (extent_logical < logic_start) {
+ extent_len -= logic_start - extent_logical;
+ extent_logical = logic_start;
+ }
+
+ if (extent_logical + extent_len >
+ logic_start + map->stripe_len)
+ extent_len = logic_start + map->stripe_len -
+ extent_logical;
+
+ scrub_parity_mark_sectors_data(sparity, extent_logical,
+ extent_len);
+
+ scrub_remap_extent(fs_info, extent_logical,
+ extent_len, &extent_physical,
+ &extent_dev,
+ &extent_mirror_num);
+
+ ret = btrfs_lookup_csums_range(csum_root,
+ extent_logical,
+ extent_logical + extent_len - 1,
+ &sctx->csum_list, 1);
+ if (ret)
+ goto out;
+
+ ret = scrub_extent_for_parity(sparity, extent_logical,
+ extent_len,
+ extent_physical,
+ extent_dev, flags,
+ generation,
+ extent_mirror_num);
+ if (ret)
+ goto out;
+
+ scrub_free_csums(sctx);
+ if (extent_logical + extent_len <
+ key.objectid + bytes) {
+ logic_start += map->stripe_len;
+
+ if (logic_start >= logic_end) {
+ stop_loop = 1;
+ break;
+ }
+
+ if (logic_start < key.objectid + bytes) {
+ cond_resched();
+ goto again;
+ }
+ }
+next:
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ if (stop_loop)
+ break;
+
+ logic_start += map->stripe_len;
+ }
+out:
+ if (ret < 0)
+ scrub_parity_mark_sectors_error(sparity, logic_start,
+ logic_end - logic_start + 1);
+ scrub_parity_put(sparity);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
int num, u64 base, u64 length,
int is_dev_replace)
{
- struct btrfs_path *path;
+ struct btrfs_path *path, *ppath;
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 extent_logical;
u64 extent_physical;
u64 extent_len;
+ u64 stripe_logical;
+ u64 stripe_end;
struct btrfs_device *extent_dev;
int extent_mirror_num;
int stop_loop = 0;
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
mirror_num = num % map->num_stripes + 1;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
- get_raid56_logic_offset(physical, num, map, &offset);
+ get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
mirror_num = 1;
} else {
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (!path)
return -ENOMEM;
+ ppath = btrfs_alloc_path();
+ if (!ppath) {
+ btrfs_free_path(ppath);
+ return -ENOMEM;
+ }
+
/*
* work on commit root. The related disk blocks are static as
* long as COW is applied. This means, it is save to rewrite
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
get_raid56_logic_offset(physical_end, num,
- map, &logic_end);
+ map, &logic_end, NULL);
logic_end += base;
} else {
logic_end = logical + increment * nstripes;
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6)) {
ret = get_raid56_logic_offset(physical, num,
- map, &logical);
+ map, &logical, &stripe_logical);
logical += base;
- if (ret)
+ if (ret) {
+ stripe_logical += base;
+ stripe_end = stripe_logical + increment - 1;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ ppath, stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
goto skip;
+ }
}
/*
* canceled?
@@ -2558,13 +3284,25 @@ again:
* loop until we find next data stripe
* or we have finished all stripes.
*/
- do {
- physical += map->stripe_len;
- ret = get_raid56_logic_offset(
- physical, num,
- map, &logical);
- logical += base;
- } while (physical < physical_end && ret);
+loop:
+ physical += map->stripe_len;
+ ret = get_raid56_logic_offset(physical,
+ num, map, &logical,
+ &stripe_logical);
+ logical += base;
+
+ if (ret && physical < physical_end) {
+ stripe_logical += base;
+ stripe_end = stripe_logical +
+ increment - 1;
+ ret = scrub_raid56_parity(sctx,
+ map, scrub_dev, ppath,
+ stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
+ goto loop;
+ }
} else {
physical += map->stripe_len;
logical += increment;
@@ -2605,6 +3343,7 @@ out:
blk_finish_plug(&plug);
btrfs_free_path(path);
+ btrfs_free_path(ppath);
return ret < 0 ? ret : 0;
}
@@ -3310,6 +4049,50 @@ out:
scrub_pending_trans_workers_dec(sctx);
}
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+ u64 logical)
+{
+ struct extent_state *cached_state = NULL;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_io_tree *io_tree;
+ struct extent_map *em;
+ u64 lockstart = start, lockend = start + len - 1;
+ int ret = 0;
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+ lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = 1;
+ goto out_unlock;
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock;
+ }
+
+ /*
+ * This extent does not actually cover the logical extent anymore,
+ * move on to the next inode.
+ */
+ if (em->block_start > logical ||
+ em->block_start + em->block_len < logical + len) {
+ free_extent_map(em);
+ ret = 1;
+ goto out_unlock;
+ }
+ free_extent_map(em);
+
+out_unlock:
+ unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+ GFP_NOFS);
+ return ret;
+}
+
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *nocow_ctx)
{
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct inode *inode;
struct page *page;
struct btrfs_root *local_root;
- struct btrfs_ordered_extent *ordered;
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
struct extent_io_tree *io_tree;
u64 physical_for_dev_replace;
+ u64 nocow_ctx_logical;
u64 len = nocow_ctx->len;
- u64 lockstart = offset, lockend = offset + len - 1;
unsigned long index;
int srcu_index;
int ret = 0;
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
io_tree = &BTRFS_I(inode)->io_tree;
+ nocow_ctx_logical = nocow_ctx->logical;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- goto out_unlock;
- }
-
- em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock;
- }
-
- /*
- * This extent does not actually cover the logical extent anymore,
- * move on to the next inode.
- */
- if (em->block_start > nocow_ctx->logical ||
- em->block_start + em->block_len < nocow_ctx->logical + len) {
- free_extent_map(em);
- goto out_unlock;
+ ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto out;
}
- free_extent_map(em);
while (len >= PAGE_CACHE_SIZE) {
index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +4159,7 @@ again:
goto next_page;
} else {
ClearPageError(page);
- err = extent_read_full_page_nolock(io_tree, page,
+ err = extent_read_full_page(io_tree, page,
btrfs_get_extent,
nocow_ctx->mirror_num);
if (err) {
@@ -3421,6 +4184,14 @@ again:
goto next_page;
}
}
+
+ ret = check_extent_to_block(inode, offset, len,
+ nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto next_page;
+ }
+
err = write_page_nocow(nocow_ctx->sctx,
physical_for_dev_replace, page);
if (err)
@@ -3434,12 +4205,10 @@ next_page:
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
+ nocow_ctx_logical += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
ret = COPY_COMPLETE;
-out_unlock:
- unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
- GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
iput(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 874828dd0a86..804432dbc351 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
return ret;
}
+/*
+ * If orphan cleanup did remove any orphans from a root, it means the tree
+ * was modified and therefore the commit root is not the same as the current
+ * root anymore. This is a problem, because send uses the commit root and
+ * therefore can see inode items that don't exist in the current root anymore,
+ * and for example make calls to btrfs_iget, which will do tree lookups based
+ * on the current root and not on the commit root. Those lookups will fail,
+ * returning a -ESTALE error, and making send fail with that error. So make
+ * sure a send does not see any orphans we have just removed, and that it will
+ * see the same inodes regardless of whether a transaction commit happened
+ * before it started (meaning that the commit root will be the same as the
+ * current root) or not.
+ */
+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
+{
+ int i;
+ struct btrfs_trans_handle *trans = NULL;
+
+again:
+ if (sctx->parent_root &&
+ sctx->parent_root->node != sctx->parent_root->commit_root)
+ goto commit_trans;
+
+ for (i = 0; i < sctx->clone_roots_cnt; i++)
+ if (sctx->clone_roots[i].root->node !=
+ sctx->clone_roots[i].root->commit_root)
+ goto commit_trans;
+
+ if (trans)
+ return btrfs_end_transaction(trans, sctx->send_root);
+
+ return 0;
+
+commit_trans:
+ /* Use any root, all fs roots will get their commit roots updated. */
+ if (!trans) {
+ trans = btrfs_join_transaction(sctx->send_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ goto again;
+ }
+
+ return btrfs_commit_transaction(trans, sctx->send_root);
+}
+
static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
{
spin_lock(&root->root_item_lock);
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
NULL);
sort_clone_roots = 1;
+ ret = ensure_commit_roots_uptodate(sctx);
+ if (ret)
+ goto out;
+
current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current->journal_info = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a2b97ef10317..60f7cbe815e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
- if (!trans->blocks_used) {
+ if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
const char *errstr;
errstr = btrfs_decode_error(errno);
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
"disabling disk space caching");
break;
case Opt_inode_cache:
- btrfs_set_and_info(root, CHANGE_INODE_CACHE,
+ btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
"enabling inode map caching");
break;
case Opt_noinode_cache:
- btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+ btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
"disabling inode map caching");
break;
case Opt_clear_cache:
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
- if (PTR_ERR(trans) == -ENOENT)
- return 0;
- return PTR_ERR(trans);
+ if (PTR_ERR(trans) == -ENOENT) {
+ /*
+ * Exit unless we have some pending changes
+ * that need to go through commit
+ */
+ if (fs_info->pending_changes == 0)
+ return 0;
+ trans = btrfs_start_transaction(root, 0);
+ } else {
+ return PTR_ERR(trans);
+ }
}
return btrfs_commit_transaction(trans, root);
}
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
int i = 0, nr_devices;
int ret;
+ /*
+ * We aren't under the device list lock, so this is racey-ish, but good
+ * enough for our purposes.
+ */
nr_devices = fs_info->fs_devices->open_devices;
- BUG_ON(!nr_devices);
+ if (!nr_devices) {
+ smp_mb();
+ nr_devices = fs_info->fs_devices->open_devices;
+ ASSERT(nr_devices);
+ if (!nr_devices) {
+ *free_bytes = 0;
+ return 0;
+ }
+ }
devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
GFP_NOFS);
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
else
min_stripe_size = BTRFS_STRIPE_LEN;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (fs_info->alloc_start)
+ mutex_lock(&fs_devices->device_list_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (!device->in_fs_metadata || !device->bdev ||
device->is_tgtdev_for_dev_replace)
continue;
+ if (i >= nr_devices)
+ break;
+
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
skip_space = 1024 * 1024;
/* user can set the offset in fs_info->alloc_start. */
- if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
- device->total_bytes)
+ if (fs_info->alloc_start &&
+ fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+ device->total_bytes) {
+ rcu_read_unlock();
skip_space = max(fs_info->alloc_start, skip_space);
- /*
- * btrfs can not use the free space in [0, skip_space - 1],
- * we must subtract it from the total. In order to implement
- * it, we account the used space in this range first.
- */
- ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
- &used_space);
- if (ret) {
- kfree(devices_info);
- return ret;
- }
+ /*
+ * btrfs can not use the free space in
+ * [0, skip_space - 1], we must subtract it from the
+ * total. In order to implement it, we account the used
+ * space in this range first.
+ */
+ ret = btrfs_account_dev_extents_size(device, 0,
+ skip_space - 1,
+ &used_space);
+ if (ret) {
+ kfree(devices_info);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ret;
+ }
+
+ rcu_read_lock();
- /* calc the free space in [0, skip_space - 1] */
- skip_space -= used_space;
+ /* calc the free space in [0, skip_space - 1] */
+ skip_space -= used_space;
+ }
/*
* we can use the free space in [0, skip_space - 1], subtract
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
i++;
}
+ rcu_read_unlock();
+ if (fs_info->alloc_start)
+ mutex_unlock(&fs_devices->device_list_mutex);
nr_devices = i;
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
* holding chunk_muext to avoid allocating new chunks, holding
* device_list_mutex to avoid the device being removed
*/
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree -= block_rsv->size >> bits;
spin_unlock(&block_rsv->lock);
- buf->f_bavail = total_free_data;
+ buf->f_bavail = div_u64(total_free_data, factor);
ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
- if (ret) {
- mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (ret)
return ret;
- }
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
- mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
@@ -2151,6 +2181,7 @@ static void __exit exit_btrfs_fs(void)
extent_map_exit();
extent_io_exit();
btrfs_interface_exit();
+ btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b2e7bb4393f6..92db3f648df4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info;
struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
- struct btrfs_trans_handle *trans;
u64 features, set, clear;
unsigned long val;
int ret;
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
btrfs_info(fs_info, "%s %s feature flag",
val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
- trans = btrfs_start_transaction(fs_info->fs_root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
spin_lock(&fs_info->super_lock);
features = get_features(fs_info, fa->feature_set);
if (val)
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
set_features(fs_info, fa->feature_set, features);
spin_unlock(&fs_info->super_lock);
- ret = btrfs_commit_transaction(trans, fs_info->fs_root);
- if (ret)
- return ret;
+ /*
+ * We don't want to do full transaction commit from inside sysfs
+ */
+ btrfs_set_pending(fs_info, COMMIT);
+ wake_up_process(fs_info->transaction_kthread);
return count;
}
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
const char *buf, size_t len)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = fs_info->fs_root;
- int ret;
size_t p_len;
if (fs_info->sb->s_flags & MS_RDONLY)
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
if (p_len >= BTRFS_LABEL_SIZE)
return -EINVAL;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- spin_lock(&root->fs_info->super_lock);
+ spin_lock(&fs_info->super_lock);
memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
memcpy(fs_info->super_copy->label, buf, p_len);
- spin_unlock(&root->fs_info->super_lock);
- ret = btrfs_commit_transaction(trans, root);
+ spin_unlock(&fs_info->super_lock);
- if (!ret)
- return len;
+ /*
+ * We don't want to do full transaction commit from inside sysfs
+ */
+ btrfs_set_pending(fs_info, COMMIT);
+ wake_up_process(fs_info->transaction_kthread);
- return ret;
+ return len;
}
BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae3616728..a605d4e2f2bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
}
}
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+ if (need_resched()) {
+ spin_unlock(&tree->lock);
+ cond_resched();
+ spin_lock(&tree->lock);
+ }
+ }
+ spin_unlock(&tree->lock);
+}
+
static noinline void switch_commit_roots(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
root->commit_root = btrfs_root_node(root);
if (is_fstree(root->objectid))
btrfs_unpin_free_ino(root);
+ clear_btree_io_tree(&root->dirty_log_pages);
}
up_write(&fs_info->commit_root_sem);
}
@@ -220,6 +247,7 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
+ INIT_LIST_HEAD(&cur_trans->pending_ordered);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
h->sync = false;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
+ INIT_LIST_HEAD(&h->ordered);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
+ if (!list_empty(&trans->ordered)) {
+ spin_lock(&info->trans_lock);
+ list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ spin_unlock(&info->trans_lock);
+ }
+
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
- convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
- mark, &cached_state, GFP_NOFS);
- cached_state = NULL;
- err = filemap_fdatawrite_range(mapping, start, end);
+ bool wait_writeback = false;
+
+ err = convert_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ mark, &cached_state, GFP_NOFS);
+ /*
+ * convert_extent_bit can return -ENOMEM, which is most of the
+ * time a temporary error. So when it happens, ignore the error
+ * and wait for writeback of this range to finish - because we
+ * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+ * to btrfs_wait_marked_extents() would not know that writeback
+ * for this range started and therefore wouldn't wait for it to
+ * finish - we don't want to commit a superblock that points to
+ * btree nodes/leafs for which writeback hasn't finished yet
+ * (and without errors).
+ * We cleanup any entries left in the io tree when committing
+ * the transaction (through clear_btree_io_tree()).
+ */
+ if (err == -ENOMEM) {
+ err = 0;
+ wait_writeback = true;
+ }
+ if (!err)
+ err = filemap_fdatawrite_range(mapping, start, end);
if (err)
werr = err;
+ else if (wait_writeback)
+ werr = filemap_fdatawait_range(mapping, start, end);
+ free_extent_state(cached_state);
+ cached_state = NULL;
cond_resched();
start = end + 1;
}
- if (err)
- werr = err;
return werr;
}
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
- clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
- 0, 0, &cached_state, GFP_NOFS);
- err = filemap_fdatawait_range(mapping, start, end);
+ /*
+ * Ignore -ENOMEM errors returned by clear_extent_bit().
+ * When committing the transaction, we'll remove any entries
+ * left in the io tree. For a log commit, we don't remove them
+ * after committing the log because the tree can be accessed
+ * concurrently - we do it only at transaction commit time when
+ * it's safe to do it (through clear_btree_io_tree()).
+ */
+ err = clear_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ 0, 0, &cached_state, GFP_NOFS);
+ if (err == -ENOMEM)
+ err = 0;
+ if (!err)
+ err = filemap_fdatawait_range(mapping, start, end);
if (err)
werr = err;
+ free_extent_state(cached_state);
+ cached_state = NULL;
cond_resched();
start = end + 1;
}
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
return 0;
}
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (!trans || !trans->transaction) {
- struct inode *btree_inode;
- btree_inode = root->fs_info->btree_inode;
- return filemap_write_and_wait(btree_inode->i_mapping);
- }
- return btrfs_write_and_wait_marked_extents(root,
+ int ret;
+
+ ret = btrfs_write_and_wait_marked_extents(root,
&trans->transaction->dirty_pages,
EXTENT_DIRTY);
+ clear_btree_io_tree(&trans->transaction->dirty_pages);
+
+ return ret;
}
/*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, -1);
}
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&cur_trans->pending_ordered)) {
+ ordered = list_first_entry(&cur_trans->pending_ordered,
+ struct btrfs_ordered_extent,
+ trans_list);
+ list_del_init(&ordered->trans_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+ &ordered->flags));
+ btrfs_put_ordered_extent(ordered);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
+ list_splice(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_wait_delalloc_flush(root->fs_info);
+ btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
btrfs_scrub_pause(root);
/*
* Ok now we need to make sure to block out any other joins while we
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
/*
- * Since the transaction is done, we should set the inode map cache flag
- * before any other comming transaction.
+ * Since the transaction is done, we can apply the pending changes
+ * before the next transaction.
*/
- if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
- btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
- else
- btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+ btrfs_apply_pending_changes(root->fs_info);
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
return (ret < 0) ? 0 : 1;
}
+
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+ unsigned long prev;
+ unsigned long bit;
+
+ prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+ if (!prev)
+ return;
+
+ bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
+ if (prev & bit)
+ btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+ prev &= ~bit;
+
+ bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
+ if (prev & bit)
+ btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+ prev &= ~bit;
+
+ bit = 1 << BTRFS_PENDING_COMMIT;
+ if (prev & bit)
+ btrfs_debug(fs_info, "pending commit done");
+ prev &= ~bit;
+
+ if (prev)
+ btrfs_warn(fs_info,
+ "unknown pending changes left 0x%lx, ignoring", prev);
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1a5d2d..00ed29c4b3f9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
+ struct list_head pending_ordered;
struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
*/
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
+ struct list_head ordered;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
+
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1475979e5718..9a02da16f2be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* is this extent already allocated in the extent
* allocation tree? If so, just add a reference
*/
- ret = btrfs_lookup_extent(root, ins.objectid,
+ ret = btrfs_lookup_data_extent(root, ins.objectid,
ins.offset);
if (ret == 0) {
ret = btrfs_inc_extent_ref(trans, root,
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
+ mark);
+ btrfs_wait_logged_extents(trans, log, log_transid);
wait_log_commit(trans, log_root_tree,
root_log_ctx.log_transid);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
- ret = root_log_ctx.log_ret;
+ if (!ret)
+ ret = root_log_ctx.log_ret;
goto out;
}
ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- btrfs_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_NEW | EXTENT_DIRTY);
- btrfs_wait_logged_extents(log, log_transid);
+ ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ if (!ret)
+ ret = btrfs_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_NEW | EXTENT_DIRTY);
+ if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_free_logged_extents(log, log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
+ btrfs_wait_logged_extents(trans, log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+ /*
+ * Clear the AS_EIO/AS_ENOSPC flags from the inode's
+ * i_mapping flags, so that the next fsync won't get
+ * an outdated io error too.
+ */
+ btrfs_inode_check_errors(inode);
*ordered_io_error = true;
break;
}
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+ btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
&token);
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
btrfs_set_token_file_extent_type(leaf, fi,
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(inode, &logged_list);
+ btrfs_get_logged_extents(inode, &logged_list, start, end);
/*
* a brute force approach to making sure we get the most uptodate
@@ -4089,6 +4104,21 @@ log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (fast_search) {
+ /*
+ * Some ordered extents started by fsync might have completed
+ * before we collected the ordered extents in logged_list, which
+ * means they're gone, not in our logged_list nor in the inode's
+ * ordered tree. We want the application/user space to know an
+ * error happened while attempting to persist file data so that
+ * it can take proper action. If such error happened, we leave
+ * without writing to the log tree and the fsync must report the
+ * file data write error and not commit the current transaction.
+ */
+ err = btrfs_inode_check_errors(inode);
+ if (err) {
+ ctx->io_err = err;
+ goto out_unlock;
+ }
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
&logged_list, ctx);
if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d47289c715c8..0144790e296e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
-static void lock_chunks(struct btrfs_root *root)
-{
- mutex_lock(&root->fs_info->chunk_mutex);
-}
-
-static void unlock_chunks(struct btrfs_root *root)
-{
- mutex_unlock(&root->fs_info->chunk_mutex);
-}
-
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
u64 *start, u64 len)
{
struct extent_map *em;
+ struct list_head *search_list = &trans->transaction->pending_chunks;
int ret = 0;
- list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+again:
+ list_for_each_entry(em, search_list, list) {
struct map_lookup *map;
int i;
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
ret = 1;
}
}
+ if (search_list == &trans->transaction->pending_chunks) {
+ search_list = &trans->root->fs_info->pinned_chunks;
+ goto again;
+ }
return ret;
}
@@ -1800,8 +1796,8 @@ error_undo:
goto error_brelse;
}
-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev)
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev)
{
struct btrfs_fs_devices *fs_devices;
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->bdev)
fs_devices->open_devices--;
+}
+
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev)
+{
+ struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
call_rcu(&srcdev->rcu, free_device);
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+ ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
}
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for the tree */
- free_extent_map(em);
out:
/* once for us */
free_extent_map(em);
@@ -4505,6 +4501,8 @@ error_del_extent:
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
+ /* One for the pending_chunks list reference */
+ free_extent_map(em);
error:
kfree(devices_info);
return ret;
@@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b)
static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
{
struct btrfs_bio_stripe s;
+ int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
int i;
u64 l;
int again = 1;
+ int m;
while (again) {
again = 0;
- for (i = 0; i < bbio->num_stripes - 1; i++) {
+ for (i = 0; i < real_stripes - 1; i++) {
if (parity_smaller(raid_map[i], raid_map[i+1])) {
s = bbio->stripes[i];
l = raid_map[i];
@@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
raid_map[i] = raid_map[i+1];
bbio->stripes[i+1] = s;
raid_map[i+1] = l;
+
+ if (bbio->tgtdev_map) {
+ m = bbio->tgtdev_map[i];
+ bbio->tgtdev_map[i] =
+ bbio->tgtdev_map[i + 1];
+ bbio->tgtdev_map[i + 1] = m;
+ }
+
again = 1;
}
}
@@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
int ret = 0;
int num_stripes;
int max_errors = 0;
+ int tgtdev_indexes = 0;
struct btrfs_bio *bbio = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
@@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
BTRFS_BLOCK_GROUP_RAID6)) {
u64 tmp;
- if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
- && raid_map_ret) {
+ if (raid_map_ret &&
+ ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+ mirror_num > 1)) {
int i, rot;
/* push stripe_nr back to the start of the full stripe */
stripe_nr = raid56_full_stripe_start;
- do_div(stripe_nr, stripe_len);
-
- stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+ do_div(stripe_nr, stripe_len * nr_data_stripes(map));
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
@@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_alloc_stripes <<= 1;
if (rw & REQ_GET_READ_MIRRORS)
num_alloc_stripes++;
+ tgtdev_indexes = num_stripes;
}
- bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
+
+ bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
+ GFP_NOFS);
if (!bbio) {
kfree(raid_map);
ret = -ENOMEM;
goto out;
}
atomic_set(&bbio->error, 0);
+ if (dev_replace_is_ongoing)
+ bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
if (rw & REQ_DISCARD) {
int factor = 0;
@@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
max_errors = btrfs_chunk_max_errors(map);
+ tgtdev_indexes = 0;
if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
dev_replace->tgtdev != NULL) {
int index_where_to_add;
@@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
new->physical = old->physical;
new->length = old->length;
new->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
+ tgtdev_indexes++;
}
}
num_stripes = index_where_to_add;
@@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
tgtdev_stripe->length =
bbio->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[index_srcdev] = num_stripes;
+ tgtdev_indexes++;
num_stripes++;
}
}
@@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
+ bbio->num_tgtdevs = tgtdev_indexes;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
mirror_num, NULL);
}
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret, int mirror_num,
+ u64 **raid_map_ret)
+{
+ return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+ mirror_num, raid_map_ret);
+}
+
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len)
@@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
} else {
ret = raid56_parity_recover(root, bio, bbio,
raid_map, map_length,
- mirror_num);
+ mirror_num, 1);
}
- /*
- * FIXME, replace dosen't support raid56 yet, please fix
- * it in the future.
- */
+
btrfs_bio_counter_dec(root->fs_info);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 08980fa23039..d6fe73c0f4a2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
struct btrfs_bio {
atomic_t stripes_pending;
@@ -305,6 +305,8 @@ struct btrfs_bio {
int max_errors;
int num_stripes;
int mirror_num;
+ int num_tgtdevs;
+ int *tgtdev_map;
struct btrfs_bio_stripe stripes[];
};
@@ -387,12 +389,18 @@ struct btrfs_balance_control {
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
-#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
- (sizeof(struct btrfs_bio_stripe) * (n)))
+#define btrfs_bio_size(total_stripes, real_stripes) \
+ (sizeof(struct btrfs_bio) + \
+ (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
+ (sizeof(int) * (real_stripes)))
int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret, int mirror_num,
+ u64 **raid_map_ret);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
@@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
@@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
struct btrfs_transaction *transaction);
+
+static inline void lock_chunks(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static inline void unlock_chunks(struct btrfs_root *root)
+{
+ mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index dcf20131fbe4..47b19465f0dc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
#include "xattr.h"
#include "disk-io.h"
#include "props.h"
+#include "locking.h"
ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
- struct btrfs_dir_item *di;
+ struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
size_t name_len = strlen(name);
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->skip_release_on_error = 1;
+
+ if (!value) {
+ di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+ name, name_len, -1);
+ if (!di && (flags & XATTR_REPLACE))
+ ret = -ENODATA;
+ else if (di)
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ goto out;
+ }
+ /*
+ * For a replace we can't just do the insert blindly.
+ * Do a lookup first (read-only btrfs_search_slot), and return if xattr
+ * doesn't exist. If it exists, fall down below to the insert/replace
+ * path - we can't race with a concurrent xattr delete, because the VFS
+ * locks the inode's i_mutex before calling setxattr or removexattr.
+ */
if (flags & XATTR_REPLACE) {
- di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
- name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- } else if (!di) {
+ ASSERT(mutex_is_locked(&inode->i_mutex));
+ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+ name, name_len, 0);
+ if (!di) {
ret = -ENODATA;
goto out;
}
- ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret)
- goto out;
btrfs_release_path(path);
+ di = NULL;
+ }
+ ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+ name, name_len, value, size);
+ if (ret == -EOVERFLOW) {
/*
- * remove the attribute
+ * We have an existing item in a leaf, split_leaf couldn't
+ * expand it. That item might have or not a dir_item that
+ * matches our target xattr, so lets check.
*/
- if (!value)
- goto out;
- } else {
- di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
- name, name_len, 0);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ ret = 0;
+ btrfs_assert_tree_locked(path->nodes[0]);
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (!di && !(flags & XATTR_REPLACE)) {
+ ret = -ENOSPC;
goto out;
}
- if (!di && !value)
- goto out;
- btrfs_release_path(path);
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ ASSERT(di); /* logic error */
+ } else if (ret) {
+ goto out;
}
-again:
- ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
- name, name_len, value, size);
- /*
- * If we're setting an xattr to a new value but the new value is say
- * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
- * back from split_leaf. This is because it thinks we'll be extending
- * the existing item size, but we're asking for enough space to add the
- * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
- * the rest of the function figure it out.
- */
- if (ret == -EOVERFLOW)
+ if (di && (flags & XATTR_CREATE)) {
ret = -EEXIST;
+ goto out;
+ }
- if (ret == -EEXIST) {
- if (flags & XATTR_CREATE)
- goto out;
+ if (di) {
/*
- * We can't use the path we already have since we won't have the
- * proper locking for a delete, so release the path and
- * re-lookup to delete the thing.
+ * We're doing a replace, and it must be atomic, that is, at
+ * any point in time we have either the old or the new xattr
+ * value in the tree. We don't want readers (getxattr and
+ * listxattrs) to miss a value, this is specially important
+ * for ACLs.
*/
- btrfs_release_path(path);
- di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
- name, name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- } else if (!di) {
- /* Shouldn't happen but just in case... */
- btrfs_release_path(path);
- goto again;
+ const int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+ const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 data_size = sizeof(*di) + name_len + size;
+ struct btrfs_item *item;
+ unsigned long data_ptr;
+ char *ptr;
+
+ if (size > old_data_len) {
+ if (btrfs_leaf_free_space(root, leaf) <
+ (size - old_data_len)) {
+ ret = -ENOSPC;
+ goto out;
+ }
}
- ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret)
- goto out;
+ if (old_data_len + name_len + sizeof(*di) == item_size) {
+ /* No other xattrs packed in the same leaf item. */
+ if (size > old_data_len)
+ btrfs_extend_item(root, path,
+ size - old_data_len);
+ else if (size < old_data_len)
+ btrfs_truncate_item(root, path, data_size, 1);
+ } else {
+ /* There are other xattrs packed in the same item. */
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto out;
+ btrfs_extend_item(root, path, data_size);
+ }
+ item = btrfs_item_nr(slot);
+ ptr = btrfs_item_ptr(leaf, slot, char);
+ ptr += btrfs_item_size(leaf, item) - data_size;
+ di = (struct btrfs_dir_item *)ptr;
+ btrfs_set_dir_data_len(leaf, di, size);
+ data_ptr = ((unsigned long)(di + 1)) + name_len;
+ write_extent_buffer(leaf, value, data_ptr, size);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
/*
- * We have a value to set, so go back and try to insert it now.
+ * Insert, and we had space for the xattr, so path->slots[0] is
+ * where our xattr dir_item is and btrfs_insert_xattr_item()
+ * filled it.
*/
- if (value) {
- btrfs_release_path(path);
- goto again;
- }
}
out:
btrfs_free_path(path);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 759fa4e2de8f..fb22fd8d8fb8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -299,6 +299,8 @@ done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
kunmap(pages_in[page_in_index]);
+ if (!ret)
+ btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
return ret;
}
@@ -310,10 +312,14 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
- unsigned long bytes_left = destlen;
+ unsigned long bytes_left;
unsigned long total_out = 0;
+ unsigned long pg_offset = 0;
char *kaddr;
+ destlen = min_t(unsigned long, destlen, PAGE_SIZE);
+ bytes_left = destlen;
+
workspace->strm.next_in = data_in;
workspace->strm.avail_in = srclen;
workspace->strm.total_in = 0;
@@ -341,7 +347,6 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long buf_start;
unsigned long buf_offset;
unsigned long bytes;
- unsigned long pg_offset = 0;
ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
@@ -384,6 +389,17 @@ next:
ret = 0;
zlib_inflateEnd(&workspace->strm);
+
+ /*
+ * this should only happen if zlib returned fewer bytes than we
+ * expected. btrfs_get_block is responsible for zeroing from the
+ * end of the inline extent (destlen) to the end of the page
+ */
+ if (pg_offset < destlen) {
+ kaddr = kmap_atomic(dest_page);
+ memset(kaddr + pg_offset, 0, destlen - pg_offset);
+ kunmap_atomic(kaddr);
+ }
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 6c48f20eddd4..20805db2c987 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -128,21 +128,15 @@ __clear_page_buffers(struct page *page)
page_cache_release(page);
}
-
-static int quiet_error(struct buffer_head *bh)
-{
- if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
- return 0;
- return 1;
-}
-
-
-static void buffer_io_error(struct buffer_head *bh)
+static void buffer_io_error(struct buffer_head *bh, char *msg)
{
char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+
+ if (!test_bit(BH_Quiet, &bh->b_state))
+ printk_ratelimited(KERN_ERR
+ "Buffer I/O error on dev %s, logical block %llu%s\n",
bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr);
+ (unsigned long long)bh->b_blocknr, msg);
}
/*
@@ -177,17 +171,10 @@ EXPORT_SYMBOL(end_buffer_read_sync);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
-
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (!quiet_error(bh)) {
- buffer_io_error(bh);
- printk(KERN_WARNING "lost page write due to "
- "I/O error on %s\n",
- bdevname(bh->b_bdev, b));
- }
+ buffer_io_error(bh, ", lost sync page write");
set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
@@ -304,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
clear_buffer_uptodate(bh);
- if (!quiet_error(bh))
- buffer_io_error(bh);
+ buffer_io_error(bh, ", async page read");
SetPageError(page);
}
@@ -353,7 +339,6 @@ still_busy:
*/
void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
@@ -365,12 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (!quiet_error(bh)) {
- buffer_io_error(bh);
- printk(KERN_WARNING "lost page write due to "
- "I/O error on %s\n",
- bdevname(bh->b_bdev, b));
- }
+ buffer_io_error(bh, ", lost async page write");
set_bit(AS_EIO, &page->mapping->flags);
set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index e12f189d539b..7f8e83f9d74e 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -102,8 +102,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
struct cachefiles_object *object;
struct rb_node *p;
- _enter(",'%*.*s'",
- dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
+ _enter(",'%pd'", dentry);
write_lock(&cache->active_lock);
@@ -273,9 +272,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
char nbuffer[8 + 8 + 1];
int ret;
- _enter(",'%*.*s','%*.*s'",
- dir->d_name.len, dir->d_name.len, dir->d_name.name,
- rep->d_name.len, rep->d_name.len, rep->d_name.name);
+ _enter(",'%pd','%pd'", dir, rep);
_debug("remove %p from %p", rep, dir);
@@ -597,8 +594,7 @@ lookup_again:
/* if we've found that the terminal object exists, then we need to
* check its attributes and delete it if it's out of date */
if (!object->new) {
- _debug("validate '%*.*s'",
- next->d_name.len, next->d_name.len, next->d_name.name);
+ _debug("validate '%pd'", next);
ret = cachefiles_check_object_xattr(object, auxdata);
if (ret == -ESTALE) {
@@ -827,8 +823,8 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
unsigned long start;
int ret;
- //_enter(",%*.*s/,%s",
- // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+ //_enter(",%pd/,%s",
+ // dir, filename);
/* look up the victim */
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
@@ -910,8 +906,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
struct dentry *victim;
int ret;
- _enter(",%*.*s/,%s",
- dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+ _enter(",%pd/,%s", dir, filename);
victim = cachefiles_check_active(cache, dir, filename);
if (IS_ERR(victim))
@@ -969,8 +964,8 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
{
struct dentry *victim;
- //_enter(",%*.*s/,%s",
- // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+ //_enter(",%pd/,%s",
+ // dir, filename);
victim = cachefiles_check_active(cache, dir, filename);
if (IS_ERR(victim))
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index acbc1f094fb1..a8a68745e11d 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,9 +51,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
}
if (ret != -EEXIST) {
- pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n",
- dentry->d_name.len, dentry->d_name.len,
- dentry->d_name.name, dentry->d_inode->i_ino,
+ pr_err("Can't set xattr on %pd [%lu] (err %d)\n",
+ dentry, dentry->d_inode->i_ino,
-ret);
goto error;
}
@@ -64,9 +63,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
if (ret == -ERANGE)
goto bad_type_length;
- pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n",
- dentry->d_name.len, dentry->d_name.len,
- dentry->d_name.name, dentry->d_inode->i_ino,
+ pr_err("Can't read xattr on %pd [%lu] (err %d)\n",
+ dentry, dentry->d_inode->i_ino,
-ret);
goto error;
}
@@ -92,9 +90,8 @@ bad_type_length:
bad_type:
xtype[2] = 0;
- pr_err("Cache object %*.*s [%lu] type %s not %s\n",
- dentry->d_name.len, dentry->d_name.len,
- dentry->d_name.name, dentry->d_inode->i_ino,
+ pr_err("Cache object %pd [%lu] type %s not %s\n",
+ dentry, dentry->d_inode->i_ino,
xtype, type);
ret = -EIO;
goto error;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 18c06bbaf136..f5013d92a7e6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -192,17 +192,30 @@ static int readpage_nounlock(struct file *filp, struct page *page)
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
int err = 0;
+ u64 off = page_offset(page);
u64 len = PAGE_CACHE_SIZE;
- err = ceph_readpage_from_fscache(inode, page);
+ if (off >= i_size_read(inode)) {
+ zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ return 0;
+ }
+ /*
+ * Uptodate inline data should have been added into page cache
+ * while getting Fcr caps.
+ */
+ if (ci->i_inline_version != CEPH_INLINE_NONE)
+ return -EINVAL;
+
+ err = ceph_readpage_from_fscache(inode, page);
if (err == 0)
goto out;
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- (u64) page_offset(page), &len,
+ off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
if (err == -ENOENT)
@@ -319,7 +332,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
off, len);
vino = ceph_vino(inode);
req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
- 1, CEPH_OSD_OP_READ,
+ 0, 1, CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ, NULL,
ci->i_truncate_seq, ci->i_truncate_size,
false);
@@ -384,6 +397,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
int rc = 0;
int max = 0;
+ if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
+ return -EINVAL;
+
rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
&nr_pages);
@@ -673,7 +689,7 @@ static int ceph_writepages_start(struct address_space *mapping,
int rc = 0;
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
- int do_sync;
+ int do_sync = 0;
u64 truncate_size, snap_size;
u32 truncate_seq;
@@ -750,7 +766,6 @@ retry:
last_snapc = snapc;
while (!done && index <= end) {
- int num_ops = do_sync ? 2 : 1;
unsigned i;
int first;
pgoff_t next;
@@ -850,7 +865,8 @@ get_more_pages:
len = wsize;
req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout, vino,
- offset, &len, num_ops,
+ offset, &len, 0,
+ do_sync ? 2 : 1,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE |
CEPH_OSD_FLAG_ONDISK,
@@ -862,6 +878,9 @@ get_more_pages:
break;
}
+ if (do_sync)
+ osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+
req->r_callback = writepages_finish;
req->r_inode = inode;
@@ -1204,6 +1223,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = file_inode(vma->vm_file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
+ struct page *pinned_page = NULL;
loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
int want, got, ret;
@@ -1215,7 +1235,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
want = CEPH_CAP_FILE_CACHE;
while (1) {
got = 0;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
+ -1, &got, &pinned_page);
if (ret == 0)
break;
if (ret != -ERESTARTSYS) {
@@ -1226,12 +1247,54 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
- ret = filemap_fault(vma, vmf);
+ if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
+ ci->i_inline_version == CEPH_INLINE_NONE)
+ ret = filemap_fault(vma, vmf);
+ else
+ ret = -EAGAIN;
dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+ if (pinned_page)
+ page_cache_release(pinned_page);
ceph_put_cap_refs(ci, got);
+ if (ret != -EAGAIN)
+ return ret;
+
+ /* read inline data */
+ if (off >= PAGE_CACHE_SIZE) {
+ /* does not support inline data > PAGE_SIZE */
+ ret = VM_FAULT_SIGBUS;
+ } else {
+ int ret1;
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page = find_or_create_page(mapping, 0,
+ mapping_gfp_mask(mapping) &
+ ~__GFP_FS);
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+ ret1 = __ceph_do_getattr(inode, page,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (ret1 < 0 || off >= i_size_read(inode)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if (ret1 < PAGE_CACHE_SIZE)
+ zero_user_segment(page, ret1, PAGE_CACHE_SIZE);
+ else
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ vmf->page = page;
+ ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+ }
+out:
+ dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+ inode, off, (size_t)PAGE_CACHE_SIZE, ret);
return ret;
}
@@ -1250,6 +1313,19 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
size_t len;
int want, got, ret;
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ struct page *locked_page = NULL;
+ if (off == 0) {
+ lock_page(page);
+ locked_page = page;
+ }
+ ret = ceph_uninline_data(vma->vm_file, locked_page);
+ if (locked_page)
+ unlock_page(locked_page);
+ if (ret < 0)
+ return VM_FAULT_SIGBUS;
+ }
+
if (off + PAGE_CACHE_SIZE <= size)
len = PAGE_CACHE_SIZE;
else
@@ -1263,7 +1339,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
want = CEPH_CAP_FILE_BUFFER;
while (1) {
got = 0;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+ &got, NULL);
if (ret == 0)
break;
if (ret != -ERESTARTSYS) {
@@ -1297,11 +1374,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
}
out:
- if (ret != VM_FAULT_LOCKED) {
+ if (ret != VM_FAULT_LOCKED)
unlock_page(page);
- } else {
+ if (ret == VM_FAULT_LOCKED ||
+ ci->i_inline_version != CEPH_INLINE_NONE) {
int dirty;
spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
@@ -1315,6 +1394,178 @@ out:
return ret;
}
+void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
+ char *data, size_t len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+
+ if (locked_page) {
+ page = locked_page;
+ } else {
+ if (i_size_read(inode) == 0)
+ return;
+ page = find_or_create_page(mapping, 0,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return;
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ return;
+ }
+ }
+
+ dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n",
+ inode, ceph_vinop(inode), len, locked_page);
+
+ if (len > 0) {
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr, data, len);
+ kunmap_atomic(kaddr);
+ }
+
+ if (page != locked_page) {
+ if (len < PAGE_CACHE_SIZE)
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ else
+ flush_dcache_page(page);
+
+ SetPageUptodate(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+}
+
+int ceph_uninline_data(struct file *filp, struct page *locked_page)
+{
+ struct inode *inode = file_inode(filp);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_request *req;
+ struct page *page = NULL;
+ u64 len, inline_version;
+ int err = 0;
+ bool from_pagecache = false;
+
+ spin_lock(&ci->i_ceph_lock);
+ inline_version = ci->i_inline_version;
+ spin_unlock(&ci->i_ceph_lock);
+
+ dout("uninline_data %p %llx.%llx inline_version %llu\n",
+ inode, ceph_vinop(inode), inline_version);
+
+ if (inline_version == 1 || /* initial version, no data */
+ inline_version == CEPH_INLINE_NONE)
+ goto out;
+
+ if (locked_page) {
+ page = locked_page;
+ WARN_ON(!PageUptodate(page));
+ } else if (ceph_caps_issued(ci) &
+ (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
+ page = find_get_page(inode->i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ from_pagecache = true;
+ lock_page(page);
+ } else {
+ page_cache_release(page);
+ page = NULL;
+ }
+ }
+ }
+
+ if (page) {
+ len = i_size_read(inode);
+ if (len > PAGE_CACHE_SIZE)
+ len = PAGE_CACHE_SIZE;
+ } else {
+ page = __page_cache_alloc(GFP_NOFS);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = __ceph_do_getattr(inode, page,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (err < 0) {
+ /* no inline data */
+ if (err == -ENODATA)
+ err = 0;
+ goto out;
+ }
+ len = err;
+ }
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode), 0, &len, 0, 1,
+ CEPH_OSD_OP_CREATE,
+ CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ ci->i_snap_realm->cached_context,
+ 0, 0, false);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+ err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!err)
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_put_request(req);
+ if (err < 0)
+ goto out;
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode), 0, &len, 1, 3,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ ci->i_snap_realm->cached_context,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+
+ err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
+ "inline_version", &inline_version,
+ sizeof(inline_version),
+ CEPH_OSD_CMPXATTR_OP_GT,
+ CEPH_OSD_CMPXATTR_MODE_U64);
+ if (err)
+ goto out_put;
+
+ err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
+ "inline_version", &inline_version,
+ sizeof(inline_version), 0, 0);
+ if (err)
+ goto out_put;
+
+ ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+ err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!err)
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+out_put:
+ ceph_osdc_put_request(req);
+ if (err == -ECANCELED)
+ err = 0;
+out:
+ if (page && page != locked_page) {
+ if (from_pagecache) {
+ unlock_page(page);
+ page_cache_release(page);
+ } else
+ __free_pages(page, 0);
+ }
+
+ dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
+ inode, ceph_vinop(inode), inline_version, err);
+ return err;
+}
+
static struct vm_operations_struct ceph_vmops = {
.fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 659f2ea9e6f7..b93c631c6c87 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -975,10 +975,12 @@ static int send_cap_msg(struct ceph_mds_session *session,
kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
- u64 follows)
+ u64 follows, bool inline_data)
{
struct ceph_mds_caps *fc;
struct ceph_msg *msg;
+ void *p;
+ size_t extra_len;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
" seq %u/%u mseq %u follows %lld size %llu/%llu"
@@ -988,7 +990,10 @@ static int send_cap_msg(struct ceph_mds_session *session,
seq, issue_seq, mseq, follows, size, max_size,
xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
+ /* flock buffer size + inline version + inline data size */
+ extra_len = 4 + 8 + 4;
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
+ GFP_NOFS, false);
if (!msg)
return -ENOMEM;
@@ -1020,6 +1025,14 @@ static int send_cap_msg(struct ceph_mds_session *session,
fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
fc->mode = cpu_to_le32(mode);
+ p = fc + 1;
+ /* flock buffer size */
+ ceph_encode_32(&p, 0);
+ /* inline version */
+ ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
+ /* inline data size */
+ ceph_encode_32(&p, 0);
+
fc->xattr_version = cpu_to_le64(xattr_version);
if (xattrs_buf) {
msg->middle = ceph_buffer_get(xattrs_buf);
@@ -1126,6 +1139,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
u64 flush_tid = 0;
int i;
int ret;
+ bool inline_data;
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
@@ -1209,13 +1223,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
xattr_version = ci->i_xattrs.version;
}
+ inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
spin_unlock(&ci->i_ceph_lock);
ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
size, max_size, &mtime, &atime, time_warp_seq,
uid, gid, mode, xattr_version, xattr_blob,
- follows);
+ follows, inline_data);
if (ret < 0) {
dout("error sending cap msg, must requeue %p\n", inode);
delayed = 1;
@@ -1336,7 +1352,7 @@ retry:
capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode,
capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows);
+ capsnap->follows, capsnap->inline_data);
next_follows = capsnap->follows + 1;
ceph_put_cap_snap(capsnap);
@@ -2057,15 +2073,17 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
* requested from the MDS.
*/
static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- int *got, loff_t endoff, int *check_max, int *err)
+ loff_t endoff, int *got, struct page **pinned_page,
+ int *check_max, int *err)
{
struct inode *inode = &ci->vfs_inode;
int ret = 0;
- int have, implemented;
+ int have, implemented, _got = 0;
int file_wanted;
dout("get_cap_refs %p need %s want %s\n", inode,
ceph_cap_string(need), ceph_cap_string(want));
+again:
spin_lock(&ci->i_ceph_lock);
/* make sure file is actually open */
@@ -2075,7 +2093,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
ceph_cap_string(need), ceph_cap_string(file_wanted));
*err = -EBADF;
ret = 1;
- goto out;
+ goto out_unlock;
}
/* finish pending truncate */
@@ -2095,7 +2113,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
*check_max = 1;
ret = 1;
}
- goto out;
+ goto out_unlock;
}
/*
* If a sync write is in progress, we must wait, so that we
@@ -2103,7 +2121,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
*/
if (__ceph_have_pending_cap_snap(ci)) {
dout("get_cap_refs %p cap_snap_pending\n", inode);
- goto out;
+ goto out_unlock;
}
}
@@ -2120,18 +2138,50 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
if ((revoking & not) == 0) {
- *got = need | (have & want);
- __take_cap_refs(ci, *got);
+ _got = need | (have & want);
+ __take_cap_refs(ci, _got);
ret = 1;
}
} else {
dout("get_cap_refs %p have %s needed %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
-out:
+out_unlock:
spin_unlock(&ci->i_ceph_lock);
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(inode) > 0) {
+ int ret1;
+ struct page *page = find_get_page(inode->i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ *pinned_page = page;
+ goto out;
+ }
+ page_cache_release(page);
+ }
+ /*
+ * drop cap refs first because getattr while holding
+ * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /* getattr request will bring inline data into page cache */
+ ret1 = __ceph_do_getattr(inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (ret1 >= 0) {
+ ret = 0;
+ goto again;
+ }
+ *err = ret1;
+ ret = 1;
+ }
+out:
dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(*got));
+ ret, ceph_cap_string(_got));
+ *got = _got;
return ret;
}
@@ -2168,8 +2218,8 @@ static void check_max_size(struct inode *inode, loff_t endoff)
* due to a small max_size, make sure we check_max_size (and possibly
* ask the mds) so we don't get hung up indefinitely.
*/
-int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
- loff_t endoff)
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+ loff_t endoff, int *got, struct page **pinned_page)
{
int check_max, ret, err;
@@ -2179,8 +2229,8 @@ retry:
check_max = 0;
err = 0;
ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want,
- got, endoff,
+ try_get_cap_refs(ci, need, want, endoff,
+ got, pinned_page,
&check_max, &err));
if (err)
ret = err;
@@ -2383,6 +2433,8 @@ static void invalidate_aliases(struct inode *inode)
static void handle_cap_grant(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *grant,
void *snaptrace, int snaptrace_len,
+ u64 inline_version,
+ void *inline_data, int inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
struct ceph_cap *cap, int issued)
@@ -2403,6 +2455,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
bool queue_invalidate = false;
bool queue_revalidate = false;
bool deleted_inode = false;
+ bool fill_inline = false;
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2576,6 +2629,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
}
BUG_ON(cap->issued & ~cap->implemented);
+ if (inline_version > 0 && inline_version >= ci->i_inline_version) {
+ ci->i_inline_version = inline_version;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
+ fill_inline = true;
+ }
+
spin_unlock(&ci->i_ceph_lock);
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
@@ -2589,6 +2649,9 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
wake = true;
}
+ if (fill_inline)
+ ceph_fill_inline_data(inode, NULL, inline_data, inline_len);
+
if (queue_trunc) {
ceph_queue_vmtruncate(inode);
ceph_queue_revalidate(inode);
@@ -2638,7 +2701,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
for (i = 0; i < CEPH_CAP_BITS; i++)
if ((dirty & (1 << i)) &&
- flush_tid == ci->i_cap_flush_tid[i])
+ (u16)flush_tid == ci->i_cap_flush_tid[i])
cleaned |= 1 << i;
dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
@@ -2996,11 +3059,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u64 cap_id;
u64 size, max_size;
u64 tid;
+ u64 inline_version = 0;
+ void *inline_data = NULL;
+ u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
- void *flock;
- void *end;
- u32 flock_len;
+ void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3021,30 +3085,37 @@ void ceph_handle_caps(struct ceph_mds_session *session,
snaptrace = h + 1;
snaptrace_len = le32_to_cpu(h->snap_trace_len);
+ p = snaptrace + snaptrace_len;
if (le16_to_cpu(msg->hdr.version) >= 2) {
- void *p = snaptrace + snaptrace_len;
+ u32 flock_len;
ceph_decode_32_safe(&p, end, flock_len, bad);
if (p + flock_len > end)
goto bad;
- flock = p;
- } else {
- flock = NULL;
- flock_len = 0;
+ p += flock_len;
}
if (le16_to_cpu(msg->hdr.version) >= 3) {
if (op == CEPH_CAP_OP_IMPORT) {
- void *p = flock + flock_len;
if (p + sizeof(*peer) > end)
goto bad;
peer = p;
+ p += sizeof(*peer);
} else if (op == CEPH_CAP_OP_EXPORT) {
/* recorded in unused fields */
peer = (void *)&h->size;
}
}
+ if (le16_to_cpu(msg->hdr.version) >= 4) {
+ ceph_decode_64_safe(&p, end, inline_version, bad);
+ ceph_decode_32_safe(&p, end, inline_len, bad);
+ if (p + inline_len > end)
+ goto bad;
+ inline_data = p;
+ p += inline_len;
+ }
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
@@ -3085,6 +3156,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &issued);
handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len,
+ inline_version, inline_data, inline_len,
msg->middle, session, cap, issued);
goto done_unlocked;
}
@@ -3105,8 +3177,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_GRANT:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle,
- session, cap, issued);
+ handle_cap_grant(mdsc, inode, h, NULL, 0,
+ inline_version, inline_data, inline_len,
+ msg->middle, session, cap, issued);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
@@ -3137,8 +3210,7 @@ flush_cap_releases:
done:
mutex_unlock(&session->s_mutex);
done_unlocked:
- if (inode)
- iput(inode);
+ iput(inode);
return;
bad:
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 5d5a4c8c8496..1b2355109b9f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -83,10 +83,9 @@ static int mdsc_show(struct seq_file *s, void *p)
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
- seq_printf(s, " #%llx/%.*s (%s)",
+ seq_printf(s, " #%llx/%pd (%s)",
ceph_ino(req->r_dentry->d_parent->d_inode),
- req->r_dentry->d_name.len,
- req->r_dentry->d_name.name,
+ req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
kfree(path);
@@ -103,11 +102,10 @@ static int mdsc_show(struct seq_file *s, void *p)
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
- seq_printf(s, " #%llx/%.*s (%s)",
+ seq_printf(s, " #%llx/%pd (%s)",
req->r_old_dentry_dir ?
ceph_ino(req->r_old_dentry_dir) : 0,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
+ req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
kfree(path);
@@ -150,8 +148,8 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
spin_lock(&mdsc->dentry_lru_lock);
list_for_each_entry(di, &mdsc->dentry_lru, lru) {
struct dentry *dentry = di->dentry;
- seq_printf(s, "%p %p\t%.*s\n",
- di, dentry, dentry->d_name.len, dentry->d_name.name);
+ seq_printf(s, "%p %p\t%pd\n",
+ di, dentry, dentry);
}
spin_unlock(&mdsc->dentry_lru_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e6d63f8f98c0..c241603764fd 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -111,7 +111,7 @@ static int fpos_cmp(loff_t l, loff_t r)
/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
- * d_u.d_child when we initially get results back from the MDS, and
+ * d_child when we initially get results back from the MDS, and
* falling back to a "normal" sync readdir if any dentries in the dir
* are dropped.
*
@@ -123,7 +123,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
u32 shared_gen)
{
struct ceph_file_info *fi = file->private_data;
- struct dentry *parent = file->f_dentry;
+ struct dentry *parent = file->f_path.dentry;
struct inode *dir = parent->d_inode;
struct list_head *p;
struct dentry *dentry, *last;
@@ -147,11 +147,11 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
p = parent->d_subdirs.prev;
dout(" initial p %p/%p\n", p->prev, p->next);
} else {
- p = last->d_u.d_child.prev;
+ p = last->d_child.prev;
}
more:
- dentry = list_entry(p, struct dentry, d_u.d_child);
+ dentry = list_entry(p, struct dentry, d_child);
di = ceph_dentry(dentry);
while (1) {
dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
@@ -168,13 +168,13 @@ more:
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
fpos_cmp(ctx->pos, di->offset) <= 0)
break;
- dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
- dentry->d_name.len, dentry->d_name.name, di->offset,
+ dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry,
+ dentry, di->offset,
ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
!dentry->d_inode ? " null" : "");
spin_unlock(&dentry->d_lock);
p = p->prev;
- dentry = list_entry(p, struct dentry, d_u.d_child);
+ dentry = list_entry(p, struct dentry, d_child);
di = ceph_dentry(dentry);
}
@@ -183,15 +183,15 @@ more:
spin_unlock(&parent->d_lock);
/* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_is_complete(dir)) {
+ if (!ceph_dir_is_complete_ordered(dir)) {
dout(" lost dir complete on %p; falling back to mds\n", dir);
dput(dentry);
err = -EAGAIN;
goto out;
}
- dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
- dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+ dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+ dentry, dentry, dentry->d_inode);
if (!dir_emit(ctx, dentry->d_name.name,
dentry->d_name.len,
ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
@@ -261,10 +261,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* always start with . and .. */
if (ctx->pos == 0) {
- /* note dir version at start of readdir so we can tell
- * if any dentries get dropped */
- fi->dir_release_count = atomic_read(&ci->i_release_count);
-
dout("readdir off 0 -> '.'\n");
if (!dir_emit(ctx, ".", 1,
ceph_translate_ino(inode->i_sb, inode->i_ino),
@@ -274,7 +270,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
off = 1;
}
if (ctx->pos == 1) {
- ino_t ino = parent_ino(file->f_dentry);
+ ino_t ino = parent_ino(file->f_path.dentry);
dout("readdir off 1 -> '..'\n");
if (!dir_emit(ctx, "..", 2,
ceph_translate_ino(inode->i_sb, ino),
@@ -289,7 +285,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
if ((ctx->pos == 2 || fi->dentry) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
- __ceph_dir_is_complete(ci) &&
+ __ceph_dir_is_complete_ordered(ci) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
u32 shared_gen = ci->i_shared_gen;
spin_unlock(&ci->i_ceph_lock);
@@ -312,6 +308,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* proceed with a normal readdir */
+ if (ctx->pos == 2) {
+ /* note dir version at start of readdir so we can tell
+ * if any dentries get dropped */
+ fi->dir_release_count = atomic_read(&ci->i_release_count);
+ fi->dir_ordered_count = ci->i_ordered_count;
+ }
+
more:
/* do we have the correct frag content buffered? */
if (fi->frag != frag || fi->last_readdir == NULL) {
@@ -337,7 +340,7 @@ more:
}
req->r_inode = inode;
ihold(inode);
- req->r_dentry = dget(file->f_dentry);
+ req->r_dentry = dget(file->f_path.dentry);
/* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS;
req->r_direct_hash = ceph_frag_value(frag);
@@ -446,8 +449,12 @@ more:
*/
spin_lock(&ci->i_ceph_lock);
if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
- dout(" marking %p complete\n", inode);
- __ceph_dir_set_complete(ci, fi->dir_release_count);
+ if (ci->i_ordered_count == fi->dir_ordered_count)
+ dout(" marking %p complete and ordered\n", inode);
+ else
+ dout(" marking %p complete\n", inode);
+ __ceph_dir_set_complete(ci, fi->dir_release_count,
+ fi->dir_ordered_count);
}
spin_unlock(&ci->i_ceph_lock);
@@ -538,8 +545,8 @@ int ceph_handle_snapdir(struct ceph_mds_request *req,
strcmp(dentry->d_name.name,
fsc->mount_options->snapdir_name) == 0) {
struct inode *inode = ceph_get_snapdir(parent);
- dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
- dentry, dentry->d_name.len, dentry->d_name.name, inode);
+ dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
+ dentry, dentry, inode);
BUG_ON(!d_unhashed(dentry));
d_add(dentry, inode);
err = 0;
@@ -603,8 +610,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
int op;
int err;
- dout("lookup %p dentry %p '%.*s'\n",
- dir, dentry, dentry->d_name.len, dentry->d_name.name);
+ dout("lookup %p dentry %p '%pd'\n",
+ dir, dentry, dentry);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -774,8 +781,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
op = CEPH_MDS_OP_MKSNAP;
- dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
- dentry->d_name.len, dentry->d_name.name, dentry);
+ dout("mksnap dir %p snap '%pd' dn %p\n", dir,
+ dentry, dentry);
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
@@ -805,7 +812,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
acls.pagelist = NULL;
}
err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
+ if (!err &&
+ !req->r_reply_info.head->is_target &&
+ !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
out:
@@ -888,8 +897,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
- dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
- dentry->d_name.name, dentry);
+ dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
op = CEPH_MDS_OP_RMSNAP;
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
dout("unlink/rmdir dir %p dn %p inode %p\n",
@@ -1063,16 +1071,15 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
- ceph_dentry(dentry)->offset);
+ dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
+ dentry, dentry->d_inode, ceph_dentry(dentry)->offset);
dir = ceph_get_dentry_parent_inode(dentry);
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
- dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+ dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
+ dentry, dentry->d_inode);
valid = 1;
} else if (dentry->d_inode &&
ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
@@ -1265,8 +1272,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
struct ceph_dentry_info *di = ceph_dentry(dn);
struct ceph_mds_client *mdsc;
- dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
- dn->d_name.len, dn->d_name.name);
+ dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_add_tail(&di->lru, &mdsc->dentry_lru);
@@ -1279,8 +1285,8 @@ void ceph_dentry_lru_touch(struct dentry *dn)
struct ceph_dentry_info *di = ceph_dentry(dn);
struct ceph_mds_client *mdsc;
- dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
- dn->d_name.len, dn->d_name.name, di->offset);
+ dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
+ di->offset);
mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_move_tail(&di->lru, &mdsc->dentry_lru);
@@ -1292,8 +1298,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
struct ceph_dentry_info *di = ceph_dentry(dn);
struct ceph_mds_client *mdsc;
- dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
- dn->d_name.len, dn->d_name.name);
+ dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_del_init(&di->lru);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d7e0da8366e6..ce74b394b49d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -211,7 +211,7 @@ int ceph_open(struct inode *inode, struct file *file)
req->r_num_caps = 1;
if (flags & O_CREAT)
- parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
+ parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
err = ceph_mdsc_do_request(mdsc, parent_inode, req);
iput(parent_inode);
if (!err)
@@ -238,8 +238,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_acls_info acls = {};
int err;
- dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
- dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
+ dir, dentry, dentry,
d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
if (dentry->d_name.len > NAME_MAX)
@@ -333,6 +333,11 @@ int ceph_release(struct inode *inode, struct file *file)
return 0;
}
+enum {
+ CHECK_EOF = 1,
+ READ_INLINE = 2,
+};
+
/*
* Read a range of bytes striped over one or more objects. Iterate over
* objects we stripe over. (That's not atomic, but good enough for now.)
@@ -412,7 +417,7 @@ more:
ret = read;
/* did we bounce off eof? */
if (pos + left > inode->i_size)
- *checkeof = 1;
+ *checkeof = CHECK_EOF;
}
dout("striped_read returns %d\n", ret);
@@ -598,7 +603,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, pos, &len,
+ vino, pos, &len, 0,
2,/*include a 'startsync' command*/
CEPH_OSD_OP_WRITE, flags, snapc,
ci->i_truncate_seq,
@@ -609,6 +614,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
break;
}
+ osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) {
ret = n;
@@ -713,7 +720,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, pos, &len, 1,
+ vino, pos, &len, 0, 1,
CEPH_OSD_OP_WRITE, flags, snapc,
ci->i_truncate_seq,
ci->i_truncate_size,
@@ -803,9 +810,10 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
size_t len = iocb->ki_nbytes;
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct page *pinned_page = NULL;
ssize_t ret;
int want, got = 0;
- int checkeof = 0, read = 0;
+ int retry_op = 0, read = 0;
again:
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
@@ -815,7 +823,7 @@ again:
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
if (ret < 0)
return ret;
@@ -827,8 +835,12 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
- /* hmm, this isn't really async... */
- ret = ceph_sync_read(iocb, to, &checkeof);
+ if (ci->i_inline_version == CEPH_INLINE_NONE) {
+ /* hmm, this isn't really async... */
+ ret = ceph_sync_read(iocb, to, &retry_op);
+ } else {
+ retry_op = READ_INLINE;
+ }
} else {
dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
@@ -838,13 +850,55 @@ again:
}
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+ if (pinned_page) {
+ page_cache_release(pinned_page);
+ pinned_page = NULL;
+ }
ceph_put_cap_refs(ci, got);
+ if (retry_op && ret >= 0) {
+ int statret;
+ struct page *page = NULL;
+ loff_t i_size;
+ if (retry_op == READ_INLINE) {
+ page = __page_cache_alloc(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ }
- if (checkeof && ret >= 0) {
- int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+ statret = __ceph_do_getattr(inode, page,
+ CEPH_STAT_CAP_INLINE_DATA, !!page);
+ if (statret < 0) {
+ __free_page(page);
+ if (statret == -ENODATA) {
+ BUG_ON(retry_op != READ_INLINE);
+ goto again;
+ }
+ return statret;
+ }
+
+ i_size = i_size_read(inode);
+ if (retry_op == READ_INLINE) {
+ /* does not support inline data > PAGE_SIZE */
+ if (i_size > PAGE_CACHE_SIZE) {
+ ret = -EIO;
+ } else if (iocb->ki_pos < i_size) {
+ loff_t end = min_t(loff_t, i_size,
+ iocb->ki_pos + len);
+ if (statret < end)
+ zero_user_segment(page, statret, end);
+ ret = copy_page_to_iter(page,
+ iocb->ki_pos & ~PAGE_MASK,
+ end - iocb->ki_pos, to);
+ iocb->ki_pos += ret;
+ } else {
+ ret = 0;
+ }
+ __free_pages(page, 0);
+ return ret;
+ }
/* hit EOF or hole? */
- if (statret == 0 && iocb->ki_pos < inode->i_size &&
+ if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
ret < len) {
dout("sync_read hit hole, ppos %lld < size %lld"
", reading more\n", iocb->ki_pos,
@@ -852,7 +906,7 @@ again:
read += ret;
len -= ret;
- checkeof = 0;
+ retry_op = 0;
goto again;
}
}
@@ -909,6 +963,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ err = ceph_uninline_data(file, NULL);
+ if (err < 0)
+ goto out;
+ }
+
retry_snap:
if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
err = -ENOSPC;
@@ -922,7 +982,8 @@ retry_snap:
else
want = CEPH_CAP_FILE_BUFFER;
got = 0;
- err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count);
+ err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
+ &got, NULL);
if (err < 0)
goto out;
@@ -969,6 +1030,7 @@ retry_snap:
if (written >= 0) {
int dirty;
spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
@@ -1111,7 +1173,7 @@ static int ceph_zero_partial_object(struct inode *inode,
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode),
offset, length,
- 1, op,
+ 0, 1, op,
CEPH_OSD_FLAG_WRITE |
CEPH_OSD_FLAG_ONDISK,
NULL, 0, 0, false);
@@ -1214,6 +1276,12 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ ret = ceph_uninline_data(file, NULL);
+ if (ret < 0)
+ goto unlock;
+ }
+
size = i_size_read(inode);
if (!(mode & FALLOC_FL_KEEP_SIZE))
endoff = offset + length;
@@ -1223,7 +1291,7 @@ static long ceph_fallocate(struct file *file, int mode,
else
want = CEPH_CAP_FILE_BUFFER;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
if (ret < 0)
goto unlock;
@@ -1240,6 +1308,7 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) {
spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7b6139004401..f61a74115beb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -387,8 +387,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
spin_lock_init(&ci->i_ceph_lock);
ci->i_version = 0;
+ ci->i_inline_version = 0;
ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0;
+ ci->i_ordered_count = 0;
atomic_set(&ci->i_release_count, 1);
atomic_set(&ci->i_complete_count, 0);
ci->i_symlink = NULL;
@@ -657,7 +659,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
*/
-static int fill_inode(struct inode *inode,
+static int fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_reply_info_in *iinfo,
struct ceph_mds_reply_dirfrag *dirinfo,
struct ceph_mds_session *session,
@@ -675,6 +677,7 @@ static int fill_inode(struct inode *inode,
bool wake = false;
bool queue_trunc = false;
bool new_version = false;
+ bool fill_inline = false;
dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
inode, ceph_vinop(inode), le64_to_cpu(info->version),
@@ -845,7 +848,8 @@ static int fill_inode(struct inode *inode,
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode);
- __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
+ __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count),
+ ci->i_ordered_count);
}
/* were we issued a capability? */
@@ -873,8 +877,23 @@ static int fill_inode(struct inode *inode,
ceph_vinop(inode));
__ceph_get_fmode(ci, cap_fmode);
}
+
+ if (iinfo->inline_version > 0 &&
+ iinfo->inline_version >= ci->i_inline_version) {
+ int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+ ci->i_inline_version = iinfo->inline_version;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (locked_page ||
+ (le32_to_cpu(info->cap.caps) & cache_caps)))
+ fill_inline = true;
+ }
+
spin_unlock(&ci->i_ceph_lock);
+ if (fill_inline)
+ ceph_fill_inline_data(inode, locked_page,
+ iinfo->inline_data, iinfo->inline_len);
+
if (wake)
wake_up_all(&ci->i_cap_wq);
@@ -967,7 +986,7 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
/* dn must be unhashed */
if (!d_unhashed(dn))
d_drop(dn);
- realdn = d_materialise_unique(dn, in);
+ realdn = d_splice_alias(in, dn);
if (IS_ERR(realdn)) {
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
PTR_ERR(realdn), dn, in, ceph_vinop(in));
@@ -1062,7 +1081,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
struct inode *dir = req->r_locked_dir;
if (dir) {
- err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+ err = fill_inode(dir, NULL,
+ &rinfo->diri, rinfo->dirfrag,
session, req->r_request_started, -1,
&req->r_caps_reservation);
if (err < 0)
@@ -1132,7 +1152,7 @@ retry_lookup:
}
req->r_target_inode = in;
- err = fill_inode(in, &rinfo->targeti, NULL,
+ err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
session, req->r_request_started,
(!req->r_aborted && rinfo->head->result == 0) ?
req->r_fmode : -1,
@@ -1186,28 +1206,26 @@ retry_lookup:
struct inode *olddir = req->r_old_dentry_dir;
BUG_ON(!olddir);
- dout(" src %p '%.*s' dst %p '%.*s'\n",
+ dout(" src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry,
req->r_old_dentry,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- dn, dn->d_name.len, dn->d_name.name);
+ dn, dn);
dout("fill_trace doing d_move %p -> %p\n",
req->r_old_dentry, dn);
d_move(req->r_old_dentry, dn);
- dout(" src %p '%.*s' dst %p '%.*s'\n",
+ dout(" src %p '%pd' dst %p '%pd'\n",
req->r_old_dentry,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- dn, dn->d_name.len, dn->d_name.name);
+ req->r_old_dentry,
+ dn, dn);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
/* d_move screws up sibling dentries' offsets */
- ceph_dir_clear_complete(dir);
- ceph_dir_clear_complete(olddir);
+ ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(olddir);
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
@@ -1219,6 +1237,7 @@ retry_lookup:
if (!rinfo->head->is_target) {
dout("fill_trace null dentry\n");
if (dn->d_inode) {
+ ceph_dir_clear_ordered(dir);
dout("d_delete %p\n", dn);
d_delete(dn);
} else {
@@ -1235,7 +1254,7 @@ retry_lookup:
/* attach proper inode */
if (!dn->d_inode) {
- ceph_dir_clear_complete(dir);
+ ceph_dir_clear_ordered(dir);
ihold(in);
dn = splice_dentry(dn, in, &have_lease);
if (IS_ERR(dn)) {
@@ -1265,7 +1284,7 @@ retry_lookup:
BUG_ON(!dir);
BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
dout(" linking snapped dir %p to dn %p\n", in, dn);
- ceph_dir_clear_complete(dir);
+ ceph_dir_clear_ordered(dir);
ihold(in);
dn = splice_dentry(dn, in, NULL);
if (IS_ERR(dn)) {
@@ -1302,7 +1321,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
- rc = fill_inode(in, &rinfo->dir_in[i], NULL, session,
+ rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation);
if (rc < 0) {
@@ -1399,7 +1418,7 @@ retry_lookup:
/* reorder parent's d_subdirs */
spin_lock(&parent->d_lock);
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_u.d_child, &parent->d_subdirs);
+ list_move(&dn->d_child, &parent->d_subdirs);
spin_unlock(&dn->d_lock);
spin_unlock(&parent->d_lock);
}
@@ -1418,7 +1437,7 @@ retry_lookup:
}
}
- if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
+ if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation) < 0) {
pr_err("fill_inode badness on %p\n", in);
@@ -1901,7 +1920,8 @@ out_put:
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
*/
-int ceph_do_getattr(struct inode *inode, int mask, bool force)
+int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
+ int mask, bool force)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1913,7 +1933,8 @@ int ceph_do_getattr(struct inode *inode, int mask, bool force)
return 0;
}
- dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
+ dout("do_getattr inode %p mask %s mode 0%o\n",
+ inode, ceph_cap_string(mask), inode->i_mode);
if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
return 0;
@@ -1924,7 +1945,19 @@ int ceph_do_getattr(struct inode *inode, int mask, bool force)
ihold(inode);
req->r_num_caps = 1;
req->r_args.getattr.mask = cpu_to_le32(mask);
+ req->r_locked_page = locked_page;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (locked_page && err == 0) {
+ u64 inline_version = req->r_reply_info.targeti.inline_version;
+ if (inline_version == 0) {
+ /* the reply is supposed to contain inline data */
+ err = -EINVAL;
+ } else if (inline_version == CEPH_INLINE_NONE) {
+ err = -ENODATA;
+ } else {
+ err = req->r_reply_info.targeti.inline_len;
+ }
+ }
ceph_mdsc_put_request(req);
dout("do_getattr result=%d\n", err);
return err;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index fbc39c47bacd..c35c5c614e38 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -9,6 +9,8 @@
#include <linux/ceph/pagelist.h>
static u64 lock_secret;
+static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
static inline u64 secure_addr(void *addr)
{
@@ -40,6 +42,9 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
u64 length = 0;
u64 owner;
+ if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
+ wait = 0;
+
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -68,6 +73,9 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
+ if (wait)
+ req->r_wait_for_completion = ceph_lock_wait_for_completion;
+
err = ceph_mdsc_do_request(mdsc, inode, req);
if (operation == CEPH_MDS_OP_GETFILELOCK) {
@@ -96,6 +104,52 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
return err;
}
+static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct ceph_mds_request *intr_req;
+ struct inode *inode = req->r_inode;
+ int err, lock_type;
+
+ BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
+ if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
+ lock_type = CEPH_LOCK_FCNTL_INTR;
+ else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
+ lock_type = CEPH_LOCK_FLOCK_INTR;
+ else
+ BUG_ON(1);
+ BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
+
+ err = wait_for_completion_interruptible(&req->r_completion);
+ if (!err)
+ return 0;
+
+ dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
+ req->r_tid);
+
+ intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
+ USE_AUTH_MDS);
+ if (IS_ERR(intr_req))
+ return PTR_ERR(intr_req);
+
+ intr_req->r_inode = inode;
+ ihold(inode);
+ intr_req->r_num_caps = 1;
+
+ intr_req->r_args.filelock_change = req->r_args.filelock_change;
+ intr_req->r_args.filelock_change.rule = lock_type;
+ intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
+
+ err = ceph_mdsc_do_request(mdsc, inode, intr_req);
+ ceph_mdsc_put_request(intr_req);
+
+ if (err && err != -ERESTARTSYS)
+ return err;
+
+ wait_for_completion(&req->r_completion);
+ return 0;
+}
+
/**
* Attempt to set an fcntl lock.
* For now, this just goes away to the server. Later it may be more awesome.
@@ -143,11 +197,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err);
}
}
-
- } else if (err == -ERESTARTSYS) {
- dout("undoing lock\n");
- ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- CEPH_LOCK_UNLOCK, 0, fl);
}
return err;
}
@@ -186,11 +235,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
file, CEPH_LOCK_UNLOCK, 0, fl);
dout("got %d on flock_lock_file_wait, undid lock", err);
}
- } else if (err == -ERESTARTSYS) {
- dout("undoing lock\n");
- ceph_lock_message(CEPH_LOCK_FLOCK,
- CEPH_MDS_OP_SETFILELOCK,
- file, CEPH_LOCK_UNLOCK, 0, fl);
}
return err;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a92d3f5c6c12..d2171f4a6980 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -89,6 +89,16 @@ static int parse_reply_info_in(void **p, void *end,
ceph_decode_need(p, end, info->xattr_len, bad);
info->xattr_data = *p;
*p += info->xattr_len;
+
+ if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ ceph_decode_64_safe(p, end, info->inline_version, bad);
+ ceph_decode_32_safe(p, end, info->inline_len, bad);
+ ceph_decode_need(p, end, info->inline_len, bad);
+ info->inline_data = *p;
+ *p += info->inline_len;
+ } else
+ info->inline_version = CEPH_INLINE_NONE;
+
return 0;
bad:
return err;
@@ -524,8 +534,7 @@ void ceph_mdsc_release_request(struct kref *kref)
}
if (req->r_locked_dir)
ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_target_inode)
- iput(req->r_target_inode);
+ iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
if (req->r_old_dentry)
@@ -861,8 +870,11 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
/*
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
+ *
+ * ClientSession messages with metadata are v2
*/
- msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */
+ msg->hdr.version = cpu_to_le16(2);
+ msg->hdr.compat_version = cpu_to_le16(1);
/* The write pointer, following the session_head structure */
p = msg->front.iov_base + sizeof(*h);
@@ -1066,8 +1078,7 @@ out:
session->s_cap_iterator = NULL;
spin_unlock(&session->s_cap_lock);
- if (last_inode)
- iput(last_inode);
+ iput(last_inode);
if (old_cap)
ceph_put_cap(session->s_mdsc, old_cap);
@@ -1874,7 +1885,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
goto out_free2;
}
- msg->hdr.version = 2;
+ msg->hdr.version = cpu_to_le16(2);
msg->hdr.tid = cpu_to_le64(req->r_tid);
head = msg->front.iov_base;
@@ -2208,6 +2219,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
&req->r_completion, req->r_timeout);
if (err == 0)
err = -EIO;
+ } else if (req->r_wait_for_completion) {
+ err = req->r_wait_for_completion(mdsc, req);
} else {
err = wait_for_completion_killable(&req->r_completion);
}
@@ -3744,6 +3757,20 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
return msg;
}
+static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+ return ceph_auth_sign_message(auth, msg);
+}
+
+static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+ return ceph_auth_check_message_signature(auth, msg);
+}
+
static const struct ceph_connection_operations mds_con_ops = {
.get = con_get,
.put = con_put,
@@ -3753,6 +3780,8 @@ static const struct ceph_connection_operations mds_con_ops = {
.invalidate_authorizer = invalidate_authorizer,
.peer_reset = peer_reset,
.alloc_msg = mds_alloc_msg,
+ .sign_message = sign_message,
+ .check_message_signature = check_message_signature,
};
/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3288359353e9..e2817d00f7d9 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -41,6 +41,9 @@ struct ceph_mds_reply_info_in {
char *symlink;
u32 xattr_len;
char *xattr_data;
+ u64 inline_version;
+ u32 inline_len;
+ char *inline_data;
};
/*
@@ -166,6 +169,11 @@ struct ceph_mds_client;
*/
typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
struct ceph_mds_request *req);
+/*
+ * wait for request completion callback
+ */
+typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
/*
* an in-flight mds request
@@ -215,6 +223,7 @@ struct ceph_mds_request {
int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
+ struct page *r_locked_page;
int r_err;
bool r_aborted;
@@ -239,6 +248,7 @@ struct ceph_mds_request {
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
+ ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */
bool r_got_unsafe, r_got_safe, r_got_result;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index f01645a27752..ce35fbd4ba5d 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -288,6 +288,9 @@ static int cmpu64_rev(const void *a, const void *b)
return 0;
}
+
+static struct ceph_snap_context *empty_snapc;
+
/*
* build the snap context for a given realm.
*/
@@ -328,6 +331,12 @@ static int build_snap_context(struct ceph_snap_realm *realm)
return 0;
}
+ if (num == 0 && realm->seq == empty_snapc->seq) {
+ ceph_get_snap_context(empty_snapc);
+ snapc = empty_snapc;
+ goto done;
+ }
+
/* alloc new snap context */
err = -ENOMEM;
if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -365,8 +374,8 @@ static int build_snap_context(struct ceph_snap_realm *realm)
realm->ino, realm, snapc, snapc->seq,
(unsigned int) snapc->num_snaps);
- if (realm->cached_context)
- ceph_put_snap_context(realm->cached_context);
+done:
+ ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
return 0;
@@ -466,6 +475,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
cap_snap. lucky us. */
dout("queue_cap_snap %p already pending\n", inode);
kfree(capsnap);
+ } else if (ci->i_snap_realm->cached_context == empty_snapc) {
+ dout("queue_cap_snap %p empty snapc\n", inode);
+ kfree(capsnap);
} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
struct ceph_snap_context *snapc = ci->i_head_snapc;
@@ -504,6 +516,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
capsnap->xattr_version = 0;
}
+ capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
/* dirty page count moved from _head to this cap_snap;
all subsequent writes page dirties occur _after_ this
snapshot. */
@@ -590,15 +604,13 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
+ iput(lastinode);
lastinode = inode;
ceph_queue_cap_snap(ci);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
+ iput(lastinode);
list_for_each_entry(child, &realm->children, child_item) {
dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
@@ -928,5 +940,16 @@ out:
return;
}
+int __init ceph_snap_init(void)
+{
+ empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
+ if (!empty_snapc)
+ return -ENOMEM;
+ empty_snapc->seq = 1;
+ return 0;
+}
-
+void ceph_snap_exit(void)
+{
+ ceph_put_snap_context(empty_snapc);
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f6e12377335c..50f06cddc94b 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -515,7 +515,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_fs_client *fsc;
const u64 supported_features =
CEPH_FEATURE_FLOCK |
- CEPH_FEATURE_DIRLAYOUTHASH;
+ CEPH_FEATURE_DIRLAYOUTHASH |
+ CEPH_FEATURE_MDS_INLINE_DATA;
const u64 required_features = 0;
int page_count;
size_t size;
@@ -1017,9 +1018,6 @@ static struct file_system_type ceph_fs_type = {
};
MODULE_ALIAS_FS("ceph");
-#define _STRINGIFY(x) #x
-#define STRINGIFY(x) _STRINGIFY(x)
-
static int __init init_ceph(void)
{
int ret = init_caches();
@@ -1028,15 +1026,20 @@ static int __init init_ceph(void)
ceph_flock_init();
ceph_xattr_init();
+ ret = ceph_snap_init();
+ if (ret)
+ goto out_xattr;
ret = register_filesystem(&ceph_fs_type);
if (ret)
- goto out_icache;
+ goto out_snap;
pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
return 0;
-out_icache:
+out_snap:
+ ceph_snap_exit();
+out_xattr:
ceph_xattr_exit();
destroy_caches();
out:
@@ -1047,6 +1050,7 @@ static void __exit exit_ceph(void)
{
dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type);
+ ceph_snap_exit();
ceph_xattr_exit();
destroy_caches();
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b82f507979b8..e1aa32d0759d 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -161,6 +161,7 @@ struct ceph_cap_snap {
u64 time_warp_seq;
int writing; /* a sync write is still in progress */
int dirty_pages; /* dirty pages awaiting writeback */
+ bool inline_data;
};
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
@@ -253,9 +254,11 @@ struct ceph_inode_info {
spinlock_t i_ceph_lock;
u64 i_version;
+ u64 i_inline_version;
u32 i_time_warp_seq;
unsigned i_ceph_flags;
+ int i_ordered_count;
atomic_t i_release_count;
atomic_t i_complete_count;
@@ -434,14 +437,19 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
/*
* Ceph inode.
*/
-#define CEPH_I_NODELAY 4 /* do not delay cap release */
-#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
+#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */
+#define CEPH_I_NODELAY 4 /* do not delay cap release */
+#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
- int release_count)
+ int release_count, int ordered_count)
{
atomic_set(&ci->i_complete_count, release_count);
+ if (ci->i_ordered_count == ordered_count)
+ ci->i_ceph_flags |= CEPH_I_DIR_ORDERED;
+ else
+ ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
}
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
@@ -455,16 +463,35 @@ static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
atomic_read(&ci->i_release_count);
}
+static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
+{
+ return __ceph_dir_is_complete(ci) &&
+ (ci->i_ceph_flags & CEPH_I_DIR_ORDERED);
+}
+
static inline void ceph_dir_clear_complete(struct inode *inode)
{
__ceph_dir_clear_complete(ceph_inode(inode));
}
-static inline bool ceph_dir_is_complete(struct inode *inode)
+static inline void ceph_dir_clear_ordered(struct inode *inode)
{
- return __ceph_dir_is_complete(ceph_inode(inode));
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ordered_count++;
+ ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
+ spin_unlock(&ci->i_ceph_lock);
}
+static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool ret;
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_dir_is_complete_ordered(ci);
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
/* find a specific frag @f */
extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
@@ -580,6 +607,7 @@ struct ceph_file_info {
char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */
int dir_release_count;
+ int dir_ordered_count;
/* used for -o dirstat read() on directory thing */
char *dir_info;
@@ -673,6 +701,8 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern int ceph_snap_init(void);
+extern void ceph_snap_exit(void);
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
@@ -715,7 +745,12 @@ extern void ceph_queue_vmtruncate(struct inode *inode);
extern void ceph_queue_invalidate(struct inode *inode);
extern void ceph_queue_writeback(struct inode *inode);
-extern int ceph_do_getattr(struct inode *inode, int mask, bool force);
+extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
+ int mask, bool force);
+static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
+{
+ return __ceph_do_getattr(inode, NULL, mask, force);
+}
extern int ceph_permission(struct inode *inode, int mask);
extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -830,7 +865,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
int mds, int drop, int unless);
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
- int *got, loff_t endoff);
+ loff_t endoff, int *got, struct page **pinned_page);
/* for counting open files by mode */
static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
@@ -852,7 +887,9 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode,
int *opened);
extern int ceph_release(struct inode *inode, struct file *filp);
-
+extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
+ char *data, size_t len);
+int ceph_uninline_data(struct file *filp, struct page *locked_page);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
extern const struct inode_operations ceph_dir_iops;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 678b0d2bbbc4..5a492caf34cb 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -854,7 +854,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
struct ceph_pagelist *pagelist = NULL;
int err;
- if (value) {
+ if (size > 0) {
/* copy value into pagelist */
pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
if (!pagelist)
@@ -864,7 +864,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
err = ceph_pagelist_append(pagelist, value, size);
if (err)
goto out;
- } else {
+ } else if (!value) {
flags |= CEPH_XATTR_REMOVE;
}
@@ -1001,6 +1001,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_setxattr(dentry, name, value, size, flags);
+ if (size == 0)
+ value = ""; /* empty EA, do not remove */
+
return __ceph_setxattr(dentry, name, value, size, flags);
}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f77f7702fabe..67b2007f10fe 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -117,7 +117,6 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
goto out;
}
major = i;
- ret = major;
}
cd->major = major;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 44ec72684df5..9c56ef776407 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -34,27 +34,9 @@
void
cifs_dump_mem(char *label, void *data, int length)
{
- int i, j;
- int *intptr = data;
- char *charptr = data;
- char buf[10], line[80];
-
- printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n",
- label, length, data);
- for (i = 0; i < length; i += 16) {
- line[0] = 0;
- for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
- sprintf(buf, " %08x", intptr[i / 4 + j]);
- strcat(line, buf);
- }
- buf[0] = ' ';
- buf[2] = 0;
- for (j = 0; (j < 16) && (i + j < length); j++) {
- buf[1] = isprint(charptr[i + j]) ? charptr[i + j] : '.';
- strcat(line, buf);
- }
- printk(KERN_DEBUG "%s\n", line);
- }
+ pr_debug("%s: dump of %d bytes of data at 0x%p\n", label, length, data);
+ print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 16, 4,
+ data, length, true);
}
#ifdef CONFIG_CIFS_DEBUG
@@ -68,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_ERR "CIFS VFS: %pV", &vaf);
+ pr_err("CIFS VFS: %pV", &vaf);
va_end(args);
}
@@ -274,6 +256,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
+ bool bv;
int rc;
struct list_head *tmp1, *tmp2, *tmp3;
struct TCP_Server_Info *server;
@@ -284,7 +267,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
if (rc)
return rc;
- if (c == '1' || c == 'y' || c == 'Y' || c == '0') {
+ if (strtobool(&c, &bv) == 0) {
#ifdef CONFIG_CIFS_STATS2
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
@@ -451,15 +434,14 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char c;
+ bool bv;
int rc;
rc = get_user(c, buffer);
if (rc)
return rc;
- if (c == '0' || c == 'n' || c == 'N')
- cifsFYI = 0;
- else if (c == '1' || c == 'y' || c == 'Y')
- cifsFYI = 1;
+ if (strtobool(&c, &bv) == 0)
+ cifsFYI = bv;
else if ((c > '1') && (c <= '9'))
cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */
@@ -490,15 +472,18 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
+ bool bv;
int rc;
rc = get_user(c, buffer);
if (rc)
return rc;
- if (c == '0' || c == 'n' || c == 'N')
- linuxExtEnabled = 0;
- else if (c == '1' || c == 'y' || c == 'Y')
- linuxExtEnabled = 1;
+
+ rc = strtobool(&c, &bv);
+ if (rc)
+ return rc;
+
+ linuxExtEnabled = bv;
return count;
}
@@ -527,15 +512,18 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
+ bool bv;
int rc;
rc = get_user(c, buffer);
if (rc)
return rc;
- if (c == '0' || c == 'n' || c == 'N')
- lookupCacheEnabled = 0;
- else if (c == '1' || c == 'y' || c == 'Y')
- lookupCacheEnabled = 1;
+
+ rc = strtobool(&c, &bv);
+ if (rc)
+ return rc;
+
+ lookupCacheEnabled = bv;
return count;
}
@@ -564,15 +552,18 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char c;
+ bool bv;
int rc;
rc = get_user(c, buffer);
if (rc)
return rc;
- if (c == '0' || c == 'n' || c == 'N')
- traceSMB = 0;
- else if (c == '1' || c == 'y' || c == 'Y')
- traceSMB = 1;
+
+ rc = strtobool(&c, &bv);
+ if (rc)
+ return rc;
+
+ traceSMB = bv;
return count;
}
@@ -630,6 +621,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
unsigned int flags;
char flags_string[12];
char c;
+ bool bv;
if ((count < 1) || (count > 11))
return -EINVAL;
@@ -642,11 +634,8 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
if (count < 3) {
/* single char or single char followed by null */
c = flags_string[0];
- if (c == '0' || c == 'n' || c == 'N') {
- global_secflags = CIFSSEC_DEF; /* default */
- return count;
- } else if (c == '1' || c == 'y' || c == 'Y') {
- global_secflags = CIFSSEC_MAX;
+ if (strtobool(&c, &bv) == 0) {
+ global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
return count;
} else if (!isdigit(c)) {
cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c99b40fb609b..f40fbaca1b2a 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -53,13 +53,12 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
do { \
if (type == FYI) { \
if (cifsFYI & CIFS_INFO) { \
- printk(KERN_DEBUG "%s: " fmt, \
- __FILE__, ##__VA_ARGS__); \
+ pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__); \
} \
} else if (type == VFS) { \
cifs_vfs_err(fmt, ##__VA_ARGS__); \
} else if (type == NOISY && type != 0) { \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+ pr_debug(fmt, ##__VA_ARGS__); \
} \
} while (0)
@@ -71,7 +70,7 @@ do { \
#define cifs_dbg(type, fmt, ...) \
do { \
if (0) \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+ pr_debug(fmt, ##__VA_ARGS__); \
} while (0)
#endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6d00c419cbae..1ea780bc6376 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -38,7 +38,7 @@ static const struct cifs_sid sid_everyone = {
1, 1, {0, 0, 0, 0, 0, 1}, {0} };
/* security id for Authenticated Users system group */
static const struct cifs_sid sid_authusers = {
- 1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} };
+ 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
/* group users */
static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9d7996e8e793..d72fe37f5420 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -209,8 +209,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
{
- struct super_block *sb = file->f_path.dentry->d_sb;
- struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct TCP_Server_Info *server = tcon->ses->server;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 002e0c173939..252f5c15806b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.05"
+#define CIFS_VERSION "2.06"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 02a33e529904..6e139111fdb2 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1168,6 +1168,12 @@ CIFS_SB(struct super_block *sb)
return sb->s_fs_info;
}
+static inline struct cifs_sb_info *
+CIFS_FILE_SB(struct file *file)
+{
+ return CIFS_SB(file_inode(file)->i_sb);
+}
+
static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
{
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 61d00a6e398f..fa13d5e79f64 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2477,14 +2477,14 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
}
parm_data = (struct cifs_posix_lock *)
((char *)&pSMBr->hdr.Protocol + data_offset);
- if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
+ if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
pLockData->fl_type = F_UNLCK;
else {
if (parm_data->lock_type ==
- __constant_cpu_to_le16(CIFS_RDLCK))
+ cpu_to_le16(CIFS_RDLCK))
pLockData->fl_type = F_RDLCK;
else if (parm_data->lock_type ==
- __constant_cpu_to_le16(CIFS_WRLCK))
+ cpu_to_le16(CIFS_WRLCK))
pLockData->fl_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start);
@@ -3276,25 +3276,25 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
pSMB->TotalParameterCount = 0;
- pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+ pSMB->TotalDataCount = cpu_to_le32(2);
pSMB->MaxParameterCount = 0;
pSMB->MaxDataCount = 0;
pSMB->MaxSetupCount = 4;
pSMB->Reserved = 0;
pSMB->ParameterOffset = 0;
- pSMB->DataCount = __constant_cpu_to_le32(2);
+ pSMB->DataCount = cpu_to_le32(2);
pSMB->DataOffset =
cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
compression_state) - 4); /* 84 */
pSMB->SetupCount = 4;
- pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+ pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
pSMB->ParameterCount = 0;
- pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+ pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION);
pSMB->IsFsctl = 1; /* FSCTL */
pSMB->IsRootFlag = 0;
pSMB->Fid = fid; /* file handle always le */
/* 3 byte pad, followed by 2 byte compress state */
- pSMB->ByteCount = __constant_cpu_to_le16(5);
+ pSMB->ByteCount = cpu_to_le16(5);
inc_rfc1001_len(pSMB, 5);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3430,10 +3430,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
cifs_acl->version = cpu_to_le16(1);
if (acl_type == ACL_TYPE_ACCESS) {
cifs_acl->access_entry_count = cpu_to_le16(count);
- cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF);
+ cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
} else if (acl_type == ACL_TYPE_DEFAULT) {
cifs_acl->default_entry_count = cpu_to_le16(count);
- cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF);
+ cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
} else {
cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 24fa08d261fb..2a772da16b83 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1466,9 +1466,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->seal = 1;
break;
case Opt_noac:
- printk(KERN_WARNING "CIFS: Mount option noac not "
- "supported. Instead set "
- "/proc/fs/cifs/LookupCacheEnabled to 0\n");
+ pr_warn("CIFS: Mount option noac not supported. Instead set /proc/fs/cifs/LookupCacheEnabled to 0\n");
break;
case Opt_fsc:
#ifndef CONFIG_CIFS_FSCACHE
@@ -1598,7 +1596,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (strnlen(string, CIFS_MAX_USERNAME_LEN) >
CIFS_MAX_USERNAME_LEN) {
- printk(KERN_WARNING "CIFS: username too long\n");
+ pr_warn("CIFS: username too long\n");
goto cifs_parse_mount_err;
}
vol->username = kstrdup(string, GFP_KERNEL);
@@ -1662,8 +1660,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
temp_len = strlen(value);
vol->password = kzalloc(temp_len+1, GFP_KERNEL);
if (vol->password == NULL) {
- printk(KERN_WARNING "CIFS: no memory "
- "for password\n");
+ pr_warn("CIFS: no memory for password\n");
goto cifs_parse_mount_err;
}
@@ -1687,8 +1684,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (!cifs_convert_address(dstaddr, string,
strlen(string))) {
- printk(KERN_ERR "CIFS: bad ip= option (%s).\n",
- string);
+ pr_err("CIFS: bad ip= option (%s).\n", string);
goto cifs_parse_mount_err;
}
got_ip = true;
@@ -1700,15 +1696,13 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN)
== CIFS_MAX_DOMAINNAME_LEN) {
- printk(KERN_WARNING "CIFS: domain name too"
- " long\n");
+ pr_warn("CIFS: domain name too long\n");
goto cifs_parse_mount_err;
}
vol->domainname = kstrdup(string, GFP_KERNEL);
if (!vol->domainname) {
- printk(KERN_WARNING "CIFS: no memory "
- "for domainname\n");
+ pr_warn("CIFS: no memory for domainname\n");
goto cifs_parse_mount_err;
}
cifs_dbg(FYI, "Domain name set\n");
@@ -1721,8 +1715,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (!cifs_convert_address(
(struct sockaddr *)&vol->srcaddr,
string, strlen(string))) {
- printk(KERN_WARNING "CIFS: Could not parse"
- " srcaddr: %s\n", string);
+ pr_warn("CIFS: Could not parse srcaddr: %s\n",
+ string);
goto cifs_parse_mount_err;
}
break;
@@ -1732,8 +1726,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto out_nomem;
if (strnlen(string, 1024) >= 65) {
- printk(KERN_WARNING "CIFS: iocharset name "
- "too long.\n");
+ pr_warn("CIFS: iocharset name too long.\n");
goto cifs_parse_mount_err;
}
@@ -1741,8 +1734,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->iocharset = kstrdup(string,
GFP_KERNEL);
if (!vol->iocharset) {
- printk(KERN_WARNING "CIFS: no memory"
- "for charset\n");
+ pr_warn("CIFS: no memory for charset\n");
goto cifs_parse_mount_err;
}
}
@@ -1773,9 +1765,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
* set at top of the function
*/
if (i == RFC1001_NAME_LEN && string[i] != 0)
- printk(KERN_WARNING "CIFS: netbiosname"
- " longer than 15 truncated.\n");
-
+ pr_warn("CIFS: netbiosname longer than 15 truncated.\n");
break;
case Opt_servern:
/* servernetbiosname specified override *SMBSERVER */
@@ -1801,8 +1791,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
/* The string has 16th byte zero still from
set at top of the function */
if (i == RFC1001_NAME_LEN && string[i] != 0)
- printk(KERN_WARNING "CIFS: server net"
- "biosname longer than 15 truncated.\n");
+ pr_warn("CIFS: server netbiosname longer than 15 truncated.\n");
break;
case Opt_ver:
string = match_strdup(args);
@@ -1814,8 +1803,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
break;
}
/* For all other value, error */
- printk(KERN_WARNING "CIFS: Invalid version"
- " specified\n");
+ pr_warn("CIFS: Invalid version specified\n");
goto cifs_parse_mount_err;
case Opt_vers:
string = match_strdup(args);
@@ -1856,7 +1844,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
if (!sloppy && invalid) {
- printk(KERN_ERR "CIFS: Unknown mount option \"%s\"\n", invalid);
+ pr_err("CIFS: Unknown mount option \"%s\"\n", invalid);
goto cifs_parse_mount_err;
}
@@ -1882,8 +1870,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
/* No ip= option specified? Try to get it from UNC */
if (!cifs_convert_address(dstaddr, &vol->UNC[2],
strlen(&vol->UNC[2]))) {
- printk(KERN_ERR "Unable to determine destination "
- "address.\n");
+ pr_err("Unable to determine destination address.\n");
goto cifs_parse_mount_err;
}
}
@@ -1894,20 +1881,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (uid_specified)
vol->override_uid = override_uid;
else if (override_uid == 1)
- printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
- "specified with no uid= option.\n");
+ pr_notice("CIFS: ignoring forceuid mount option specified with no uid= option.\n");
if (gid_specified)
vol->override_gid = override_gid;
else if (override_gid == 1)
- printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
- "specified with no gid= option.\n");
+ pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n");
kfree(mountdata_copy);
return 0;
out_nomem:
- printk(KERN_WARNING "Could not allocate temporary buffer\n");
+ pr_warn("Could not allocate temporary buffer\n");
cifs_parse_mount_err:
kfree(string);
kfree(mountdata_copy);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3e4d00a06c44..96b7e9b7706d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1066,7 +1066,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
- buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf) {
free_xid(xid);
return -ENOMEM;
@@ -1401,7 +1401,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
- buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -1586,7 +1586,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
tcon->ses->server);
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ cifs_sb = CIFS_FILE_SB(file);
netfid = cfile->fid.netfid;
cinode = CIFS_I(file_inode(file));
@@ -2305,7 +2305,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct cifsFileInfo *smbfile = file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct inode *inode = file->f_mapping->host;
rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
@@ -2585,7 +2585,7 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
iov_iter_truncate(from, len);
INIT_LIST_HEAD(&wdata_list);
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ cifs_sb = CIFS_FILE_SB(file);
open_file = file->private_data;
tcon = tlink_tcon(open_file->tlink);
@@ -3010,7 +3010,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
return 0;
INIT_LIST_HEAD(&rdata_list);
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ cifs_sb = CIFS_FILE_SB(file);
open_file = file->private_data;
tcon = tlink_tcon(open_file->tlink);
@@ -3155,7 +3155,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
__u32 pid;
xid = get_xid();
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ cifs_sb = CIFS_FILE_SB(file);
/* FIXME: set up handlers for larger reads and/or convert to async */
rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
@@ -3462,7 +3462,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
int rc;
struct list_head tmplist;
struct cifsFileInfo *open_file = file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct TCP_Server_Info *server;
pid_t pid;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 197cb503d528..0c3ce464cae4 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -895,7 +895,7 @@ inode_has_hashed_dentries(struct inode *inode)
struct dentry *dentry;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
spin_unlock(&inode->i_lock);
return true;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index b7415d596dbd..337946355b29 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -513,39 +513,11 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
void
dump_smb(void *buf, int smb_buf_length)
{
- int i, j;
- char debug_line[17];
- unsigned char *buffer = buf;
-
if (traceSMB == 0)
return;
- for (i = 0, j = 0; i < smb_buf_length; i++, j++) {
- if (i % 8 == 0) {
- /* have reached the beginning of line */
- printk(KERN_DEBUG "| ");
- j = 0;
- }
- printk("%0#4x ", buffer[i]);
- debug_line[2 * j] = ' ';
- if (isprint(buffer[i]))
- debug_line[1 + (2 * j)] = buffer[i];
- else
- debug_line[1 + (2 * j)] = '_';
-
- if (i % 8 == 7) {
- /* reached end of line, time to print ascii */
- debug_line[16] = 0;
- printk(" | %s\n", debug_line);
- }
- }
- for (; j < 8; j++) {
- printk(" ");
- debug_line[2 * j] = ' ';
- debug_line[1 + (2 * j)] = ' ';
- }
- printk(" | %s\n", debug_line);
- return;
+ print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 8, 2, buf,
+ smb_buf_length, true);
}
void
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 8fd2a95860ba..8eaf20a80649 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -123,7 +123,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
if (!inode)
goto out;
- alias = d_materialise_unique(dentry, inode);
+ alias = d_splice_alias(inode, dentry);
if (alias && !IS_ERR(alias))
dput(alias);
out:
@@ -261,7 +261,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file)
int rc = 0;
char *full_path = NULL;
struct cifsFileInfo *cifsFile;
- struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct tcon_link *tlink = NULL;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
@@ -561,7 +561,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
loff_t first_entry_in_buffer;
loff_t index_to_find = pos;
struct cifsFileInfo *cfile = file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct TCP_Server_Info *server = tcon->ses->server;
/* check if index in the buffer */
@@ -679,7 +679,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
char *scratch_buf, unsigned int max_len)
{
struct cifsFileInfo *file_info = file->private_data;
- struct super_block *sb = file->f_path.dentry->d_sb;
+ struct super_block *sb = file_inode(file)->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_dirent de = { NULL, };
struct cifs_fattr fattr;
@@ -753,7 +753,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
*/
fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
- cifs_prime_dcache(file->f_dentry, &name, &fattr);
+ cifs_prime_dcache(file->f_path.dentry, &name, &fattr);
ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
@@ -794,10 +794,6 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
if it before then restart search
if after then keep searching till find it */
- if (file->private_data == NULL) {
- rc = -EINVAL;
- goto rddir2_exit;
- }
cifsFile = file->private_data;
if (cifsFile->srch_inf.endOfSearch) {
if (cifsFile->srch_inf.emptyDir) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 57db63ff88da..bce6fdcd5d48 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -46,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
USHRT_MAX));
pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
- pSMB->req.VcNumber = __constant_cpu_to_le16(1);
+ pSMB->req.VcNumber = cpu_to_le16(1);
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@ -1303,6 +1303,11 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+ if (ses->Suid != smb_buf->Uid) {
+ ses->Suid = smb_buf->Uid;
+ cifs_dbg(FYI, "UID changed! new UID = %llu\n", ses->Suid);
+ }
+
bytes_remaining = get_bcc(smb_buf);
bcc_ptr = pByteArea(smb_buf);
blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 45992944e238..7198eac5dddd 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -111,7 +111,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
return -EINVAL;
max_num = max_buf / sizeof(struct smb2_lock_element);
- buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -247,7 +247,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
}
max_num = max_buf / sizeof(struct smb2_lock_element);
- buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
+ buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf) {
free_xid(xid);
return -ENOMEM;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 1a08a34838fc..f1cefc9763ed 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -67,27 +67,27 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
* indexed by command in host byte order
*/
static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
- /* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65),
- /* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9),
- /* SMB2_LOGOFF */ __constant_cpu_to_le16(4),
- /* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16),
- /* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4),
- /* SMB2_CREATE */ __constant_cpu_to_le16(89),
- /* SMB2_CLOSE */ __constant_cpu_to_le16(60),
- /* SMB2_FLUSH */ __constant_cpu_to_le16(4),
- /* SMB2_READ */ __constant_cpu_to_le16(17),
- /* SMB2_WRITE */ __constant_cpu_to_le16(17),
- /* SMB2_LOCK */ __constant_cpu_to_le16(4),
- /* SMB2_IOCTL */ __constant_cpu_to_le16(49),
+ /* SMB2_NEGOTIATE */ cpu_to_le16(65),
+ /* SMB2_SESSION_SETUP */ cpu_to_le16(9),
+ /* SMB2_LOGOFF */ cpu_to_le16(4),
+ /* SMB2_TREE_CONNECT */ cpu_to_le16(16),
+ /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4),
+ /* SMB2_CREATE */ cpu_to_le16(89),
+ /* SMB2_CLOSE */ cpu_to_le16(60),
+ /* SMB2_FLUSH */ cpu_to_le16(4),
+ /* SMB2_READ */ cpu_to_le16(17),
+ /* SMB2_WRITE */ cpu_to_le16(17),
+ /* SMB2_LOCK */ cpu_to_le16(4),
+ /* SMB2_IOCTL */ cpu_to_le16(49),
/* BB CHECK this ... not listed in documentation */
- /* SMB2_CANCEL */ __constant_cpu_to_le16(0),
- /* SMB2_ECHO */ __constant_cpu_to_le16(4),
- /* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9),
- /* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9),
- /* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9),
- /* SMB2_SET_INFO */ __constant_cpu_to_le16(2),
+ /* SMB2_CANCEL */ cpu_to_le16(0),
+ /* SMB2_ECHO */ cpu_to_le16(4),
+ /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9),
+ /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9),
+ /* SMB2_QUERY_INFO */ cpu_to_le16(9),
+ /* SMB2_SET_INFO */ cpu_to_le16(2),
/* BB FIXME can also be 44 for lease break */
- /* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24)
+ /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
};
int
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index c5f521bcdee2..93fd0586f9ec 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -600,7 +600,7 @@ smb2_clone_range(const unsigned int xid,
goto cchunk_out;
/* For now array only one chunk long, will make more flexible later */
- pcchunk->ChunkCount = __constant_cpu_to_le32(1);
+ pcchunk->ChunkCount = cpu_to_le32(1);
pcchunk->Reserved = 0;
pcchunk->Reserved2 = 0;
@@ -1102,6 +1102,64 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
return rc;
}
+static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
+ loff_t off, loff_t len, bool keep_size)
+{
+ struct inode *inode;
+ struct cifsInodeInfo *cifsi;
+ struct cifsFileInfo *cfile = file->private_data;
+ long rc = -EOPNOTSUPP;
+ unsigned int xid;
+
+ xid = get_xid();
+
+ inode = cfile->dentry->d_inode;
+ cifsi = CIFS_I(inode);
+
+ /* if file not oplocked can't be sure whether asking to extend size */
+ if (!CIFS_CACHE_READ(cifsi))
+ if (keep_size == false)
+ return -EOPNOTSUPP;
+
+ /*
+ * Files are non-sparse by default so falloc may be a no-op
+ * Must check if file sparse. If not sparse, and not extending
+ * then no need to do anything since file already allocated
+ */
+ if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) {
+ if (keep_size == true)
+ return 0;
+ /* check if extending file */
+ else if (i_size_read(inode) >= off + len)
+ /* not extending file and already not sparse */
+ return 0;
+ /* BB: in future add else clause to extend file */
+ else
+ return -EOPNOTSUPP;
+ }
+
+ if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
+ /*
+ * Check if falloc starts within first few pages of file
+ * and ends within a few pages of the end of file to
+ * ensure that most of file is being forced to be
+ * fallocated now. If so then setting whole file sparse
+ * ie potentially making a few extra pages at the beginning
+ * or end of the file non-sparse via set_sparse is harmless.
+ */
+ if ((off > 8192) || (off + len + 8192 < i_size_read(inode)))
+ return -EOPNOTSUPP;
+
+ rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
+ }
+ /* BB: else ... in future add code to extend file and set sparse */
+
+
+ free_xid(xid);
+ return rc;
+}
+
+
static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
loff_t off, loff_t len)
{
@@ -1112,7 +1170,10 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
if (mode & FALLOC_FL_KEEP_SIZE)
return smb3_zero_range(file, tcon, off, len, true);
return smb3_zero_range(file, tcon, off, len, false);
- }
+ } else if (mode == FALLOC_FL_KEEP_SIZE)
+ return smb3_simple_falloc(file, tcon, off, len, true);
+ else if (mode == 0)
+ return smb3_simple_falloc(file, tcon, off, len, false);
return -EOPNOTSUPP;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 8f1672bb82d5..3417340bf89e 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -431,8 +431,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
if (rc)
goto neg_exit;
if (blob_length)
- rc = decode_neg_token_init(security_blob, blob_length,
- &server->sec_type);
+ rc = decode_negTokenInit(security_blob, blob_length, server);
if (rc == 1)
rc = 0;
else if (rc == 0) {
@@ -1359,7 +1358,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
char *ret_data = NULL;
fsctl_input.CompressionState =
- __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+ cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SET_COMPRESSION, true /* is_fsctl */,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index e3188abdafd0..ce858477002a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -85,7 +85,7 @@
/* BB FIXME - analyze following length BB */
#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
-#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe)
+#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
/*
* SMB2 Header Definition
@@ -96,7 +96,7 @@
*
*/
-#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64)
+#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
struct smb2_hdr {
__be32 smb2_buf_length; /* big endian on wire */
@@ -137,16 +137,16 @@ struct smb2_transform_hdr {
} __packed;
/* Encryption Algorithms */
-#define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
/*
* SMB2 flag definitions
*/
-#define SMB2_FLAGS_SERVER_TO_REDIR __constant_cpu_to_le32(0x00000001)
-#define SMB2_FLAGS_ASYNC_COMMAND __constant_cpu_to_le32(0x00000002)
-#define SMB2_FLAGS_RELATED_OPERATIONS __constant_cpu_to_le32(0x00000004)
-#define SMB2_FLAGS_SIGNED __constant_cpu_to_le32(0x00000008)
-#define SMB2_FLAGS_DFS_OPERATIONS __constant_cpu_to_le32(0x10000000)
+#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
/*
* Definitions for SMB2 Protocol Data Units (network frames)
@@ -157,7 +157,7 @@ struct smb2_transform_hdr {
*
*/
-#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9)
+#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
struct smb2_err_rsp {
struct smb2_hdr hdr;
@@ -502,12 +502,12 @@ struct create_context {
#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
#define SMB2_LEASE_WRITE_CACHING_HE 0x04
-#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00)
-#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01)
-#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02)
-#define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04)
+#define SMB2_LEASE_NONE cpu_to_le32(0x00)
+#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01)
+#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02)
+#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04)
-#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02)
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02)
#define SMB2_LEASE_KEY_SIZE 16
@@ -836,6 +836,25 @@ struct smb2_query_directory_rsp {
#define SMB2_O_INFO_SECURITY 0x03
#define SMB2_O_INFO_QUOTA 0x04
+/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */
+#define OWNER_SECINFO 0x00000001
+#define GROUP_SECINFO 0x00000002
+#define DACL_SECINFO 0x00000004
+#define SACL_SECINFO 0x00000008
+#define LABEL_SECINFO 0x00000010
+#define ATTRIBUTE_SECINFO 0x00000020
+#define SCOPE_SECINFO 0x00000040
+#define BACKUP_SECINFO 0x00010000
+#define UNPROTECTED_SACL_SECINFO 0x10000000
+#define UNPROTECTED_DACL_SECINFO 0x20000000
+#define PROTECTED_SACL_SECINFO 0x40000000
+#define PROTECTED_DACL_SECINFO 0x80000000
+
+/* Flags used for FileFullEAinfo */
+#define SL_RESTART_SCAN 0x00000001
+#define SL_RETURN_SINGLE_ENTRY 0x00000002
+#define SL_INDEX_SPECIFIED 0x00000004
+
struct smb2_query_info_req {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 41 */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 9d087f4e7d4e..126f46b887cc 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -99,9 +99,9 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
something is wrong, unless it is quite a slow link or server */
if ((now - midEntry->when_alloc) > HZ) {
if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) {
- printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %llu",
+ pr_debug(" CIFS slow rsp: cmd %d mid %llu",
midEntry->command, midEntry->mid);
- printk(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
+ pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
now - midEntry->when_alloc,
now - midEntry->when_sent,
now - midEntry->when_received);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 278f8fdeb9ef..46ee6f238985 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -92,7 +92,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
struct dentry *de;
spin_lock(&parent->d_lock);
- list_for_each_entry(de, &parent->d_subdirs, d_u.d_child) {
+ list_for_each_entry(de, &parent->d_subdirs, d_child) {
/* don't know what to do with negative dentries */
if (de->d_inode )
coda_flag_inode(de->d_inode, flag);
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 1326d38960db..f1714cfb589c 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -40,12 +40,6 @@ int coda_iscontrol(const char *name, size_t length)
(strncmp(name, CODA_CONTROL, CODA_CONTROLLEN) == 0));
}
-/* recognize /coda inode */
-int coda_isroot(struct inode *i)
-{
- return ( i->i_sb->s_root->d_inode == i );
-}
-
unsigned short coda_flags_to_cflags(unsigned short flags)
{
unsigned short coda_flags = 0;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index d42b725b1d21..d6f7a76a1f5b 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -52,7 +52,6 @@ int coda_setattr(struct dentry *, struct iattr *);
/* this file: heloers */
char *coda_f2s(struct CodaFid *f);
-int coda_isroot(struct inode *i);
int coda_iscontrol(const char *name, size_t length);
void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 9c3dedc000d1..86c893884eb9 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -107,7 +107,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig
}
/* control object, create inode on the fly */
- if (coda_isroot(dir) && coda_iscontrol(name, length)) {
+ if (is_root_inode(dir) && coda_iscontrol(name, length)) {
inode = coda_cnode_makectl(sb);
type = CODA_NOCACHE;
} else {
@@ -195,7 +195,7 @@ static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool
struct CodaFid newfid;
struct coda_vattr attrs;
- if (coda_isroot(dir) && coda_iscontrol(name, length))
+ if (is_root_inode(dir) && coda_iscontrol(name, length))
return -EPERM;
error = venus_create(dir->i_sb, coda_i2f(dir), name, length,
@@ -227,7 +227,7 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode)
int error;
struct CodaFid newfid;
- if (coda_isroot(dir) && coda_iscontrol(name, len))
+ if (is_root_inode(dir) && coda_iscontrol(name, len))
return -EPERM;
attrs.va_mode = mode;
@@ -261,7 +261,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
int len = de->d_name.len;
int error;
- if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
+ if (is_root_inode(dir_inode) && coda_iscontrol(name, len))
return -EPERM;
error = venus_link(dir_inode->i_sb, coda_i2f(inode),
@@ -287,7 +287,7 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
int symlen;
int error;
- if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
+ if (is_root_inode(dir_inode) && coda_iscontrol(name, len))
return -EPERM;
symlen = strlen(symname);
@@ -426,7 +426,6 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
struct coda_file_info *cfi;
struct coda_inode_info *cii;
struct file *host_file;
- struct dentry *de;
struct venus_dirent *vdir;
unsigned long vdir_size = offsetof(struct venus_dirent, d_name);
unsigned int type;
@@ -438,8 +437,7 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- de = coda_file->f_path.dentry;
- cii = ITOC(de->d_inode);
+ cii = ITOC(file_inode(coda_file));
vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
if (!vdir) return -ENOMEM;
@@ -507,7 +505,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
return -ECHILD;
inode = de->d_inode;
- if (!inode || coda_isroot(inode))
+ if (!inode || is_root_inode(inode))
goto out;
if (is_bad_inode(inode))
goto bad;
diff --git a/fs/compat.c b/fs/compat.c
index b13df99f3534..6fd272d455e4 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -847,10 +847,12 @@ struct compat_readdir_callback {
int result;
};
-static int compat_fillonedir(void *__buf, const char *name, int namlen,
- loff_t offset, u64 ino, unsigned int d_type)
+static int compat_fillonedir(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
{
- struct compat_readdir_callback *buf = __buf;
+ struct compat_readdir_callback *buf =
+ container_of(ctx, struct compat_readdir_callback, ctx);
struct compat_old_linux_dirent __user *dirent;
compat_ulong_t d_ino;
@@ -915,11 +917,12 @@ struct compat_getdents_callback {
int error;
};
-static int compat_filldir(void *__buf, const char *name, int namlen,
+static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct compat_linux_dirent __user * dirent;
- struct compat_getdents_callback *buf = __buf;
+ struct compat_getdents_callback *buf =
+ container_of(ctx, struct compat_getdents_callback, ctx);
compat_ulong_t d_ino;
int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
namlen + 2, sizeof(compat_long_t));
@@ -1001,11 +1004,13 @@ struct compat_getdents_callback64 {
int error;
};
-static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t offset,
- u64 ino, unsigned int d_type)
+static int compat_filldir64(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
{
struct linux_dirent64 __user *dirent;
- struct compat_getdents_callback64 *buf = __buf;
+ struct compat_getdents_callback64 *buf =
+ container_of(ctx, struct compat_getdents_callback64, ctx);
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
u64 off;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 668dcabc5695..c9c298bd3058 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -386,7 +386,7 @@ static void remove_dir(struct dentry * d)
if (d->d_inode)
simple_rmdir(parent->d_inode,d);
- pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
+ pr_debug(" o %pd removing done (%d)\n", d, d_count(d));
dput(parent);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index d5a23fd0da90..e368d4f412f9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -44,7 +44,7 @@
/*
* Usage:
* dcache->d_inode->i_lock protects:
- * - i_dentry, d_alias, d_inode of aliases
+ * - i_dentry, d_u.d_alias, d_inode of aliases
* dcache_hash_bucket lock protects:
* - the dcache hash table
* s_anon bl list spinlock protects:
@@ -59,7 +59,7 @@
* - d_unhashed()
* - d_parent and d_subdirs
* - childrens' d_child and d_parent
- * - d_alias, d_inode
+ * - d_u.d_alias, d_inode
*
* Ordering:
* dentry->d_inode->i_lock
@@ -252,14 +252,12 @@ static void __d_free(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
- WARN_ON(!hlist_unhashed(&dentry->d_alias));
kmem_cache_free(dentry_cache, dentry);
}
static void __d_free_external(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
- WARN_ON(!hlist_unhashed(&dentry->d_alias));
kfree(external_name(dentry));
kmem_cache_free(dentry_cache, dentry);
}
@@ -271,6 +269,7 @@ static inline int dname_external(const struct dentry *dentry)
static void dentry_free(struct dentry *dentry)
{
+ WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
if (unlikely(dname_external(dentry))) {
struct external_name *p = external_name(dentry);
if (likely(atomic_dec_and_test(&p->u.count))) {
@@ -311,7 +310,7 @@ static void dentry_iput(struct dentry * dentry)
struct inode *inode = dentry->d_inode;
if (inode) {
dentry->d_inode = NULL;
- hlist_del_init(&dentry->d_alias);
+ hlist_del_init(&dentry->d_u.d_alias);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -336,7 +335,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
struct inode *inode = dentry->d_inode;
__d_clear_type(dentry);
dentry->d_inode = NULL;
- hlist_del_init(&dentry->d_alias);
+ hlist_del_init(&dentry->d_u.d_alias);
dentry_rcuwalk_barrier(dentry);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
@@ -496,7 +495,7 @@ static void __dentry_kill(struct dentry *dentry)
}
/* if it was on the hash then remove it */
__d_drop(dentry);
- list_del(&dentry->d_u.d_child);
+ __list_del_entry(&dentry->d_child);
/*
* Inform d_walk() that we are no longer attached to the
* dentry tree
@@ -722,7 +721,7 @@ static struct dentry *__d_find_alias(struct inode *inode)
again:
discon_alias = NULL;
- hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
spin_lock(&alias->d_lock);
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
if (IS_ROOT(alias) &&
@@ -772,12 +771,13 @@ void d_prune_aliases(struct inode *inode)
struct dentry *dentry;
restart:
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
spin_lock(&dentry->d_lock);
if (!dentry->d_lockref.count) {
struct dentry *parent = lock_parent(dentry);
if (likely(!dentry->d_lockref.count)) {
__dentry_kill(dentry);
+ dput(parent);
goto restart;
}
if (parent)
@@ -1050,7 +1050,7 @@ repeat:
resume:
while (next != &this_parent->d_subdirs) {
struct list_head *tmp = next;
- struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
+ struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
next = tmp->next;
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
@@ -1082,33 +1082,31 @@ resume:
/*
* All done at this level ... ascend and resume the search.
*/
+ rcu_read_lock();
+ascend:
if (this_parent != parent) {
struct dentry *child = this_parent;
this_parent = child->d_parent;
- rcu_read_lock();
spin_unlock(&child->d_lock);
spin_lock(&this_parent->d_lock);
- /*
- * might go back up the wrong parent if we have had a rename
- * or deletion
- */
- if (this_parent != child->d_parent ||
- (child->d_flags & DCACHE_DENTRY_KILLED) ||
- need_seqretry(&rename_lock, seq)) {
- spin_unlock(&this_parent->d_lock);
- rcu_read_unlock();
+ /* might go back up the wrong parent if we have had a rename. */
+ if (need_seqretry(&rename_lock, seq))
goto rename_retry;
+ next = child->d_child.next;
+ while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)) {
+ if (next == &this_parent->d_subdirs)
+ goto ascend;
+ child = list_entry(next, struct dentry, d_child);
+ next = next->next;
}
rcu_read_unlock();
- next = child->d_u.d_child.next;
goto resume;
}
- if (need_seqretry(&rename_lock, seq)) {
- spin_unlock(&this_parent->d_lock);
+ if (need_seqretry(&rename_lock, seq))
goto rename_retry;
- }
+ rcu_read_unlock();
if (finish)
finish(data);
@@ -1118,6 +1116,9 @@ out_unlock:
return;
rename_retry:
+ spin_unlock(&this_parent->d_lock);
+ rcu_read_unlock();
+ BUG_ON(seq & 1);
if (!retry)
return;
seq = 1;
@@ -1454,8 +1455,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
- INIT_HLIST_NODE(&dentry->d_alias);
- INIT_LIST_HEAD(&dentry->d_u.d_child);
+ INIT_HLIST_NODE(&dentry->d_u.d_alias);
+ INIT_LIST_HEAD(&dentry->d_child);
d_set_d_op(dentry, dentry->d_sb->s_d_op);
this_cpu_inc(nr_dentry);
@@ -1485,7 +1486,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
*/
__dget_dlock(parent);
dentry->d_parent = parent;
- list_add(&dentry->d_u.d_child, &parent->d_subdirs);
+ list_add(&dentry->d_child, &parent->d_subdirs);
spin_unlock(&parent->d_lock);
return dentry;
@@ -1578,7 +1579,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
spin_lock(&dentry->d_lock);
__d_set_type(dentry, add_flags);
if (inode)
- hlist_add_head(&dentry->d_alias, &inode->i_dentry);
+ hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
dentry->d_inode = inode;
dentry_rcuwalk_barrier(dentry);
spin_unlock(&dentry->d_lock);
@@ -1602,7 +1603,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
void d_instantiate(struct dentry *entry, struct inode * inode)
{
- BUG_ON(!hlist_unhashed(&entry->d_alias));
+ BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
if (inode)
spin_lock(&inode->i_lock);
__d_instantiate(entry, inode);
@@ -1641,7 +1642,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
return NULL;
}
- hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
/*
* Don't need alias->d_lock here, because aliases with
* d_parent == entry->d_parent are not subject to name or
@@ -1667,7 +1668,7 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
{
struct dentry *result;
- BUG_ON(!hlist_unhashed(&entry->d_alias));
+ BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
if (inode)
spin_lock(&inode->i_lock);
@@ -1698,7 +1699,7 @@ EXPORT_SYMBOL(d_instantiate_unique);
*/
int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
{
- BUG_ON(!hlist_unhashed(&entry->d_alias));
+ BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
spin_lock(&inode->i_lock);
if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
@@ -1737,7 +1738,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
if (hlist_empty(&inode->i_dentry))
return NULL;
- alias = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
__dget(alias);
return alias;
}
@@ -1799,7 +1800,7 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
spin_lock(&tmp->d_lock);
tmp->d_inode = inode;
tmp->d_flags |= add_flags;
- hlist_add_head(&tmp->d_alias, &inode->i_dentry);
+ hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry);
hlist_bl_lock(&tmp->d_sb->s_anon);
hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
hlist_bl_unlock(&tmp->d_sb->s_anon);
@@ -1888,51 +1889,19 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
* if not go ahead and create it now.
*/
found = d_hash_and_lookup(dentry->d_parent, name);
- if (unlikely(IS_ERR(found)))
- goto err_out;
if (!found) {
new = d_alloc(dentry->d_parent, name);
if (!new) {
found = ERR_PTR(-ENOMEM);
- goto err_out;
- }
-
- found = d_splice_alias(inode, new);
- if (found) {
- dput(new);
- return found;
- }
- return new;
- }
-
- /*
- * If a matching dentry exists, and it's not negative use it.
- *
- * Decrement the reference count to balance the iget() done
- * earlier on.
- */
- if (found->d_inode) {
- if (unlikely(found->d_inode != inode)) {
- /* This can't happen because bad inodes are unhashed. */
- BUG_ON(!is_bad_inode(inode));
- BUG_ON(!is_bad_inode(found->d_inode));
+ } else {
+ found = d_splice_alias(inode, new);
+ if (found) {
+ dput(new);
+ return found;
+ }
+ return new;
}
- iput(inode);
- return found;
}
-
- /*
- * Negative dentry: instantiate it unless the inode is a directory and
- * already has a dentry.
- */
- new = d_splice_alias(inode, found);
- if (new) {
- dput(found);
- found = new;
- }
- return found;
-
-err_out:
iput(inode);
return found;
}
@@ -2234,7 +2203,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
struct dentry *child;
spin_lock(&dparent->d_lock);
- list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
+ list_for_each_entry(child, &dparent->d_subdirs, d_child) {
if (dentry == child) {
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
__dget_dlock(dentry);
@@ -2392,6 +2361,8 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
*/
unsigned int i;
BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+ kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
+ kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
swap(((long *) &dentry->d_iname)[i],
((long *) &target->d_iname)[i]);
@@ -2525,13 +2496,13 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
/* splicing a tree */
dentry->d_parent = target->d_parent;
target->d_parent = target;
- list_del_init(&target->d_u.d_child);
- list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+ list_del_init(&target->d_child);
+ list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
} else {
/* swapping two dentries */
swap(dentry->d_parent, target->d_parent);
- list_move(&target->d_u.d_child, &target->d_parent->d_subdirs);
- list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+ list_move(&target->d_child, &target->d_parent->d_subdirs);
+ list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
if (exchange)
fsnotify_d_move(target);
fsnotify_d_move(dentry);
@@ -2607,11 +2578,11 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
*/
-static struct dentry *__d_unalias(struct inode *inode,
+static int __d_unalias(struct inode *inode,
struct dentry *dentry, struct dentry *alias)
{
struct mutex *m1 = NULL, *m2 = NULL;
- struct dentry *ret = ERR_PTR(-EBUSY);
+ int ret = -EBUSY;
/* If alias and dentry share a parent, then no extra locks required */
if (alias->d_parent == dentry->d_parent)
@@ -2626,7 +2597,7 @@ static struct dentry *__d_unalias(struct inode *inode,
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
__d_move(alias, dentry, false);
- ret = alias;
+ ret = 0;
out_err:
spin_unlock(&inode->i_lock);
if (m2)
@@ -2661,128 +2632,57 @@ out_err:
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
- struct dentry *new = NULL;
-
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (inode && S_ISDIR(inode->i_mode)) {
- spin_lock(&inode->i_lock);
- new = __d_find_any_alias(inode);
- if (new) {
- if (!IS_ROOT(new)) {
- spin_unlock(&inode->i_lock);
- dput(new);
- return ERR_PTR(-EIO);
- }
- if (d_ancestor(new, dentry)) {
- spin_unlock(&inode->i_lock);
- dput(new);
- return ERR_PTR(-EIO);
- }
- write_seqlock(&rename_lock);
- __d_move(new, dentry, false);
- write_sequnlock(&rename_lock);
- spin_unlock(&inode->i_lock);
- security_d_instantiate(new, inode);
- iput(inode);
- } else {
- /* already taking inode->i_lock, so d_add() by hand */
- __d_instantiate(dentry, inode);
- spin_unlock(&inode->i_lock);
- security_d_instantiate(dentry, inode);
- d_rehash(dentry);
- }
- } else {
- d_instantiate(dentry, inode);
- if (d_unhashed(dentry))
- d_rehash(dentry);
- }
- return new;
-}
-EXPORT_SYMBOL(d_splice_alias);
-
-/**
- * d_materialise_unique - introduce an inode into the tree
- * @dentry: candidate dentry
- * @inode: inode to bind to the dentry, to which aliases may be attached
- *
- * Introduces an dentry into the tree, substituting an extant disconnected
- * root directory alias in its place if there is one. Caller must hold the
- * i_mutex of the parent directory.
- */
-struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
-{
- struct dentry *actual;
-
BUG_ON(!d_unhashed(dentry));
if (!inode) {
- actual = dentry;
__d_instantiate(dentry, NULL);
- d_rehash(actual);
- goto out_nolock;
+ goto out;
}
-
spin_lock(&inode->i_lock);
-
if (S_ISDIR(inode->i_mode)) {
- struct dentry *alias;
-
- /* Does an aliased dentry already exist? */
- alias = __d_find_alias(inode);
- if (alias) {
- actual = alias;
+ struct dentry *new = __d_find_any_alias(inode);
+ if (unlikely(new)) {
write_seqlock(&rename_lock);
-
- if (d_ancestor(alias, dentry)) {
- /* Check for loops */
- actual = ERR_PTR(-ELOOP);
+ if (unlikely(d_ancestor(new, dentry))) {
+ write_sequnlock(&rename_lock);
spin_unlock(&inode->i_lock);
- } else if (IS_ROOT(alias)) {
- /* Is this an anonymous mountpoint that we
- * could splice into our tree? */
- __d_move(alias, dentry, false);
+ dput(new);
+ new = ERR_PTR(-ELOOP);
+ pr_warn_ratelimited(
+ "VFS: Lookup of '%s' in %s %s"
+ " would have caused loop\n",
+ dentry->d_name.name,
+ inode->i_sb->s_type->name,
+ inode->i_sb->s_id);
+ } else if (!IS_ROOT(new)) {
+ int err = __d_unalias(inode, dentry, new);
write_sequnlock(&rename_lock);
- goto found;
+ if (err) {
+ dput(new);
+ new = ERR_PTR(err);
+ }
} else {
- /* Nope, but we must(!) avoid directory
- * aliasing. This drops inode->i_lock */
- actual = __d_unalias(inode, dentry, alias);
- }
- write_sequnlock(&rename_lock);
- if (IS_ERR(actual)) {
- if (PTR_ERR(actual) == -ELOOP)
- pr_warn_ratelimited(
- "VFS: Lookup of '%s' in %s %s"
- " would have caused loop\n",
- dentry->d_name.name,
- inode->i_sb->s_type->name,
- inode->i_sb->s_id);
- dput(alias);
+ __d_move(new, dentry, false);
+ write_sequnlock(&rename_lock);
+ spin_unlock(&inode->i_lock);
+ security_d_instantiate(new, inode);
}
- goto out_nolock;
+ iput(inode);
+ return new;
}
}
-
- /* Add a unique reference */
- actual = __d_instantiate_unique(dentry, inode);
- if (!actual)
- actual = dentry;
-
- d_rehash(actual);
-found:
+ /* already taking inode->i_lock, so d_add() by hand */
+ __d_instantiate(dentry, inode);
spin_unlock(&inode->i_lock);
-out_nolock:
- if (actual == dentry) {
- security_d_instantiate(dentry, inode);
- return NULL;
- }
-
- iput(inode);
- return actual;
+out:
+ security_d_instantiate(dentry, inode);
+ d_rehash(dentry);
+ return NULL;
}
-EXPORT_SYMBOL_GPL(d_materialise_unique);
+EXPORT_SYMBOL(d_splice_alias);
static int prepend(char **buffer, int *buflen, const char *str, int namelen)
{
@@ -3318,7 +3218,7 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode)
{
inode_dec_link_count(inode);
BUG_ON(dentry->d_name.name != dentry->d_iname ||
- !hlist_unhashed(&dentry->d_alias) ||
+ !hlist_unhashed(&dentry->d_u.d_alias) ||
!d_unlinked(dentry));
spin_lock(&dentry->d_parent->d_lock);
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 76c08c2beb2f..517e64938438 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -22,6 +22,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/atomic.h>
+#include <linux/device.h>
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -692,18 +693,19 @@ EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
* because some peripherals have several blocks of identical registers,
* for example configuration of dma channels
*/
-int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
- int nregs, void __iomem *base, char *prefix)
+void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
+ int nregs, void __iomem *base, char *prefix)
{
- int i, ret = 0;
+ int i;
for (i = 0; i < nregs; i++, regs++) {
if (prefix)
- ret += seq_printf(s, "%s", prefix);
- ret += seq_printf(s, "%s = 0x%08x\n", regs->name,
- readl(base + regs->offset));
+ seq_printf(s, "%s", prefix);
+ seq_printf(s, "%s = 0x%08x\n", regs->name,
+ readl(base + regs->offset));
+ if (seq_has_overflowed(s))
+ break;
}
- return ret;
}
EXPORT_SYMBOL_GPL(debugfs_print_regs32);
@@ -761,3 +763,56 @@ struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
EXPORT_SYMBOL_GPL(debugfs_create_regset32);
#endif /* CONFIG_HAS_IOMEM */
+
+struct debugfs_devm_entry {
+ int (*read)(struct seq_file *seq, void *data);
+ struct device *dev;
+};
+
+static int debugfs_devm_entry_open(struct inode *inode, struct file *f)
+{
+ struct debugfs_devm_entry *entry = inode->i_private;
+
+ return single_open(f, entry->read, entry->dev);
+}
+
+static const struct file_operations debugfs_devm_entry_ops = {
+ .owner = THIS_MODULE,
+ .open = debugfs_devm_entry_open,
+ .release = single_release,
+ .read = seq_read,
+ .llseek = seq_lseek
+};
+
+/**
+ * debugfs_create_devm_seqfile - create a debugfs file that is bound to device.
+ *
+ * @dev: device related to this debugfs file.
+ * @name: name of the debugfs file.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @read_fn: function pointer called to print the seq_file content.
+ */
+struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
+ struct dentry *parent,
+ int (*read_fn)(struct seq_file *s,
+ void *data))
+{
+ struct debugfs_devm_entry *entry;
+
+ if (IS_ERR(parent))
+ return ERR_PTR(-ENOENT);
+
+ entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return ERR_PTR(-ENOMEM);
+
+ entry->read = read_fn;
+ entry->dev = dev;
+
+ return debugfs_create_file(name, S_IRUGO, parent, entry,
+ &debugfs_devm_entry_ops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile);
+
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 1e3b99d3db0d..05f2960ed7c3 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -553,7 +553,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
* use the d_u.d_child as the rcu head and corrupt this list.
*/
spin_lock(&parent->d_lock);
- list_for_each_entry(child, &parent->d_subdirs, d_u.d_child) {
+ list_for_each_entry(child, &parent->d_subdirs, d_child) {
if (!debugfs_positive(child))
continue;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1323c568e362..eea64912c9c0 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -48,8 +48,8 @@ static char *print_lockmode(int mode)
}
}
-static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
- struct dlm_rsb *res)
+static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ struct dlm_rsb *res)
{
seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
@@ -68,21 +68,17 @@ static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
if (lkb->lkb_wait_type)
seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
- return seq_puts(s, "\n");
+ seq_puts(s, "\n");
}
-static int print_format1(struct dlm_rsb *res, struct seq_file *s)
+static void print_format1(struct dlm_rsb *res, struct seq_file *s)
{
struct dlm_lkb *lkb;
int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
- int rv;
lock_rsb(res);
- rv = seq_printf(s, "\nResource %p Name (len=%d) \"",
- res, res->res_length);
- if (rv)
- goto out;
+ seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
for (i = 0; i < res->res_length; i++) {
if (isprint(res->res_name[i]))
@@ -92,17 +88,16 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
}
if (res->res_nodeid > 0)
- rv = seq_printf(s, "\"\nLocal Copy, Master is node %d\n",
- res->res_nodeid);
+ seq_printf(s, "\"\nLocal Copy, Master is node %d\n",
+ res->res_nodeid);
else if (res->res_nodeid == 0)
- rv = seq_puts(s, "\"\nMaster Copy\n");
+ seq_puts(s, "\"\nMaster Copy\n");
else if (res->res_nodeid == -1)
- rv = seq_printf(s, "\"\nLooking up master (lkid %x)\n",
- res->res_first_lkid);
+ seq_printf(s, "\"\nLooking up master (lkid %x)\n",
+ res->res_first_lkid);
else
- rv = seq_printf(s, "\"\nInvalid master %d\n",
- res->res_nodeid);
- if (rv)
+ seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid);
+ if (seq_has_overflowed(s))
goto out;
/* Print the LVB: */
@@ -116,8 +111,8 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
}
if (rsb_flag(res, RSB_VALNOTVALID))
seq_puts(s, " (INVALID)");
- rv = seq_puts(s, "\n");
- if (rv)
+ seq_puts(s, "\n");
+ if (seq_has_overflowed(s))
goto out;
}
@@ -125,32 +120,30 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
recover_list = !list_empty(&res->res_recover_list);
if (root_list || recover_list) {
- rv = seq_printf(s, "Recovery: root %d recover %d flags %lx "
- "count %d\n", root_list, recover_list,
- res->res_flags, res->res_recover_locks_count);
- if (rv)
- goto out;
+ seq_printf(s, "Recovery: root %d recover %d flags %lx count %d\n",
+ root_list, recover_list,
+ res->res_flags, res->res_recover_locks_count);
}
/* Print the locks attached to this resource */
seq_puts(s, "Granted Queue\n");
list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
- rv = print_format1_lock(s, lkb, res);
- if (rv)
+ print_format1_lock(s, lkb, res);
+ if (seq_has_overflowed(s))
goto out;
}
seq_puts(s, "Conversion Queue\n");
list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
- rv = print_format1_lock(s, lkb, res);
- if (rv)
+ print_format1_lock(s, lkb, res);
+ if (seq_has_overflowed(s))
goto out;
}
seq_puts(s, "Waiting Queue\n");
list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
- rv = print_format1_lock(s, lkb, res);
- if (rv)
+ print_format1_lock(s, lkb, res);
+ if (seq_has_overflowed(s))
goto out;
}
@@ -159,23 +152,23 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
seq_puts(s, "Lookup Queue\n");
list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
- rv = seq_printf(s, "%08x %s", lkb->lkb_id,
- print_lockmode(lkb->lkb_rqmode));
+ seq_printf(s, "%08x %s",
+ lkb->lkb_id, print_lockmode(lkb->lkb_rqmode));
if (lkb->lkb_wait_type)
seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
- rv = seq_puts(s, "\n");
+ seq_puts(s, "\n");
+ if (seq_has_overflowed(s))
+ goto out;
}
out:
unlock_rsb(res);
- return rv;
}
-static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
- struct dlm_rsb *r)
+static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ struct dlm_rsb *r)
{
u64 xid = 0;
u64 us;
- int rv;
if (lkb->lkb_flags & DLM_IFL_USER) {
if (lkb->lkb_ua)
@@ -188,103 +181,97 @@ static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
/* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
r_nodeid r_len r_name */
- rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
- lkb->lkb_id,
- lkb->lkb_nodeid,
- lkb->lkb_remid,
- lkb->lkb_ownpid,
- (unsigned long long)xid,
- lkb->lkb_exflags,
- lkb->lkb_flags,
- lkb->lkb_status,
- lkb->lkb_grmode,
- lkb->lkb_rqmode,
- (unsigned long long)us,
- r->res_nodeid,
- r->res_length,
- r->res_name);
- return rv;
+ seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
+ lkb->lkb_id,
+ lkb->lkb_nodeid,
+ lkb->lkb_remid,
+ lkb->lkb_ownpid,
+ (unsigned long long)xid,
+ lkb->lkb_exflags,
+ lkb->lkb_flags,
+ lkb->lkb_status,
+ lkb->lkb_grmode,
+ lkb->lkb_rqmode,
+ (unsigned long long)us,
+ r->res_nodeid,
+ r->res_length,
+ r->res_name);
}
-static int print_format2(struct dlm_rsb *r, struct seq_file *s)
+static void print_format2(struct dlm_rsb *r, struct seq_file *s)
{
struct dlm_lkb *lkb;
- int rv = 0;
lock_rsb(r);
list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
- rv = print_format2_lock(s, lkb, r);
- if (rv)
+ print_format2_lock(s, lkb, r);
+ if (seq_has_overflowed(s))
goto out;
}
list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
- rv = print_format2_lock(s, lkb, r);
- if (rv)
+ print_format2_lock(s, lkb, r);
+ if (seq_has_overflowed(s))
goto out;
}
list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
- rv = print_format2_lock(s, lkb, r);
- if (rv)
+ print_format2_lock(s, lkb, r);
+ if (seq_has_overflowed(s))
goto out;
}
out:
unlock_rsb(r);
- return rv;
}
-static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
int rsb_lookup)
{
u64 xid = 0;
- int rv;
if (lkb->lkb_flags & DLM_IFL_USER) {
if (lkb->lkb_ua)
xid = lkb->lkb_ua->xid;
}
- rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
- lkb->lkb_id,
- lkb->lkb_nodeid,
- lkb->lkb_remid,
- lkb->lkb_ownpid,
- (unsigned long long)xid,
- lkb->lkb_exflags,
- lkb->lkb_flags,
- lkb->lkb_status,
- lkb->lkb_grmode,
- lkb->lkb_rqmode,
- lkb->lkb_last_bast.mode,
- rsb_lookup,
- lkb->lkb_wait_type,
- lkb->lkb_lvbseq,
- (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
- (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
- return rv;
+ seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+ lkb->lkb_id,
+ lkb->lkb_nodeid,
+ lkb->lkb_remid,
+ lkb->lkb_ownpid,
+ (unsigned long long)xid,
+ lkb->lkb_exflags,
+ lkb->lkb_flags,
+ lkb->lkb_status,
+ lkb->lkb_grmode,
+ lkb->lkb_rqmode,
+ lkb->lkb_last_bast.mode,
+ rsb_lookup,
+ lkb->lkb_wait_type,
+ lkb->lkb_lvbseq,
+ (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+ (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
}
-static int print_format3(struct dlm_rsb *r, struct seq_file *s)
+static void print_format3(struct dlm_rsb *r, struct seq_file *s)
{
struct dlm_lkb *lkb;
int i, lvblen = r->res_ls->ls_lvblen;
int print_name = 1;
- int rv;
lock_rsb(r);
- rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
- r,
- r->res_nodeid,
- r->res_first_lkid,
- r->res_flags,
- !list_empty(&r->res_root_list),
- !list_empty(&r->res_recover_list),
- r->res_recover_locks_count,
- r->res_length);
- if (rv)
+ seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+ r,
+ r->res_nodeid,
+ r->res_first_lkid,
+ r->res_flags,
+ !list_empty(&r->res_root_list),
+ !list_empty(&r->res_recover_list),
+ r->res_recover_locks_count,
+ r->res_length);
+ if (seq_has_overflowed(s))
goto out;
for (i = 0; i < r->res_length; i++) {
@@ -292,7 +279,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
print_name = 0;
}
- seq_printf(s, "%s", print_name ? "str " : "hex");
+ seq_puts(s, print_name ? "str " : "hex");
for (i = 0; i < r->res_length; i++) {
if (print_name)
@@ -300,8 +287,8 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
else
seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
}
- rv = seq_puts(s, "\n");
- if (rv)
+ seq_puts(s, "\n");
+ if (seq_has_overflowed(s))
goto out;
if (!r->res_lvbptr)
@@ -311,65 +298,62 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
for (i = 0; i < lvblen; i++)
seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
- rv = seq_puts(s, "\n");
- if (rv)
+ seq_puts(s, "\n");
+ if (seq_has_overflowed(s))
goto out;
do_locks:
list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
- rv = print_format3_lock(s, lkb, 0);
- if (rv)
+ print_format3_lock(s, lkb, 0);
+ if (seq_has_overflowed(s))
goto out;
}
list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
- rv = print_format3_lock(s, lkb, 0);
- if (rv)
+ print_format3_lock(s, lkb, 0);
+ if (seq_has_overflowed(s))
goto out;
}
list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
- rv = print_format3_lock(s, lkb, 0);
- if (rv)
+ print_format3_lock(s, lkb, 0);
+ if (seq_has_overflowed(s))
goto out;
}
list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
- rv = print_format3_lock(s, lkb, 1);
- if (rv)
+ print_format3_lock(s, lkb, 1);
+ if (seq_has_overflowed(s))
goto out;
}
out:
unlock_rsb(r);
- return rv;
}
-static int print_format4(struct dlm_rsb *r, struct seq_file *s)
+static void print_format4(struct dlm_rsb *r, struct seq_file *s)
{
int our_nodeid = dlm_our_nodeid();
int print_name = 1;
- int i, rv;
+ int i;
lock_rsb(r);
- rv = seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ",
- r,
- r->res_nodeid,
- r->res_master_nodeid,
- r->res_dir_nodeid,
- our_nodeid,
- r->res_toss_time,
- r->res_flags,
- r->res_length);
- if (rv)
- goto out;
+ seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ",
+ r,
+ r->res_nodeid,
+ r->res_master_nodeid,
+ r->res_dir_nodeid,
+ our_nodeid,
+ r->res_toss_time,
+ r->res_flags,
+ r->res_length);
for (i = 0; i < r->res_length; i++) {
if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
print_name = 0;
}
- seq_printf(s, "%s", print_name ? "str " : "hex");
+ seq_puts(s, print_name ? "str " : "hex");
for (i = 0; i < r->res_length; i++) {
if (print_name)
@@ -377,10 +361,9 @@ static int print_format4(struct dlm_rsb *r, struct seq_file *s)
else
seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
}
- rv = seq_puts(s, "\n");
- out:
+ seq_puts(s, "\n");
+
unlock_rsb(r);
- return rv;
}
struct rsbtbl_iter {
@@ -390,47 +373,45 @@ struct rsbtbl_iter {
int header;
};
-/* seq_printf returns -1 if the buffer is full, and 0 otherwise.
- If the buffer is full, seq_printf can be called again, but it
- does nothing and just returns -1. So, the these printing routines
- periodically check the return value to avoid wasting too much time
- trying to print to a full buffer. */
+/*
+ * If the buffer is full, seq_printf can be called again, but it
+ * does nothing. So, the these printing routines periodically check
+ * seq_has_overflowed to avoid wasting too much time trying to print to
+ * a full buffer.
+ */
static int table_seq_show(struct seq_file *seq, void *iter_ptr)
{
struct rsbtbl_iter *ri = iter_ptr;
- int rv = 0;
switch (ri->format) {
case 1:
- rv = print_format1(ri->rsb, seq);
+ print_format1(ri->rsb, seq);
break;
case 2:
if (ri->header) {
- seq_printf(seq, "id nodeid remid pid xid exflags "
- "flags sts grmode rqmode time_ms "
- "r_nodeid r_len r_name\n");
+ seq_puts(seq, "id nodeid remid pid xid exflags flags sts grmode rqmode time_ms r_nodeid r_len r_name\n");
ri->header = 0;
}
- rv = print_format2(ri->rsb, seq);
+ print_format2(ri->rsb, seq);
break;
case 3:
if (ri->header) {
- seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+ seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
ri->header = 0;
}
- rv = print_format3(ri->rsb, seq);
+ print_format3(ri->rsb, seq);
break;
case 4:
if (ri->header) {
- seq_printf(seq, "version 4 rsb 2\n");
+ seq_puts(seq, "version 4 rsb 2\n");
ri->header = 0;
}
- rv = print_format4(ri->rsb, seq);
+ print_format4(ri->rsb, seq);
break;
}
- return rv;
+ return 0;
}
static const struct seq_operations format1_seq_ops;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 83f3d5520307..35502d4046f5 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5886,6 +5886,78 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
return error;
}
+/*
+ * The caller asks for an orphan lock on a given resource with a given mode.
+ * If a matching lock exists, it's moved to the owner's list of locks and
+ * the lkid is returned.
+ */
+
+int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, void *name, unsigned int namelen,
+ unsigned long timeout_cs, uint32_t *lkid)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_user_args *ua;
+ int found_other_mode = 0;
+ int found = 0;
+ int rv = 0;
+
+ mutex_lock(&ls->ls_orphans_mutex);
+ list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) {
+ if (lkb->lkb_resource->res_length != namelen)
+ continue;
+ if (memcmp(lkb->lkb_resource->res_name, name, namelen))
+ continue;
+ if (lkb->lkb_grmode != mode) {
+ found_other_mode = 1;
+ continue;
+ }
+
+ found = 1;
+ list_del_init(&lkb->lkb_ownqueue);
+ lkb->lkb_flags &= ~DLM_IFL_ORPHAN;
+ *lkid = lkb->lkb_id;
+ break;
+ }
+ mutex_unlock(&ls->ls_orphans_mutex);
+
+ if (!found && found_other_mode) {
+ rv = -EAGAIN;
+ goto out;
+ }
+
+ if (!found) {
+ rv = -ENOENT;
+ goto out;
+ }
+
+ lkb->lkb_exflags = flags;
+ lkb->lkb_ownpid = (int) current->pid;
+
+ ua = lkb->lkb_ua;
+
+ ua->proc = ua_tmp->proc;
+ ua->xid = ua_tmp->xid;
+ ua->castparam = ua_tmp->castparam;
+ ua->castaddr = ua_tmp->castaddr;
+ ua->bastparam = ua_tmp->bastparam;
+ ua->bastaddr = ua_tmp->bastaddr;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ /*
+ * The lkb reference from the ls_orphans list was not
+ * removed above, and is now considered the reference
+ * for the proc locks list.
+ */
+
+ spin_lock(&ua->proc->locks_spin);
+ list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+ spin_unlock(&ua->proc->locks_spin);
+ out:
+ kfree(ua_tmp);
+ return rv;
+}
+
int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
uint32_t flags, uint32_t lkid, char *lvb_in)
{
@@ -6029,7 +6101,7 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
struct dlm_args args;
int error;
- hold_lkb(lkb);
+ hold_lkb(lkb); /* reference for the ls_orphans list */
mutex_lock(&ls->ls_orphans_mutex);
list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
mutex_unlock(&ls->ls_orphans_mutex);
@@ -6217,7 +6289,7 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
{
int error = 0;
- if (nodeid != dlm_our_nodeid()) {
+ if (nodeid && (nodeid != dlm_our_nodeid())) {
error = send_purge(ls, nodeid, pid);
} else {
dlm_lock_recovery(ls);
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 5e0c72e36a9b..ed8ebd3a8593 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -49,6 +49,9 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
unsigned long timeout_cs);
+int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, void *name, unsigned int namelen,
+ unsigned long timeout_cs, uint32_t *lkid);
int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
uint32_t flags, uint32_t lkid, char *lvb_in);
int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 142e21655eed..fb85f32e9eca 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -238,6 +238,7 @@ static int device_user_lock(struct dlm_user_proc *proc,
{
struct dlm_ls *ls;
struct dlm_user_args *ua;
+ uint32_t lkid;
int error = -ENOMEM;
ls = dlm_find_lockspace_local(proc->lockspace);
@@ -260,12 +261,20 @@ static int device_user_lock(struct dlm_user_proc *proc,
ua->bastaddr = params->bastaddr;
ua->xid = params->xid;
- if (params->flags & DLM_LKF_CONVERT)
+ if (params->flags & DLM_LKF_CONVERT) {
error = dlm_user_convert(ls, ua,
params->mode, params->flags,
params->lkid, params->lvb,
(unsigned long) params->timeout);
- else {
+ } else if (params->flags & DLM_LKF_ORPHAN) {
+ error = dlm_user_adopt_orphan(ls, ua,
+ params->mode, params->flags,
+ params->name, params->namelen,
+ (unsigned long) params->timeout,
+ &lkid);
+ if (!error)
+ error = lkid;
+ } else {
error = dlm_user_request(ls, ua,
params->mode, params->flags,
params->name, params->namelen,
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 1de7294aad20..2bc2c87f35e7 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,13 +40,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
static void drop_slab(void)
{
int nr_objects;
- struct shrink_control shrink = {
- .gfp_mask = GFP_KERNEL,
- };
- nodes_setall(shrink.nodes_to_scan);
do {
- nr_objects = shrink_slab(&shrink, 1000, 1000);
+ int nid;
+
+ nr_objects = 0;
+ for_each_online_node(nid)
+ nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
+ 1000, 1000);
} while (nr_objects > 10);
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 31b148f3e772..719e1ce1c609 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1373,7 +1373,7 @@ out:
int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode)
{
struct dentry *lower_dentry =
- ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry;
+ ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry;
ssize_t size;
int rc = 0;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 54742f9a67a8..6f4e659f508f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -75,11 +75,11 @@ struct ecryptfs_getdents_callback {
/* Inspired by generic filldir in fs/readdir.c */
static int
-ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
- loff_t offset, u64 ino, unsigned int d_type)
+ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
+ int lower_namelen, loff_t offset, u64 ino, unsigned int d_type)
{
struct ecryptfs_getdents_callback *buf =
- (struct ecryptfs_getdents_callback *)dirent;
+ container_of(ctx, struct ecryptfs_getdents_callback, ctx);
size_t name_size;
char *name;
int rc;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 34eb8433d93f..d9eb84bda559 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -576,6 +576,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
s->s_blocksize = path.dentry->d_sb->s_blocksize;
s->s_magic = ECRYPTFS_SUPER_MAGIC;
+ s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
+
+ rc = -EINVAL;
+ if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ pr_err("eCryptfs: maximum fs stacking depth exceeded\n");
+ goto out_free;
+ }
inode = ecryptfs_get_inode(path.dentry->d_inode, s);
rc = PTR_ERR(inode);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 564a1fa34b99..4626976794e7 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -419,7 +419,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
ssize_t size;
void *xattr_virt;
struct dentry *lower_dentry =
- ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry;
+ ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry;
struct inode *lower_inode = lower_dentry->d_inode;
int rc;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index cdb2971192a5..90001da9abfd 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -47,8 +47,8 @@ static ssize_t efivarfs_file_write(struct file *file,
if (bytes == -ENOENT) {
drop_nlink(inode);
- d_delete(file->f_dentry);
- dput(file->f_dentry);
+ d_delete(file->f_path.dentry);
+ dput(file->f_path.dentry);
} else {
mutex_lock(&inode->i_mutex);
i_size_write(inode, datasize + sizeof(attributes));
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 0a48886e069c..6dad1176ec52 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -236,6 +236,7 @@ static void efivarfs_kill_sb(struct super_block *sb)
}
static struct file_system_type efivarfs_type = {
+ .owner = THIS_MODULE,
.name = "efivarfs",
.mount = efivarfs_mount,
.kill_sb = efivarfs_kill_sb,
@@ -244,17 +245,23 @@ static struct file_system_type efivarfs_type = {
static __init int efivarfs_init(void)
{
if (!efi_enabled(EFI_RUNTIME_SERVICES))
- return 0;
+ return -ENODEV;
if (!efivars_kobject())
- return 0;
+ return -ENODEV;
return register_filesystem(&efivarfs_type);
}
+static __exit void efivarfs_exit(void)
+{
+ unregister_filesystem(&efivarfs_type);
+}
+
MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr");
MODULE_DESCRIPTION("EFI Variable Filesystem");
MODULE_LICENSE("GPL");
MODULE_ALIAS_FS("efivarfs");
module_init(efivarfs_init);
+module_exit(efivarfs_exit);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d6a88e7812f3..4b0a226024fa 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -287,17 +287,14 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
}
#ifdef CONFIG_PROC_FS
-static int eventfd_show_fdinfo(struct seq_file *m, struct file *f)
+static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct eventfd_ctx *ctx = f->private_data;
- int ret;
spin_lock_irq(&ctx->wqh.lock);
- ret = seq_printf(m, "eventfd-count: %16llx\n",
- (unsigned long long)ctx->count);
+ seq_printf(m, "eventfd-count: %16llx\n",
+ (unsigned long long)ctx->count);
spin_unlock_irq(&ctx->wqh.lock);
-
- return ret;
}
#endif
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7bcfff900f05..d77f94491352 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -870,25 +870,22 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
}
#ifdef CONFIG_PROC_FS
-static int ep_show_fdinfo(struct seq_file *m, struct file *f)
+static void ep_show_fdinfo(struct seq_file *m, struct file *f)
{
struct eventpoll *ep = f->private_data;
struct rb_node *rbp;
- int ret = 0;
mutex_lock(&ep->mtx);
for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
- ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
- epi->ffd.fd, epi->event.events,
- (long long)epi->event.data);
- if (ret)
+ seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
+ epi->ffd.fd, epi->event.events,
+ (long long)epi->event.data);
+ if (seq_has_overflowed(m))
break;
}
mutex_unlock(&ep->mtx);
-
- return ret;
}
#endif
diff --git a/fs/exec.c b/fs/exec.c
index 7302b75a9820..ad8798e26be9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -277,6 +277,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
goto err;
mm->stack_vm = mm->total_vm = 1;
+ arch_bprm_mm_init(mm, vma);
up_write(&mm->mmap_sem);
bprm->p = vma->vm_end - sizeof(void *);
return 0;
@@ -747,18 +748,25 @@ EXPORT_SYMBOL(setup_arg_pages);
#endif /* CONFIG_MMU */
-static struct file *do_open_exec(struct filename *name)
+static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
struct file *file;
int err;
- static const struct open_flags open_exec_flags = {
+ struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
.acc_mode = MAY_EXEC | MAY_OPEN,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
- file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
+ if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return ERR_PTR(-EINVAL);
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
+
+ file = do_filp_open(fd, name, &open_exec_flags);
if (IS_ERR(file))
goto out;
@@ -769,12 +777,13 @@ static struct file *do_open_exec(struct filename *name)
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
goto exit;
- fsnotify_open(file);
-
err = deny_write_access(file);
if (err)
goto exit;
+ if (name->name[0] != '\0')
+ fsnotify_open(file);
+
out:
return file;
@@ -786,7 +795,7 @@ exit:
struct file *open_exec(const char *name)
{
struct filename tmp = { .name = name };
- return do_open_exec(&tmp);
+ return do_open_execat(AT_FDCWD, &tmp, 0);
}
EXPORT_SYMBOL(open_exec);
@@ -1427,10 +1436,12 @@ static int exec_binprm(struct linux_binprm *bprm)
/*
* sys_execve() executes a new program.
*/
-static int do_execve_common(struct filename *filename,
- struct user_arg_ptr argv,
- struct user_arg_ptr envp)
+static int do_execveat_common(int fd, struct filename *filename,
+ struct user_arg_ptr argv,
+ struct user_arg_ptr envp,
+ int flags)
{
+ char *pathbuf = NULL;
struct linux_binprm *bprm;
struct file *file;
struct files_struct *displaced;
@@ -1471,7 +1482,7 @@ static int do_execve_common(struct filename *filename,
check_unsafe_exec(bprm);
current->in_execve = 1;
- file = do_open_exec(filename);
+ file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;
@@ -1479,7 +1490,28 @@ static int do_execve_common(struct filename *filename,
sched_exec();
bprm->file = file;
- bprm->filename = bprm->interp = filename->name;
+ if (fd == AT_FDCWD || filename->name[0] == '/') {
+ bprm->filename = filename->name;
+ } else {
+ if (filename->name[0] == '\0')
+ pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
+ else
+ pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
+ fd, filename->name);
+ if (!pathbuf) {
+ retval = -ENOMEM;
+ goto out_unmark;
+ }
+ /*
+ * Record that a name derived from an O_CLOEXEC fd will be
+ * inaccessible after exec. Relies on having exclusive access to
+ * current->files (due to unshare_files above).
+ */
+ if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+ bprm->filename = pathbuf;
+ }
+ bprm->interp = bprm->filename;
retval = bprm_mm_init(bprm);
if (retval)
@@ -1520,6 +1552,7 @@ static int do_execve_common(struct filename *filename,
acct_update_integrals(current);
task_numa_free(current);
free_bprm(bprm);
+ kfree(pathbuf);
putname(filename);
if (displaced)
put_files_struct(displaced);
@@ -1537,6 +1570,7 @@ out_unmark:
out_free:
free_bprm(bprm);
+ kfree(pathbuf);
out_files:
if (displaced)
@@ -1552,7 +1586,18 @@ int do_execve(struct filename *filename,
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
- return do_execve_common(filename, argv, envp);
+ return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+int do_execveat(int fd, struct filename *filename,
+ const char __user *const __user *__argv,
+ const char __user *const __user *__envp,
+ int flags)
+{
+ struct user_arg_ptr argv = { .ptr.native = __argv };
+ struct user_arg_ptr envp = { .ptr.native = __envp };
+
+ return do_execveat_common(fd, filename, argv, envp, flags);
}
#ifdef CONFIG_COMPAT
@@ -1568,7 +1613,23 @@ static int compat_do_execve(struct filename *filename,
.is_compat = true,
.ptr.compat = __envp,
};
- return do_execve_common(filename, argv, envp);
+ return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+static int compat_do_execveat(int fd, struct filename *filename,
+ const compat_uptr_t __user *__argv,
+ const compat_uptr_t __user *__envp,
+ int flags)
+{
+ struct user_arg_ptr argv = {
+ .is_compat = true,
+ .ptr.compat = __argv,
+ };
+ struct user_arg_ptr envp = {
+ .is_compat = true,
+ .ptr.compat = __envp,
+ };
+ return do_execveat_common(fd, filename, argv, envp, flags);
}
#endif
@@ -1608,6 +1669,20 @@ SYSCALL_DEFINE3(execve,
{
return do_execve(getname(filename), argv, envp);
}
+
+SYSCALL_DEFINE5(execveat,
+ int, fd, const char __user *, filename,
+ const char __user *const __user *, argv,
+ const char __user *const __user *, envp,
+ int, flags)
+{
+ int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+ return do_execveat(fd,
+ getname_flags(filename, lookup_flags, NULL),
+ argv, envp, flags);
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
const compat_uptr_t __user *, argv,
@@ -1615,4 +1690,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
{
return compat_do_execve(getname(filename), argv, envp);
}
+
+COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
+ const char __user *, filename,
+ const compat_uptr_t __user *, argv,
+ const compat_uptr_t __user *, envp,
+ int, flags)
+{
+ int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+ return compat_do_execveat(fd,
+ getname_flags(filename, lookup_flags, NULL),
+ argv, envp, flags);
+}
#endif
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index b01fbfb51f43..fdfd206c737a 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -50,7 +50,7 @@ find_acceptable_alias(struct dentry *result,
inode = result->d_inode;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
dget(dentry);
spin_unlock(&inode->i_lock);
if (toput)
@@ -241,10 +241,11 @@ struct getdents_callback {
* A rather strange filldir function to capture
* the name matching the specified inode number.
*/
-static int filldir_one(void * __buf, const char * name, int len,
+static int filldir_one(struct dir_context *ctx, const char *name, int len,
loff_t pos, u64 ino, unsigned int d_type)
{
- struct getdents_callback *buf = __buf;
+ struct getdents_callback *buf =
+ container_of(ctx, struct getdents_callback, ctx);
int result = 0;
buf->sequence++;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index d9a17d0b124d..e4279ead4a05 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -689,6 +689,9 @@ struct ext2_inode_info {
struct mutex truncate_mutex;
struct inode vfs_inode;
struct list_head i_orphan; /* unlinked but open inodes */
+#ifdef CONFIG_QUOTA
+ struct dquot *i_dquot[MAXQUOTAS];
+#endif
};
/*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 170dc41e8bf4..ae55fddc26a9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -166,6 +166,10 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
return NULL;
ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
+#ifdef CONFIG_QUOTA
+ memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
+#endif
+
return &ei->vfs_inode;
}
@@ -303,6 +307,10 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
#ifdef CONFIG_QUOTA
static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
+static struct dquot **ext2_get_dquots(struct inode *inode)
+{
+ return EXT2_I(inode)->i_dquot;
+}
#endif
static const struct super_operations ext2_sops = {
@@ -320,6 +328,7 @@ static const struct super_operations ext2_sops = {
#ifdef CONFIG_QUOTA
.quota_read = ext2_quota_read,
.quota_write = ext2_quota_write,
+ .get_dquots = ext2_get_dquots,
#endif
};
@@ -1090,6 +1099,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_QUOTA
sb->dq_op = &dquot_operations;
sb->s_qcop = &dquot_quotactl_ops;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
root = ext2_iget(sb, EXT2_ROOT_INO);
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
index fc3cdcf24aed..f483a80b3fe7 100644
--- a/fs/ext3/ext3.h
+++ b/fs/ext3/ext3.h
@@ -615,6 +615,10 @@ struct ext3_inode_info {
atomic_t i_sync_tid;
atomic_t i_datasync_tid;
+#ifdef CONFIG_QUOTA
+ struct dquot *i_dquot[MAXQUOTAS];
+#endif
+
struct inode vfs_inode;
};
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7015db0bafd1..9b4e7d750d4f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -485,6 +485,10 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
ei->vfs_inode.i_version = 1;
atomic_set(&ei->i_datasync_tid, 0);
atomic_set(&ei->i_sync_tid, 0);
+#ifdef CONFIG_QUOTA
+ memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
+#endif
+
return &ei->vfs_inode;
}
@@ -764,6 +768,10 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
static ssize_t ext3_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
+static struct dquot **ext3_get_dquots(struct inode *inode)
+{
+ return EXT3_I(inode)->i_dquot;
+}
static const struct dquot_operations ext3_quota_operations = {
.write_dquot = ext3_write_dquot,
@@ -803,6 +811,7 @@ static const struct super_operations ext3_sops = {
#ifdef CONFIG_QUOTA
.quota_read = ext3_quota_read,
.quota_write = ext3_quota_write,
+ .get_dquots = ext3_get_dquots,
#endif
.bdev_try_to_free_page = bdev_try_to_free_page,
};
@@ -1354,13 +1363,6 @@ set_qf_format:
"not specified.");
return 0;
}
- } else {
- if (sbi->s_jquota_fmt) {
- ext3_msg(sb, KERN_ERR, "error: journaled quota format "
- "specified with no journaling "
- "enabled.");
- return 0;
- }
}
#endif
return 1;
@@ -2008,6 +2010,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
#ifdef CONFIG_QUOTA
sb->s_qcop = &ext3_qctl_operations;
sb->dq_op = &ext3_quota_operations;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c55a1faaed58..a75fba67bb1f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -158,17 +158,8 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED (1 << BH_Mapped)
#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
-/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
- * ext4_map_blocks wants to know whether or not the underlying cluster has
- * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
- * the requested mapping was from previously mapped (or delayed allocated)
- * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
- * should never appear on buffer_head's state flags.
- */
-#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
- EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_FROM_CLUSTER)
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -565,10 +556,8 @@ enum {
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
/* Do not take i_data_sem locking in ext4_map_blocks */
#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
- /* Do not put hole in extent cache */
-#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200
/*
* The bit position of these flags must not overlap with any of the
@@ -889,10 +878,12 @@ struct ext4_inode_info {
/* extents status tree */
struct ext4_es_tree i_es_tree;
rwlock_t i_es_lock;
- struct list_head i_es_lru;
+ struct list_head i_es_list;
unsigned int i_es_all_nr; /* protected by i_es_lock */
- unsigned int i_es_lru_nr; /* protected by i_es_lock */
- unsigned long i_touch_when; /* jiffies of last accessing */
+ unsigned int i_es_shk_nr; /* protected by i_es_lock */
+ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
+ extents to shrink. Protected by
+ i_es_lock */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -941,6 +932,10 @@ struct ext4_inode_info {
tid_t i_sync_tid;
tid_t i_datasync_tid;
+#ifdef CONFIG_QUOTA
+ struct dquot *i_dquot[MAXQUOTAS];
+#endif
+
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
__u32 i_csum_seed;
};
@@ -1333,10 +1328,11 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
- struct list_head s_es_lru;
+ struct list_head s_es_list; /* List of inodes with reclaimable extents */
+ long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
struct mb_cache *s_mb_cache;
- spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+ spinlock_t s_es_lock ____cacheline_aligned_in_smp;
/* Ratelimit ext4 messages. */
struct ratelimit_state s_err_ratelimit_state;
@@ -2192,7 +2188,6 @@ extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
-extern void ext4_kvfree(void *ptr);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -2643,7 +2638,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
int *retval);
extern int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
- int *has_inline);
+ int *has_inline, __u64 start, __u64 len);
extern int ext4_try_to_evict_inline_data(handle_t *handle,
struct inode *inode,
int needed);
@@ -2791,16 +2786,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
/*
- * Note that these flags will never ever appear in a buffer_head's state flag.
- * See EXT4_MAP_... to see where this is used.
- */
-enum ext4_state_bits {
- BH_AllocFromCluster /* allocated blocks were part of already
- * allocated cluster. */
- = BH_JBDPrivateStart
-};
-
-/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
* to mark the bitmap uptodate. We need to also zero-out the bitmap
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 37043d0b2be8..e5d3eadf47b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2306,16 +2306,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ext4_lblk_t block)
{
int depth = ext_depth(inode);
- unsigned long len = 0;
- ext4_lblk_t lblock = 0;
+ ext4_lblk_t len;
+ ext4_lblk_t lblock;
struct ext4_extent *ex;
+ struct extent_status es;
ex = path[depth].p_ext;
if (ex == NULL) {
- /*
- * there is no extent yet, so gap is [0;-] and we
- * don't cache it
- */
+ /* there is no extent yet, so gap is [0;-] */
+ lblock = 0;
+ len = EXT_MAX_BLOCKS;
ext_debug("cache gap(whole file):");
} else if (block < le32_to_cpu(ex->ee_block)) {
lblock = block;
@@ -2324,9 +2324,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block,
le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex));
- if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
- ext4_es_insert_extent(inode, lblock, len, ~0,
- EXTENT_STATUS_HOLE);
} else if (block >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
@@ -2340,14 +2337,19 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block);
BUG_ON(next == lblock);
len = next - lblock;
- if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
- ext4_es_insert_extent(inode, lblock, len, ~0,
- EXTENT_STATUS_HOLE);
} else {
BUG();
}
- ext_debug(" -> %u:%lu\n", lblock, len);
+ ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+ if (es.es_len) {
+ /* There's delayed extent containing lblock? */
+ if (es.es_lblk <= lblock)
+ return;
+ len = min(es.es_lblk - lblock, len);
+ }
+ ext_debug(" -> %u:%u\n", lblock, len);
+ ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
}
/*
@@ -2481,7 +2483,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned short ee_len = ext4_ext_get_actual_len(ex);
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
int flags = get_default_free_blocks_flags(inode);
@@ -2490,7 +2492,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* at the beginning of the extent. Instead, we make a note
* that we tried freeing the cluster, and check to see if we
* need to free it on a subsequent call to ext4_remove_blocks,
- * or at the end of the ext4_truncate() operation.
+ * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
*/
flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
@@ -2501,8 +2503,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* partial cluster here.
*/
pblk = ext4_ext_pblock(ex) + ee_len - 1;
- if ((*partial_cluster > 0) &&
- (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ if (*partial_cluster > 0 &&
+ *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
sbi->s_cluster_ratio, flags);
@@ -2528,7 +2530,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
- unsigned int unaligned;
+ long long first_cluster;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
pblk = ext4_ext_pblock(ex) + ee_len - num;
@@ -2538,7 +2540,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* used by any other extent (partial_cluster is negative).
*/
if (*partial_cluster < 0 &&
- -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+ *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
ext_debug("free last %u blocks starting %llu partial %lld\n",
@@ -2549,21 +2551,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* beginning of a cluster, and we removed the entire
* extent and the cluster is not used by any other extent,
* save the partial cluster here, since we might need to
- * delete if we determine that the truncate operation has
- * removed all of the blocks in the cluster.
+ * delete if we determine that the truncate or punch hole
+ * operation has removed all of the blocks in the cluster.
+ * If that cluster is used by another extent, preserve its
+ * negative value so it isn't freed later on.
*
- * On the other hand, if we did not manage to free the whole
- * extent, we have to mark the cluster as used (store negative
- * cluster number in partial_cluster).
+ * If the whole extent wasn't freed, we've reached the
+ * start of the truncated/punched region and have finished
+ * removing blocks. If there's a partial cluster here it's
+ * shared with the remainder of the extent and is no longer
+ * a candidate for removal.
*/
- unaligned = EXT4_PBLK_COFF(sbi, pblk);
- if (unaligned && (ee_len == num) &&
- (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
- *partial_cluster = EXT4_B2C(sbi, pblk);
- else if (unaligned)
- *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
- else if (*partial_cluster > 0)
+ if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
+ first_cluster = (long long) EXT4_B2C(sbi, pblk);
+ if (first_cluster != -*partial_cluster)
+ *partial_cluster = first_cluster;
+ } else {
*partial_cluster = 0;
+ }
} else
ext4_error(sbi->s_sb, "strange request: removal(2) "
"%u-%u from %u:%u\n",
@@ -2574,15 +2579,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
/*
* ext4_ext_rm_leaf() Removes the extents associated with the
- * blocks appearing between "start" and "end", and splits the extents
- * if "start" and "end" appear in the same extent
+ * blocks appearing between "start" and "end". Both "start"
+ * and "end" must appear in the same extent or EIO is returned.
*
* @handle: The journal handle
* @inode: The files inode
* @path: The path to the leaf
* @partial_cluster: The cluster which we'll have to free if all extents
- * has been released from it. It gets negative in case
- * that the cluster is still used.
+ * has been released from it. However, if this value is
+ * negative, it's a cluster just to the right of the
+ * punched region and it must not be freed.
* @start: The first block to remove
* @end: The last block to remove
*/
@@ -2621,27 +2627,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
- /*
- * If we're starting with an extent other than the last one in the
- * node, we need to see if it shares a cluster with the extent to
- * the right (towards the end of the file). If its leftmost cluster
- * is this extent's rightmost cluster and it is not cluster aligned,
- * we'll mark it as a partial that is not to be deallocated.
- */
-
- if (ex != EXT_LAST_EXTENT(eh)) {
- ext4_fsblk_t current_pblk, right_pblk;
- long long current_cluster, right_cluster;
-
- current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
- current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
- right_pblk = ext4_ext_pblock(ex + 1);
- right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
- if (current_cluster == right_cluster &&
- EXT4_PBLK_COFF(sbi, right_pblk))
- *partial_cluster = -right_cluster;
- }
-
trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2666,14 +2651,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (end < ex_ee_block) {
/*
* We're going to skip this extent and move to another,
- * so if this extent is not cluster aligned we have
- * to mark the current cluster as used to avoid
- * accidentally freeing it later on
+ * so note that its first cluster is in use to avoid
+ * freeing it when removing blocks. Eventually, the
+ * right edge of the truncated/punched region will
+ * be just to the left.
*/
- pblk = ext4_ext_pblock(ex);
- if (EXT4_PBLK_COFF(sbi, pblk))
+ if (sbi->s_cluster_ratio > 1) {
+ pblk = ext4_ext_pblock(ex);
*partial_cluster =
- -((long long)EXT4_B2C(sbi, pblk));
+ -(long long) EXT4_B2C(sbi, pblk);
+ }
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2749,8 +2736,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
sizeof(struct ext4_extent));
}
le16_add_cpu(&eh->eh_entries, -1);
- } else if (*partial_cluster > 0)
- *partial_cluster = 0;
+ }
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -2769,20 +2755,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/*
* If there's a partial cluster and at least one extent remains in
* the leaf, free the partial cluster if it isn't shared with the
- * current extent. If there's a partial cluster and no extents
- * remain in the leaf, it can't be freed here. It can only be
- * freed when it's possible to determine if it's not shared with
- * any other extent - when the next leaf is processed or when space
- * removal is complete.
+ * current extent. If it is shared with the current extent
+ * we zero partial_cluster because we've reached the start of the
+ * truncated/punched region and we're done removing blocks.
*/
- if (*partial_cluster > 0 && eh->eh_entries &&
- (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
- *partial_cluster)) {
- int flags = get_default_free_blocks_flags(inode);
-
- ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(sbi, *partial_cluster),
- sbi->s_cluster_ratio, flags);
+ if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
+ pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+ if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio,
+ get_default_free_blocks_flags(inode));
+ }
*partial_cluster = 0;
}
@@ -2819,7 +2803,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
- struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
long long partial_cluster = 0;
@@ -2845,9 +2829,10 @@ again:
*/
if (end < EXT_MAX_BLOCKS - 1) {
struct ext4_extent *ex;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, ex_end, lblk;
+ ext4_fsblk_t pblk;
- /* find extent for this block */
+ /* find extent for or closest extent to this block */
path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
@@ -2867,6 +2852,7 @@ again:
}
ee_block = le32_to_cpu(ex->ee_block);
+ ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
/*
* See if the last block is inside the extent, if so split
@@ -2874,8 +2860,19 @@ again:
* tail of the first part of the split extent in
* ext4_ext_rm_leaf().
*/
- if (end >= ee_block &&
- end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+ if (end >= ee_block && end < ex_end) {
+
+ /*
+ * If we're going to split the extent, note that
+ * the cluster containing the block after 'end' is
+ * in use to avoid freeing it when removing blocks.
+ */
+ if (sbi->s_cluster_ratio > 1) {
+ pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+ partial_cluster =
+ -(long long) EXT4_B2C(sbi, pblk);
+ }
+
/*
* Split the extent in two so that 'end' is the last
* block in the first new extent. Also we should not
@@ -2886,6 +2883,24 @@ again:
end + 1, 1);
if (err < 0)
goto out;
+
+ } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+ /*
+ * If there's an extent to the right its first cluster
+ * contains the immediate right boundary of the
+ * truncated/punched region. Set partial_cluster to
+ * its negative value so it won't be freed if shared
+ * with the current extent. The end < ee_block case
+ * is handled in ext4_ext_rm_leaf().
+ */
+ lblk = ex_end + 1;
+ err = ext4_ext_search_right(inode, path, &lblk, &pblk,
+ &ex);
+ if (err)
+ goto out;
+ if (pblk)
+ partial_cluster =
+ -(long long) EXT4_B2C(sbi, pblk);
}
}
/*
@@ -2996,16 +3011,18 @@ again:
trace_ext4_ext_remove_space_done(inode, start, end, depth,
partial_cluster, path->p_hdr->eh_entries);
- /* If we still have something in the partial cluster and we have removed
+ /*
+ * If we still have something in the partial cluster and we have removed
* even the first extent, then we should free the blocks in the partial
- * cluster as well. */
- if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
- int flags = get_default_free_blocks_flags(inode);
-
+ * cluster as well. (This code will only run when there are no leaves
+ * to the immediate left of the truncated/punched region.)
+ */
+ if (partial_cluster > 0 && err == 0) {
+ /* don't zero partial_cluster since it's not used afterwards */
ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(EXT4_SB(sb), partial_cluster),
- EXT4_SB(sb)->s_cluster_ratio, flags);
- partial_cluster = 0;
+ EXT4_C2B(sbi, partial_cluster),
+ sbi->s_cluster_ratio,
+ get_default_free_blocks_flags(inode));
}
/* TODO: flexible tree reduction should be here */
@@ -3603,11 +3620,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
}
- allocated = ext4_split_extent(handle, inode, ppath,
- &split_map, split_flag, flags);
- if (allocated < 0)
- err = allocated;
-
+ err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
+ flags);
+ if (err > 0)
+ err = 0;
out:
/* If we have gotten a failure, don't zero out status tree */
if (!err)
@@ -4268,6 +4284,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
int set_unwritten = 0;
+ bool map_from_cluster = false;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -4344,10 +4361,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
}
}
- if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk))
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-
/*
* requested block isn't allocated yet;
* we couldn't try to create block if create flag is zero
@@ -4357,15 +4370,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* put just found gap into cache to speed up
* subsequent requests
*/
- if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
/*
* Okay, we need to do block allocation.
*/
- map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
newex.ee_block = cpu_to_le32(map->m_lblk);
cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
@@ -4377,7 +4388,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4398,7 +4409,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4524,7 +4535,7 @@ got_allocated_blocks:
*/
reserved_clusters = get_reserved_cluster_alloc(inode,
map->m_lblk, allocated);
- if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+ if (map_from_cluster) {
if (reserved_clusters) {
/*
* We have clusters reserved for this range.
@@ -4621,7 +4632,6 @@ out2:
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
- ext4_es_lru_add(inode);
return err ? err : allocated;
}
@@ -5141,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ext4_has_inline_data(inode)) {
int has_inline = 1;
- error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+ error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
+ start, len);
if (has_inline)
return error;
@@ -5155,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext4_get_block);
+ return __generic_block_fiemap(inode, fieinfo, start, len,
+ ext4_get_block);
if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
return -EBADR;
@@ -5180,7 +5191,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
error = ext4_fill_fiemap_extents(inode, start_blk,
len_blks, fieinfo);
}
- ext4_es_lru_add(inode);
return error;
}
@@ -5240,8 +5250,6 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
return -EIO;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
- if (!ex_last)
- return -EIO;
err = ext4_access_path(handle, inode, path + depth);
if (err)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 94e7855ae71b..e04d45733976 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -147,10 +147,9 @@ static struct kmem_cache *ext4_es_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end);
-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
- int nr_to_scan);
-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
- struct ext4_inode_info *locked_ei);
+static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
int __init ext4_init_es(void)
{
@@ -298,6 +297,36 @@ out:
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
}
+static void ext4_es_list_add(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!list_empty(&ei->i_es_list))
+ return;
+
+ spin_lock(&sbi->s_es_lock);
+ if (list_empty(&ei->i_es_list)) {
+ list_add_tail(&ei->i_es_list, &sbi->s_es_list);
+ sbi->s_es_nr_inode++;
+ }
+ spin_unlock(&sbi->s_es_lock);
+}
+
+static void ext4_es_list_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lock);
+ if (!list_empty(&ei->i_es_list)) {
+ list_del_init(&ei->i_es_list);
+ sbi->s_es_nr_inode--;
+ WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
+ }
+ spin_unlock(&sbi->s_es_lock);
+}
+
static struct extent_status *
ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
ext4_fsblk_t pblk)
@@ -314,9 +343,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
* We don't count delayed extent because we never try to reclaim them
*/
if (!ext4_es_is_delayed(es)) {
- EXT4_I(inode)->i_es_lru_nr++;
+ if (!EXT4_I(inode)->i_es_shk_nr++)
+ ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
- s_es_stats.es_stats_lru_cnt);
+ s_es_stats.es_stats_shk_cnt);
}
EXT4_I(inode)->i_es_all_nr++;
@@ -330,12 +360,13 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
EXT4_I(inode)->i_es_all_nr--;
percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
- /* Decrease the lru counter when this es is not delayed */
+ /* Decrease the shrink counter when this es is not delayed */
if (!ext4_es_is_delayed(es)) {
- BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
- EXT4_I(inode)->i_es_lru_nr--;
+ BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
+ if (!--EXT4_I(inode)->i_es_shk_nr)
+ ext4_es_list_del(inode);
percpu_counter_dec(&EXT4_SB(inode->i_sb)->
- s_es_stats.es_stats_lru_cnt);
+ s_es_stats.es_stats_shk_cnt);
}
kmem_cache_free(ext4_es_cachep, es);
@@ -351,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
static int ext4_es_can_be_merged(struct extent_status *es1,
struct extent_status *es2)
{
- if (ext4_es_status(es1) != ext4_es_status(es2))
+ if (ext4_es_type(es1) != ext4_es_type(es2))
return 0;
if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
@@ -394,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
es1 = rb_entry(node, struct extent_status, rb_node);
if (ext4_es_can_be_merged(es1, es)) {
es1->es_len += es->es_len;
+ if (ext4_es_is_referenced(es))
+ ext4_es_set_referenced(es1);
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
es = es1;
@@ -416,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
es1 = rb_entry(node, struct extent_status, rb_node);
if (ext4_es_can_be_merged(es, es1)) {
es->es_len += es1->es_len;
+ if (ext4_es_is_referenced(es1))
+ ext4_es_set_referenced(es);
rb_erase(node, &tree->root);
ext4_es_free_extent(inode, es1);
}
@@ -683,8 +718,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
goto error;
retry:
err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
- EXT4_I(inode)))
+ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+ 128, EXT4_I(inode)))
goto retry;
if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
err = 0;
@@ -782,6 +817,8 @@ out:
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
+ if (!ext4_es_is_referenced(es))
+ ext4_es_set_referenced(es);
stats->es_stats_cache_hits++;
} else {
stats->es_stats_cache_misses++;
@@ -841,8 +878,8 @@ retry:
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
if ((err == -ENOMEM) &&
- __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
- EXT4_I(inode)))
+ __es_shrink(EXT4_SB(inode->i_sb),
+ 128, EXT4_I(inode)))
goto retry;
goto out;
}
@@ -914,6 +951,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
end = lblk + len - 1;
BUG_ON(end < lblk);
+ /*
+ * ext4_clear_inode() depends on us taking i_es_lock unconditionally
+ * so that we are sure __es_shrink() is done with the inode before it
+ * is reclaimed.
+ */
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end);
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -921,114 +963,75 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
return err;
}
-static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
- struct list_head *b)
-{
- struct ext4_inode_info *eia, *eib;
- eia = list_entry(a, struct ext4_inode_info, i_es_lru);
- eib = list_entry(b, struct ext4_inode_info, i_es_lru);
-
- if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
- !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
- return 1;
- if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
- ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
- return -1;
- if (eia->i_touch_when == eib->i_touch_when)
- return 0;
- if (time_after(eia->i_touch_when, eib->i_touch_when))
- return 1;
- else
- return -1;
-}
-
-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
- struct ext4_inode_info *locked_ei)
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei)
{
struct ext4_inode_info *ei;
struct ext4_es_stats *es_stats;
- struct list_head *cur, *tmp;
- LIST_HEAD(skipped);
ktime_t start_time;
u64 scan_time;
+ int nr_to_walk;
int nr_shrunk = 0;
- int retried = 0, skip_precached = 1, nr_skipped = 0;
+ int retried = 0, nr_skipped = 0;
es_stats = &sbi->s_es_stats;
start_time = ktime_get();
- spin_lock(&sbi->s_es_lru_lock);
retry:
- list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
- int shrunk;
-
- /*
- * If we have already reclaimed all extents from extent
- * status tree, just stop the loop immediately.
- */
- if (percpu_counter_read_positive(
- &es_stats->es_stats_lru_cnt) == 0)
- break;
-
- ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+ spin_lock(&sbi->s_es_lock);
+ nr_to_walk = sbi->s_es_nr_inode;
+ while (nr_to_walk-- > 0) {
+ if (list_empty(&sbi->s_es_list)) {
+ spin_unlock(&sbi->s_es_lock);
+ goto out;
+ }
+ ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
+ i_es_list);
+ /* Move the inode to the tail */
+ list_move_tail(&ei->i_es_list, &sbi->s_es_list);
/*
- * Skip the inode that is newer than the last_sorted
- * time. Normally we try hard to avoid shrinking
- * precached inodes, but we will as a last resort.
+ * Normally we try hard to avoid shrinking precached inodes,
+ * but we will as a last resort.
*/
- if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
- (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
- EXT4_STATE_EXT_PRECACHED))) {
+ if (!retried && ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED)) {
nr_skipped++;
- list_move_tail(cur, &skipped);
continue;
}
- if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
- !write_trylock(&ei->i_es_lock))
+ if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
+ nr_skipped++;
continue;
+ }
+ /*
+ * Now we hold i_es_lock which protects us from inode reclaim
+ * freeing inode under us
+ */
+ spin_unlock(&sbi->s_es_lock);
- shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
- if (ei->i_es_lru_nr == 0)
- list_del_init(&ei->i_es_lru);
+ nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
write_unlock(&ei->i_es_lock);
- nr_shrunk += shrunk;
- nr_to_scan -= shrunk;
- if (nr_to_scan == 0)
- break;
+ if (nr_to_scan <= 0)
+ goto out;
+ spin_lock(&sbi->s_es_lock);
}
-
- /* Move the newer inodes into the tail of the LRU list. */
- list_splice_tail(&skipped, &sbi->s_es_lru);
- INIT_LIST_HEAD(&skipped);
+ spin_unlock(&sbi->s_es_lock);
/*
* If we skipped any inodes, and we weren't able to make any
- * forward progress, sort the list and try again.
+ * forward progress, try again to scan precached inodes.
*/
if ((nr_shrunk == 0) && nr_skipped && !retried) {
retried++;
- list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
- es_stats->es_stats_last_sorted = jiffies;
- ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
- i_es_lru);
- /*
- * If there are no non-precached inodes left on the
- * list, start releasing precached extents.
- */
- if (ext4_test_inode_state(&ei->vfs_inode,
- EXT4_STATE_EXT_PRECACHED))
- skip_precached = 0;
goto retry;
}
- spin_unlock(&sbi->s_es_lru_lock);
-
if (locked_ei && nr_shrunk == 0)
- nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+ nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
+out:
scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
if (likely(es_stats->es_stats_scan_time))
es_stats->es_stats_scan_time = (scan_time +
@@ -1043,7 +1046,7 @@ retry:
else
es_stats->es_stats_shrunk = nr_shrunk;
- trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
+ trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
nr_skipped, retried);
return nr_shrunk;
}
@@ -1055,7 +1058,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
struct ext4_sb_info *sbi;
sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
- nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+ nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
}
@@ -1068,13 +1071,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
- ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
if (!nr_to_scan)
return ret;
- nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+ nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
return nr_shrunk;
@@ -1102,28 +1105,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
return 0;
/* here we just find an inode that has the max nr. of objects */
- spin_lock(&sbi->s_es_lru_lock);
- list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
+ spin_lock(&sbi->s_es_lock);
+ list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
inode_cnt++;
if (max && max->i_es_all_nr < ei->i_es_all_nr)
max = ei;
else if (!max)
max = ei;
}
- spin_unlock(&sbi->s_es_lru_lock);
+ spin_unlock(&sbi->s_es_lock);
seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
- percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
+ percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
seq_printf(seq, " %lu/%lu cache hits/misses\n",
es_stats->es_stats_cache_hits,
es_stats->es_stats_cache_misses);
- if (es_stats->es_stats_last_sorted != 0)
- seq_printf(seq, " %u ms last sorted interval\n",
- jiffies_to_msecs(jiffies -
- es_stats->es_stats_last_sorted));
if (inode_cnt)
- seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
+ seq_printf(seq, " %d inodes on list\n", inode_cnt);
seq_printf(seq, "average:\n %llu us scan time\n",
div_u64(es_stats->es_stats_scan_time, 1000));
@@ -1132,7 +1131,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
seq_printf(seq,
"maximum:\n %lu inode (%u objects, %u reclaimable)\n"
" %llu us max scan time\n",
- max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
+ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
div_u64(es_stats->es_stats_max_scan_time, 1000));
return 0;
@@ -1181,9 +1180,11 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
int err;
- INIT_LIST_HEAD(&sbi->s_es_lru);
- spin_lock_init(&sbi->s_es_lru_lock);
- sbi->s_es_stats.es_stats_last_sorted = 0;
+ /* Make sure we have enough bits for physical block number */
+ BUILD_BUG_ON(ES_SHIFT < 48);
+ INIT_LIST_HEAD(&sbi->s_es_list);
+ sbi->s_es_nr_inode = 0;
+ spin_lock_init(&sbi->s_es_lock);
sbi->s_es_stats.es_stats_shrunk = 0;
sbi->s_es_stats.es_stats_cache_hits = 0;
sbi->s_es_stats.es_stats_cache_misses = 0;
@@ -1192,7 +1193,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
if (err)
return err;
- err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL);
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
if (err)
goto err1;
@@ -1210,7 +1211,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
return 0;
err2:
- percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
err1:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
return err;
@@ -1221,71 +1222,83 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
if (sbi->s_proc)
remove_proc_entry("es_shrinker_info", sbi->s_proc);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
- percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
}
-void ext4_es_lru_add(struct inode *inode)
+/*
+ * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
+ * most *nr_to_scan extents, update *nr_to_scan accordingly.
+ *
+ * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
+ * Increment *nr_shrunk by the number of reclaimed extents. Also update
+ * ei->i_es_shrink_lblk to where we should continue scanning.
+ */
+static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
+ int *nr_to_scan, int *nr_shrunk)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-
- ei->i_touch_when = jiffies;
-
- if (!list_empty(&ei->i_es_lru))
- return;
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+ struct extent_status *es;
+ struct rb_node *node;
- spin_lock(&sbi->s_es_lru_lock);
- if (list_empty(&ei->i_es_lru))
- list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
- spin_unlock(&sbi->s_es_lru_lock);
-}
+ es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
+ if (!es)
+ goto out_wrap;
+ node = &es->rb_node;
+ while (*nr_to_scan > 0) {
+ if (es->es_lblk > end) {
+ ei->i_es_shrink_lblk = end + 1;
+ return 0;
+ }
-void ext4_es_lru_del(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ (*nr_to_scan)--;
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+ * fiemap, bigallic, and seek_data/hole need to use it.
+ */
+ if (ext4_es_is_delayed(es))
+ goto next;
+ if (ext4_es_is_referenced(es)) {
+ ext4_es_clear_referenced(es);
+ goto next;
+ }
- spin_lock(&sbi->s_es_lru_lock);
- if (!list_empty(&ei->i_es_lru))
- list_del_init(&ei->i_es_lru);
- spin_unlock(&sbi->s_es_lru_lock);
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ (*nr_shrunk)++;
+next:
+ if (!node)
+ goto out_wrap;
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+ ei->i_es_shrink_lblk = es->es_lblk;
+ return 1;
+out_wrap:
+ ei->i_es_shrink_lblk = 0;
+ return 0;
}
-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
- int nr_to_scan)
+static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
{
struct inode *inode = &ei->vfs_inode;
- struct ext4_es_tree *tree = &ei->i_es_tree;
- struct rb_node *node;
- struct extent_status *es;
- unsigned long nr_shrunk = 0;
+ int nr_shrunk = 0;
+ ext4_lblk_t start = ei->i_es_shrink_lblk;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- if (ei->i_es_lru_nr == 0)
+ if (ei->i_es_shk_nr == 0)
return 0;
if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
__ratelimit(&_rs))
ext4_warning(inode->i_sb, "forced shrink of precached extents");
- node = rb_first(&tree->root);
- while (node != NULL) {
- es = rb_entry(node, struct extent_status, rb_node);
- node = rb_next(&es->rb_node);
- /*
- * We can't reclaim delayed extent from status tree because
- * fiemap, bigallic, and seek_data/hole need to use it.
- */
- if (!ext4_es_is_delayed(es)) {
- rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(inode, es);
- nr_shrunk++;
- if (--nr_to_scan == 0)
- break;
- }
- }
- tree->cache_es = NULL;
+ if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
+ start != 0)
+ es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
+
+ ei->i_es_tree.cache_es = NULL;
return nr_shrunk;
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index efd5f970b501..691b52613ce4 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -29,25 +29,28 @@
/*
* These flags live in the high bits of extent_status.es_pblk
*/
-#define ES_SHIFT 60
-
-#define EXTENT_STATUS_WRITTEN (1 << 3)
-#define EXTENT_STATUS_UNWRITTEN (1 << 2)
-#define EXTENT_STATUS_DELAYED (1 << 1)
-#define EXTENT_STATUS_HOLE (1 << 0)
+enum {
+ ES_WRITTEN_B,
+ ES_UNWRITTEN_B,
+ ES_DELAYED_B,
+ ES_HOLE_B,
+ ES_REFERENCED_B,
+ ES_FLAGS
+};
-#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
- EXTENT_STATUS_UNWRITTEN | \
- EXTENT_STATUS_DELAYED | \
- EXTENT_STATUS_HOLE)
+#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
+#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
-#define ES_WRITTEN (1ULL << 63)
-#define ES_UNWRITTEN (1ULL << 62)
-#define ES_DELAYED (1ULL << 61)
-#define ES_HOLE (1ULL << 60)
+#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
+#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
+#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
+#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B)
+#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B)
-#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
- ES_DELAYED | ES_HOLE)
+#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
+ EXTENT_STATUS_UNWRITTEN | \
+ EXTENT_STATUS_DELAYED | \
+ EXTENT_STATUS_HOLE) << ES_SHIFT)
struct ext4_sb_info;
struct ext4_extent;
@@ -65,14 +68,13 @@ struct ext4_es_tree {
};
struct ext4_es_stats {
- unsigned long es_stats_last_sorted;
unsigned long es_stats_shrunk;
unsigned long es_stats_cache_hits;
unsigned long es_stats_cache_misses;
u64 es_stats_scan_time;
u64 es_stats_max_scan_time;
struct percpu_counter es_stats_all_cnt;
- struct percpu_counter es_stats_lru_cnt;
+ struct percpu_counter es_stats_shk_cnt;
};
extern int __init ext4_init_es(void);
@@ -93,29 +95,49 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
+static inline unsigned int ext4_es_status(struct extent_status *es)
+{
+ return es->es_pblk >> ES_SHIFT;
+}
+
+static inline unsigned int ext4_es_type(struct extent_status *es)
+{
+ return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
+}
+
static inline int ext4_es_is_written(struct extent_status *es)
{
- return (es->es_pblk & ES_WRITTEN) != 0;
+ return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
}
static inline int ext4_es_is_unwritten(struct extent_status *es)
{
- return (es->es_pblk & ES_UNWRITTEN) != 0;
+ return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
}
static inline int ext4_es_is_delayed(struct extent_status *es)
{
- return (es->es_pblk & ES_DELAYED) != 0;
+ return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}
static inline int ext4_es_is_hole(struct extent_status *es)
{
- return (es->es_pblk & ES_HOLE) != 0;
+ return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}
-static inline unsigned int ext4_es_status(struct extent_status *es)
+static inline void ext4_es_set_referenced(struct extent_status *es)
{
- return es->es_pblk >> ES_SHIFT;
+ es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
+}
+
+static inline void ext4_es_clear_referenced(struct extent_status *es)
+{
+ es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
+}
+
+static inline int ext4_es_is_referenced(struct extent_status *es)
+{
+ return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
}
static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
@@ -135,23 +157,19 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
static inline void ext4_es_store_status(struct extent_status *es,
unsigned int status)
{
- es->es_pblk = (((ext4_fsblk_t)
- (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
- (es->es_pblk & ~ES_MASK));
+ es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
+ (es->es_pblk & ~ES_MASK);
}
static inline void ext4_es_store_pblock_status(struct extent_status *es,
ext4_fsblk_t pb,
unsigned int status)
{
- es->es_pblk = (((ext4_fsblk_t)
- (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
- (pb & ~ES_MASK));
+ es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
+ (pb & ~ES_MASK);
}
extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
-extern void ext4_es_lru_add(struct inode *inode);
-extern void ext4_es_lru_del(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24a4432..513c12cf444c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -137,10 +137,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
}
+ iocb->private = &overwrite;
if (o_direct) {
blk_start_plug(&plug);
- iocb->private = &overwrite;
/* check whether we do a DIO overwrite or not */
if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
@@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
* we determine this extent as a data or a hole according to whether the
* page cache has data or not.
*/
-static int ext4_find_unwritten_pgoff(struct inode *inode,
- int whence,
- struct ext4_map_blocks *map,
- loff_t *offset)
+static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
+ loff_t endoff, loff_t *offset)
{
struct pagevec pvec;
- unsigned int blkbits;
pgoff_t index;
pgoff_t end;
- loff_t endoff;
loff_t startoff;
loff_t lastoff;
int found = 0;
- blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -408,147 +403,144 @@ out:
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
- struct extent_status es;
- ext4_lblk_t start, last, end;
- loff_t dataoff, isize;
- int blkbits;
- int ret = 0;
+ struct fiemap_extent_info fie;
+ struct fiemap_extent ext[2];
+ loff_t next;
+ int i, ret = 0;
mutex_lock(&inode->i_mutex);
-
- isize = i_size_read(inode);
- if (offset >= isize) {
+ if (offset >= inode->i_size) {
mutex_unlock(&inode->i_mutex);
return -ENXIO;
}
-
- blkbits = inode->i_sb->s_blocksize_bits;
- start = offset >> blkbits;
- last = start;
- end = isize >> blkbits;
- dataoff = offset;
-
- do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
+ fie.fi_flags = 0;
+ fie.fi_extents_max = 2;
+ fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
+ while (1) {
+ mm_segment_t old_fs = get_fs();
+
+ fie.fi_extents_mapped = 0;
+ memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
+
+ set_fs(get_ds());
+ ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
+ set_fs(old_fs);
+ if (ret)
break;
- }
- /*
- * If there is a delay extent at this offset,
- * it will be as a data.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
+ /* No extents found, EOF */
+ if (!fie.fi_extents_mapped) {
+ ret = -ENXIO;
break;
}
+ for (i = 0; i < fie.fi_extents_mapped; i++) {
+ next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- &map, &dataoff);
- if (unwritten)
- break;
- }
+ if (offset < (loff_t)ext[i].fe_logical)
+ offset = (loff_t)ext[i].fe_logical;
+ /*
+ * If extent is not unwritten, then it contains valid
+ * data, mapped or delayed.
+ */
+ if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
+ goto out;
- last++;
- dataoff = (loff_t)last << blkbits;
- } while (last <= end);
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ next, &offset))
+ goto out;
+ if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
+ ret = -ENXIO;
+ goto out;
+ }
+ offset = next;
+ }
+ }
+ if (offset > inode->i_size)
+ offset = inode->i_size;
+out:
mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
- if (dataoff > isize)
- return -ENXIO;
-
- return vfs_setpos(file, dataoff, maxsize);
+ return vfs_setpos(file, offset, maxsize);
}
/*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE
*/
static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
- struct extent_status es;
- ext4_lblk_t start, last, end;
- loff_t holeoff, isize;
- int blkbits;
- int ret = 0;
+ struct fiemap_extent_info fie;
+ struct fiemap_extent ext[2];
+ loff_t next;
+ int i, ret = 0;
mutex_lock(&inode->i_mutex);
-
- isize = i_size_read(inode);
- if (offset >= isize) {
+ if (offset >= inode->i_size) {
mutex_unlock(&inode->i_mutex);
return -ENXIO;
}
- blkbits = inode->i_sb->s_blocksize_bits;
- start = offset >> blkbits;
- last = start;
- end = isize >> blkbits;
- holeoff = offset;
+ fie.fi_flags = 0;
+ fie.fi_extents_max = 2;
+ fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
+ while (1) {
+ mm_segment_t old_fs = get_fs();
- do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
- continue;
- }
+ fie.fi_extents_mapped = 0;
+ memset(ext, 0, sizeof(*ext));
- /*
- * If there is a delay extent at this offset,
- * we will skip this extent.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- last = es.es_lblk + es.es_len;
- holeoff = (loff_t)last << blkbits;
- continue;
- }
+ set_fs(get_ds());
+ ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
+ set_fs(old_fs);
+ if (ret)
+ break;
- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- &map, &holeoff);
- if (!unwritten) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
+ /* No extents found */
+ if (!fie.fi_extents_mapped)
+ break;
+
+ for (i = 0; i < fie.fi_extents_mapped; i++) {
+ next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
+ /*
+ * If extent is not unwritten, then it contains valid
+ * data, mapped or delayed.
+ */
+ if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
+ if (offset < (loff_t)ext[i].fe_logical)
+ goto out;
+ offset = next;
continue;
}
- }
-
- /* find a hole */
- break;
- } while (last <= end);
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ next, &offset))
+ goto out;
+ offset = next;
+ if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
+ goto out;
+ }
+ }
+ if (offset > inode->i_size)
+ offset = inode->i_size;
+out:
mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
- if (holeoff > isize)
- holeoff = isize;
-
- return vfs_setpos(file, holeoff, maxsize);
+ return vfs_setpos(file, offset, maxsize);
}
/*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8012a5daf401..ac644c31ca67 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -887,6 +887,10 @@ got:
struct buffer_head *block_bitmap_bh;
block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (!block_bitmap_bh) {
+ err = -EIO;
+ goto out;
+ }
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3ea62695abce..4b143febf21f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -811,8 +811,11 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
ret = __block_write_begin(page, 0, inline_size,
ext4_da_get_block_prep);
if (ret) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ unlock_page(page);
+ page_cache_release(page);
ext4_truncate_failed_write(inode);
- goto out;
+ return ret;
}
SetPageDirty(page);
@@ -870,6 +873,12 @@ retry_journal:
goto out_journal;
}
+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags |= AOP_FLAG_NOFS;
+
if (ret == -ENOSPC) {
ret = ext4_da_convert_inline_data_to_extent(mapping,
inode,
@@ -882,11 +891,6 @@ retry_journal:
goto out;
}
- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- flags |= AOP_FLAG_NOFS;
page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page) {
@@ -1807,11 +1811,12 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
- int *has_inline)
+ int *has_inline, __u64 start, __u64 len)
{
__u64 physical = 0;
- __u64 length;
- __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+ __u64 inline_len;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+ FIEMAP_EXTENT_LAST;
int error = 0;
struct ext4_iloc iloc;
@@ -1820,6 +1825,13 @@ int ext4_inline_data_fiemap(struct inode *inode,
*has_inline = 0;
goto out;
}
+ inline_len = min_t(size_t, ext4_get_inline_size(inode),
+ i_size_read(inode));
+ if (start >= inline_len)
+ goto out;
+ if (start + len < inline_len)
+ inline_len = start + len;
+ inline_len -= start;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -1828,11 +1840,10 @@ int ext4_inline_data_fiemap(struct inode *inode,
physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
- length = i_size_read(inode);
if (physical)
- error = fiemap_fill_next_extent(fieinfo, 0, physical,
- length, flags);
+ error = fiemap_fill_next_extent(fieinfo, start, physical,
+ inline_len, flags);
brelse(iloc.bh);
out:
up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e9777f93cf05..5653fa42930b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -416,11 +416,6 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
up_read((&EXT4_I(inode)->i_data_sem));
- /*
- * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
- * because it shouldn't be marked in es_map->m_flags.
- */
- map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
/*
* We don't check m_len because extent will be collpased in status
@@ -491,7 +486,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
- ext4_es_lru_add(inode);
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -1393,7 +1387,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, &es)) {
- ext4_es_lru_add(inode);
if (ext4_es_is_hole(&es)) {
retval = 0;
down_read(&EXT4_I(inode)->i_data_sem);
@@ -1434,24 +1427,12 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- if (ext4_has_inline_data(inode)) {
- /*
- * We will soon create blocks for this page, and let
- * us pretend as if the blocks aren't allocated yet.
- * In case of clusters, we have to handle the work
- * of mapping from cluster so that the reserved space
- * is calculated properly.
- */
- if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk))
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ if (ext4_has_inline_data(inode))
retval = 0;
- } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(NULL, inode, map,
- EXT4_GET_BLOCKS_NO_PUT_HOLE);
+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
- retval = ext4_ind_map_blocks(NULL, inode, map,
- EXT4_GET_BLOCKS_NO_PUT_HOLE);
+ retval = ext4_ind_map_blocks(NULL, inode, map, 0);
add_delayed:
if (retval == 0) {
@@ -1465,7 +1446,8 @@ add_delayed:
* then we don't need to reserve it again. However we still need
* to reserve metadata for every block we're going to write.
*/
- if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
+ !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
ret = ext4_da_reserve_space(inode, iblock);
if (ret) {
/* not enough space to reserve */
@@ -1481,11 +1463,6 @@ add_delayed:
goto out_unlock;
}
- /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
- * and it should not appear on the bh->b_state.
- */
- map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
-
map_bh(bh, inode->i_sb, invalid_block);
set_buffer_new(bh);
set_buffer_delay(bh);
@@ -3643,7 +3620,7 @@ out_stop:
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
* However, if this was a real unlink then we were called by
- * ext4_delete_inode(), and we allow that function to clean up the
+ * ext4_evict_inode(), and we allow that function to clean up the
* orphan info for us.
*/
if (inode->i_nlink)
@@ -4959,7 +4936,12 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else {
- jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal);
+ if (err < 0) {
+ jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
+ return err;
+ }
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
}
ext4_set_aops(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bfda18a15592..f58a0d106726 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -78,8 +78,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
- ext4_es_lru_del(inode1);
- ext4_es_lru_del(inode2);
isize = i_size_read(inode1);
i_size_write(inode1, i_size_read(inode2));
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbfe15c2533c..8d1e60214ef0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
if (sbi->s_group_info) {
memcpy(new_groupinfo, sbi->s_group_info,
sbi->s_group_info_size * sizeof(*sbi->s_group_info));
- ext4_kvfree(sbi->s_group_info);
+ kvfree(sbi->s_group_info);
}
sbi->s_group_info = new_groupinfo;
sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
@@ -2385,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
metalen = sizeof(*meta_group_info) <<
EXT4_DESC_PER_BLOCK_BITS(sb);
- meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ meta_group_info = kmalloc(metalen, GFP_NOFS);
if (meta_group_info == NULL) {
ext4_msg(sb, KERN_ERR, "can't allocate mem "
"for a buddy group");
@@ -2399,7 +2399,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
+ meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
if (meta_group_info[i] == NULL) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
goto exit_group_info;
@@ -2428,7 +2428,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
{
struct buffer_head *bh;
meta_group_info[i]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_KERNEL);
+ kmalloc(sb->s_blocksize, GFP_NOFS);
BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
bh = ext4_read_block_bitmap(sb, group);
BUG_ON(bh == NULL);
@@ -2495,7 +2495,7 @@ err_freebuddy:
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
err_freesgi:
- ext4_kvfree(sbi->s_group_info);
+ kvfree(sbi->s_group_info);
return -ENOMEM;
}
@@ -2708,12 +2708,11 @@ int ext4_mb_release(struct super_block *sb)
EXT4_DESC_PER_BLOCK_BITS(sb);
for (i = 0; i < num_meta_group_infos; i++)
kfree(sbi->s_group_info[i]);
- ext4_kvfree(sbi->s_group_info);
+ kvfree(sbi->s_group_info);
}
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
- if (sbi->s_buddy_cache)
- iput(sbi->s_buddy_cache);
+ iput(sbi->s_buddy_cache);
if (sbi->s_mb_stats) {
ext4_msg(sb, KERN_INFO,
"mballoc: %u blocks %u reqs (%u success)",
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a432634f2e6a..3cb267aee802 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -592,7 +592,7 @@ err_out:
/*
* set the i_blocks count to zero
- * so that the ext4_delete_inode does the
+ * so that the ext4_evict_inode() does the
* right job
*
* We don't need to take the i_lock because
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 9f2311bc9c4f..503ea15dc5db 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -273,6 +273,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
int replaced_count = 0;
int from = data_offset_in_page << orig_inode->i_blkbits;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ struct super_block *sb = orig_inode->i_sb;
/*
* It needs twice the amount of ordinary journal buffers because
@@ -405,10 +406,13 @@ unlock_pages:
page_cache_release(pagep[1]);
stop_journal:
ext4_journal_stop(handle);
+ if (*err == -ENOSPC &&
+ ext4_should_retry_alloc(sb, &retries))
+ goto again;
/* Buffer was busy because probably is pinned to journal transaction,
* force transaction commit may help to free it. */
- if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
- &retries))
+ if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
+ jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
goto again;
return replaced_count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index adb559de23c1..2291923dae4e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1816,31 +1816,39 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
ext4fs_dirhash(name, namelen, &hinfo);
+ memset(frames, 0, sizeof(frames));
frame = frames;
frame->entries = entries;
frame->at = entries;
frame->bh = bh;
bh = bh2;
- ext4_handle_dirty_dx_node(handle, dir, frame->bh);
- ext4_handle_dirty_dirent_node(handle, dir, bh);
+ retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+ if (retval)
+ goto out_frames;
+ retval = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ if (retval)
+ goto out_frames;
de = do_split(handle,dir, &bh, frame, &hinfo);
if (IS_ERR(de)) {
- /*
- * Even if the block split failed, we have to properly write
- * out all the changes we did so far. Otherwise we can end up
- * with corrupted filesystem.
- */
- ext4_mark_inode_dirty(handle, dir);
- dx_release(frames);
- return PTR_ERR(de);
+ retval = PTR_ERR(de);
+ goto out_frames;
}
dx_release(frames);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
return retval;
+out_frames:
+ /*
+ * Even if the block split failed, we have to properly write
+ * out all the changes we did so far. Otherwise we can end up
+ * with corrupted filesystem.
+ */
+ ext4_mark_inode_dirty(handle, dir);
+ dx_release(frames);
+ return retval;
}
/*
@@ -2806,7 +2814,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
ext4_orphan_add(handle, inode);
inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
- retval = 0;
end_unlink:
brelse(bh);
@@ -3148,6 +3155,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
}
}
+static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
+ int credits, handle_t **h)
+{
+ struct inode *wh;
+ handle_t *handle;
+ int retries = 0;
+
+ /*
+ * for inode block, sb block, group summaries,
+ * and inode bitmap
+ */
+ credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
+ EXT4_XATTR_TRANS_BLOCKS + 4);
+retry:
+ wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
+ &ent->dentry->d_name, 0, NULL,
+ EXT4_HT_DIR, credits);
+
+ handle = ext4_journal_current_handle();
+ if (IS_ERR(wh)) {
+ if (handle)
+ ext4_journal_stop(handle);
+ if (PTR_ERR(wh) == -ENOSPC &&
+ ext4_should_retry_alloc(ent->dir->i_sb, &retries))
+ goto retry;
+ } else {
+ *h = handle;
+ init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
+ wh->i_op = &ext4_special_inode_operations;
+ }
+ return wh;
+}
+
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
@@ -3157,7 +3197,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
* This comes from rename(const char *oldpath, const char *newpath)
*/
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
handle_t *handle = NULL;
struct ext4_renament old = {
@@ -3172,6 +3213,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
};
int force_reread;
int retval;
+ struct inode *whiteout = NULL;
+ int credits;
+ u8 old_file_type;
dquot_initialize(old.dir);
dquot_initialize(new.dir);
@@ -3210,11 +3254,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
ext4_alloc_da_blocks(old.inode);
- handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
- (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+ if (!(flags & RENAME_WHITEOUT)) {
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ } else {
+ whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
+ if (IS_ERR(whiteout))
+ return PTR_ERR(whiteout);
+ }
if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
ext4_handle_sync(handle);
@@ -3242,13 +3292,26 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
force_reread = (new.dir->i_ino == old.dir->i_ino &&
ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
+
+ old_file_type = old.de->file_type;
+ if (whiteout) {
+ /*
+ * Do this before adding a new entry, so the old entry is sure
+ * to be still pointing to the valid old entry.
+ */
+ retval = ext4_setent(handle, &old, whiteout->i_ino,
+ EXT4_FT_CHRDEV);
+ if (retval)
+ goto end_rename;
+ ext4_mark_inode_dirty(handle, whiteout);
+ }
if (!new.bh) {
retval = ext4_add_entry(handle, new.dentry, old.inode);
if (retval)
goto end_rename;
} else {
retval = ext4_setent(handle, &new,
- old.inode->i_ino, old.de->file_type);
+ old.inode->i_ino, old_file_type);
if (retval)
goto end_rename;
}
@@ -3263,10 +3326,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
old.inode->i_ctime = ext4_current_time(old.inode);
ext4_mark_inode_dirty(handle, old.inode);
- /*
- * ok, that's it
- */
- ext4_rename_delete(handle, &old, force_reread);
+ if (!whiteout) {
+ /*
+ * ok, that's it
+ */
+ ext4_rename_delete(handle, &old, force_reread);
+ }
if (new.inode) {
ext4_dec_count(handle, new.inode);
@@ -3302,6 +3367,12 @@ end_rename:
brelse(old.dir_bh);
brelse(old.bh);
brelse(new.bh);
+ if (whiteout) {
+ if (retval)
+ drop_nlink(whiteout);
+ unlock_new_inode(whiteout);
+ iput(whiteout);
+ }
if (handle)
ext4_journal_stop(handle);
return retval;
@@ -3434,18 +3505,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE) {
return ext4_cross_rename(old_dir, old_dentry,
new_dir, new_dentry);
}
- /*
- * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
- * is equivalent to regular rename.
- */
- return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
/*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f298c60f907d..bf76f405a5f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -856,7 +856,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
n_group_desc[gdb_num] = gdb_bh;
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
- ext4_kvfree(o_group_desc);
+ kvfree(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
err = ext4_handle_dirty_super(handle, sb);
@@ -866,7 +866,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
return err;
exit_inode:
- ext4_kvfree(n_group_desc);
+ kvfree(n_group_desc);
brelse(iloc.bh);
exit_dind:
brelse(dind);
@@ -909,7 +909,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
n_group_desc[gdb_num] = gdb_bh;
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
- ext4_kvfree(o_group_desc);
+ kvfree(o_group_desc);
BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
@@ -1081,7 +1081,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
break;
if (meta_bg == 0)
- backup_block = group * bpg + blk_off;
+ backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
else
backup_block = (ext4_group_first_block_no(sb, group) +
ext4_bg_has_super(sb, group));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1eda6ab0ef9d..43c92b1685cb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -176,15 +176,6 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)
return ret;
}
-void ext4_kvfree(void *ptr)
-{
- if (is_vmalloc_addr(ptr))
- vfree(ptr);
- else
- kfree(ptr);
-
-}
-
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg)
{
@@ -811,8 +802,8 @@ static void ext4_put_super(struct super_block *sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
- ext4_kvfree(sbi->s_group_desc);
- ext4_kvfree(sbi->s_flex_groups);
+ kvfree(sbi->s_group_desc);
+ kvfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -880,10 +871,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
- INIT_LIST_HEAD(&ei->i_es_lru);
+ INIT_LIST_HEAD(&ei->i_es_list);
ei->i_es_all_nr = 0;
- ei->i_es_lru_nr = 0;
- ei->i_touch_when = 0;
+ ei->i_es_shk_nr = 0;
+ ei->i_es_shrink_lblk = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -892,6 +883,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&(ei->i_block_reservation_lock));
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
+ memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
@@ -972,7 +964,6 @@ void ext4_clear_inode(struct inode *inode)
dquot_drop(inode);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
- ext4_es_lru_del(inode);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1068,6 +1059,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
static int ext4_enable_quotas(struct super_block *sb);
+static struct dquot **ext4_get_dquots(struct inode *inode)
+{
+ return EXT4_I(inode)->i_dquot;
+}
+
static const struct dquot_operations ext4_quota_operations = {
.get_reserved_space = ext4_get_reserved_space,
.write_dquot = ext4_write_dquot,
@@ -1117,6 +1113,7 @@ static const struct super_operations ext4_sops = {
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
+ .get_dquots = ext4_get_dquots,
#endif
.bdev_try_to_free_page = bdev_try_to_free_page,
};
@@ -1146,7 +1143,7 @@ enum {
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
- Opt_max_dir_size_kb,
+ Opt_max_dir_size_kb, Opt_nojournal_checksum,
};
static const match_table_t tokens = {
@@ -1180,6 +1177,7 @@ static const match_table_t tokens = {
{Opt_journal_dev, "journal_dev=%u"},
{Opt_journal_path, "journal_path=%s"},
{Opt_journal_checksum, "journal_checksum"},
+ {Opt_nojournal_checksum, "nojournal_checksum"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_abort, "abort"},
{Opt_data_journal, "data=journal"},
@@ -1361,6 +1359,8 @@ static const struct mount_opts {
MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
MOPT_EXT4_ONLY | MOPT_CLEAR},
+ {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_SET},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1702,6 +1702,12 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
}
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
+ "in data=ordered mode");
+ return 0;
+ }
return 1;
}
@@ -1939,7 +1945,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
memcpy(new_groups, sbi->s_flex_groups,
(sbi->s_flex_groups_allocated *
sizeof(struct flex_groups)));
- ext4_kvfree(sbi->s_flex_groups);
+ kvfree(sbi->s_flex_groups);
}
sbi->s_flex_groups = new_groups;
sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
@@ -3310,7 +3316,7 @@ int ext4_calculate_overhead(struct super_block *sb)
struct ext4_super_block *es = sbi->s_es;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
ext4_fsblk_t overhead = 0;
- char *buf = (char *) get_zeroed_page(GFP_KERNEL);
+ char *buf = (char *) get_zeroed_page(GFP_NOFS);
if (!buf)
return -ENOMEM;
@@ -3338,8 +3344,8 @@ int ext4_calculate_overhead(struct super_block *sb)
memset(buf, 0, PAGE_SIZE);
cond_resched();
}
- /* Add the journal blocks as well */
- if (sbi->s_journal)
+ /* Add the internal journal blocks as well */
+ if (sbi->s_journal && !sbi->journal_bdev)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
sbi->s_overhead = overhead;
@@ -3526,6 +3532,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
+ /* don't forget to enable journal_csum when metadata_csum is enabled. */
+ if (ext4_has_metadata_csum(sb))
+ set_opt(sb, JOURNAL_CHECKSUM);
+
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3928,6 +3938,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_qcop = &ext4_qctl_sysfile_operations;
else
sb->s_qcop = &ext4_qctl_operations;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
@@ -3943,7 +3954,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
!(sb->s_flags & MS_RDONLY))
if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
- goto failed_mount3;
+ goto failed_mount3a;
/*
* The first inode we look at is the journal inode. Don't try
@@ -3952,7 +3963,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!test_opt(sb, NOLOAD) &&
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
if (ext4_load_journal(sb, es, journal_devnum))
- goto failed_mount3;
+ goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -4220,7 +4231,7 @@ failed_mount7:
failed_mount6:
ext4_mb_release(sb);
if (sbi->s_flex_groups)
- ext4_kvfree(sbi->s_flex_groups);
+ kvfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -4240,6 +4251,7 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
+failed_mount3a:
ext4_es_unregister_shrinker(sbi);
failed_mount3:
del_timer_sync(&sbi->s_err_report);
@@ -4248,7 +4260,7 @@ failed_mount3:
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
- ext4_kvfree(sbi->s_group_desc);
+ kvfree(sbi->s_group_desc);
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
@@ -4841,6 +4853,22 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
+ test_opt(sb, JOURNAL_CHECKSUM)) {
+ ext4_msg(sb, KERN_ERR, "changing journal_checksum "
+ "during remount not supported");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
+ if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
+ test_opt(sb, JOURNAL_CHECKSUM)) {
+ ext4_msg(sb, KERN_ERR, "changing journal_checksum "
+ "during remount not supported");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
if (test_opt2(sb, EXPLICIT_DELALLOC)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 83b9b5a8d112..1ccb26bc2a0b 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -162,7 +162,8 @@ fail:
return ERR_PTR(-EINVAL);
}
-struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
+ struct page *dpage)
{
int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
void *value = NULL;
@@ -172,12 +173,13 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
if (type == ACL_TYPE_ACCESS)
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
- retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
+ retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
if (retval > 0) {
value = kmalloc(retval, GFP_F2FS_ZERO);
if (!value)
return ERR_PTR(-ENOMEM);
- retval = f2fs_getxattr(inode, name_index, "", value, retval);
+ retval = f2fs_getxattr(inode, name_index, "", value,
+ retval, dpage);
}
if (retval > 0)
@@ -194,6 +196,11 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
return acl;
}
+struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+{
+ return __f2fs_get_acl(inode, type, NULL);
+}
+
static int __f2fs_set_acl(struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
@@ -229,7 +236,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
if (acl) {
value = f2fs_acl_to_disk(acl, &size);
if (IS_ERR(value)) {
- cond_clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(fi, FI_ACL_MODE);
return (int)PTR_ERR(value);
}
}
@@ -240,7 +247,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
if (!error)
set_cached_acl(inode, type, acl);
- cond_clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(fi, FI_ACL_MODE);
return error;
}
@@ -249,12 +256,137 @@ int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return __f2fs_set_acl(inode, type, acl, NULL);
}
-int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
+/*
+ * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create
+ * are copied from posix_acl.c
+ */
+static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl,
+ gfp_t flags)
+{
+ struct posix_acl *clone = NULL;
+
+ if (acl) {
+ int size = sizeof(struct posix_acl) + acl->a_count *
+ sizeof(struct posix_acl_entry);
+ clone = kmemdup(acl, size, flags);
+ if (clone)
+ atomic_set(&clone->a_refcount, 1);
+ }
+ return clone;
+}
+
+static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
+{
+ struct posix_acl_entry *pa, *pe;
+ struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
+ umode_t mode = *mode_p;
+ int not_equiv = 0;
+
+ /* assert(atomic_read(acl->a_refcount) == 1); */
+
+ FOREACH_ACL_ENTRY(pa, acl, pe) {
+ switch(pa->e_tag) {
+ case ACL_USER_OBJ:
+ pa->e_perm &= (mode >> 6) | ~S_IRWXO;
+ mode &= (pa->e_perm << 6) | ~S_IRWXU;
+ break;
+
+ case ACL_USER:
+ case ACL_GROUP:
+ not_equiv = 1;
+ break;
+
+ case ACL_GROUP_OBJ:
+ group_obj = pa;
+ break;
+
+ case ACL_OTHER:
+ pa->e_perm &= mode | ~S_IRWXO;
+ mode &= pa->e_perm | ~S_IRWXO;
+ break;
+
+ case ACL_MASK:
+ mask_obj = pa;
+ not_equiv = 1;
+ break;
+
+ default:
+ return -EIO;
+ }
+ }
+
+ if (mask_obj) {
+ mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+ mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
+ } else {
+ if (!group_obj)
+ return -EIO;
+ group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+ mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
+ }
+
+ *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+ return not_equiv;
+}
+
+static int f2fs_acl_create(struct inode *dir, umode_t *mode,
+ struct posix_acl **default_acl, struct posix_acl **acl,
+ struct page *dpage)
+{
+ struct posix_acl *p;
+ int ret;
+
+ if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
+ goto no_acl;
+
+ p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage);
+ if (IS_ERR(p)) {
+ if (p == ERR_PTR(-EOPNOTSUPP))
+ goto apply_umask;
+ return PTR_ERR(p);
+ }
+
+ if (!p)
+ goto apply_umask;
+
+ *acl = f2fs_acl_clone(p, GFP_NOFS);
+ if (!*acl)
+ return -ENOMEM;
+
+ ret = f2fs_acl_create_masq(*acl, mode);
+ if (ret < 0) {
+ posix_acl_release(*acl);
+ return -ENOMEM;
+ }
+
+ if (ret == 0) {
+ posix_acl_release(*acl);
+ *acl = NULL;
+ }
+
+ if (!S_ISDIR(*mode)) {
+ posix_acl_release(p);
+ *default_acl = NULL;
+ } else {
+ *default_acl = p;
+ }
+ return 0;
+
+apply_umask:
+ *mode &= ~current_umask();
+no_acl:
+ *default_acl = NULL;
+ *acl = NULL;
+ return 0;
+}
+
+int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
+ struct page *dpage)
{
- struct posix_acl *default_acl, *acl;
+ struct posix_acl *default_acl = NULL, *acl = NULL;
int error = 0;
- error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage);
if (error)
return error;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index e0864651cdc1..997ca8edb6cb 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -38,14 +38,15 @@ struct f2fs_acl_header {
extern struct posix_acl *f2fs_get_acl(struct inode *, int);
extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int f2fs_init_acl(struct inode *, struct inode *, struct page *);
+extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
+ struct page *);
#else
#define f2fs_check_acl NULL
#define f2fs_get_acl NULL
#define f2fs_set_acl NULL
static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
- struct page *page)
+ struct page *ipage, struct page *dpage)
{
return 0;
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index dd10a031c052..e6c271fefaca 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,36 +72,36 @@ out:
return page;
}
-struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
-{
- bool readahead = false;
- struct page *page;
-
- page = find_get_page(META_MAPPING(sbi), index);
- if (!page || (page && !PageUptodate(page)))
- readahead = true;
- f2fs_put_page(page, 0);
-
- if (readahead)
- ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
- return get_meta_page(sbi, index);
-}
-
-static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type)
{
switch (type) {
case META_NAT:
- return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK;
+ break;
case META_SIT:
- return SIT_BLK_CNT(sbi);
+ if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
+ return false;
+ break;
case META_SSA:
+ if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
+ blkaddr < SM_I(sbi)->ssa_blkaddr))
+ return false;
+ break;
case META_CP:
- return 0;
+ if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
+ blkaddr < __start_cp_addr(sbi)))
+ return false;
+ break;
case META_POR:
- return MAX_BLKADDR(sbi);
+ if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
+ blkaddr < MAIN_BLKADDR(sbi)))
+ return false;
+ break;
default:
BUG();
}
+
+ return true;
}
/*
@@ -112,7 +112,6 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
block_t prev_blk_addr = 0;
struct page *page;
block_t blkno = start;
- block_t max_blks = get_max_meta_blks(sbi, type);
struct f2fs_io_info fio = {
.type = META,
@@ -122,18 +121,20 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
for (; nrpages-- > 0; blkno++) {
block_t blk_addr;
+ if (!is_valid_blkaddr(sbi, blkno, type))
+ goto out;
+
switch (type) {
case META_NAT:
- /* get nat block addr */
- if (unlikely(blkno >= max_blks))
+ if (unlikely(blkno >=
+ NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
blkno = 0;
+ /* get nat block addr */
blk_addr = current_nat_addr(sbi,
blkno * NAT_ENTRY_PER_BLOCK);
break;
case META_SIT:
/* get sit block addr */
- if (unlikely(blkno >= max_blks))
- goto out;
blk_addr = current_sit_addr(sbi,
blkno * SIT_ENTRY_PER_BLOCK);
if (blkno != start && prev_blk_addr + 1 != blk_addr)
@@ -143,10 +144,6 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
case META_SSA:
case META_CP:
case META_POR:
- if (unlikely(blkno >= max_blks))
- goto out;
- if (unlikely(blkno < SEG0_BLKADDR(sbi)))
- goto out;
blk_addr = blkno;
break;
default:
@@ -169,6 +166,20 @@ out:
return blkno - start;
}
+void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ struct page *page;
+ bool readahead = false;
+
+ page = find_get_page(META_MAPPING(sbi), index);
+ if (!page || (page && !PageUptodate(page)))
+ readahead = true;
+ f2fs_put_page(page, 0);
+
+ if (readahead)
+ ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+}
+
static int f2fs_write_meta_page(struct page *page,
struct writeback_control *wbc)
{
@@ -178,7 +189,7 @@ static int f2fs_write_meta_page(struct page *page,
if (unlikely(sbi->por_doing))
goto redirty_out;
- if (wbc->for_reclaim)
+ if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
goto redirty_out;
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
@@ -187,6 +198,9 @@ static int f2fs_write_meta_page(struct page *page,
write_meta_page(sbi, page);
dec_page_count(sbi, F2FS_DIRTY_META);
unlock_page(page);
+
+ if (wbc->for_reclaim)
+ f2fs_submit_merged_bio(sbi, META, WRITE);
return 0;
redirty_out:
@@ -298,46 +312,57 @@ const struct address_space_operations f2fs_meta_aops = {
static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
+ struct inode_management *im = &sbi->im[type];
struct ino_entry *e;
retry:
- spin_lock(&sbi->ino_lock[type]);
+ if (radix_tree_preload(GFP_NOFS)) {
+ cond_resched();
+ goto retry;
+ }
+
+ spin_lock(&im->ino_lock);
- e = radix_tree_lookup(&sbi->ino_root[type], ino);
+ e = radix_tree_lookup(&im->ino_root, ino);
if (!e) {
e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
if (!e) {
- spin_unlock(&sbi->ino_lock[type]);
+ spin_unlock(&im->ino_lock);
+ radix_tree_preload_end();
goto retry;
}
- if (radix_tree_insert(&sbi->ino_root[type], ino, e)) {
- spin_unlock(&sbi->ino_lock[type]);
+ if (radix_tree_insert(&im->ino_root, ino, e)) {
+ spin_unlock(&im->ino_lock);
kmem_cache_free(ino_entry_slab, e);
+ radix_tree_preload_end();
goto retry;
}
memset(e, 0, sizeof(struct ino_entry));
e->ino = ino;
- list_add_tail(&e->list, &sbi->ino_list[type]);
+ list_add_tail(&e->list, &im->ino_list);
+ if (type != ORPHAN_INO)
+ im->ino_num++;
}
- spin_unlock(&sbi->ino_lock[type]);
+ spin_unlock(&im->ino_lock);
+ radix_tree_preload_end();
}
static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
+ struct inode_management *im = &sbi->im[type];
struct ino_entry *e;
- spin_lock(&sbi->ino_lock[type]);
- e = radix_tree_lookup(&sbi->ino_root[type], ino);
+ spin_lock(&im->ino_lock);
+ e = radix_tree_lookup(&im->ino_root, ino);
if (e) {
list_del(&e->list);
- radix_tree_delete(&sbi->ino_root[type], ino);
- if (type == ORPHAN_INO)
- sbi->n_orphans--;
- spin_unlock(&sbi->ino_lock[type]);
+ radix_tree_delete(&im->ino_root, ino);
+ im->ino_num--;
+ spin_unlock(&im->ino_lock);
kmem_cache_free(ino_entry_slab, e);
return;
}
- spin_unlock(&sbi->ino_lock[type]);
+ spin_unlock(&im->ino_lock);
}
void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -355,10 +380,12 @@ void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
/* mode should be APPEND_INO or UPDATE_INO */
bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
{
+ struct inode_management *im = &sbi->im[mode];
struct ino_entry *e;
- spin_lock(&sbi->ino_lock[mode]);
- e = radix_tree_lookup(&sbi->ino_root[mode], ino);
- spin_unlock(&sbi->ino_lock[mode]);
+
+ spin_lock(&im->ino_lock);
+ e = radix_tree_lookup(&im->ino_root, ino);
+ spin_unlock(&im->ino_lock);
return e ? true : false;
}
@@ -368,36 +395,42 @@ void release_dirty_inode(struct f2fs_sb_info *sbi)
int i;
for (i = APPEND_INO; i <= UPDATE_INO; i++) {
- spin_lock(&sbi->ino_lock[i]);
- list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) {
+ struct inode_management *im = &sbi->im[i];
+
+ spin_lock(&im->ino_lock);
+ list_for_each_entry_safe(e, tmp, &im->ino_list, list) {
list_del(&e->list);
- radix_tree_delete(&sbi->ino_root[i], e->ino);
+ radix_tree_delete(&im->ino_root, e->ino);
kmem_cache_free(ino_entry_slab, e);
+ im->ino_num--;
}
- spin_unlock(&sbi->ino_lock[i]);
+ spin_unlock(&im->ino_lock);
}
}
int acquire_orphan_inode(struct f2fs_sb_info *sbi)
{
+ struct inode_management *im = &sbi->im[ORPHAN_INO];
int err = 0;
- spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- if (unlikely(sbi->n_orphans >= sbi->max_orphans))
+ spin_lock(&im->ino_lock);
+ if (unlikely(im->ino_num >= sbi->max_orphans))
err = -ENOSPC;
else
- sbi->n_orphans++;
- spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
+ im->ino_num++;
+ spin_unlock(&im->ino_lock);
return err;
}
void release_orphan_inode(struct f2fs_sb_info *sbi)
{
- spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- f2fs_bug_on(sbi, sbi->n_orphans == 0);
- sbi->n_orphans--;
- spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
+ struct inode_management *im = &sbi->im[ORPHAN_INO];
+
+ spin_lock(&im->ino_lock);
+ f2fs_bug_on(sbi, im->ino_num == 0);
+ im->ino_num--;
+ spin_unlock(&im->ino_lock);
}
void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -460,17 +493,19 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
struct f2fs_orphan_block *orphan_blk = NULL;
unsigned int nentries = 0;
unsigned short index;
- unsigned short orphan_blocks =
- (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
+ unsigned short orphan_blocks;
struct page *page = NULL;
struct ino_entry *orphan = NULL;
+ struct inode_management *im = &sbi->im[ORPHAN_INO];
+
+ orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
for (index = 0; index < orphan_blocks; index++)
grab_meta_page(sbi, start_blk + index);
index = 1;
- spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- head = &sbi->ino_list[ORPHAN_INO];
+ spin_lock(&im->ino_lock);
+ head = &im->ino_list;
/* loop for each orphan inode entry and write them in Jornal block */
list_for_each_entry(orphan, head, list) {
@@ -510,7 +545,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
f2fs_put_page(page, 1);
}
- spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
+ spin_unlock(&im->ino_lock);
}
static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -731,6 +766,9 @@ void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
struct dir_inode_entry *entry;
struct inode *inode;
retry:
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
spin_lock(&sbi->dir_inode_lock);
head = &sbi->dir_inode_list;
@@ -830,6 +868,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
nid_t last_nid = nm_i->next_scan_nid;
block_t start_blk;
struct page *cp_page;
@@ -889,7 +928,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
else
clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
- orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
+ orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
orphan_blocks);
@@ -905,7 +944,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
orphan_blocks);
}
- if (sbi->n_orphans)
+ if (orphan_num)
set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
else
clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
@@ -940,7 +979,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_put_page(cp_page, 1);
}
- if (sbi->n_orphans) {
+ if (orphan_num) {
write_orphan_inodes(sbi, start_blk);
start_blk += orphan_blocks;
}
@@ -975,6 +1014,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Here, we only have one bio having CP pack */
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+ /* wait for previous submitted meta pages writeback */
+ wait_on_all_pages_writeback(sbi);
+
release_dirty_inode(sbi);
if (unlikely(f2fs_cp_error(sbi)))
@@ -1036,9 +1078,12 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
int i;
for (i = 0; i < MAX_INO_ENTRY; i++) {
- INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC);
- spin_lock_init(&sbi->ino_lock[i]);
- INIT_LIST_HEAD(&sbi->ino_list[i]);
+ struct inode_management *im = &sbi->im[i];
+
+ INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC);
+ spin_lock_init(&im->ino_lock);
+ INIT_LIST_HEAD(&im->ino_list);
+ im->ino_num = 0;
}
/*
@@ -1047,7 +1092,6 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
* orphan entries with the limitation one reserved segment
* for cp pack we can have max 1020*504 orphan entries
*/
- sbi->n_orphans = 0;
sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8e58c4cc2cb9..7ec697b37f19 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -61,11 +61,6 @@ static void f2fs_write_end_io(struct bio *bio, int err)
dec_page_count(sbi, F2FS_WRITEBACK);
}
- if (sbi->wait_io) {
- complete(sbi->wait_io);
- sbi->wait_io = NULL;
- }
-
if (!get_pages(sbi, F2FS_WRITEBACK) &&
!list_empty(&sbi->cp_wait.task_list))
wake_up(&sbi->cp_wait);
@@ -95,34 +90,18 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
- int rw;
if (!io->bio)
return;
- rw = fio->rw;
-
- if (is_read_io(rw)) {
- trace_f2fs_submit_read_bio(io->sbi->sb, rw,
- fio->type, io->bio);
- submit_bio(rw, io->bio);
- } else {
- trace_f2fs_submit_write_bio(io->sbi->sb, rw,
- fio->type, io->bio);
- /*
- * META_FLUSH is only from the checkpoint procedure, and we
- * should wait this metadata bio for FS consistency.
- */
- if (fio->type == META_FLUSH) {
- DECLARE_COMPLETION_ONSTACK(wait);
- io->sbi->wait_io = &wait;
- submit_bio(rw, io->bio);
- wait_for_completion(&wait);
- } else {
- submit_bio(rw, io->bio);
- }
- }
+ if (is_read_io(fio->rw))
+ trace_f2fs_submit_read_bio(io->sbi->sb, fio->rw,
+ fio->type, io->bio);
+ else
+ trace_f2fs_submit_write_bio(io->sbi->sb, fio->rw,
+ fio->type, io->bio);
+ submit_bio(fio->rw, io->bio);
io->bio = NULL;
}
@@ -257,9 +236,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
bool need_put = dn->inode_page ? false : true;
int err;
- /* if inode_page exists, index should be zero */
- f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
-
err = get_dnode_of_data(dn, index, ALLOC_NODE);
if (err)
return err;
@@ -740,14 +716,14 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
static int f2fs_read_data_page(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
- int ret;
+ int ret = -EAGAIN;
trace_f2fs_readpage(page, DATA);
/* If the file has inline data, try to read it directly */
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
- else
+ if (ret == -EAGAIN)
ret = mpage_readpage(page, get_data_block);
return ret;
@@ -859,10 +835,11 @@ write:
else if (has_not_enough_free_secs(sbi, 0))
goto redirty_out;
+ err = -EAGAIN;
f2fs_lock_op(sbi);
- if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode))
- err = f2fs_write_inline_data(inode, page, offset);
- else
+ if (f2fs_has_inline_data(inode))
+ err = f2fs_write_inline_data(inode, page);
+ if (err == -EAGAIN)
err = do_write_data_page(page, &fio);
f2fs_unlock_op(sbi);
done:
@@ -951,7 +928,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *page;
+ struct page *page, *ipage;
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
struct dnode_of_data dn;
int err = 0;
@@ -959,45 +936,60 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
trace_f2fs_write_begin(inode, pos, len, flags);
f2fs_balance_fs(sbi);
-repeat:
- err = f2fs_convert_inline_data(inode, pos + len, NULL);
- if (err)
- goto fail;
+ /*
+ * We should check this at this moment to avoid deadlock on inode page
+ * and #0 page. The locking rule for inline_data conversion should be:
+ * lock_page(page #0) -> lock_page(inode_page)
+ */
+ if (index != 0) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ goto fail;
+ }
+repeat:
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
err = -ENOMEM;
goto fail;
}
- /* to avoid latency during memory pressure */
- unlock_page(page);
-
*pagep = page;
- if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
- goto inline_data;
-
f2fs_lock_op(sbi);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = f2fs_reserve_block(&dn, index);
- f2fs_unlock_op(sbi);
- if (err) {
- f2fs_put_page(page, 0);
- goto fail;
- }
-inline_data:
- lock_page(page);
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
- goto repeat;
+
+ /* check inline_data */
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto unlock_fail;
}
- f2fs_wait_on_page_writeback(page, DATA);
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_has_inline_data(inode)) {
+ if (pos + len <= MAX_INLINE_DATA) {
+ read_inline_data(page, ipage);
+ set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ sync_inode_page(&dn);
+ goto put_next;
+ }
+ err = f2fs_convert_inline_page(&dn, page);
+ if (err)
+ goto put_fail;
+ }
+ err = f2fs_reserve_block(&dn, index);
+ if (err)
+ goto put_fail;
+put_next:
+ f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
return 0;
+ f2fs_wait_on_page_writeback(page, DATA);
+
if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
unsigned start = pos & (PAGE_CACHE_SIZE - 1);
unsigned end = start + len;
@@ -1010,18 +1002,10 @@ inline_data:
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
- if (f2fs_has_inline_data(inode)) {
- err = f2fs_read_inline_data(inode, page);
- if (err) {
- page_cache_release(page);
- goto fail;
- }
- } else {
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
- READ_SYNC);
- if (err)
- goto fail;
- }
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ READ_SYNC);
+ if (err)
+ goto fail;
lock_page(page);
if (unlikely(!PageUptodate(page))) {
@@ -1038,6 +1022,12 @@ out:
SetPageUptodate(page);
clear_cold_data(page);
return 0;
+
+put_fail:
+ f2fs_put_dnode(&dn);
+unlock_fail:
+ f2fs_unlock_op(sbi);
+ f2fs_put_page(page, 1);
fail:
f2fs_write_failed(mapping, pos + len);
return err;
@@ -1052,10 +1042,7 @@ static int f2fs_write_end(struct file *file,
trace_f2fs_write_end(inode, pos, len, copied);
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
- register_inmem_page(inode, page);
- else
- set_page_dirty(page);
+ set_page_dirty(page);
if (pos + copied > i_size_read(inode)) {
i_size_write(inode, pos + copied);
@@ -1093,9 +1080,12 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
int err;
- /* Let buffer I/O handle the inline data case. */
- if (f2fs_has_inline_data(inode))
- return 0;
+ /* we don't need to use inline_data strictly */
+ if (f2fs_has_inline_data(inode)) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
if (check_direct_IO(inode, rw, iter, offset))
return 0;
@@ -1119,6 +1109,9 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
return;
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ invalidate_inmem_page(inode, page);
+
if (PageDirty(page))
inode_dec_dirty_pages(inode);
ClearPagePrivate(page);
@@ -1138,6 +1131,12 @@ static int f2fs_set_data_page_dirty(struct page *page)
trace_f2fs_set_page_dirty(page, DATA);
SetPageUptodate(page);
+
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) {
+ register_inmem_page(inode, page);
+ return 1;
+ }
+
mark_inode_dirty(inode);
if (!PageDirty(page)) {
@@ -1152,9 +1151,12 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- if (f2fs_has_inline_data(inode))
- return 0;
-
+ /* we don't need to use inline_data strictly */
+ if (f2fs_has_inline_data(inode)) {
+ int err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
return generic_block_bmap(mapping, block, get_data_block);
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 0a91ab813a9e..91e8f699ab30 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -39,13 +39,15 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
si->ndirty_dirs = sbi->n_dirty_dirs;
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+ si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
si->valid_count = valid_user_blocks(sbi);
si->valid_node_count = valid_node_count(sbi);
si->valid_inode_count = valid_inode_count(sbi);
- si->inline_inode = sbi->inline_inode;
+ si->inline_inode = atomic_read(&sbi->inline_inode);
+ si->inline_dir = atomic_read(&sbi->inline_dir);
si->utilization = utilization(sbi);
si->free_segs = free_segments(sbi);
@@ -118,6 +120,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned npages;
+ int i;
if (si->base_mem)
goto get_cache;
@@ -167,8 +170,9 @@ get_cache:
si->cache_mem += npages << PAGE_CACHE_SHIFT;
npages = META_MAPPING(sbi)->nrpages;
si->cache_mem += npages << PAGE_CACHE_SHIFT;
- si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry);
si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
+ for (i = 0; i <= UPDATE_INO; i++)
+ si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
}
static int stat_show(struct seq_file *s, void *v)
@@ -200,6 +204,8 @@ static int stat_show(struct seq_file *s, void *v)
si->valid_count - si->valid_node_count);
seq_printf(s, " - Inline_data Inode: %u\n",
si->inline_inode);
+ seq_printf(s, " - Inline_dentry Inode: %u\n",
+ si->inline_dir);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
@@ -244,6 +250,8 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
si->hit_ext, si->total_ext);
seq_puts(s, "\nBalancing F2FS Async:\n");
+ seq_printf(s, " - inmem: %4d\n",
+ si->inmem_pages);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d\n",
@@ -321,6 +329,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->sbi = sbi;
sbi->stat_info = si;
+ atomic_set(&sbi->inline_inode, 0);
+ atomic_set(&sbi->inline_dir, 0);
+
mutex_lock(&f2fs_stat_mutex);
list_add_tail(&si->stat_list, &f2fs_stat_list);
mutex_unlock(&f2fs_stat_mutex);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b54f87149c09..b1a7d5737cd0 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level)
return 4;
}
-static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
+unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
[F2FS_FT_UNKNOWN] = DT_UNKNOWN,
[F2FS_FT_REG_FILE] = DT_REG,
[F2FS_FT_DIR] = DT_DIR,
@@ -59,7 +59,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
};
-static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
+void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
{
umode_t mode = inode->i_mode;
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
@@ -90,51 +90,70 @@ static bool early_match_name(size_t namelen, f2fs_hash_t namehash,
}
static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
- struct qstr *name, int *max_slots,
- f2fs_hash_t namehash, struct page **res_page)
+ struct qstr *name, int *max_slots,
+ struct page **res_page)
+{
+ struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dir_entry *de;
+ struct f2fs_dentry_ptr d;
+
+ dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
+
+ make_dentry_ptr(&d, (void *)dentry_blk, 1);
+ de = find_target_dentry(name, max_slots, &d);
+
+ if (de)
+ *res_page = dentry_page;
+ else
+ kunmap(dentry_page);
+
+ /*
+ * For the most part, it should be a bug when name_len is zero.
+ * We stop here for figuring out where the bugs has occurred.
+ */
+ f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0);
+ return de;
+}
+
+struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots,
+ struct f2fs_dentry_ptr *d)
{
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
- struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
- const void *dentry_bits = &dentry_blk->dentry_bitmap;
+ f2fs_hash_t namehash = f2fs_dentry_hash(name);
int max_len = 0;
- while (bit_pos < NR_DENTRY_IN_BLOCK) {
- if (!test_bit_le(bit_pos, dentry_bits)) {
+ if (max_slots)
+ *max_slots = 0;
+ while (bit_pos < d->max) {
+ if (!test_bit_le(bit_pos, d->bitmap)) {
if (bit_pos == 0)
max_len = 1;
- else if (!test_bit_le(bit_pos - 1, dentry_bits))
+ else if (!test_bit_le(bit_pos - 1, d->bitmap))
max_len++;
bit_pos++;
continue;
}
- de = &dentry_blk->dentry[bit_pos];
- if (early_match_name(name->len, namehash, de)) {
- if (!memcmp(dentry_blk->filename[bit_pos],
- name->name,
- name->len)) {
- *res_page = dentry_page;
- goto found;
- }
- }
- if (max_len > *max_slots) {
+ de = &d->dentry[bit_pos];
+ if (early_match_name(name->len, namehash, de) &&
+ !memcmp(d->filename[bit_pos], name->name, name->len))
+ goto found;
+
+ if (max_slots && *max_slots >= 0 && max_len > *max_slots) {
*max_slots = max_len;
max_len = 0;
}
- /*
- * For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs has occurred.
- */
- f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
+ /* remain bug on condition */
+ if (unlikely(!de->name_len))
+ d->max = -1;
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
de = NULL;
- kunmap(dentry_page);
found:
- if (max_len > *max_slots)
+ if (max_slots && max_len > *max_slots)
*max_slots = max_len;
return de;
}
@@ -149,7 +168,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
struct page *dentry_page;
struct f2fs_dir_entry *de = NULL;
bool room = false;
- int max_slots = 0;
+ int max_slots;
f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
@@ -168,8 +187,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
continue;
}
- de = find_in_block(dentry_page, name, &max_slots,
- namehash, res_page);
+ de = find_in_block(dentry_page, name, &max_slots, res_page);
if (de)
break;
@@ -201,6 +219,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
unsigned int max_depth;
unsigned int level;
+ if (f2fs_has_inline_dentry(dir))
+ return find_in_inline_dir(dir, child, res_page);
+
if (npages == 0)
return NULL;
@@ -227,6 +248,9 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
struct f2fs_dir_entry *de;
struct f2fs_dentry_block *dentry_blk;
+ if (f2fs_has_inline_dentry(dir))
+ return f2fs_parent_inline_dir(dir, p);
+
page = get_lock_data_page(dir, 0);
if (IS_ERR(page))
return NULL;
@@ -247,7 +271,7 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
de = f2fs_find_entry(dir, qstr, &page);
if (de) {
res = le32_to_cpu(de->ino);
- kunmap(page);
+ f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
}
@@ -257,11 +281,13 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode)
{
+ enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, type);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode);
- kunmap(page);
+ if (!f2fs_has_inline_dentry(dir))
+ kunmap(page);
set_page_dirty(page);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
mark_inode_dirty(dir);
@@ -296,36 +322,48 @@ int update_dent_inode(struct inode *inode, const struct qstr *name)
return 0;
}
-static int make_empty_dir(struct inode *inode,
- struct inode *parent, struct page *page)
+void do_make_empty_dir(struct inode *inode, struct inode *parent,
+ struct f2fs_dentry_ptr *d)
{
- struct page *dentry_page;
- struct f2fs_dentry_block *dentry_blk;
struct f2fs_dir_entry *de;
- dentry_page = get_new_data_page(inode, page, 0, true);
- if (IS_ERR(dentry_page))
- return PTR_ERR(dentry_page);
-
-
- dentry_blk = kmap_atomic(dentry_page);
-
- de = &dentry_blk->dentry[0];
+ de = &d->dentry[0];
de->name_len = cpu_to_le16(1);
de->hash_code = 0;
de->ino = cpu_to_le32(inode->i_ino);
- memcpy(dentry_blk->filename[0], ".", 1);
+ memcpy(d->filename[0], ".", 1);
set_de_type(de, inode);
- de = &dentry_blk->dentry[1];
+ de = &d->dentry[1];
de->hash_code = 0;
de->name_len = cpu_to_le16(2);
de->ino = cpu_to_le32(parent->i_ino);
- memcpy(dentry_blk->filename[1], "..", 2);
+ memcpy(d->filename[1], "..", 2);
set_de_type(de, inode);
- test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
- test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
+ test_and_set_bit_le(0, (void *)d->bitmap);
+ test_and_set_bit_le(1, (void *)d->bitmap);
+}
+
+static int make_empty_dir(struct inode *inode,
+ struct inode *parent, struct page *page)
+{
+ struct page *dentry_page;
+ struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dentry_ptr d;
+
+ if (f2fs_has_inline_dentry(inode))
+ return make_empty_inline_dir(inode, parent, page);
+
+ dentry_page = get_new_data_page(inode, page, 0, true);
+ if (IS_ERR(dentry_page))
+ return PTR_ERR(dentry_page);
+
+ dentry_blk = kmap_atomic(dentry_page);
+
+ make_dentry_ptr(&d, (void *)dentry_blk, 1);
+ do_make_empty_dir(inode, parent, &d);
+
kunmap_atomic(dentry_blk);
set_page_dirty(dentry_page);
@@ -333,8 +371,8 @@ static int make_empty_dir(struct inode *inode,
return 0;
}
-static struct page *init_inode_metadata(struct inode *inode,
- struct inode *dir, const struct qstr *name)
+struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
+ const struct qstr *name, struct page *dpage)
{
struct page *page;
int err;
@@ -350,7 +388,7 @@ static struct page *init_inode_metadata(struct inode *inode,
goto error;
}
- err = f2fs_init_acl(inode, dir, page);
+ err = f2fs_init_acl(inode, dir, page, dpage);
if (err)
goto put_error;
@@ -395,7 +433,7 @@ error:
return ERR_PTR(err);
}
-static void update_parent_metadata(struct inode *dir, struct inode *inode,
+void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
@@ -417,27 +455,23 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
}
-static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots)
+int room_for_filename(const void *bitmap, int slots, int max_slots)
{
int bit_start = 0;
int zero_start, zero_end;
next:
- zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK,
- bit_start);
- if (zero_start >= NR_DENTRY_IN_BLOCK)
- return NR_DENTRY_IN_BLOCK;
+ zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start);
+ if (zero_start >= max_slots)
+ return max_slots;
- zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK,
- zero_start);
+ zero_end = find_next_bit_le(bitmap, max_slots, zero_start);
if (zero_end - zero_start >= slots)
return zero_start;
bit_start = zero_end + 1;
- if (zero_end + 1 >= NR_DENTRY_IN_BLOCK)
- return NR_DENTRY_IN_BLOCK;
+ if (zero_end + 1 >= max_slots)
+ return max_slots;
goto next;
}
@@ -463,6 +497,14 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
int err = 0;
int i;
+ if (f2fs_has_inline_dentry(dir)) {
+ err = f2fs_add_inline_entry(dir, name, inode);
+ if (!err || err != -EAGAIN)
+ return err;
+ else
+ err = 0;
+ }
+
dentry_hash = f2fs_dentry_hash(name);
level = 0;
current_depth = F2FS_I(dir)->i_current_depth;
@@ -491,7 +533,8 @@ start:
return PTR_ERR(dentry_page);
dentry_blk = kmap(dentry_page);
- bit_pos = room_for_filename(dentry_blk, slots);
+ bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
+ slots, NR_DENTRY_IN_BLOCK);
if (bit_pos < NR_DENTRY_IN_BLOCK)
goto add_dentry;
@@ -506,7 +549,7 @@ add_dentry:
f2fs_wait_on_page_writeback(dentry_page, DATA);
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, name);
+ page = init_inode_metadata(inode, dir, name, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -545,7 +588,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
int err = 0;
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, NULL);
+ page = init_inode_metadata(inode, dir, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -560,26 +603,57 @@ fail:
return err;
}
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+ down_write(&F2FS_I(inode)->i_sem);
+
+ if (S_ISDIR(inode->i_mode)) {
+ drop_nlink(dir);
+ if (page)
+ update_inode(dir, page);
+ else
+ update_inode_page(dir);
+ }
+ inode->i_ctime = CURRENT_TIME;
+
+ drop_nlink(inode);
+ if (S_ISDIR(inode->i_mode)) {
+ drop_nlink(inode);
+ i_size_write(inode, 0);
+ }
+ up_write(&F2FS_I(inode)->i_sem);
+ update_inode_page(inode);
+
+ if (inode->i_nlink == 0)
+ add_orphan_inode(sbi, inode->i_ino);
+ else
+ release_orphan_inode(sbi);
+}
+
/*
* It only removes the dentry from the dentry page, corresponding name
* entry in name page does not need to be touched during deletion.
*/
void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
- struct inode *inode)
+ struct inode *dir, struct inode *inode)
{
struct f2fs_dentry_block *dentry_blk;
unsigned int bit_pos;
- struct inode *dir = page->mapping->host;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
int i;
+ if (f2fs_has_inline_dentry(dir))
+ return f2fs_delete_inline_entry(dentry, page, dir, inode);
+
lock_page(page);
f2fs_wait_on_page_writeback(page, DATA);
dentry_blk = page_address(page);
bit_pos = dentry - dentry_blk->dentry;
for (i = 0; i < slots; i++)
- test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+ clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
/* Let's check and deallocate this dentry page */
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
@@ -590,29 +664,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- if (inode) {
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
-
- down_write(&F2FS_I(inode)->i_sem);
-
- if (S_ISDIR(inode->i_mode)) {
- drop_nlink(dir);
- update_inode_page(dir);
- }
- inode->i_ctime = CURRENT_TIME;
- drop_nlink(inode);
- if (S_ISDIR(inode->i_mode)) {
- drop_nlink(inode);
- i_size_write(inode, 0);
- }
- up_write(&F2FS_I(inode)->i_sem);
- update_inode_page(inode);
-
- if (inode->i_nlink == 0)
- add_orphan_inode(sbi, inode->i_ino);
- else
- release_orphan_inode(sbi);
- }
+ if (inode)
+ f2fs_drop_nlink(dir, inode, NULL);
if (bit_pos == NR_DENTRY_IN_BLOCK) {
truncate_hole(dir, page->index, page->index + 1);
@@ -628,9 +681,12 @@ bool f2fs_empty_dir(struct inode *dir)
unsigned long bidx;
struct page *dentry_page;
unsigned int bit_pos;
- struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dentry_block *dentry_blk;
unsigned long nblock = dir_blocks(dir);
+ if (f2fs_has_inline_dentry(dir))
+ return f2fs_empty_inline_dir(dir);
+
for (bidx = 0; bidx < nblock; bidx++) {
dentry_page = get_lock_data_page(dir, bidx);
if (IS_ERR(dentry_page)) {
@@ -640,7 +696,6 @@ bool f2fs_empty_dir(struct inode *dir)
return false;
}
-
dentry_blk = kmap_atomic(dentry_page);
if (bidx == 0)
bit_pos = 2;
@@ -659,19 +714,48 @@ bool f2fs_empty_dir(struct inode *dir)
return true;
}
+bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+ unsigned int start_pos)
+{
+ unsigned char d_type = DT_UNKNOWN;
+ unsigned int bit_pos;
+ struct f2fs_dir_entry *de = NULL;
+
+ bit_pos = ((unsigned long)ctx->pos % d->max);
+
+ while (bit_pos < d->max) {
+ bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos);
+ if (bit_pos >= d->max)
+ break;
+
+ de = &d->dentry[bit_pos];
+ if (de->file_type < F2FS_FT_MAX)
+ d_type = f2fs_filetype_table[de->file_type];
+ else
+ d_type = DT_UNKNOWN;
+ if (!dir_emit(ctx, d->filename[bit_pos],
+ le16_to_cpu(de->name_len),
+ le32_to_cpu(de->ino), d_type))
+ return true;
+
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ ctx->pos = start_pos + bit_pos;
+ }
+ return false;
+}
+
static int f2fs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
unsigned long npages = dir_blocks(inode);
- unsigned int bit_pos = 0;
struct f2fs_dentry_block *dentry_blk = NULL;
- struct f2fs_dir_entry *de = NULL;
struct page *dentry_page = NULL;
struct file_ra_state *ra = &file->f_ra;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
- unsigned char d_type = DT_UNKNOWN;
+ struct f2fs_dentry_ptr d;
- bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
+ if (f2fs_has_inline_dentry(inode))
+ return f2fs_read_inline_dir(file, ctx);
/* readahead for multi pages of dir */
if (npages - n > 1 && !ra_has_index(ra, n))
@@ -684,28 +768,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
continue;
dentry_blk = kmap(dentry_page);
- while (bit_pos < NR_DENTRY_IN_BLOCK) {
- bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK,
- bit_pos);
- if (bit_pos >= NR_DENTRY_IN_BLOCK)
- break;
-
- de = &dentry_blk->dentry[bit_pos];
- if (de->file_type < F2FS_FT_MAX)
- d_type = f2fs_filetype_table[de->file_type];
- else
- d_type = DT_UNKNOWN;
- if (!dir_emit(ctx,
- dentry_blk->filename[bit_pos],
- le16_to_cpu(de->name_len),
- le32_to_cpu(de->ino), d_type))
- goto stop;
- bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
- ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
- }
- bit_pos = 0;
+ make_dentry_ptr(&d, (void *)dentry_blk, 1);
+
+ if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK))
+ goto stop;
+
ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8171e80b2ee9..ec58bb2373fc 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -46,8 +46,10 @@
#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
#define F2FS_MOUNT_INLINE_XATTR 0x00000080
#define F2FS_MOUNT_INLINE_DATA 0x00000100
-#define F2FS_MOUNT_FLUSH_MERGE 0x00000200
-#define F2FS_MOUNT_NOBARRIER 0x00000400
+#define F2FS_MOUNT_INLINE_DENTRY 0x00000200
+#define F2FS_MOUNT_FLUSH_MERGE 0x00000400
+#define F2FS_MOUNT_NOBARRIER 0x00000800
+#define F2FS_MOUNT_FASTBOOT 0x00001000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -211,6 +213,32 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
/*
* For INODE and NODE manager
*/
+/* for directory operations */
+struct f2fs_dentry_ptr {
+ const void *bitmap;
+ struct f2fs_dir_entry *dentry;
+ __u8 (*filename)[F2FS_SLOT_LEN];
+ int max;
+};
+
+static inline void make_dentry_ptr(struct f2fs_dentry_ptr *d,
+ void *src, int type)
+{
+ if (type == 1) {
+ struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
+ d->max = NR_DENTRY_IN_BLOCK;
+ d->bitmap = &t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
+ } else {
+ struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src;
+ d->max = NR_INLINE_DENTRY;
+ d->bitmap = &t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
+ }
+}
+
/*
* XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1
* as its node offset to distinguish from index node blocks.
@@ -269,6 +297,7 @@ struct f2fs_inode_info {
struct extent_info ext; /* in-memory extent cache entry */
struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
+ struct radix_tree_root inmem_root; /* radix tree for inmem pages */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
};
@@ -303,7 +332,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- rwlock_t nat_tree_lock; /* protect nat_tree_lock */
+ struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
struct list_head nat_entries; /* cached nat entry list (clean) */
unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
@@ -433,6 +462,7 @@ enum count_type {
F2FS_DIRTY_DENTS,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
+ F2FS_INMEM_PAGES,
NR_COUNT_TYPE,
};
@@ -470,6 +500,14 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
};
+/* for inner inode cache management */
+struct inode_management {
+ struct radix_tree_root ino_root; /* ino entry array */
+ spinlock_t ino_lock; /* for ino entry lock */
+ struct list_head ino_list; /* inode list head */
+ unsigned long ino_num; /* number of entries */
+};
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
@@ -488,7 +526,6 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info read_io; /* for read bios */
struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
- struct completion *wait_io; /* for completion bios */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -500,13 +537,9 @@ struct f2fs_sb_info {
bool por_doing; /* recovery is doing or not */
wait_queue_head_t cp_wait;
- /* for inode management */
- struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */
- spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */
- struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */
+ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
/* for orphan inode, use 0'th array */
- unsigned int n_orphans; /* # of orphan inodes */
unsigned int max_orphans; /* max orphan inodes */
/* for directory inode management */
@@ -557,7 +590,8 @@ struct f2fs_sb_info {
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
- int inline_inode; /* # of inline_data inodes */
+ atomic_t inline_inode; /* # of inline_data inodes */
+ atomic_t inline_dir; /* # of inline_dentry inodes */
int bg_gc; /* background gc calls */
unsigned int n_dirty_dirs; /* # of dir inodes */
#endif
@@ -988,6 +1022,13 @@ retry:
return entry;
}
+static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
+ unsigned long index, void *item)
+{
+ while (radix_tree_insert(root, index, item))
+ cond_resched();
+}
+
#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
static inline bool IS_INODE(struct page *page)
@@ -1020,7 +1061,7 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr)
return mask & *addr;
}
-static inline int f2fs_set_bit(unsigned int nr, char *addr)
+static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr)
{
int mask;
int ret;
@@ -1032,7 +1073,7 @@ static inline int f2fs_set_bit(unsigned int nr, char *addr)
return ret;
}
-static inline int f2fs_clear_bit(unsigned int nr, char *addr)
+static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr)
{
int mask;
int ret;
@@ -1044,6 +1085,15 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
return ret;
}
+static inline void f2fs_change_bit(unsigned int nr, char *addr)
+{
+ int mask;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ *addr ^= mask;
+}
+
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
@@ -1057,11 +1107,13 @@ enum {
FI_NO_EXTENT, /* not to use the extent cache */
FI_INLINE_XATTR, /* used for inline xattr */
FI_INLINE_DATA, /* used for inline data*/
+ FI_INLINE_DENTRY, /* used for inline dentry */
FI_APPEND_WRITE, /* inode has appended data */
FI_UPDATE_WRITE, /* inode has in-place-update data */
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
FI_VOLATILE_FILE, /* indicate volatile file */
+ FI_DATA_EXIST, /* indicate data exists */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1087,15 +1139,6 @@ static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
set_inode_flag(fi, FI_ACL_MODE);
}
-static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
-{
- if (is_inode_flag_set(fi, FI_ACL_MODE)) {
- clear_inode_flag(fi, FI_ACL_MODE);
- return 1;
- }
- return 0;
-}
-
static inline void get_inline_info(struct f2fs_inode_info *fi,
struct f2fs_inode *ri)
{
@@ -1103,6 +1146,10 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
set_inode_flag(fi, FI_INLINE_XATTR);
if (ri->i_inline & F2FS_INLINE_DATA)
set_inode_flag(fi, FI_INLINE_DATA);
+ if (ri->i_inline & F2FS_INLINE_DENTRY)
+ set_inode_flag(fi, FI_INLINE_DENTRY);
+ if (ri->i_inline & F2FS_DATA_EXIST)
+ set_inode_flag(fi, FI_DATA_EXIST);
}
static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -1114,6 +1161,10 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
ri->i_inline |= F2FS_INLINE_XATTR;
if (is_inode_flag_set(fi, FI_INLINE_DATA))
ri->i_inline |= F2FS_INLINE_DATA;
+ if (is_inode_flag_set(fi, FI_INLINE_DENTRY))
+ ri->i_inline |= F2FS_INLINE_DENTRY;
+ if (is_inode_flag_set(fi, FI_DATA_EXIST))
+ ri->i_inline |= F2FS_DATA_EXIST;
}
static inline int f2fs_has_inline_xattr(struct inode *inode)
@@ -1148,6 +1199,17 @@ static inline int f2fs_has_inline_data(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
}
+static inline void f2fs_clear_inline_inode(struct inode *inode)
+{
+ clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+}
+
+static inline int f2fs_exist_data(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST);
+}
+
static inline bool f2fs_is_atomic_file(struct inode *inode)
{
return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
@@ -1164,6 +1226,23 @@ static inline void *inline_data_addr(struct page *page)
return (void *)&(ri->i_addr[1]);
}
+static inline int f2fs_has_inline_dentry(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY);
+}
+
+static inline void *inline_dentry_addr(struct page *page)
+{
+ struct f2fs_inode *ri = F2FS_INODE(page);
+ return (void *)&(ri->i_addr[1]);
+}
+
+static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
+{
+ if (!f2fs_has_inline_dentry(dir))
+ kunmap(page);
+}
+
static inline int f2fs_readonly(struct super_block *sb)
{
return sb->s_flags & MS_RDONLY;
@@ -1224,6 +1303,19 @@ struct dentry *f2fs_get_parent(struct dentry *child);
/*
* dir.c
*/
+extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
+void set_de_type(struct f2fs_dir_entry *, struct inode *);
+struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *,
+ struct f2fs_dentry_ptr *);
+bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
+ unsigned int);
+void do_make_empty_dir(struct inode *, struct inode *,
+ struct f2fs_dentry_ptr *);
+struct page *init_inode_metadata(struct inode *, struct inode *,
+ const struct qstr *, struct page *);
+void update_parent_metadata(struct inode *, struct inode *, unsigned int);
+int room_for_filename(const void *, int, int);
+void f2fs_drop_nlink(struct inode *, struct inode *, struct page *);
struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
struct page **);
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
@@ -1232,7 +1324,8 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
int update_dent_inode(struct inode *, const struct qstr *);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
-void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
+void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
+ struct inode *);
int f2fs_do_tmpfile(struct inode *, struct inode *);
int f2fs_make_empty(struct inode *, struct inode *);
bool f2fs_empty_dir(struct inode *);
@@ -1296,6 +1389,7 @@ void destroy_node_manager_caches(void);
* segment.c
*/
void register_inmem_page(struct inode *, struct page *);
+void invalidate_inmem_page(struct inode *, struct page *);
void commit_inmem_pages(struct inode *, bool);
void f2fs_balance_fs(struct f2fs_sb_info *);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
@@ -1337,8 +1431,8 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
+void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
@@ -1405,7 +1499,7 @@ struct f2fs_stat_info {
int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
int nats, sits, fnids;
int total_count, utilization;
- int bg_gc, inline_inode;
+ int bg_gc, inline_inode, inline_dir, inmem_pages;
unsigned int valid_count, valid_node_count, valid_inode_count;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
@@ -1438,14 +1532,23 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_I_SB(inode))->inline_inode++); \
+ (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \
} while (0)
#define stat_dec_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_I_SB(inode))->inline_inode--); \
+ (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \
+ } while (0)
+#define stat_inc_inline_dir(inode) \
+ do { \
+ if (f2fs_has_inline_dentry(inode)) \
+ (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \
+ } while (0)
+#define stat_dec_inline_dir(inode) \
+ do { \
+ if (f2fs_has_inline_dentry(inode)) \
+ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \
} while (0)
-
#define stat_inc_seg_type(sbi, curseg) \
((sbi)->segment_count[(curseg)->alloc_type]++)
#define stat_inc_block_count(sbi, curseg) \
@@ -1492,6 +1595,8 @@ void f2fs_destroy_root_stats(void);
#define stat_inc_read_hit(sb)
#define stat_inc_inline_inode(inode)
#define stat_dec_inline_inode(inode)
+#define stat_inc_inline_dir(inode)
+#define stat_dec_inline_dir(inode)
#define stat_inc_seg_type(sbi, curseg)
#define stat_inc_block_count(sbi, curseg)
#define stat_inc_seg_count(si, type)
@@ -1519,9 +1624,20 @@ extern const struct inode_operations f2fs_special_inode_operations;
* inline.c
*/
bool f2fs_may_inline(struct inode *);
+void read_inline_data(struct page *, struct page *);
int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
-int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
-void truncate_inline_data(struct inode *, u64);
+int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
+int f2fs_convert_inline_inode(struct inode *);
+int f2fs_write_inline_data(struct inode *, struct page *);
+void truncate_inline_data(struct page *, u64);
bool recover_inline_data(struct inode *, struct page *);
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
+ struct page **);
+struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
+int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
+int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *);
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
+ struct inode *, struct inode *);
+bool f2fs_empty_inline_dir(struct inode *);
+int f2fs_read_inline_dir(struct file *, struct dir_context *);
#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8e68bb64f835..3c27e0ecb3bc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -41,18 +41,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
sb_start_pagefault(inode->i_sb);
- /* force to convert with normal data indices */
- err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
- if (err)
- goto out;
+ f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
/* block allocation */
f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_reserve_block(&dn, page->index);
- f2fs_unlock_op(sbi);
- if (err)
+ if (err) {
+ f2fs_unlock_op(sbi);
goto out;
+ }
+ f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
file_update_time(vma->vm_file);
lock_page(page);
@@ -130,10 +130,45 @@ static inline bool need_do_checkpoint(struct inode *inode)
need_cp = true;
else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
need_cp = true;
+ else if (test_opt(sbi, FASTBOOT))
+ need_cp = true;
+ else if (sbi->active_logs == 2)
+ need_cp = true;
return need_cp;
}
+static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+ bool ret = false;
+ /* But we need to avoid that there are some inode updates */
+ if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino))
+ ret = true;
+ f2fs_put_page(i, 0);
+ return ret;
+}
+
+static void try_to_fix_pino(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ nid_t pino;
+
+ down_write(&fi->i_sem);
+ fi->xattr_ver = 0;
+ if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+ get_parent_ino(inode, &pino)) {
+ fi->i_pino = pino;
+ file_got_pino(inode);
+ up_write(&fi->i_sem);
+
+ mark_inode_dirty_sync(inode);
+ f2fs_write_inode(inode, NULL);
+ } else {
+ up_write(&fi->i_sem);
+ }
+}
+
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
@@ -164,19 +199,21 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
}
+ /* if the inode is dirty, let's recover all the time */
+ if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) {
+ update_inode_page(inode);
+ goto go_write;
+ }
+
/*
* if there is no written data, don't waste time to write recovery info.
*/
if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
!exist_written_data(sbi, ino, APPEND_INO)) {
- struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
- /* But we need to avoid that there are some inode updates */
- if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
- f2fs_put_page(i, 0);
+ /* it may call write_inode just prior to fsync */
+ if (need_inode_page_update(sbi, ino))
goto go_write;
- }
- f2fs_put_page(i, 0);
if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
exist_written_data(sbi, ino, UPDATE_INO))
@@ -196,49 +233,36 @@ go_write:
up_read(&fi->i_sem);
if (need_cp) {
- nid_t pino;
-
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
- down_write(&fi->i_sem);
- F2FS_I(inode)->xattr_ver = 0;
- if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
- get_parent_ino(inode, &pino)) {
- F2FS_I(inode)->i_pino = pino;
- file_got_pino(inode);
- up_write(&fi->i_sem);
- mark_inode_dirty_sync(inode);
- ret = f2fs_write_inode(inode, NULL);
- if (ret)
- goto out;
- } else {
- up_write(&fi->i_sem);
- }
- } else {
+ /*
+ * We've secured consistency through sync_fs. Following pino
+ * will be used only for fsynced inodes after checkpoint.
+ */
+ try_to_fix_pino(inode);
+ goto out;
+ }
sync_nodes:
- sync_node_pages(sbi, ino, &wbc);
-
- if (need_inode_block_update(sbi, ino)) {
- mark_inode_dirty_sync(inode);
- ret = f2fs_write_inode(inode, NULL);
- if (ret)
- goto out;
- goto sync_nodes;
- }
+ sync_node_pages(sbi, ino, &wbc);
- ret = wait_on_node_pages_writeback(sbi, ino);
- if (ret)
- goto out;
+ if (need_inode_block_update(sbi, ino)) {
+ mark_inode_dirty_sync(inode);
+ f2fs_write_inode(inode, NULL);
+ goto sync_nodes;
+ }
+
+ ret = wait_on_node_pages_writeback(sbi, ino);
+ if (ret)
+ goto out;
- /* once recovery info is written, don't need to tack this */
- remove_dirty_inode(sbi, ino, APPEND_INO);
- clear_inode_flag(fi, FI_APPEND_WRITE);
+ /* once recovery info is written, don't need to tack this */
+ remove_dirty_inode(sbi, ino, APPEND_INO);
+ clear_inode_flag(fi, FI_APPEND_WRITE);
flush_out:
- remove_dirty_inode(sbi, ino, UPDATE_INO);
- clear_inode_flag(fi, FI_UPDATE_WRITE);
- ret = f2fs_issue_flush(F2FS_I_SB(inode));
- }
+ remove_dirty_inode(sbi, ino, UPDATE_INO);
+ clear_inode_flag(fi, FI_UPDATE_WRITE);
+ ret = f2fs_issue_flush(sbi);
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
@@ -296,7 +320,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
goto fail;
/* handle inline data case */
- if (f2fs_has_inline_data(inode)) {
+ if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) {
if (whence == SEEK_HOLE)
data_ofs = isize;
goto found;
@@ -374,6 +398,15 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct inode *inode = file_inode(file);
+
+ /* we don't need to use inline_data strictly */
+ if (f2fs_has_inline_data(inode)) {
+ int err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
+
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
return 0;
@@ -415,20 +448,17 @@ void truncate_data_blocks(struct dnode_of_data *dn)
truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
}
-static void truncate_partial_data_page(struct inode *inode, u64 from)
+static int truncate_partial_data_page(struct inode *inode, u64 from)
{
unsigned offset = from & (PAGE_CACHE_SIZE - 1);
struct page *page;
- if (f2fs_has_inline_data(inode))
- return truncate_inline_data(inode, from);
-
if (!offset)
- return;
+ return 0;
page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false);
if (IS_ERR(page))
- return;
+ return 0;
lock_page(page);
if (unlikely(!PageUptodate(page) ||
@@ -438,9 +468,9 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
f2fs_wait_on_page_writeback(page, DATA);
zero_user(page, offset, PAGE_CACHE_SIZE - offset);
set_page_dirty(page);
-
out:
f2fs_put_page(page, 1);
+ return 0;
}
int truncate_blocks(struct inode *inode, u64 from, bool lock)
@@ -450,27 +480,33 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
struct dnode_of_data dn;
pgoff_t free_from;
int count = 0, err = 0;
+ struct page *ipage;
trace_f2fs_truncate_blocks_enter(inode, from);
- if (f2fs_has_inline_data(inode))
- goto done;
-
free_from = (pgoff_t)
- ((from + blocksize - 1) >> (sbi->log_blocksize));
+ ((from + blocksize - 1) >> (sbi->log_blocksize));
if (lock)
f2fs_lock_op(sbi);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto out;
+ }
+
+ if (f2fs_has_inline_data(inode)) {
+ f2fs_put_page(ipage, 1);
+ goto out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
if (err) {
if (err == -ENOENT)
goto free_next;
- if (lock)
- f2fs_unlock_op(sbi);
- trace_f2fs_truncate_blocks_exit(inode, err);
- return err;
+ goto out;
}
count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
@@ -486,11 +522,13 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
f2fs_put_dnode(&dn);
free_next:
err = truncate_inode_blocks(inode, free_from);
+out:
if (lock)
f2fs_unlock_op(sbi);
-done:
+
/* lastly zero out the first data page */
- truncate_partial_data_page(inode, from);
+ if (!err)
+ err = truncate_partial_data_page(inode, from);
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
@@ -504,6 +542,12 @@ void f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
+ /* we should check inline_data size */
+ if (f2fs_has_inline_data(inode) && !f2fs_may_inline(inode)) {
+ if (f2fs_convert_inline_inode(inode))
+ return;
+ }
+
if (!truncate_blocks(inode, i_size_read(inode), true)) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
@@ -561,10 +605,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
if (attr->ia_valid & ATTR_SIZE) {
- err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
- if (err)
- return err;
-
if (attr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
f2fs_truncate(inode);
@@ -665,9 +705,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (offset >= inode->i_size)
return ret;
- ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
- if (ret)
- return ret;
+ if (f2fs_has_inline_data(inode)) {
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+ }
pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -721,9 +763,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (ret)
return ret;
- ret = f2fs_convert_inline_data(inode, offset + len, NULL);
- if (ret)
- return ret;
+ if (f2fs_has_inline_data(inode)) {
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+ }
pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -874,7 +918,15 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
+ return f2fs_convert_inline_inode(inode);
+}
+
+static int f2fs_release_file(struct inode *inode, struct file *filp)
+{
+ /* some remained atomic pages should discarded */
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ commit_inmem_pages(inode, true);
+ return 0;
}
static int f2fs_ioc_commit_atomic_write(struct file *filp)
@@ -908,7 +960,8 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
return -EACCES;
set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
- return 0;
+
+ return f2fs_convert_inline_inode(inode);
}
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
@@ -985,6 +1038,7 @@ const struct file_operations f2fs_file_operations = {
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.open = generic_file_open,
+ .release = f2fs_release_file,
.mmap = f2fs_file_mmap,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 2a8f4acdb86b..eec0933a4819 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -96,8 +96,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
dev_t dev = sbi->sb->s_bdev->bd_dev;
int err = 0;
- if (!test_opt(sbi, BG_GC))
- goto out;
gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th) {
err = -ENOMEM;
@@ -340,34 +338,39 @@ static const struct victim_selection default_v_ops = {
.get_victim = get_victim_by_default,
};
-static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
+static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
{
struct inode_entry *ie;
- list_for_each_entry(ie, ilist, list)
- if (ie->inode->i_ino == ino)
- return ie->inode;
+ ie = radix_tree_lookup(&gc_list->iroot, ino);
+ if (ie)
+ return ie->inode;
return NULL;
}
-static void add_gc_inode(struct inode *inode, struct list_head *ilist)
+static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
{
struct inode_entry *new_ie;
- if (inode == find_gc_inode(inode->i_ino, ilist)) {
+ if (inode == find_gc_inode(gc_list, inode->i_ino)) {
iput(inode);
return;
}
-
new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS);
new_ie->inode = inode;
- list_add_tail(&new_ie->list, ilist);
+retry:
+ if (radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie)) {
+ cond_resched();
+ goto retry;
+ }
+ list_add_tail(&new_ie->list, &gc_list->ilist);
}
-static void put_gc_inode(struct list_head *ilist)
+static void put_gc_inode(struct gc_inode_list *gc_list)
{
struct inode_entry *ie, *next_ie;
- list_for_each_entry_safe(ie, next_ie, ilist, list) {
+ list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
+ radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
iput(ie->inode);
list_del(&ie->list);
kmem_cache_free(winode_slab, ie);
@@ -553,7 +556,7 @@ out:
* the victim data block is ignored.
*/
static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
- struct list_head *ilist, unsigned int segno, int gc_type)
+ struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
struct f2fs_summary *entry;
@@ -605,27 +608,27 @@ next_step:
data_page = find_data_page(inode,
start_bidx + ofs_in_node, false);
- if (IS_ERR(data_page))
- goto next_iput;
+ if (IS_ERR(data_page)) {
+ iput(inode);
+ continue;
+ }
f2fs_put_page(data_page, 0);
- add_gc_inode(inode, ilist);
- } else {
- inode = find_gc_inode(dni.ino, ilist);
- if (inode) {
- start_bidx = start_bidx_of_node(nofs,
- F2FS_I(inode));
- data_page = get_lock_data_page(inode,
+ add_gc_inode(gc_list, inode);
+ continue;
+ }
+
+ /* phase 3 */
+ inode = find_gc_inode(gc_list, dni.ino);
+ if (inode) {
+ start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+ data_page = get_lock_data_page(inode,
start_bidx + ofs_in_node);
- if (IS_ERR(data_page))
- continue;
- move_data_page(inode, data_page, gc_type);
- stat_inc_data_blk_count(sbi, 1);
- }
+ if (IS_ERR(data_page))
+ continue;
+ move_data_page(inode, data_page, gc_type);
+ stat_inc_data_blk_count(sbi, 1);
}
- continue;
-next_iput:
- iput(inode);
}
if (++phase < 4)
@@ -646,18 +649,20 @@ next_iput:
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
- int gc_type, int type)
+ int gc_type)
{
struct sit_info *sit_i = SIT_I(sbi);
int ret;
+
mutex_lock(&sit_i->sentry_lock);
- ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS);
+ ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
+ NO_CHECK_TYPE, LFS);
mutex_unlock(&sit_i->sentry_lock);
return ret;
}
static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
- struct list_head *ilist, int gc_type)
+ struct gc_inode_list *gc_list, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
@@ -675,7 +680,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
gc_node_segment(sbi, sum->entries, segno, gc_type);
break;
case SUM_TYPE_DATA:
- gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
+ gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type);
break;
}
blk_finish_plug(&plug);
@@ -688,16 +693,18 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
int f2fs_gc(struct f2fs_sb_info *sbi)
{
- struct list_head ilist;
unsigned int segno, i;
int gc_type = BG_GC;
int nfree = 0;
int ret = -1;
- struct cp_control cpc = {
- .reason = CP_SYNC,
+ struct cp_control cpc;
+ struct gc_inode_list gc_list = {
+ .ilist = LIST_HEAD_INIT(gc_list.ilist),
+ .iroot = RADIX_TREE_INIT(GFP_NOFS),
};
- INIT_LIST_HEAD(&ilist);
+ cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
+
gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
@@ -709,7 +716,7 @@ gc_more:
write_checkpoint(sbi, &cpc);
}
- if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
+ if (!__get_victim(sbi, &segno, gc_type))
goto stop;
ret = 0;
@@ -719,7 +726,7 @@ gc_more:
META_SSA);
for (i = 0; i < sbi->segs_per_sec; i++)
- do_garbage_collect(sbi, segno + i, &ilist, gc_type);
+ do_garbage_collect(sbi, segno + i, &gc_list, gc_type);
if (gc_type == FG_GC) {
sbi->cur_victim_sec = NULL_SEGNO;
@@ -735,7 +742,7 @@ gc_more:
stop:
mutex_unlock(&sbi->gc_mutex);
- put_gc_inode(&ilist);
+ put_gc_inode(&gc_list);
return ret;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 16f0b2b22999..6ff7ad38463e 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -40,6 +40,11 @@ struct inode_entry {
struct inode *inode;
};
+struct gc_inode_list {
+ struct list_head ilist;
+ struct radix_tree_root iroot;
+};
+
/*
* inline functions
*/
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 88036fd75797..f2d3c581e776 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,35 +15,44 @@
bool f2fs_may_inline(struct inode *inode)
{
- block_t nr_blocks;
- loff_t i_size;
-
if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
return false;
if (f2fs_is_atomic_file(inode))
return false;
- nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
- if (inode->i_blocks > nr_blocks)
+ if (!S_ISREG(inode->i_mode))
return false;
- i_size = i_size_read(inode);
- if (i_size > MAX_INLINE_DATA)
+ if (i_size_read(inode) > MAX_INLINE_DATA)
return false;
return true;
}
-int f2fs_read_inline_data(struct inode *inode, struct page *page)
+void read_inline_data(struct page *page, struct page *ipage)
{
- struct page *ipage;
void *src_addr, *dst_addr;
- if (page->index) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
- goto out;
- }
+ if (PageUptodate(page))
+ return;
+
+ f2fs_bug_on(F2FS_P_SB(page), page->index);
+
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+
+ /* Copy the whole inline data block */
+ src_addr = inline_data_addr(ipage);
+ dst_addr = kmap_atomic(page);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ flush_dcache_page(page);
+ kunmap_atomic(dst_addr);
+ SetPageUptodate(page);
+}
+
+int f2fs_read_inline_data(struct inode *inode, struct page *page)
+{
+ struct page *ipage;
ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
@@ -51,112 +60,116 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
return PTR_ERR(ipage);
}
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+ if (!f2fs_has_inline_data(inode)) {
+ f2fs_put_page(ipage, 1);
+ return -EAGAIN;
+ }
- /* Copy the whole inline data block */
- src_addr = inline_data_addr(ipage);
- dst_addr = kmap(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- kunmap(page);
- f2fs_put_page(ipage, 1);
+ if (page->index)
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ else
+ read_inline_data(page, ipage);
-out:
SetPageUptodate(page);
+ f2fs_put_page(ipage, 1);
unlock_page(page);
-
return 0;
}
-static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
+int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
{
- int err = 0;
- struct page *ipage;
- struct dnode_of_data dn;
void *src_addr, *dst_addr;
block_t new_blk_addr;
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_io_info fio = {
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
};
+ int dirty, err;
- f2fs_lock_op(sbi);
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
- goto out;
- }
+ f2fs_bug_on(F2FS_I_SB(dn->inode), page->index);
- /* someone else converted inline_data already */
- if (!f2fs_has_inline_data(inode))
- goto out;
+ if (!f2fs_exist_data(dn->inode))
+ goto clear_out;
- /*
- * i_addr[0] is not used for inline data,
- * so reserving new block will not destroy inline data
- */
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- err = f2fs_reserve_block(&dn, 0);
+ err = f2fs_reserve_block(dn, 0);
if (err)
- goto out;
+ return err;
f2fs_wait_on_page_writeback(page, DATA);
+
+ if (PageUptodate(page))
+ goto no_update;
+
zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
/* Copy the whole inline data block */
- src_addr = inline_data_addr(ipage);
- dst_addr = kmap(page);
+ src_addr = inline_data_addr(dn->inode_page);
+ dst_addr = kmap_atomic(page);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- kunmap(page);
+ flush_dcache_page(page);
+ kunmap_atomic(dst_addr);
SetPageUptodate(page);
+no_update:
+ /* clear dirty state */
+ dirty = clear_page_dirty_for_io(page);
/* write data page to try to make data consistent */
set_page_writeback(page);
- write_data_page(page, &dn, &new_blk_addr, &fio);
- update_extent_cache(new_blk_addr, &dn);
+
+ write_data_page(page, dn, &new_blk_addr, &fio);
+ update_extent_cache(new_blk_addr, dn);
f2fs_wait_on_page_writeback(page, DATA);
+ if (dirty)
+ inode_dec_dirty_pages(dn->inode);
- /* clear inline data and flag after data writeback */
- zero_user_segment(ipage, INLINE_DATA_OFFSET,
- INLINE_DATA_OFFSET + MAX_INLINE_DATA);
- clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- stat_dec_inline_inode(inode);
+ /* this converted inline_data should be recovered. */
+ set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
- sync_inode_page(&dn);
- f2fs_put_dnode(&dn);
-out:
- f2fs_unlock_op(sbi);
- return err;
+ /* clear inline data and flag after data writeback */
+ truncate_inline_data(dn->inode_page, 0);
+clear_out:
+ stat_dec_inline_inode(dn->inode);
+ f2fs_clear_inline_inode(dn->inode);
+ sync_inode_page(dn);
+ f2fs_put_dnode(dn);
+ return 0;
}
-int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
- struct page *page)
+int f2fs_convert_inline_inode(struct inode *inode)
{
- struct page *new_page = page;
- int err;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ struct page *ipage, *page;
+ int err = 0;
- if (!f2fs_has_inline_data(inode))
- return 0;
- else if (to_size <= MAX_INLINE_DATA)
- return 0;
+ page = grab_cache_page(inode->i_mapping, 0);
+ if (!page)
+ return -ENOMEM;
+
+ f2fs_lock_op(sbi);
- if (!page || page->index != 0) {
- new_page = grab_cache_page(inode->i_mapping, 0);
- if (!new_page)
- return -ENOMEM;
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto out;
}
- err = __f2fs_convert_inline_data(inode, new_page);
- if (!page || page->index != 0)
- f2fs_put_page(new_page, 1);
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_has_inline_data(inode))
+ err = f2fs_convert_inline_page(&dn, page);
+
+ f2fs_put_dnode(&dn);
+out:
+ f2fs_unlock_op(sbi);
+
+ f2fs_put_page(page, 1);
return err;
}
-int f2fs_write_inline_data(struct inode *inode,
- struct page *page, unsigned size)
+int f2fs_write_inline_data(struct inode *inode, struct page *page)
{
void *src_addr, *dst_addr;
- struct page *ipage;
struct dnode_of_data dn;
int err;
@@ -164,47 +177,39 @@ int f2fs_write_inline_data(struct inode *inode,
err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
if (err)
return err;
- ipage = dn.inode_page;
- f2fs_wait_on_page_writeback(ipage, NODE);
- zero_user_segment(ipage, INLINE_DATA_OFFSET,
- INLINE_DATA_OFFSET + MAX_INLINE_DATA);
- src_addr = kmap(page);
- dst_addr = inline_data_addr(ipage);
- memcpy(dst_addr, src_addr, size);
- kunmap(page);
-
- /* Release the first data block if it is allocated */
if (!f2fs_has_inline_data(inode)) {
- truncate_data_blocks_range(&dn, 1);
- set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- stat_inc_inline_inode(inode);
+ f2fs_put_dnode(&dn);
+ return -EAGAIN;
}
+ f2fs_bug_on(F2FS_I_SB(inode), page->index);
+
+ f2fs_wait_on_page_writeback(dn.inode_page, NODE);
+ src_addr = kmap_atomic(page);
+ dst_addr = inline_data_addr(dn.inode_page);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ kunmap_atomic(src_addr);
+
set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+ set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+
sync_inode_page(&dn);
f2fs_put_dnode(&dn);
-
return 0;
}
-void truncate_inline_data(struct inode *inode, u64 from)
+void truncate_inline_data(struct page *ipage, u64 from)
{
- struct page *ipage;
+ void *addr;
if (from >= MAX_INLINE_DATA)
return;
- ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(ipage))
- return;
-
f2fs_wait_on_page_writeback(ipage, NODE);
- zero_user_segment(ipage, INLINE_DATA_OFFSET + from,
- INLINE_DATA_OFFSET + MAX_INLINE_DATA);
- set_page_dirty(ipage);
- f2fs_put_page(ipage, 1);
+ addr = inline_data_addr(ipage);
+ memset(addr + from, 0, MAX_INLINE_DATA - from);
}
bool recover_inline_data(struct inode *inode, struct page *npage)
@@ -236,6 +241,10 @@ process_inline:
src_addr = inline_data_addr(npage);
dst_addr = inline_data_addr(ipage);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+
+ set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
return true;
@@ -244,16 +253,279 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- f2fs_wait_on_page_writeback(ipage, NODE);
- zero_user_segment(ipage, INLINE_DATA_OFFSET,
- INLINE_DATA_OFFSET + MAX_INLINE_DATA);
- clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ truncate_inline_data(ipage, 0);
+ f2fs_clear_inline_inode(inode);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
truncate_blocks(inode, 0, false);
- set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
goto process_inline;
}
return false;
}
+
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
+ struct qstr *name, struct page **res_page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_inline_dentry *inline_dentry;
+ struct f2fs_dir_entry *de;
+ struct f2fs_dentry_ptr d;
+ struct page *ipage;
+
+ ipage = get_node_page(sbi, dir->i_ino);
+ if (IS_ERR(ipage))
+ return NULL;
+
+ inline_dentry = inline_data_addr(ipage);
+
+ make_dentry_ptr(&d, (void *)inline_dentry, 2);
+ de = find_target_dentry(name, NULL, &d);
+
+ unlock_page(ipage);
+ if (de)
+ *res_page = ipage;
+ else
+ f2fs_put_page(ipage, 0);
+
+ /*
+ * For the most part, it should be a bug when name_len is zero.
+ * We stop here for figuring out where the bugs has occurred.
+ */
+ f2fs_bug_on(sbi, d.max < 0);
+ return de;
+}
+
+struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir,
+ struct page **p)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct page *ipage;
+ struct f2fs_dir_entry *de;
+ struct f2fs_inline_dentry *dentry_blk;
+
+ ipage = get_node_page(sbi, dir->i_ino);
+ if (IS_ERR(ipage))
+ return NULL;
+
+ dentry_blk = inline_data_addr(ipage);
+ de = &dentry_blk->dentry[1];
+ *p = ipage;
+ unlock_page(ipage);
+ return de;
+}
+
+int make_empty_inline_dir(struct inode *inode, struct inode *parent,
+ struct page *ipage)
+{
+ struct f2fs_inline_dentry *dentry_blk;
+ struct f2fs_dentry_ptr d;
+
+ dentry_blk = inline_data_addr(ipage);
+
+ make_dentry_ptr(&d, (void *)dentry_blk, 2);
+ do_make_empty_dir(inode, parent, &d);
+
+ set_page_dirty(ipage);
+
+ /* update i_size to MAX_INLINE_DATA */
+ if (i_size_read(inode) < MAX_INLINE_DATA) {
+ i_size_write(inode, MAX_INLINE_DATA);
+ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
+ }
+ return 0;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ struct page *page;
+ struct dnode_of_data dn;
+ struct f2fs_dentry_block *dentry_blk;
+ int err;
+
+ page = grab_cache_page(dir->i_mapping, 0);
+ if (!page)
+ return -ENOMEM;
+
+ set_new_dnode(&dn, dir, ipage, NULL, 0);
+ err = f2fs_reserve_block(&dn, 0);
+ if (err)
+ goto out;
+
+ f2fs_wait_on_page_writeback(page, DATA);
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+
+ dentry_blk = kmap_atomic(page);
+
+ /* copy data from inline dentry block to new dentry block */
+ memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap,
+ INLINE_DENTRY_BITMAP_SIZE);
+ memcpy(dentry_blk->dentry, inline_dentry->dentry,
+ sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY);
+ memcpy(dentry_blk->filename, inline_dentry->filename,
+ NR_INLINE_DENTRY * F2FS_SLOT_LEN);
+
+ kunmap_atomic(dentry_blk);
+ SetPageUptodate(page);
+ set_page_dirty(page);
+
+ /* clear inline dir and flag after data writeback */
+ truncate_inline_data(ipage, 0);
+
+ stat_dec_inline_dir(dir);
+ clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
+
+ if (i_size_read(dir) < PAGE_CACHE_SIZE) {
+ i_size_write(dir, PAGE_CACHE_SIZE);
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+ }
+
+ sync_inode_page(&dn);
+out:
+ f2fs_put_page(page, 1);
+ return err;
+}
+
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
+ struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct page *ipage;
+ unsigned int bit_pos;
+ f2fs_hash_t name_hash;
+ struct f2fs_dir_entry *de;
+ size_t namelen = name->len;
+ struct f2fs_inline_dentry *dentry_blk = NULL;
+ int slots = GET_DENTRY_SLOTS(namelen);
+ struct page *page;
+ int err = 0;
+ int i;
+
+ name_hash = f2fs_dentry_hash(name);
+
+ ipage = get_node_page(sbi, dir->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ dentry_blk = inline_data_addr(ipage);
+ bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
+ slots, NR_INLINE_DENTRY);
+ if (bit_pos >= NR_INLINE_DENTRY) {
+ err = f2fs_convert_inline_dir(dir, ipage, dentry_blk);
+ if (!err)
+ err = -EAGAIN;
+ goto out;
+ }
+
+ down_write(&F2FS_I(inode)->i_sem);
+ page = init_inode_metadata(inode, dir, name, ipage);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
+
+ f2fs_wait_on_page_writeback(ipage, NODE);
+ de = &dentry_blk->dentry[bit_pos];
+ de->hash_code = name_hash;
+ de->name_len = cpu_to_le16(namelen);
+ memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
+ de->ino = cpu_to_le32(inode->i_ino);
+ set_de_type(de, inode);
+ for (i = 0; i < slots; i++)
+ test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+ set_page_dirty(ipage);
+
+ /* we don't need to mark_inode_dirty now */
+ F2FS_I(inode)->i_pino = dir->i_ino;
+ update_inode(inode, page);
+ f2fs_put_page(page, 1);
+
+ update_parent_metadata(dir, inode, 0);
+fail:
+ up_write(&F2FS_I(inode)->i_sem);
+
+ if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
+ update_inode(dir, ipage);
+ clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+ }
+out:
+ f2fs_put_page(ipage, 1);
+ return err;
+}
+
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
+ struct inode *dir, struct inode *inode)
+{
+ struct f2fs_inline_dentry *inline_dentry;
+ int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+ unsigned int bit_pos;
+ int i;
+
+ lock_page(page);
+ f2fs_wait_on_page_writeback(page, NODE);
+
+ inline_dentry = inline_data_addr(page);
+ bit_pos = dentry - inline_dentry->dentry;
+ for (i = 0; i < slots; i++)
+ test_and_clear_bit_le(bit_pos + i,
+ &inline_dentry->dentry_bitmap);
+
+ set_page_dirty(page);
+
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+ if (inode)
+ f2fs_drop_nlink(dir, inode, page);
+
+ f2fs_put_page(page, 1);
+}
+
+bool f2fs_empty_inline_dir(struct inode *dir)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct page *ipage;
+ unsigned int bit_pos = 2;
+ struct f2fs_inline_dentry *dentry_blk;
+
+ ipage = get_node_page(sbi, dir->i_ino);
+ if (IS_ERR(ipage))
+ return false;
+
+ dentry_blk = inline_data_addr(ipage);
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_INLINE_DENTRY,
+ bit_pos);
+
+ f2fs_put_page(ipage, 1);
+
+ if (bit_pos < NR_INLINE_DENTRY)
+ return false;
+
+ return true;
+}
+
+int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ struct f2fs_inline_dentry *inline_dentry = NULL;
+ struct page *ipage = NULL;
+ struct f2fs_dentry_ptr d;
+
+ if (ctx->pos == NR_INLINE_DENTRY)
+ return 0;
+
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ inline_dentry = inline_data_addr(ipage);
+
+ make_dentry_ptr(&d, (void *)inline_dentry, 2);
+
+ if (!f2fs_fill_dentries(ctx, &d, 0))
+ ctx->pos = NR_INLINE_DENTRY;
+
+ f2fs_put_page(ipage, 1);
+ return 0;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 0deead4505e7..196cc7843aaf 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -67,12 +67,38 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
}
}
+static int __recover_inline_status(struct inode *inode, struct page *ipage)
+{
+ void *inline_data = inline_data_addr(ipage);
+ struct f2fs_inode *ri;
+ void *zbuf;
+
+ zbuf = kzalloc(MAX_INLINE_DATA, GFP_NOFS);
+ if (!zbuf)
+ return -ENOMEM;
+
+ if (!memcmp(zbuf, inline_data, MAX_INLINE_DATA)) {
+ kfree(zbuf);
+ return 0;
+ }
+ kfree(zbuf);
+
+ f2fs_wait_on_page_writeback(ipage, NODE);
+ set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+
+ ri = F2FS_INODE(ipage);
+ set_raw_inline(F2FS_I(inode), ri);
+ set_page_dirty(ipage);
+ return 0;
+}
+
static int do_read_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct page *node_page;
struct f2fs_inode *ri;
+ int err = 0;
/* Check if ino is within scope */
if (check_nid_range(sbi, inode->i_ino)) {
@@ -114,11 +140,19 @@ static int do_read_inode(struct inode *inode)
get_extent_info(&fi->ext, ri->i_ext);
get_inline_info(fi, ri);
+ /* check data exist */
+ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
+ err = __recover_inline_status(inode, node_page);
+
/* get rdev by using inline_info */
__get_inode_rdev(inode, ri);
f2fs_put_page(node_page, 1);
- return 0;
+
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
+ return err;
}
struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
@@ -156,7 +190,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -295,11 +329,12 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_lock_op(sbi);
remove_inode_page(inode);
- stat_dec_inline_inode(inode);
f2fs_unlock_op(sbi);
sb_end_intwrite(inode->i_sb);
no_delete:
+ stat_dec_inline_dir(inode);
+ stat_dec_inline_inode(inode);
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
@@ -325,8 +360,9 @@ void handle_failed_inode(struct inode *inode)
f2fs_truncate(inode);
remove_inode_page(inode);
- stat_dec_inline_inode(inode);
+ clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
alloc_nid_failed(sbi, inode->i_ino);
f2fs_unlock_op(sbi);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 0d2526e5aa11..547a2deeb1ac 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -54,6 +54,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_free = true;
goto out;
}
+
+ if (f2fs_may_inline(inode))
+ set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ if (test_opt(sbi, INLINE_DENTRY) && S_ISDIR(inode->i_mode))
+ set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
+
trace_f2fs_new_inode(inode, 0);
mark_inode_dirty(inode);
return inode;
@@ -129,8 +135,12 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
alloc_nid_done(sbi, ino);
+ stat_inc_inline_inode(inode);
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
out:
handle_failed_inode(inode);
@@ -157,6 +167,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_unlock_op(sbi);
d_instantiate(dentry, inode);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
out:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
@@ -187,14 +200,12 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
de = f2fs_find_entry(dir, &dentry->d_name, &page);
if (de) {
nid_t ino = le32_to_cpu(de->ino);
- kunmap(page);
+ f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
inode = f2fs_iget(dir->i_sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
-
- stat_inc_inline_inode(inode);
}
return d_splice_alias(inode, dentry);
@@ -219,15 +230,18 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
err = acquire_orphan_inode(sbi);
if (err) {
f2fs_unlock_op(sbi);
- kunmap(page);
+ f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
goto fail;
}
- f2fs_delete_entry(de, page, inode);
+ f2fs_delete_entry(de, page, dir, inode);
f2fs_unlock_op(sbi);
/* In order to evict this inode, we set it dirty */
mark_inode_dirty(inode);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
fail:
trace_f2fs_unlink_exit(inode, err);
return err;
@@ -261,6 +275,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
return err;
out:
handle_failed_inode(inode);
@@ -291,11 +308,14 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
f2fs_unlock_op(sbi);
+ stat_inc_inline_dir(inode);
alloc_nid_done(sbi, inode->i_ino);
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
out_fail:
@@ -338,8 +358,12 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
+
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
out:
handle_failed_inode(inode);
@@ -435,7 +459,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(old_inode);
- f2fs_delete_entry(old_entry, old_page, NULL);
+ f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
if (old_dir_entry) {
if (old_dir != new_dir) {
@@ -443,7 +467,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dir_page, new_dir);
update_inode_page(old_inode);
} else {
- kunmap(old_dir_page);
+ f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
drop_nlink(old_dir);
@@ -452,19 +476,22 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
f2fs_unlock_op(sbi);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
put_out_dir:
f2fs_unlock_op(sbi);
- kunmap(new_page);
+ f2fs_dentry_kunmap(new_dir, new_page);
f2fs_put_page(new_page, 0);
out_dir:
if (old_dir_entry) {
- kunmap(old_dir_page);
+ f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
out_old:
- kunmap(old_page);
+ f2fs_dentry_kunmap(old_dir, old_page);
f2fs_put_page(old_page, 0);
out:
return err;
@@ -588,6 +615,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
update_inode_page(new_dir);
f2fs_unlock_op(sbi);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
out_undo:
/* Still we may fail to recover name info of f2fs_inode here */
@@ -596,19 +626,19 @@ out_unlock:
f2fs_unlock_op(sbi);
out_new_dir:
if (new_dir_entry) {
- kunmap(new_dir_page);
+ f2fs_dentry_kunmap(new_inode, new_dir_page);
f2fs_put_page(new_dir_page, 0);
}
out_old_dir:
if (old_dir_entry) {
- kunmap(old_dir_page);
+ f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
out_new:
- kunmap(new_page);
+ f2fs_dentry_kunmap(new_dir, new_page);
f2fs_put_page(new_page, 0);
out_old:
- kunmap(old_page);
+ f2fs_dentry_kunmap(old_dir, old_page);
f2fs_put_page(old_page, 0);
out:
return err;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 44b8afef43d9..f83326ca32ef 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -31,22 +31,38 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct sysinfo val;
+ unsigned long avail_ram;
unsigned long mem_size = 0;
bool res = false;
si_meminfo(&val);
- /* give 25%, 25%, 50% memory for each components respectively */
+
+ /* only uses low memory */
+ avail_ram = val.totalram - val.totalhigh;
+
+ /* give 25%, 25%, 50%, 50% memory for each components respectively */
if (type == FREE_NIDS) {
- mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 12;
- res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2);
+ mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
+ PAGE_CACHE_SHIFT;
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
- mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12;
- res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2);
+ mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
+ PAGE_CACHE_SHIFT;
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == DIRTY_DENTS) {
if (sbi->sb->s_bdi->dirty_exceeded)
return false;
mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
- res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1);
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ } else if (type == INO_ENTRIES) {
+ int i;
+
+ if (sbi->sb->s_bdi->dirty_exceeded)
+ return false;
+ for (i = 0; i <= UPDATE_INO; i++)
+ mem_size += (sbi->im[i].ino_num *
+ sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
}
return res;
}
@@ -131,7 +147,7 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
if (get_nat_flag(ne, IS_DIRTY))
return;
-retry:
+
head = radix_tree_lookup(&nm_i->nat_set_root, set);
if (!head) {
head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
@@ -140,11 +156,7 @@ retry:
INIT_LIST_HEAD(&head->set_list);
head->set = set;
head->entry_cnt = 0;
-
- if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
- cond_resched();
- goto retry;
- }
+ f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
}
list_move_tail(&ne->list, &head->entry_list);
nm_i->dirty_nat_cnt++;
@@ -155,7 +167,7 @@ retry:
static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
struct nat_entry *ne)
{
- nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+ nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
struct nat_entry_set *head;
head = radix_tree_lookup(&nm_i->nat_set_root, set);
@@ -180,11 +192,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@@ -194,11 +206,11 @@ bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool fsynced = false;
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
fsynced = true;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return fsynced;
}
@@ -208,13 +220,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return need_update;
}
@@ -222,13 +234,8 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
{
struct nat_entry *new;
- new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
- if (!new)
- return NULL;
- if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
- kmem_cache_free(nat_entry_slab, new);
- return NULL;
- }
+ new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
+ f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
memset(new, 0, sizeof(struct nat_entry));
nat_set_nid(new, nid);
nat_reset_flag(new);
@@ -241,18 +248,14 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
struct f2fs_nat_entry *ne)
{
struct nat_entry *e;
-retry:
- write_lock(&nm_i->nat_tree_lock);
+
+ down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e) {
e = grab_nat_entry(nm_i, nid);
- if (!e) {
- write_unlock(&nm_i->nat_tree_lock);
- goto retry;
- }
node_info_from_raw_nat(&e->ni, ne);
}
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}
static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -260,15 +263,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
-retry:
- write_lock(&nm_i->nat_tree_lock);
+
+ down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = grab_nat_entry(nm_i, ni->nid);
- if (!e) {
- write_unlock(&nm_i->nat_tree_lock);
- goto retry;
- }
e->ni = *ni;
f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
@@ -310,7 +309,7 @@ retry:
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}
int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -320,7 +319,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
if (available_free_memory(sbi, NAT_ENTRIES))
return 0;
- write_lock(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
struct nat_entry *ne;
ne = list_first_entry(&nm_i->nat_entries,
@@ -328,7 +327,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
__del_from_nat_cache(nm_i, ne);
nr_shrink--;
}
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
return nr_shrink;
}
@@ -351,14 +350,14 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
ni->nid = nid;
/* Check nat cache */
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
}
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
if (e)
return;
@@ -1298,16 +1297,22 @@ static int f2fs_write_node_page(struct page *page,
return 0;
}
- if (wbc->for_reclaim)
- goto redirty_out;
-
- down_read(&sbi->node_write);
+ if (wbc->for_reclaim) {
+ if (!down_read_trylock(&sbi->node_write))
+ goto redirty_out;
+ } else {
+ down_read(&sbi->node_write);
+ }
set_page_writeback(page);
write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
up_read(&sbi->node_write);
unlock_page(page);
+
+ if (wbc->for_reclaim)
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
+
return 0;
redirty_out:
@@ -1410,13 +1415,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
if (build) {
/* do not add allocated nids */
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (ne &&
(!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
allocated = true;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
if (allocated)
return 0;
}
@@ -1425,15 +1430,22 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
i->nid = nid;
i->state = NID_NEW;
+ if (radix_tree_preload(GFP_NOFS)) {
+ kmem_cache_free(free_nid_slab, i);
+ return 0;
+ }
+
spin_lock(&nm_i->free_nid_list_lock);
if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
spin_unlock(&nm_i->free_nid_list_lock);
+ radix_tree_preload_end();
kmem_cache_free(free_nid_slab, i);
return 0;
}
list_add_tail(&i->list, &nm_i->free_nid_list);
nm_i->fcnt++;
spin_unlock(&nm_i->free_nid_list_lock);
+ radix_tree_preload_end();
return 1;
}
@@ -1804,21 +1816,15 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
raw_ne = nat_in_journal(sum, i);
-retry:
- write_lock(&nm_i->nat_tree_lock);
- ne = __lookup_nat_cache(nm_i, nid);
- if (ne)
- goto found;
- ne = grab_nat_entry(nm_i, nid);
+ down_write(&nm_i->nat_tree_lock);
+ ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
- write_unlock(&nm_i->nat_tree_lock);
- goto retry;
+ ne = grab_nat_entry(nm_i, nid);
+ node_info_from_raw_nat(&ne->ni, &raw_ne);
}
- node_info_from_raw_nat(&ne->ni, &raw_ne);
-found:
__set_nat_cache_dirty(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}
update_nats_in_cursum(sum, -i);
mutex_unlock(&curseg->curseg_mutex);
@@ -1889,10 +1895,10 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
}
raw_nat_from_node_info(raw_ne, &ne->ni);
- write_lock(&NM_I(sbi)->nat_tree_lock);
+ down_write(&NM_I(sbi)->nat_tree_lock);
nat_reset_flag(ne);
__clear_nat_cache_dirty(NM_I(sbi), ne);
- write_unlock(&NM_I(sbi)->nat_tree_lock);
+ up_write(&NM_I(sbi)->nat_tree_lock);
if (nat_get_blkaddr(ne) == NULL_ADDR)
add_free_nid(sbi, nid, false);
@@ -1903,10 +1909,10 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
else
f2fs_put_page(page, 1);
- if (!set->entry_cnt) {
- radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
- kmem_cache_free(nat_entry_set_slab, set);
- }
+ f2fs_bug_on(sbi, set->entry_cnt);
+
+ radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+ kmem_cache_free(nat_entry_set_slab, set);
}
/*
@@ -1923,6 +1929,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
nid_t set_idx = 0;
LIST_HEAD(sets);
+ if (!nm_i->dirty_nat_cnt)
+ return;
/*
* if there are no enough space in journal to store dirty nat
* entries, remove all entries from journal and merge them
@@ -1931,9 +1939,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
- if (!nm_i->dirty_nat_cnt)
- return;
-
while ((found = __gang_lookup_nat_set(nm_i,
set_idx, NATVEC_SIZE, setvec))) {
unsigned idx;
@@ -1973,13 +1978,13 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
- INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
- INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
+ INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
+ INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
INIT_LIST_HEAD(&nm_i->nat_entries);
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
- rwlock_init(&nm_i->nat_tree_lock);
+ init_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -2035,7 +2040,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->free_nid_list_lock);
/* destroy nat cache */
- write_lock(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -2044,7 +2049,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
__del_from_nat_cache(nm_i, natvec[idx]);
}
f2fs_bug_on(sbi, nm_i->nat_cnt);
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
kfree(nm_i->nat_bitmap);
sbi->nm_info = NULL;
@@ -2061,17 +2066,17 @@ int __init create_node_manager_caches(void)
free_nid_slab = f2fs_kmem_cache_create("free_nid",
sizeof(struct free_nid));
if (!free_nid_slab)
- goto destory_nat_entry;
+ goto destroy_nat_entry;
nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
sizeof(struct nat_entry_set));
if (!nat_entry_set_slab)
- goto destory_free_nid;
+ goto destroy_free_nid;
return 0;
-destory_free_nid:
+destroy_free_nid:
kmem_cache_destroy(free_nid_slab);
-destory_nat_entry:
+destroy_nat_entry:
kmem_cache_destroy(nat_entry_slab);
fail:
return -ENOMEM;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 8d5e6e0dd840..d10b6448a671 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -106,7 +106,8 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
- DIRTY_DENTS /* indicates dirty dentry pages */
+ DIRTY_DENTS, /* indicates dirty dentry pages */
+ INO_ENTRIES, /* indicates inode entries */
};
struct nat_entry_set {
@@ -192,10 +193,7 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
{
unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
- if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
- f2fs_clear_bit(block_off, nm_i->nat_bitmap);
- else
- f2fs_set_bit(block_off, nm_i->nat_bitmap);
+ f2fs_change_bit(block_off, nm_i->nat_bitmap);
}
static inline void fill_node_footer(struct page *page, nid_t nid,
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index ebd013225788..9160a37e1c7a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -111,7 +111,7 @@ retry:
iput(einode);
goto out_unmap_put;
}
- f2fs_delete_entry(de, page, einode);
+ f2fs_delete_entry(de, page, dir, einode);
iput(einode);
goto retry;
}
@@ -129,7 +129,7 @@ retry:
goto out;
out_unmap_put:
- kunmap(page);
+ f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
out_err:
iput(dir);
@@ -170,13 +170,15 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+ ra_meta_pages(sbi, blkaddr, 1, META_POR);
+
while (1) {
struct fsync_inode_entry *entry;
if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
return 0;
- page = get_meta_page_ra(sbi, blkaddr);
+ page = get_meta_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page))
break;
@@ -227,6 +229,8 @@ next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
+
+ ra_meta_pages_cond(sbi, blkaddr);
}
f2fs_put_page(page, 1);
return err;
@@ -436,7 +440,9 @@ static int recover_data(struct f2fs_sb_info *sbi,
if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
break;
- page = get_meta_page_ra(sbi, blkaddr);
+ ra_meta_pages_cond(sbi, blkaddr);
+
+ page = get_meta_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page)) {
f2fs_put_page(page, 1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 923cb76fdc46..42607a679923 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -178,17 +178,47 @@ void register_inmem_page(struct inode *inode, struct page *page)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
struct inmem_pages *new;
+ int err;
+
+ SetPagePrivate(page);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
/* add atomic page indices to the list */
new->page = page;
INIT_LIST_HEAD(&new->list);
-
+retry:
/* increase reference count with clean state */
mutex_lock(&fi->inmem_lock);
+ err = radix_tree_insert(&fi->inmem_root, page->index, new);
+ if (err == -EEXIST) {
+ mutex_unlock(&fi->inmem_lock);
+ kmem_cache_free(inmem_entry_slab, new);
+ return;
+ } else if (err) {
+ mutex_unlock(&fi->inmem_lock);
+ goto retry;
+ }
get_page(page);
list_add_tail(&new->list, &fi->inmem_pages);
+ inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ mutex_unlock(&fi->inmem_lock);
+}
+
+void invalidate_inmem_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct inmem_pages *cur;
+
+ mutex_lock(&fi->inmem_lock);
+ cur = radix_tree_lookup(&fi->inmem_root, page->index);
+ if (cur) {
+ radix_tree_delete(&fi->inmem_root, cur->page->index);
+ f2fs_put_page(cur->page, 0);
+ list_del(&cur->list);
+ kmem_cache_free(inmem_entry_slab, cur);
+ dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ }
mutex_unlock(&fi->inmem_lock);
}
@@ -203,7 +233,16 @@ void commit_inmem_pages(struct inode *inode, bool abort)
.rw = WRITE_SYNC,
};
- f2fs_balance_fs(sbi);
+ /*
+ * The abort is true only when f2fs_evict_inode is called.
+ * Basically, the f2fs_evict_inode doesn't produce any data writes, so
+ * that we don't need to call f2fs_balance_fs.
+ * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
+ * inode becomes free by iget_locked in f2fs_iget.
+ */
+ if (!abort)
+ f2fs_balance_fs(sbi);
+
f2fs_lock_op(sbi);
mutex_lock(&fi->inmem_lock);
@@ -216,9 +255,11 @@ void commit_inmem_pages(struct inode *inode, bool abort)
do_write_data_page(cur->page, &fio);
submit_bio = true;
}
+ radix_tree_delete(&fi->inmem_root, cur->page->index);
f2fs_put_page(cur->page, 1);
list_del(&cur->list);
kmem_cache_free(inmem_entry_slab, cur);
+ dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
}
if (submit_bio)
f2fs_submit_merged_bio(sbi, DATA, WRITE);
@@ -248,7 +289,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
{
/* check the # of cached NAT entries and prefree segments */
if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
- excess_prefree_segs(sbi))
+ excess_prefree_segs(sbi) ||
+ available_free_memory(sbi, INO_ENTRIES))
f2fs_sync_fs(sbi->sb, true);
}
@@ -441,10 +483,33 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
}
}
-static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static void __add_discard_entry(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc, unsigned int start, unsigned int end)
{
struct list_head *head = &SM_I(sbi)->discard_list;
- struct discard_entry *new;
+ struct discard_entry *new, *last;
+
+ if (!list_empty(head)) {
+ last = list_last_entry(head, struct discard_entry, list);
+ if (START_BLOCK(sbi, cpc->trim_start) + start ==
+ last->blkaddr + last->len) {
+ last->len += end - start;
+ goto done;
+ }
+ }
+
+ new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+ INIT_LIST_HEAD(&new->list);
+ new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
+ new->len = end - start;
+ list_add_tail(&new->list, head);
+done:
+ SM_I(sbi)->nr_discards += end - start;
+ cpc->trimmed += end - start;
+}
+
+static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
int max_blocks = sbi->blocks_per_seg;
struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
@@ -473,13 +538,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
mutex_unlock(&dirty_i->seglist_lock);
- new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
- INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
- new->len = sbi->blocks_per_seg;
- list_add_tail(&new->list, head);
- SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
- cpc->trimmed += sbi->blocks_per_seg;
+ __add_discard_entry(sbi, cpc, 0, sbi->blocks_per_seg);
return;
}
@@ -489,7 +548,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
for (i = 0; i < entries; i++)
- dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
+ dmap[i] = ~(cur_map[i] | ckpt_map[i]);
while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
start = __find_rev_next_bit(dmap, max_blocks, end + 1);
@@ -501,14 +560,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (end - start < cpc->trim_minlen)
continue;
- new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
- INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
- new->len = end - start;
- cpc->trimmed += end - start;
-
- list_add_tail(&new->list, head);
- SM_I(sbi)->nr_discards += end - start;
+ __add_discard_entry(sbi, cpc, start, end);
}
}
@@ -620,10 +672,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
/* Update valid block bitmap */
if (del > 0) {
- if (f2fs_set_bit(offset, se->cur_valid_map))
+ if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
f2fs_bug_on(sbi, 1);
} else {
- if (!f2fs_clear_bit(offset, se->cur_valid_map))
+ if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
f2fs_bug_on(sbi, 1);
}
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -1004,6 +1056,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
range->len < sbi->blocksize)
return -EINVAL;
+ cpc.trimmed = 0;
if (end <= MAIN_BLKADDR(sbi))
goto out;
@@ -1015,10 +1068,11 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
cpc.trim_start = start_segno;
cpc.trim_end = end_segno;
cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
- cpc.trimmed = 0;
/* do checkpoint to issue discard commands safely */
+ mutex_lock(&sbi->gc_mutex);
write_checkpoint(sbi, &cpc);
+ mutex_unlock(&sbi->gc_mutex);
out:
range->len = cpc.trimmed << sbi->log_blocksize;
return 0;
@@ -1050,8 +1104,8 @@ static int __get_segment_type_4(struct page *page, enum page_type p_type)
else
return CURSEG_COLD_DATA;
} else {
- if (IS_DNODE(page) && !is_cold_node(page))
- return CURSEG_HOT_NODE;
+ if (IS_DNODE(page) && is_cold_node(page))
+ return CURSEG_WARM_NODE;
else
return CURSEG_COLD_NODE;
}
@@ -1524,17 +1578,7 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
unsigned int segno)
{
- struct sit_info *sit_i = SIT_I(sbi);
- unsigned int offset = SIT_BLOCK_OFFSET(segno);
- block_t blk_addr = sit_i->sit_base_addr + offset;
-
- check_seg_range(sbi, segno);
-
- /* calculate sit block address */
- if (f2fs_test_bit(offset, sit_i->sit_bitmap))
- blk_addr += sit_i->sit_blocks;
-
- return get_meta_page(sbi, blk_addr);
+ return get_meta_page(sbi, current_sit_addr(sbi, segno));
}
static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
@@ -1687,7 +1731,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* #2, flush sit entries to sit page.
*/
list_for_each_entry_safe(ses, tmp, head, set_list) {
- struct page *page;
+ struct page *page = NULL;
struct f2fs_sit_block *raw_sit = NULL;
unsigned int start_segno = ses->start_segno;
unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
@@ -2200,7 +2244,7 @@ int __init create_segment_manager_caches(void)
goto fail;
sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
- sizeof(struct nat_entry_set));
+ sizeof(struct sit_entry_set));
if (!sit_entry_set_slab)
goto destory_discard_entry;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 2495bec1c621..7f327c0ba4e3 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -657,10 +657,7 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
{
unsigned int block_off = SIT_BLOCK_OFFSET(start);
- if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
- f2fs_clear_bit(block_off, sit_i->sit_bitmap);
- else
- f2fs_set_bit(block_off, sit_i->sit_bitmap);
+ f2fs_change_bit(block_off, sit_i->sit_bitmap);
}
static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
@@ -714,6 +711,9 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
*/
static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
{
+ if (sbi->sb->s_bdi->dirty_exceeded)
+ return 0;
+
if (type == DATA)
return sbi->blocks_per_seg;
else if (type == NODE)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 41d6f700f4ee..f71421d70475 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -51,8 +51,10 @@ enum {
Opt_disable_ext_identify,
Opt_inline_xattr,
Opt_inline_data,
+ Opt_inline_dentry,
Opt_flush_merge,
Opt_nobarrier,
+ Opt_fastboot,
Opt_err,
};
@@ -69,8 +71,10 @@ static match_table_t f2fs_tokens = {
{Opt_disable_ext_identify, "disable_ext_identify"},
{Opt_inline_xattr, "inline_xattr"},
{Opt_inline_data, "inline_data"},
+ {Opt_inline_dentry, "inline_dentry"},
{Opt_flush_merge, "flush_merge"},
{Opt_nobarrier, "nobarrier"},
+ {Opt_fastboot, "fastboot"},
{Opt_err, NULL},
};
@@ -340,12 +344,18 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_inline_data:
set_opt(sbi, INLINE_DATA);
break;
+ case Opt_inline_dentry:
+ set_opt(sbi, INLINE_DENTRY);
+ break;
case Opt_flush_merge:
set_opt(sbi, FLUSH_MERGE);
break;
case Opt_nobarrier:
set_opt(sbi, NOBARRIER);
break;
+ case Opt_fastboot:
+ set_opt(sbi, FASTBOOT);
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -373,6 +383,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
fi->i_advise = 0;
rwlock_init(&fi->ext.ext_lock);
init_rwsem(&fi->i_sem);
+ INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
@@ -473,9 +484,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
trace_f2fs_sync_fs(sb, sync);
if (sync) {
- struct cp_control cpc = {
- .reason = CP_SYNC,
- };
+ struct cp_control cpc;
+
+ cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
mutex_lock(&sbi->gc_mutex);
write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
@@ -562,10 +573,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",disable_ext_identify");
if (test_opt(sbi, INLINE_DATA))
seq_puts(seq, ",inline_data");
+ if (test_opt(sbi, INLINE_DENTRY))
+ seq_puts(seq, ",inline_dentry");
if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
seq_puts(seq, ",flush_merge");
if (test_opt(sbi, NOBARRIER))
seq_puts(seq, ",nobarrier");
+ if (test_opt(sbi, FASTBOOT))
+ seq_puts(seq, ",fastboot");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -654,7 +669,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
f2fs_sync_fs(sb, 1);
need_restart_gc = true;
}
- } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+ } else if (!sbi->gc_thread) {
err = start_gc_thread(sbi);
if (err)
goto restore_opts;
@@ -667,7 +682,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
*/
if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
destroy_flush_cmd_control(sbi);
- } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) {
+ } else if (!SM_I(sbi)->cmd_control_info) {
err = create_flush_cmd_control(sbi);
if (err)
goto restore_gc;
@@ -922,7 +937,7 @@ retry:
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
- struct f2fs_super_block *raw_super;
+ struct f2fs_super_block *raw_super = NULL;
struct buffer_head *raw_super_buf;
struct inode *root;
long err = -EINVAL;
@@ -1123,7 +1138,7 @@ try_onemore:
* If filesystem is not mounted as read-only then
* do start the gc_thread.
*/
- if (!f2fs_readonly(sb)) {
+ if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) {
/* After POR, we can run background GC thread.*/
err = start_gc_thread(sbi);
if (err)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index deca8728117b..5072bf9ae0ef 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -83,7 +83,7 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
+ return f2fs_getxattr(dentry->d_inode, type, name, buffer, size, NULL);
}
static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -398,7 +398,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
}
int f2fs_getxattr(struct inode *inode, int index, const char *name,
- void *buffer, size_t buffer_size)
+ void *buffer, size_t buffer_size, struct page *ipage)
{
struct f2fs_xattr_entry *entry;
void *base_addr;
@@ -412,7 +412,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- base_addr = read_all_xattrs(inode, NULL);
+ base_addr = read_all_xattrs(inode, ipage);
if (!base_addr)
return -ENOMEM;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 34ab7dbcf5e3..969d792ca362 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -115,7 +115,8 @@ extern const struct xattr_handler *f2fs_xattr_handlers[];
extern int f2fs_setxattr(struct inode *, int, const char *,
const void *, size_t, struct page *, int);
-extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *,
+ size_t, struct page *);
extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#else
@@ -126,7 +127,8 @@ static inline int f2fs_setxattr(struct inode *inode, int index,
return -EOPNOTSUPP;
}
static inline int f2fs_getxattr(struct inode *inode, int index,
- const char *name, void *buffer, size_t buffer_size)
+ const char *name, void *buffer,
+ size_t buffer_size, struct page *dpage)
{
return -EOPNOTSUPP;
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3963ede84eb0..c5d6bb939d19 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -702,10 +702,11 @@ static int fat_readdir(struct file *file, struct dir_context *ctx)
}
#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
-static int func(void *__buf, const char *name, int name_len, \
+static int func(struct dir_context *ctx, const char *name, int name_len, \
loff_t offset, u64 ino, unsigned int d_type) \
{ \
- struct fat_ioctl_filldir_callback *buf = __buf; \
+ struct fat_ioctl_filldir_callback *buf = \
+ container_of(ctx, struct fat_ioctl_filldir_callback, ctx); \
struct dirent_type __user *d1 = buf->dirent; \
struct dirent_type __user *d2 = d1 + 1; \
\
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e0c4ba39a377..64e295e8ff38 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -370,6 +370,7 @@ extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
/* fat/inode.c */
+extern int fat_block_truncate_page(struct inode *inode, loff_t from);
extern void fat_attach(struct inode *inode, loff_t i_pos);
extern void fat_detach(struct inode *inode);
extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 85f79a89e747..8429c68e3057 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -443,6 +443,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
+ error = fat_block_truncate_page(inode, attr->ia_size);
+ if (error)
+ goto out;
down_write(&MSDOS_I(inode)->truncate_lock);
truncate_setsize(inode, attr->ia_size);
fat_truncate_blocks(inode, attr->ia_size);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 756aead10d96..7b41a2dcdd76 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -294,6 +294,18 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
return blocknr;
}
+/*
+ * fat_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This is required during truncate to physically zeroout the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ * Also, avoid causing failure from fsx for cases of "data past EOF"
+ */
+int fat_block_truncate_page(struct inode *inode, loff_t from)
+{
+ return block_truncate_page(inode->i_mapping, from, fat_get_block);
+}
+
static const struct address_space_operations fat_aops = {
.readpage = fat_readpage,
.readpages = fat_readpages,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6df8d3d885e5..b8b92c2f9683 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -736,7 +736,12 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
}
alias = d_find_alias(inode);
- if (alias && !vfat_d_anon_disconn(alias)) {
+ /*
+ * Checking "alias->d_parent == dentry->d_parent" to make sure
+ * FS is not corrupted (especially double linked dir).
+ */
+ if (alias && alias->d_parent == dentry->d_parent &&
+ !vfat_d_anon_disconn(alias)) {
/*
* This inode has non anonymous-DCACHE_DISCONNECTED
* dentry. This means, the user did ->lookup() by an
@@ -755,12 +760,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- dentry->d_time = dentry->d_parent->d_inode->i_version;
- dentry = d_splice_alias(inode, dentry);
- if (dentry)
- dentry->d_time = dentry->d_parent->d_inode->i_version;
- return dentry;
-
+ if (!inode)
+ dentry->d_time = dir->i_version;
+ return d_splice_alias(inode, dentry);
error:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
return ERR_PTR(err);
@@ -793,7 +795,6 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
- dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -824,6 +825,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
+ dentry->d_time = dir->i_version;
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -849,6 +851,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
+ dentry->d_time = dir->i_version;
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -889,7 +892,6 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
- dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
mutex_unlock(&MSDOS_SB(sb)->s_lock);
diff --git a/fs/file.c b/fs/file.c
index ab3eb6a88239..ee738ea028fa 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -869,7 +869,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
struct file *file = fget_raw(fildes);
if (file) {
- ret = get_unused_fd();
+ ret = get_unused_fd_flags(0);
if (ret >= 0)
fd_install(ret, file);
else
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ef9bef118342..2d609a5fbfea 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* write_inode()
*/
spin_lock(&inode->i_lock);
- /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
- if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- inode->i_state &= ~I_DIRTY_PAGES;
+
dirty = inode->i_state & I_DIRTY;
- inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ inode->i_state &= ~I_DIRTY;
+
+ /*
+ * Paired with smp_mb() in __mark_inode_dirty(). This allows
+ * __mark_inode_dirty() to test i_state without grabbing i_lock -
+ * either they see the I_DIRTY bits cleared or we see the dirtied
+ * inode.
+ *
+ * I_DIRTY_PAGES is always cleared together above even if @mapping
+ * still has dirty pages. The flag is reinstated after smp_mb() if
+ * necessary. This guarantees that either __mark_inode_dirty()
+ * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
+ */
+ smp_mb();
+
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ inode->i_state |= I_DIRTY_PAGES;
+
spin_unlock(&inode->i_lock);
+
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc);
@@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
/*
- * make sure that changes are seen by all cpus before we test i_state
- * -- mikulas
+ * Paired with smp_mb() in __writeback_single_inode() for the
+ * following lockless i_state test. See there for details.
*/
smp_mb();
- /* avoid the locking if we can */
if ((inode->i_state & flags) == flags)
return;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 966ace8b243f..28d0c7abba1c 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -415,7 +415,7 @@ err_unlock:
err_region:
unregister_chrdev_region(devt, 1);
err:
- fuse_conn_kill(fc);
+ fuse_abort_conn(fc);
goto out;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ca887314aba9..ba1107977f2e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -511,6 +511,35 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
}
EXPORT_SYMBOL_GPL(fuse_request_send);
+ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
+{
+ struct fuse_req *req;
+ ssize_t ret;
+
+ req = fuse_get_req(fc, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ req->in.h.opcode = args->in.h.opcode;
+ req->in.h.nodeid = args->in.h.nodeid;
+ req->in.numargs = args->in.numargs;
+ memcpy(req->in.args, args->in.args,
+ args->in.numargs * sizeof(struct fuse_in_arg));
+ req->out.argvar = args->out.argvar;
+ req->out.numargs = args->out.numargs;
+ memcpy(req->out.args, args->out.args,
+ args->out.numargs * sizeof(struct fuse_arg));
+ fuse_request_send(fc, req);
+ ret = req->out.h.error;
+ if (!ret && args->out.argvar) {
+ BUG_ON(args->out.numargs != 1);
+ ret = req->out.args[0].size;
+ }
+ fuse_put_request(fc, req);
+
+ return ret;
+}
+
static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
struct fuse_req *req)
{
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index dbab798f5caf..252b8a5de8b5 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -145,22 +145,22 @@ static void fuse_invalidate_entry(struct dentry *entry)
fuse_invalidate_entry_cache(entry);
}
-static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req,
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
u64 nodeid, struct qstr *name,
struct fuse_entry_out *outarg)
{
memset(outarg, 0, sizeof(struct fuse_entry_out));
- req->in.h.opcode = FUSE_LOOKUP;
- req->in.h.nodeid = nodeid;
- req->in.numargs = 1;
- req->in.args[0].size = name->len + 1;
- req->in.args[0].value = name->name;
- req->out.numargs = 1;
+ args->in.h.opcode = FUSE_LOOKUP;
+ args->in.h.nodeid = nodeid;
+ args->in.numargs = 1;
+ args->in.args[0].size = name->len + 1;
+ args->in.args[0].value = name->name;
+ args->out.numargs = 1;
if (fc->minor < 9)
- req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+ args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
else
- req->out.args[0].size = sizeof(struct fuse_entry_out);
- req->out.args[0].value = outarg;
+ args->out.args[0].size = sizeof(struct fuse_entry_out);
+ args->out.args[0].value = outarg;
}
u64 fuse_get_attr_version(struct fuse_conn *fc)
@@ -200,9 +200,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
goto invalid;
else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
(flags & LOOKUP_REVAL)) {
- int err;
struct fuse_entry_out outarg;
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_forget_link *forget;
u64 attr_version;
@@ -215,31 +214,23 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
goto out;
fc = get_fuse_conn(inode);
- req = fuse_get_req_nopages(fc);
- ret = PTR_ERR(req);
- if (IS_ERR(req))
- goto out;
forget = fuse_alloc_forget();
- if (!forget) {
- fuse_put_request(fc, req);
- ret = -ENOMEM;
+ ret = -ENOMEM;
+ if (!forget)
goto out;
- }
attr_version = fuse_get_attr_version(fc);
parent = dget_parent(entry);
- fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
+ fuse_lookup_init(fc, &args, get_node_id(parent->d_inode),
&entry->d_name, &outarg);
- fuse_request_send(fc, req);
+ ret = fuse_simple_request(fc, &args);
dput(parent);
- err = req->out.h.error;
- fuse_put_request(fc, req);
/* Zero nodeid is same as -ENOENT */
- if (!err && !outarg.nodeid)
- err = -ENOENT;
- if (!err) {
+ if (!ret && !outarg.nodeid)
+ ret = -ENOENT;
+ if (!ret) {
fi = get_fuse_inode(inode);
if (outarg.nodeid != get_node_id(inode)) {
fuse_queue_forget(fc, forget, outarg.nodeid, 1);
@@ -250,7 +241,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
spin_unlock(&fc->lock);
}
kfree(forget);
- if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+ if (ret == -ENOMEM)
+ goto out;
+ if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
goto invalid;
fuse_change_attributes(inode, &outarg.attr,
@@ -296,7 +289,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_forget_link *forget;
u64 attr_version;
int err;
@@ -306,24 +299,16 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
if (name->len > FUSE_NAME_MAX)
goto out;
- req = fuse_get_req_nopages(fc);
- err = PTR_ERR(req);
- if (IS_ERR(req))
- goto out;
forget = fuse_alloc_forget();
err = -ENOMEM;
- if (!forget) {
- fuse_put_request(fc, req);
+ if (!forget)
goto out;
- }
attr_version = fuse_get_attr_version(fc);
- fuse_lookup_init(fc, req, nodeid, name, outarg);
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ fuse_lookup_init(fc, &args, nodeid, name, outarg);
+ err = fuse_simple_request(fc, &args);
/* Zero nodeid is same as -ENOENT, but with valid timeout */
if (err || !outarg->nodeid)
goto out_put_forget;
@@ -372,7 +357,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
if (inode && get_node_id(inode) == FUSE_ROOT_ID)
goto out_iput;
- newent = d_materialise_unique(entry, inode);
+ newent = d_splice_alias(inode, entry);
err = PTR_ERR(newent);
if (IS_ERR(newent))
goto out_err;
@@ -405,7 +390,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
int err;
struct inode *inode;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_forget_link *forget;
struct fuse_create_in inarg;
struct fuse_open_out outopen;
@@ -420,15 +405,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
if (!forget)
goto out_err;
- req = fuse_get_req_nopages(fc);
- err = PTR_ERR(req);
- if (IS_ERR(req))
- goto out_put_forget_req;
-
err = -ENOMEM;
ff = fuse_file_alloc(fc);
if (!ff)
- goto out_put_request;
+ goto out_put_forget_req;
if (!fc->dont_mask)
mode &= ~current_umask();
@@ -439,24 +419,23 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.flags = flags;
inarg.mode = mode;
inarg.umask = current_umask();
- req->in.h.opcode = FUSE_CREATE;
- req->in.h.nodeid = get_node_id(dir);
- req->in.numargs = 2;
- req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
+ args.in.h.opcode = FUSE_CREATE;
+ args.in.h.nodeid = get_node_id(dir);
+ args.in.numargs = 2;
+ args.in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->in.args[1].size = entry->d_name.len + 1;
- req->in.args[1].value = entry->d_name.name;
- req->out.numargs = 2;
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = entry->d_name.len + 1;
+ args.in.args[1].value = entry->d_name.name;
+ args.out.numargs = 2;
if (fc->minor < 9)
- req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+ args.out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
else
- req->out.args[0].size = sizeof(outentry);
- req->out.args[0].value = &outentry;
- req->out.args[1].size = sizeof(outopen);
- req->out.args[1].value = &outopen;
- fuse_request_send(fc, req);
- err = req->out.h.error;
+ args.out.args[0].size = sizeof(outentry);
+ args.out.args[0].value = &outentry;
+ args.out.args[1].size = sizeof(outopen);
+ args.out.args[1].value = &outopen;
+ err = fuse_simple_request(fc, &args);
if (err)
goto out_free_ff;
@@ -464,7 +443,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
goto out_free_ff;
- fuse_put_request(fc, req);
ff->fh = outopen.fh;
ff->nodeid = outentry.nodeid;
ff->open_flags = outopen.open_flags;
@@ -492,8 +470,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
out_free_ff:
fuse_file_free(ff);
-out_put_request:
- fuse_put_request(fc, req);
out_put_forget_req:
kfree(forget);
out_err:
@@ -547,7 +523,7 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
+static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
struct inode *dir, struct dentry *entry,
umode_t mode)
{
@@ -557,22 +533,18 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
struct fuse_forget_link *forget;
forget = fuse_alloc_forget();
- if (!forget) {
- fuse_put_request(fc, req);
+ if (!forget)
return -ENOMEM;
- }
memset(&outarg, 0, sizeof(outarg));
- req->in.h.nodeid = get_node_id(dir);
- req->out.numargs = 1;
+ args->in.h.nodeid = get_node_id(dir);
+ args->out.numargs = 1;
if (fc->minor < 9)
- req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+ args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
else
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args->out.args[0].size = sizeof(outarg);
+ args->out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, args);
if (err)
goto out_put_forget_req;
@@ -609,9 +581,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
{
struct fuse_mknod_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ FUSE_ARGS(args);
if (!fc->dont_mask)
mode &= ~current_umask();
@@ -620,14 +590,14 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
inarg.mode = mode;
inarg.rdev = new_encode_dev(rdev);
inarg.umask = current_umask();
- req->in.h.opcode = FUSE_MKNOD;
- req->in.numargs = 2;
- req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
+ args.in.h.opcode = FUSE_MKNOD;
+ args.in.numargs = 2;
+ args.in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->in.args[1].size = entry->d_name.len + 1;
- req->in.args[1].value = entry->d_name.name;
- return create_new_entry(fc, req, dir, entry, mode);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = entry->d_name.len + 1;
+ args.in.args[1].value = entry->d_name.name;
+ return create_new_entry(fc, &args, dir, entry, mode);
}
static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
@@ -640,9 +610,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ FUSE_ARGS(args);
if (!fc->dont_mask)
mode &= ~current_umask();
@@ -650,13 +618,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
memset(&inarg, 0, sizeof(inarg));
inarg.mode = mode;
inarg.umask = current_umask();
- req->in.h.opcode = FUSE_MKDIR;
- req->in.numargs = 2;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->in.args[1].size = entry->d_name.len + 1;
- req->in.args[1].value = entry->d_name.name;
- return create_new_entry(fc, req, dir, entry, S_IFDIR);
+ args.in.h.opcode = FUSE_MKDIR;
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = entry->d_name.len + 1;
+ args.in.args[1].value = entry->d_name.name;
+ return create_new_entry(fc, &args, dir, entry, S_IFDIR);
}
static int fuse_symlink(struct inode *dir, struct dentry *entry,
@@ -664,17 +632,15 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
{
struct fuse_conn *fc = get_fuse_conn(dir);
unsigned len = strlen(link) + 1;
- struct fuse_req *req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ FUSE_ARGS(args);
- req->in.h.opcode = FUSE_SYMLINK;
- req->in.numargs = 2;
- req->in.args[0].size = entry->d_name.len + 1;
- req->in.args[0].value = entry->d_name.name;
- req->in.args[1].size = len;
- req->in.args[1].value = link;
- return create_new_entry(fc, req, dir, entry, S_IFLNK);
+ args.in.h.opcode = FUSE_SYMLINK;
+ args.in.numargs = 2;
+ args.in.args[0].size = entry->d_name.len + 1;
+ args.in.args[0].value = entry->d_name.name;
+ args.in.args[1].size = len;
+ args.in.args[1].value = link;
+ return create_new_entry(fc, &args, dir, entry, S_IFLNK);
}
static inline void fuse_update_ctime(struct inode *inode)
@@ -689,18 +655,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- req->in.h.opcode = FUSE_UNLINK;
- req->in.h.nodeid = get_node_id(dir);
- req->in.numargs = 1;
- req->in.args[0].size = entry->d_name.len + 1;
- req->in.args[0].value = entry->d_name.name;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ FUSE_ARGS(args);
+
+ args.in.h.opcode = FUSE_UNLINK;
+ args.in.h.nodeid = get_node_id(dir);
+ args.in.numargs = 1;
+ args.in.args[0].size = entry->d_name.len + 1;
+ args.in.args[0].value = entry->d_name.name;
+ err = fuse_simple_request(fc, &args);
if (!err) {
struct inode *inode = entry->d_inode;
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -729,18 +691,14 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
{
int err;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- req->in.h.opcode = FUSE_RMDIR;
- req->in.h.nodeid = get_node_id(dir);
- req->in.numargs = 1;
- req->in.args[0].size = entry->d_name.len + 1;
- req->in.args[0].value = entry->d_name.name;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ FUSE_ARGS(args);
+
+ args.in.h.opcode = FUSE_RMDIR;
+ args.in.h.nodeid = get_node_id(dir);
+ args.in.numargs = 1;
+ args.in.args[0].size = entry->d_name.len + 1;
+ args.in.args[0].value = entry->d_name.name;
+ err = fuse_simple_request(fc, &args);
if (!err) {
clear_nlink(entry->d_inode);
fuse_invalidate_attr(dir);
@@ -757,27 +715,21 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
int err;
struct fuse_rename2_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
- struct fuse_req *req;
-
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ FUSE_ARGS(args);
memset(&inarg, 0, argsize);
inarg.newdir = get_node_id(newdir);
inarg.flags = flags;
- req->in.h.opcode = opcode;
- req->in.h.nodeid = get_node_id(olddir);
- req->in.numargs = 3;
- req->in.args[0].size = argsize;
- req->in.args[0].value = &inarg;
- req->in.args[1].size = oldent->d_name.len + 1;
- req->in.args[1].value = oldent->d_name.name;
- req->in.args[2].size = newent->d_name.len + 1;
- req->in.args[2].value = newent->d_name.name;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.in.h.opcode = opcode;
+ args.in.h.nodeid = get_node_id(olddir);
+ args.in.numargs = 3;
+ args.in.args[0].size = argsize;
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = oldent->d_name.len + 1;
+ args.in.args[1].value = oldent->d_name.name;
+ args.in.args[2].size = newent->d_name.len + 1;
+ args.in.args[2].value = newent->d_name.name;
+ err = fuse_simple_request(fc, &args);
if (!err) {
/* ctime changes */
fuse_invalidate_attr(oldent->d_inode);
@@ -849,19 +801,17 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_link_in inarg;
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
inarg.oldnodeid = get_node_id(inode);
- req->in.h.opcode = FUSE_LINK;
- req->in.numargs = 2;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->in.args[1].size = newent->d_name.len + 1;
- req->in.args[1].value = newent->d_name.name;
- err = create_new_entry(fc, req, newdir, newent, inode->i_mode);
+ args.in.h.opcode = FUSE_LINK;
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = newent->d_name.len + 1;
+ args.in.args[1].value = newent->d_name.name;
+ err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
/* Contrary to "normal" filesystems it can happen that link
makes two "logical" inodes point to the same "physical"
inode. We invalidate the attributes of the old one, so it
@@ -929,13 +879,9 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
struct fuse_getattr_in inarg;
struct fuse_attr_out outarg;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
u64 attr_version;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
attr_version = fuse_get_attr_version(fc);
memset(&inarg, 0, sizeof(inarg));
@@ -947,20 +893,18 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
inarg.getattr_flags |= FUSE_GETATTR_FH;
inarg.fh = ff->fh;
}
- req->in.h.opcode = FUSE_GETATTR;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->out.numargs = 1;
+ args.in.h.opcode = FUSE_GETATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
if (fc->minor < 9)
- req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+ args.out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
else
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
if (!err) {
if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
make_bad_inode(inode);
@@ -1102,7 +1046,7 @@ int fuse_allow_current_process(struct fuse_conn *fc)
static int fuse_access(struct inode *inode, int mask)
{
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_access_in inarg;
int err;
@@ -1111,20 +1055,14 @@ static int fuse_access(struct inode *inode, int mask)
if (fc->no_access)
return 0;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
memset(&inarg, 0, sizeof(inarg));
inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
- req->in.h.opcode = FUSE_ACCESS;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.in.h.opcode = FUSE_ACCESS;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ err = fuse_simple_request(fc, &args);
if (err == -ENOSYS) {
fc->no_access = 1;
err = 0;
@@ -1320,7 +1258,7 @@ static int fuse_direntplus_link(struct file *file,
if (!inode)
goto out;
- alias = d_materialise_unique(dentry, inode);
+ alias = d_splice_alias(inode, dentry);
err = PTR_ERR(alias);
if (IS_ERR(alias))
goto out;
@@ -1445,31 +1383,27 @@ static char *read_link(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_req_nopages(fc);
+ FUSE_ARGS(args);
char *link;
-
- if (IS_ERR(req))
- return ERR_CAST(req);
+ ssize_t ret;
link = (char *) __get_free_page(GFP_KERNEL);
- if (!link) {
- link = ERR_PTR(-ENOMEM);
- goto out;
- }
- req->in.h.opcode = FUSE_READLINK;
- req->in.h.nodeid = get_node_id(inode);
- req->out.argvar = 1;
- req->out.numargs = 1;
- req->out.args[0].size = PAGE_SIZE - 1;
- req->out.args[0].value = link;
- fuse_request_send(fc, req);
- if (req->out.h.error) {
+ if (!link)
+ return ERR_PTR(-ENOMEM);
+
+ args.in.h.opcode = FUSE_READLINK;
+ args.in.h.nodeid = get_node_id(inode);
+ args.out.argvar = 1;
+ args.out.numargs = 1;
+ args.out.args[0].size = PAGE_SIZE - 1;
+ args.out.args[0].value = link;
+ ret = fuse_simple_request(fc, &args);
+ if (ret < 0) {
free_page((unsigned long) link);
- link = ERR_PTR(req->out.h.error);
- } else
- link[req->out.args[0].size] = '\0';
- out:
- fuse_put_request(fc, req);
+ link = ERR_PTR(ret);
+ } else {
+ link[ret] = '\0';
+ }
fuse_invalidate_atime(inode);
return link;
}
@@ -1629,22 +1563,22 @@ void fuse_release_nowrite(struct inode *inode)
spin_unlock(&fc->lock);
}
-static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
struct inode *inode,
struct fuse_setattr_in *inarg_p,
struct fuse_attr_out *outarg_p)
{
- req->in.h.opcode = FUSE_SETATTR;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(*inarg_p);
- req->in.args[0].value = inarg_p;
- req->out.numargs = 1;
+ args->in.h.opcode = FUSE_SETATTR;
+ args->in.h.nodeid = get_node_id(inode);
+ args->in.numargs = 1;
+ args->in.args[0].size = sizeof(*inarg_p);
+ args->in.args[0].value = inarg_p;
+ args->out.numargs = 1;
if (fc->minor < 9)
- req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+ args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
else
- req->out.args[0].size = sizeof(*outarg_p);
- req->out.args[0].value = outarg_p;
+ args->out.args[0].size = sizeof(*outarg_p);
+ args->out.args[0].value = outarg_p;
}
/*
@@ -1653,14 +1587,9 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
{
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
- int err;
-
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
@@ -1677,12 +1606,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.valid |= FATTR_FH;
inarg.fh = ff->fh;
}
- fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
- return err;
+ return fuse_simple_request(fc, &args);
}
/*
@@ -1698,7 +1624,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
bool is_truncate = false;
@@ -1723,10 +1649,6 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
if (attr->ia_valid & ATTR_SIZE)
is_truncate = true;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
if (is_truncate) {
fuse_set_nowrite(inode);
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -1747,10 +1669,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
inarg.valid |= FATTR_LOCKOWNER;
inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
}
- fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+ err = fuse_simple_request(fc, &args);
if (err) {
if (err == -EINTR)
fuse_invalidate_attr(inode);
@@ -1837,32 +1757,26 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
{
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_setxattr_in inarg;
int err;
if (fc->no_setxattr)
return -EOPNOTSUPP;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
memset(&inarg, 0, sizeof(inarg));
inarg.size = size;
inarg.flags = flags;
- req->in.h.opcode = FUSE_SETXATTR;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 3;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->in.args[1].size = strlen(name) + 1;
- req->in.args[1].value = name;
- req->in.args[2].size = size;
- req->in.args[2].value = value;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.in.h.opcode = FUSE_SETXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 3;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = strlen(name) + 1;
+ args.in.args[1].value = name;
+ args.in.args[2].size = size;
+ args.in.args[2].value = value;
+ err = fuse_simple_request(fc, &args);
if (err == -ENOSYS) {
fc->no_setxattr = 1;
err = -EOPNOTSUPP;
@@ -1879,7 +1793,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
{
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
@@ -1887,40 +1801,32 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
if (fc->no_getxattr)
return -EOPNOTSUPP;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
memset(&inarg, 0, sizeof(inarg));
inarg.size = size;
- req->in.h.opcode = FUSE_GETXATTR;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 2;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->in.args[1].size = strlen(name) + 1;
- req->in.args[1].value = name;
+ args.in.h.opcode = FUSE_GETXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = strlen(name) + 1;
+ args.in.args[1].value = name;
/* This is really two different operations rolled into one */
- req->out.numargs = 1;
+ args.out.numargs = 1;
if (size) {
- req->out.argvar = 1;
- req->out.args[0].size = size;
- req->out.args[0].value = value;
+ args.out.argvar = 1;
+ args.out.args[0].size = size;
+ args.out.args[0].value = value;
} else {
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
}
- fuse_request_send(fc, req);
- ret = req->out.h.error;
- if (!ret)
- ret = size ? req->out.args[0].size : outarg.size;
- else {
- if (ret == -ENOSYS) {
- fc->no_getxattr = 1;
- ret = -EOPNOTSUPP;
- }
+ ret = fuse_simple_request(fc, &args);
+ if (!ret && !size)
+ ret = outarg.size;
+ if (ret == -ENOSYS) {
+ fc->no_getxattr = 1;
+ ret = -EOPNOTSUPP;
}
- fuse_put_request(fc, req);
return ret;
}
@@ -1928,7 +1834,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
{
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
@@ -1939,38 +1845,30 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
if (fc->no_listxattr)
return -EOPNOTSUPP;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
memset(&inarg, 0, sizeof(inarg));
inarg.size = size;
- req->in.h.opcode = FUSE_LISTXATTR;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
+ args.in.h.opcode = FUSE_LISTXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
/* This is really two different operations rolled into one */
- req->out.numargs = 1;
+ args.out.numargs = 1;
if (size) {
- req->out.argvar = 1;
- req->out.args[0].size = size;
- req->out.args[0].value = list;
+ args.out.argvar = 1;
+ args.out.args[0].size = size;
+ args.out.args[0].value = list;
} else {
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
}
- fuse_request_send(fc, req);
- ret = req->out.h.error;
- if (!ret)
- ret = size ? req->out.args[0].size : outarg.size;
- else {
- if (ret == -ENOSYS) {
- fc->no_listxattr = 1;
- ret = -EOPNOTSUPP;
- }
+ ret = fuse_simple_request(fc, &args);
+ if (!ret && !size)
+ ret = outarg.size;
+ if (ret == -ENOSYS) {
+ fc->no_listxattr = 1;
+ ret = -EOPNOTSUPP;
}
- fuse_put_request(fc, req);
return ret;
}
@@ -1978,24 +1876,18 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
{
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
int err;
if (fc->no_removexattr)
return -EOPNOTSUPP;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- req->in.h.opcode = FUSE_REMOVEXATTR;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = strlen(name) + 1;
- req->in.args[0].value = name;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.in.h.opcode = FUSE_REMOVEXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = strlen(name) + 1;
+ args.in.args[0].value = name;
+ err = fuse_simple_request(fc, &args);
if (err == -ENOSYS) {
fc->no_removexattr = 1;
err = -EOPNOTSUPP;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index caa8d95b24e8..760b2c552197 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -24,30 +24,22 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
{
struct fuse_open_in inarg;
- struct fuse_req *req;
- int err;
-
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
if (!fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
- req->in.h.opcode = opcode;
- req->in.h.nodeid = nodeid;
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->out.numargs = 1;
- req->out.args[0].size = sizeof(*outargp);
- req->out.args[0].value = outargp;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.in.h.opcode = opcode;
+ args.in.h.nodeid = nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(*outargp);
+ args.out.args[0].value = outargp;
- return err;
+ return fuse_simple_request(fc, &args);
}
struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
@@ -89,37 +81,9 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
return ff;
}
-static void fuse_release_async(struct work_struct *work)
-{
- struct fuse_req *req;
- struct fuse_conn *fc;
- struct path path;
-
- req = container_of(work, struct fuse_req, misc.release.work);
- path = req->misc.release.path;
- fc = get_fuse_conn(path.dentry->d_inode);
-
- fuse_put_request(fc, req);
- path_put(&path);
-}
-
static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
{
- if (fc->destroy_req) {
- /*
- * If this is a fuseblk mount, then it's possible that
- * releasing the path will result in releasing the
- * super block and sending the DESTROY request. If
- * the server is single threaded, this would hang.
- * For this reason do the path_put() in a separate
- * thread.
- */
- atomic_inc(&req->count);
- INIT_WORK(&req->misc.release.work, fuse_release_async);
- schedule_work(&req->misc.release.work);
- } else {
- path_put(&req->misc.release.path);
- }
+ iput(req->misc.release.inode);
}
static void fuse_file_put(struct fuse_file *ff, bool sync)
@@ -133,12 +97,12 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
* implement 'open'
*/
req->background = 0;
- path_put(&req->misc.release.path);
+ iput(req->misc.release.inode);
fuse_put_request(ff->fc, req);
} else if (sync) {
req->background = 0;
fuse_request_send(ff->fc, req);
- path_put(&req->misc.release.path);
+ iput(req->misc.release.inode);
fuse_put_request(ff->fc, req);
} else {
req->end = fuse_release_end;
@@ -297,9 +261,8 @@ void fuse_release_common(struct file *file, int opcode)
inarg->lock_owner = fuse_lock_owner_id(ff->fc,
(fl_owner_t) file);
}
- /* Hold vfsmount and dentry until release is finished */
- path_get(&file->f_path);
- req->misc.release.path = file->f_path;
+ /* Hold inode until release is finished */
+ req->misc.release.inode = igrab(file_inode(file));
/*
* Normally this will send the RELEASE request, however if
@@ -480,7 +443,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
struct inode *inode = file->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_file *ff = file->private_data;
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_fsync_in inarg;
int err;
@@ -506,23 +469,15 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
goto out;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
-
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.fsync_flags = datasync ? 1 : 0;
- req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ err = fuse_simple_request(fc, &args);
if (err == -ENOSYS) {
if (isdir)
fc->no_fsyncdir = 1;
@@ -1988,7 +1943,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
+ struct fuse_conn *fc = get_fuse_conn(file_inode(file));
struct page *page;
loff_t fsize;
int err = -ENOMEM;
@@ -2156,49 +2111,44 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
return 0;
}
-static void fuse_lk_fill(struct fuse_req *req, struct file *file,
+static void fuse_lk_fill(struct fuse_args *args, struct file *file,
const struct file_lock *fl, int opcode, pid_t pid,
- int flock)
+ int flock, struct fuse_lk_in *inarg)
{
struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_file *ff = file->private_data;
- struct fuse_lk_in *arg = &req->misc.lk_in;
-
- arg->fh = ff->fh;
- arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
- arg->lk.start = fl->fl_start;
- arg->lk.end = fl->fl_end;
- arg->lk.type = fl->fl_type;
- arg->lk.pid = pid;
+
+ memset(inarg, 0, sizeof(*inarg));
+ inarg->fh = ff->fh;
+ inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+ inarg->lk.start = fl->fl_start;
+ inarg->lk.end = fl->fl_end;
+ inarg->lk.type = fl->fl_type;
+ inarg->lk.pid = pid;
if (flock)
- arg->lk_flags |= FUSE_LK_FLOCK;
- req->in.h.opcode = opcode;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(*arg);
- req->in.args[0].value = arg;
+ inarg->lk_flags |= FUSE_LK_FLOCK;
+ args->in.h.opcode = opcode;
+ args->in.h.nodeid = get_node_id(inode);
+ args->in.numargs = 1;
+ args->in.args[0].size = sizeof(*inarg);
+ args->in.args[0].value = inarg;
}
static int fuse_getlk(struct file *file, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
+ struct fuse_lk_in inarg;
struct fuse_lk_out outarg;
int err;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0);
- req->out.numargs = 1;
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
if (!err)
err = convert_fuse_file_lock(&outarg.lk, fl);
@@ -2209,7 +2159,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
{
struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
+ struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
int err;
@@ -2223,17 +2174,13 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
if (fl->fl_flags & FL_CLOSE)
return 0;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
+ err = fuse_simple_request(fc, &args);
- fuse_lk_fill(req, file, fl, opcode, pid, flock);
- fuse_request_send(fc, req);
- err = req->out.h.error;
/* locking is restartable */
if (err == -EINTR)
err = -ERESTARTSYS;
- fuse_put_request(fc, req);
+
return err;
}
@@ -2283,7 +2230,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_bmap_in inarg;
struct fuse_bmap_out outarg;
int err;
@@ -2291,24 +2238,18 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
if (!inode->i_sb->s_bdev || fc->no_bmap)
return 0;
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return 0;
-
memset(&inarg, 0, sizeof(inarg));
inarg.block = block;
inarg.blocksize = inode->i_sb->s_blocksize;
- req->in.h.opcode = FUSE_BMAP;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->out.numargs = 1;
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.in.h.opcode = FUSE_BMAP;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
if (err == -ENOSYS)
fc->no_bmap = 1;
@@ -2776,7 +2717,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
struct fuse_conn *fc = ff->fc;
struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
struct fuse_poll_out outarg;
- struct fuse_req *req;
+ FUSE_ARGS(args);
int err;
if (fc->no_poll)
@@ -2794,21 +2735,15 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
fuse_register_polled_file(fc, ff);
}
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return POLLERR;
-
- req->in.h.opcode = FUSE_POLL;
- req->in.h.nodeid = ff->nodeid;
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->out.numargs = 1;
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
- fuse_put_request(fc, req);
+ args.in.h.opcode = FUSE_POLL;
+ args.in.h.nodeid = ff->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
if (!err)
return outarg.revents;
@@ -2949,10 +2884,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
loff_t length)
{
struct fuse_file *ff = file->private_data;
- struct inode *inode = file->f_inode;
+ struct inode *inode = file_inode(file);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = ff->fc;
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_fallocate_in inarg = {
.fh = ff->fh,
.offset = offset,
@@ -2985,25 +2920,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
-
- req->in.h.opcode = FUSE_FALLOCATE;
- req->in.h.nodeid = ff->nodeid;
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
+ args.in.h.opcode = FUSE_FALLOCATE;
+ args.in.h.nodeid = ff->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ err = fuse_simple_request(fc, &args);
if (err == -ENOSYS) {
fc->no_fallocate = 1;
err = -EOPNOTSUPP;
}
- fuse_put_request(fc, req);
-
if (err)
goto out;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e8e47a6ab518..e0fc6725d1d0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -213,7 +213,7 @@ struct fuse_out {
unsigned numargs;
/** Array of arguments */
- struct fuse_arg args[3];
+ struct fuse_arg args[2];
};
/** FUSE page descriptor */
@@ -222,6 +222,25 @@ struct fuse_page_desc {
unsigned int offset;
};
+struct fuse_args {
+ struct {
+ struct {
+ uint32_t opcode;
+ uint64_t nodeid;
+ } h;
+ unsigned numargs;
+ struct fuse_in_arg args[3];
+
+ } in;
+ struct {
+ unsigned argvar:1;
+ unsigned numargs;
+ struct fuse_arg args[2];
+ } out;
+};
+
+#define FUSE_ARGS(args) struct fuse_args args = {}
+
/** The request state */
enum fuse_req_state {
FUSE_REQ_INIT = 0,
@@ -305,11 +324,8 @@ struct fuse_req {
/** Data for asynchronous requests */
union {
struct {
- union {
- struct fuse_release_in in;
- struct work_struct work;
- };
- struct path path;
+ struct fuse_release_in in;
+ struct inode *inode;
} release;
struct fuse_init_in init_in;
struct fuse_init_out init_out;
@@ -324,7 +340,6 @@ struct fuse_req {
struct fuse_req *next;
} write;
struct fuse_notify_retrieve_in retrieve_in;
- struct fuse_lk_in lk_in;
} misc;
/** page vector */
@@ -754,15 +769,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
void __fuse_get_request(struct fuse_req *req);
/**
- * Get a request, may fail with -ENOMEM,
- * useful for callers who doesn't use req->pages[]
- */
-static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc)
-{
- return fuse_get_req(fc, 0);
-}
-
-/**
* Gets a requests for a file operation, always succeeds
*/
struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
@@ -780,6 +786,11 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
/**
+ * Simple request sending that does request allocation and freeing
+ */
+ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
+
+/**
* Send a request in the background
*/
void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
@@ -804,8 +815,6 @@ void fuse_invalidate_atime(struct inode *inode);
*/
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
-void fuse_conn_kill(struct fuse_conn *fc);
-
/**
* Initialize fuse_conn
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 03246cd9d47a..6749109f255d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -376,28 +376,13 @@ static void fuse_bdi_destroy(struct fuse_conn *fc)
bdi_destroy(&fc->bdi);
}
-void fuse_conn_kill(struct fuse_conn *fc)
-{
- spin_lock(&fc->lock);
- fc->connected = 0;
- fc->blocked = 0;
- fc->initialized = 1;
- spin_unlock(&fc->lock);
- /* Flush all readers on this fs */
- kill_fasync(&fc->fasync, SIGIO, POLL_IN);
- wake_up_all(&fc->waitq);
- wake_up_all(&fc->blocked_waitq);
- wake_up_all(&fc->reserved_req_waitq);
-}
-EXPORT_SYMBOL_GPL(fuse_conn_kill);
-
static void fuse_put_super(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
fuse_send_destroy(fc);
- fuse_conn_kill(fc);
+ fuse_abort_conn(fc);
mutex_lock(&fuse_mutex);
list_del(&fc->entry);
fuse_ctl_remove_conn(fc);
@@ -425,7 +410,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- struct fuse_req *req;
+ FUSE_ARGS(args);
struct fuse_statfs_out outarg;
int err;
@@ -434,23 +419,17 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
memset(&outarg, 0, sizeof(outarg));
- req->in.numargs = 0;
- req->in.h.opcode = FUSE_STATFS;
- req->in.h.nodeid = get_node_id(dentry->d_inode);
- req->out.numargs = 1;
- req->out.args[0].size =
+ args.in.numargs = 0;
+ args.in.h.opcode = FUSE_STATFS;
+ args.in.h.nodeid = get_node_id(dentry->d_inode);
+ args.out.numargs = 1;
+ args.out.args[0].size =
fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
- req->out.args[0].value = &outarg;
- fuse_request_send(fc, req);
- err = req->out.h.error;
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
if (!err)
convert_fuse_statfs(buf, &outarg.st);
- fuse_put_request(fc, req);
return err;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 5d4261ff5d23..c5a34f09e228 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -365,23 +365,17 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
ret = gfs2_dir_read_data(ip, hc, hsize);
if (ret < 0) {
- if (is_vmalloc_addr(hc))
- vfree(hc);
- else
- kfree(hc);
+ kvfree(hc);
return ERR_PTR(ret);
}
spin_lock(&inode->i_lock);
- if (ip->i_hash_cache) {
- if (is_vmalloc_addr(hc))
- vfree(hc);
- else
- kfree(hc);
- } else {
+ if (likely(!ip->i_hash_cache)) {
ip->i_hash_cache = hc;
+ hc = NULL;
}
spin_unlock(&inode->i_lock);
+ kvfree(hc);
return ip->i_hash_cache;
}
@@ -396,10 +390,7 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
__be64 *hc = ip->i_hash_cache;
ip->i_hash_cache = NULL;
- if (is_vmalloc_addr(hc))
- vfree(hc);
- else
- kfree(hc);
+ kvfree(hc);
}
static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -1168,10 +1159,7 @@ fail:
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
out_kfree:
- if (is_vmalloc_addr(hc2))
- vfree(hc2);
- else
- kfree(hc2);
+ kvfree(hc2);
return error;
}
@@ -1302,14 +1290,6 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
return ptr;
}
-static void gfs2_free_sort_buffer(void *ptr)
-{
- if (is_vmalloc_addr(ptr))
- vfree(ptr);
- else
- kfree(ptr);
-}
-
static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
int *copied, unsigned *depth,
u64 leaf_no)
@@ -1393,7 +1373,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
out_free:
for(i = 0; i < leaf; i++)
brelse(larr[i]);
- gfs2_free_sort_buffer(larr);
+ kvfree(larr);
out:
return error;
}
@@ -2004,10 +1984,7 @@ out_rlist:
gfs2_rlist_free(&rlist);
gfs2_quota_unhold(dip);
out:
- if (is_vmalloc_addr(ht))
- vfree(ht);
- else
- kfree(ht);
+ kvfree(ht);
return error;
}
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 8b9b3775e2e7..c41d255b6a7b 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -69,10 +69,12 @@ struct get_name_filldir {
char *name;
};
-static int get_name_filldir(void *opaque, const char *name, int length,
- loff_t offset, u64 inum, unsigned int type)
+static int get_name_filldir(struct dir_context *ctx, const char *name,
+ int length, loff_t offset, u64 inum,
+ unsigned int type)
{
- struct get_name_filldir *gnfd = opaque;
+ struct get_name_filldir *gnfd =
+ container_of(ctx, struct get_name_filldir, ctx);
if (inum != gnfd->inum.no_addr)
return 0;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 80dd44dca028..6e600abf694a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -337,7 +337,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
int hint = min_t(size_t, INT_MAX, blks);
- atomic_set(&ip->i_res->rs_sizehint, hint);
+ if (hint > atomic_read(&ip->i_res->rs_sizehint))
+ atomic_set(&ip->i_res->rs_sizehint, hint);
}
/**
@@ -728,7 +729,6 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
struct gfs2_inode *ip = GFS2_I(inode);
struct buffer_head *dibh;
int error;
- loff_t size = len;
unsigned int nr_blks;
sector_t lblock = offset >> inode->i_blkbits;
@@ -762,11 +762,6 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
goto out;
}
}
- if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
- i_size_write(inode, offset + size);
-
- mark_inode_dirty(inode);
-
out:
brelse(dibh);
return error;
@@ -796,8 +791,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
}
}
-static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
- loff_t len)
+static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -811,14 +805,9 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
loff_t max_chunk_size = UINT_MAX & bsize_mask;
- struct gfs2_holder gh;
next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
- /* We only support the FALLOC_FL_KEEP_SIZE mode */
- if (mode & ~FALLOC_FL_KEEP_SIZE)
- return -EOPNOTSUPP;
-
offset &= bsize_mask;
len = next - offset;
@@ -829,17 +818,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
if (bytes == 0)
bytes = sdp->sd_sb.sb_bsize;
- error = gfs2_rs_alloc(ip);
- if (error)
- return error;
-
- mutex_lock(&inode->i_mutex);
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- error = gfs2_glock_nq(&gh);
- if (unlikely(error))
- goto out_uninit;
-
gfs2_size_hint(file, offset, len);
while (len > 0) {
@@ -852,8 +830,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
}
error = gfs2_quota_lock_check(ip);
if (error)
- goto out_unlock;
-
+ return error;
retry:
gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
@@ -895,20 +872,64 @@ retry:
gfs2_quota_unlock(ip);
}
- if (error == 0)
- error = generic_write_sync(file, pos, count);
- goto out_unlock;
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) {
+ i_size_write(inode, pos + count);
+ /* Marks the inode as dirty */
+ file_update_time(file);
+ }
+
+ return generic_write_sync(file, pos, count);
out_trans_fail:
gfs2_inplace_release(ip);
out_qunlock:
gfs2_quota_unlock(ip);
+ return error;
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret)
+ goto out_uninit;
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (offset + len) > inode->i_size) {
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret)
+ goto out_unlock;
+ }
+
+ ret = get_write_access(inode);
+ if (ret)
+ goto out_unlock;
+
+ ret = gfs2_rs_alloc(ip);
+ if (ret)
+ goto out_putw;
+
+ ret = __gfs2_fallocate(file, mode, offset, len);
+ if (ret)
+ gfs2_rs_deltree(ip->i_res);
+out_putw:
+ put_write_access(inode);
out_unlock:
gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
mutex_unlock(&inode->i_mutex);
- return error;
+ return ret;
}
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8f0c19d1d943..a23524aa3eac 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -836,8 +836,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
gh->gh_flags = flags;
gh->gh_iflags = 0;
gh->gh_ip = _RET_IP_;
- if (gh->gh_owner_pid)
- put_pid(gh->gh_owner_pid);
+ put_pid(gh->gh_owner_pid);
gh->gh_owner_pid = get_pid(task_pid(current));
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 1cc0bba6313f..fe91951c3361 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,6 +28,8 @@
#include "trans.h"
#include "dir.h"
+struct workqueue_struct *gfs2_freeze_wq;
+
static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
{
fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
@@ -94,11 +96,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
* on the stack */
tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
tr.tr_ip = _RET_IP_;
- sb_start_intwrite(sdp->sd_vfs);
- if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) {
- sb_end_intwrite(sdp->sd_vfs);
+ if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0)
return;
- }
WARN_ON_ONCE(current->journal_info);
current->journal_info = &tr;
@@ -469,20 +468,19 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
static void freeze_go_sync(struct gfs2_glock *gl)
{
+ int error = 0;
struct gfs2_sbd *sdp = gl->gl_sbd;
- DEFINE_WAIT(wait);
if (gl->gl_state == LM_ST_SHARED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- atomic_set(&sdp->sd_log_freeze, 1);
- wake_up(&sdp->sd_logd_waitq);
- do {
- prepare_to_wait(&sdp->sd_log_frozen_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&sdp->sd_log_freeze))
- io_schedule();
- } while(atomic_read(&sdp->sd_log_freeze));
- finish_wait(&sdp->sd_log_frozen_wait, &wait);
+ atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
+ error = freeze_super(sdp->sd_vfs);
+ if (error) {
+ printk(KERN_INFO "GFS2: couldn't freeze filesystem: %d\n", error);
+ gfs2_assert_withdraw(sdp, 0);
+ }
+ queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
+ gfs2_log_flush(sdp, NULL, FREEZE_FLUSH);
}
}
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 7455d2629bcb..8ed1857c1a8d 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -12,6 +12,8 @@
#include "incore.h"
+extern struct workqueue_struct *gfs2_freeze_wq;
+
extern const struct gfs2_glock_operations gfs2_meta_glops;
extern const struct gfs2_glock_operations gfs2_inode_glops;
extern const struct gfs2_glock_operations gfs2_rgrp_glops;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 39e7e9959b74..7a2dbbc0d634 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -97,6 +97,7 @@ struct gfs2_rgrpd {
#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
+#define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */
#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
spinlock_t rd_rsspin; /* protects reservation related vars */
struct rb_root rd_rstree; /* multi-block reservation tree */
@@ -587,6 +588,12 @@ enum {
SDF_SKIP_DLM_UNLOCK = 8,
};
+enum gfs2_freeze_state {
+ SFS_UNFROZEN = 0,
+ SFS_STARTING_FREEZE = 1,
+ SFS_FROZEN = 2,
+};
+
#define GFS2_FSNAME_LEN 256
struct gfs2_inum_host {
@@ -684,6 +691,7 @@ struct gfs2_sbd {
struct gfs2_holder sd_live_gh;
struct gfs2_glock *sd_rename_gl;
struct gfs2_glock *sd_freeze_gl;
+ struct work_struct sd_freeze_work;
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
@@ -788,6 +796,9 @@ struct gfs2_sbd {
wait_queue_head_t sd_log_flush_wait;
int sd_log_error;
+ atomic_t sd_reserving_log;
+ wait_queue_head_t sd_reserving_log_wait;
+
unsigned int sd_log_flush_head;
u64 sd_log_flush_wrapped;
@@ -797,12 +808,8 @@ struct gfs2_sbd {
/* For quiescing the filesystem */
struct gfs2_holder sd_freeze_gh;
- struct gfs2_holder sd_freeze_root_gh;
- struct gfs2_holder sd_thaw_gh;
- atomic_t sd_log_freeze;
- atomic_t sd_frozen_root;
- wait_queue_head_t sd_frozen_root_wait;
- wait_queue_head_t sd_log_frozen_wait;
+ atomic_t sd_freeze_state;
+ struct mutex sd_freeze_mutex;
char sd_fsname[GFS2_FSNAME_LEN];
char sd_table_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c4ed823d150e..9054002ebe70 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -596,7 +596,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
- struct dentry *d;
int error, free_vfs_inode = 0;
u32 aflags = 0;
unsigned blocks = 1;
@@ -624,22 +623,18 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
error = PTR_ERR(inode);
if (!IS_ERR(inode)) {
- d = d_splice_alias(inode, dentry);
- error = PTR_ERR(d);
- if (IS_ERR(d)) {
- inode = ERR_CAST(d);
+ if (S_ISDIR(inode->i_mode)) {
+ iput(inode);
+ inode = ERR_PTR(-EISDIR);
goto fail_gunlock;
}
+ d_instantiate(dentry, inode);
error = 0;
if (file) {
- if (S_ISREG(inode->i_mode)) {
- WARN_ON(d != NULL);
+ if (S_ISREG(inode->i_mode))
error = finish_open(file, dentry, gfs2_open_common, opened);
- } else {
- error = finish_no_open(file, d);
- }
- } else {
- dput(d);
+ else
+ error = finish_no_open(file, NULL);
}
gfs2_glock_dq_uninit(ghs);
return error;
@@ -1045,11 +1040,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
if (error)
return error;
- error = gfs2_dir_check(&dip->i_inode, name, ip);
- if (error)
- return error;
-
- return 0;
+ return gfs2_dir_check(&dip->i_inode, name, ip);
}
/**
@@ -1254,11 +1245,8 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
if (d != NULL)
dentry = d;
if (dentry->d_inode) {
- if (!(*opened & FILE_OPENED)) {
- if (d == NULL)
- dget(dentry);
- return finish_no_open(file, dentry);
- }
+ if (!(*opened & FILE_OPENED))
+ return finish_no_open(file, d);
dput(d);
return 0;
}
@@ -1622,26 +1610,18 @@ int gfs2_permission(struct inode *inode, int mask)
{
struct gfs2_inode *ip;
struct gfs2_holder i_gh;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
int unlock = 0;
- int frozen_root = 0;
ip = GFS2_I(inode);
if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) &&
- inode == sdp->sd_root_dir->d_inode &&
- atomic_inc_not_zero(&sdp->sd_frozen_root)))
- frozen_root = 1;
- else {
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
- if (error)
- return error;
- unlock = 1;
- }
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return error;
+ unlock = 1;
}
if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
@@ -1650,8 +1630,6 @@ int gfs2_permission(struct inode *inode, int mask)
error = generic_permission(inode, mask);
if (unlock)
gfs2_glock_dq_uninit(&i_gh);
- else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root))
- wake_up(&sdp->sd_frozen_root_wait);
return error;
}
@@ -1824,29 +1802,19 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct inode *inode = dentry->d_inode;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
int unlock = 0;
- int frozen_root = 0;
if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) &&
- inode == sdp->sd_root_dir->d_inode &&
- atomic_inc_not_zero(&sdp->sd_frozen_root)))
- frozen_root = 1;
- else {
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- if (error)
- return error;
- unlock = 1;
- }
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error)
+ return error;
+ unlock = 1;
}
generic_fillattr(inode, stat);
if (unlock)
gfs2_glock_dq_uninit(&gh);
- else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root))
- wake_up(&sdp->sd_frozen_root_wait);
return 0;
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 3966fadbcebd..536e7a6252cd 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -339,6 +339,7 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
{
+ int ret = 0;
unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
unsigned wanted = blks + reserved_blks;
DEFINE_WAIT(wait);
@@ -362,9 +363,13 @@ retry:
} while(free_blocks <= wanted);
finish_wait(&sdp->sd_log_waitq, &wait);
}
+ atomic_inc(&sdp->sd_reserving_log);
if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
- free_blocks - blks) != free_blocks)
+ free_blocks - blks) != free_blocks) {
+ if (atomic_dec_and_test(&sdp->sd_reserving_log))
+ wake_up(&sdp->sd_reserving_log_wait);
goto retry;
+ }
trace_gfs2_log_blocks(sdp, -blks);
/*
@@ -377,9 +382,11 @@ retry:
down_read(&sdp->sd_log_flush_lock);
if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) {
gfs2_log_release(sdp, blks);
- return -EROFS;
+ ret = -EROFS;
}
- return 0;
+ if (atomic_dec_and_test(&sdp->sd_reserving_log))
+ wake_up(&sdp->sd_reserving_log_wait);
+ return ret;
}
/**
@@ -652,9 +659,12 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
u32 hash;
int rw = WRITE_FLUSH_FUA | REQ_META;
struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
+ enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
lh = page_address(page);
clear_page(lh);
+ gfs2_assert_withdraw(sdp, (state != SFS_FROZEN));
+
tail = current_tail(sdp);
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -695,6 +705,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
enum gfs2_flush_type type)
{
struct gfs2_trans *tr;
+ enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
down_write(&sdp->sd_log_flush_lock);
@@ -713,8 +724,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
INIT_LIST_HEAD(&tr->tr_ail1_list);
INIT_LIST_HEAD(&tr->tr_ail2_list);
tr->tr_first = sdp->sd_log_flush_head;
+ if (unlikely (state == SFS_FROZEN))
+ gfs2_assert_withdraw(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new);
}
+ if (unlikely(state == SFS_FROZEN))
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
gfs2_assert_withdraw(sdp,
sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
@@ -745,8 +760,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
spin_unlock(&sdp->sd_ail_lock);
gfs2_log_unlock(sdp);
- if (atomic_read(&sdp->sd_log_freeze))
- type = FREEZE_FLUSH;
if (type != NORMAL_FLUSH) {
if (!sdp->sd_log_idle) {
for (;;) {
@@ -763,21 +776,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
}
if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH)
gfs2_log_shutdown(sdp);
- if (type == FREEZE_FLUSH) {
- int error;
-
- atomic_set(&sdp->sd_log_freeze, 0);
- wake_up(&sdp->sd_log_frozen_wait);
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
- LM_ST_SHARED, 0,
- &sdp->sd_thaw_gh);
- if (error) {
- printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error);
- gfs2_assert_withdraw(sdp, 0);
- }
- else
- gfs2_glock_dq_uninit(&sdp->sd_thaw_gh);
- }
+ if (type == FREEZE_FLUSH)
+ atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
}
trace_gfs2_log_flush(sdp, 0);
@@ -888,7 +888,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
{
- return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1) || atomic_read(&sdp->sd_log_freeze));
+ return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
}
static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 82b6ac829656..241a399bf83d 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -30,6 +30,7 @@
#include "quota.h"
#include "recovery.h"
#include "dir.h"
+#include "glops.h"
struct workqueue_struct *gfs2_control_wq;
@@ -161,9 +162,14 @@ static int __init init_gfs2_fs(void)
if (!gfs2_control_wq)
goto fail_recovery;
+ gfs2_freeze_wq = alloc_workqueue("freeze_workqueue", 0, 0);
+
+ if (!gfs2_freeze_wq)
+ goto fail_control;
+
gfs2_page_pool = mempool_create_page_pool(64, 0);
if (!gfs2_page_pool)
- goto fail_control;
+ goto fail_freeze;
gfs2_register_debugfs();
@@ -171,6 +177,8 @@ static int __init init_gfs2_fs(void)
return 0;
+fail_freeze:
+ destroy_workqueue(gfs2_freeze_wq);
fail_control:
destroy_workqueue(gfs2_control_wq);
fail_recovery:
@@ -224,6 +232,7 @@ static void __exit exit_gfs2_fs(void)
unregister_filesystem(&gfs2meta_fs_type);
destroy_workqueue(gfs_recovery_wq);
destroy_workqueue(gfs2_control_wq);
+ destroy_workqueue(gfs2_freeze_wq);
list_lru_destroy(&gfs2_qd_lru);
rcu_barrier();
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index d3eae244076e..8633ad328ee2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -129,11 +129,11 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
init_rwsem(&sdp->sd_log_flush_lock);
atomic_set(&sdp->sd_log_in_flight, 0);
+ atomic_set(&sdp->sd_reserving_log, 0);
+ init_waitqueue_head(&sdp->sd_reserving_log_wait);
init_waitqueue_head(&sdp->sd_log_flush_wait);
- init_waitqueue_head(&sdp->sd_log_frozen_wait);
- atomic_set(&sdp->sd_log_freeze, 0);
- atomic_set(&sdp->sd_frozen_root, 0);
- init_waitqueue_head(&sdp->sd_frozen_root_wait);
+ atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
+ mutex_init(&sdp->sd_freeze_mutex);
return sdp;
}
@@ -760,15 +760,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
gfs2_glock_dq_uninit(&ji_gh);
jindex = 0;
- if (!sdp->sd_args.ar_spectator) {
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
- &sdp->sd_thaw_gh);
- if (error) {
- fs_err(sdp, "can't acquire freeze glock: %d\n", error);
- goto fail_jinode_gh;
- }
- }
- gfs2_glock_dq_uninit(&sdp->sd_thaw_gh);
+ INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func);
return 0;
fail_jinode_gh:
@@ -1082,6 +1074,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
sb->s_export_op = &gfs2_export_ops;
sb->s_xattr = gfs2_xattr_handlers;
sb->s_qcop = &gfs2_quotactl_ops;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
sb->s_time_gran = 1;
sb->s_maxbytes = MAX_LFS_FILESIZE;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 64b29f7f6b4c..c8b148bbdc8b 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1360,13 +1360,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
- if (sdp->sd_quota_bitmap) {
- if (is_vmalloc_addr(sdp->sd_quota_bitmap))
- vfree(sdp->sd_quota_bitmap);
- else
- kfree(sdp->sd_quota_bitmap);
- sdp->sd_quota_bitmap = NULL;
- }
+ kvfree(sdp->sd_quota_bitmap);
+ sdp->sd_quota_bitmap = NULL;
}
static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7474c413ffd1..9150207f365c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -936,7 +936,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
- rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+ rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd->rd_data > sdp->sd_max_rg_data)
sdp->sd_max_rg_data = rgd->rd_data;
spin_lock(&sdp->sd_rindex_spin);
@@ -955,6 +955,36 @@ fail:
}
/**
+ * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
+ * @sdp: the GFS2 superblock
+ *
+ * The purpose of this function is to select a subset of the resource groups
+ * and mark them as PREFERRED. We do it in such a way that each node prefers
+ * to use a unique set of rgrps to minimize glock contention.
+ */
+static void set_rgrp_preferences(struct gfs2_sbd *sdp)
+{
+ struct gfs2_rgrpd *rgd, *first;
+ int i;
+
+ /* Skip an initial number of rgrps, based on this node's journal ID.
+ That should start each node out on its own set. */
+ rgd = gfs2_rgrpd_get_first(sdp);
+ for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
+ rgd = gfs2_rgrpd_get_next(rgd);
+ first = rgd;
+
+ do {
+ rgd->rd_flags |= GFS2_RDF_PREFERRED;
+ for (i = 0; i < sdp->sd_journals; i++) {
+ rgd = gfs2_rgrpd_get_next(rgd);
+ if (rgd == first)
+ break;
+ }
+ } while (rgd != first);
+}
+
+/**
* gfs2_ri_update - Pull in a new resource index from the disk
* @ip: pointer to the rindex inode
*
@@ -973,6 +1003,8 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
if (error < 0)
return error;
+ set_rgrp_preferences(sdp);
+
sdp->sd_rindex_uptodate = 1;
return 0;
}
@@ -1891,6 +1923,25 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
}
/**
+ * fast_to_acquire - determine if a resource group will be fast to acquire
+ *
+ * If this is one of our preferred rgrps, it should be quicker to acquire,
+ * because we tried to set ourselves up as dlm lock master.
+ */
+static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_glock *gl = rgd->rd_gl;
+
+ if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
+ !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+ !test_bit(GLF_DEMOTE, &gl->gl_flags))
+ return 1;
+ if (rgd->rd_flags & GFS2_RDF_PREFERRED)
+ return 1;
+ return 0;
+}
+
+/**
* gfs2_inplace_reserve - Reserve space in the filesystem
* @ip: the inode to reserve space for
* @ap: the allocation parameters
@@ -1932,10 +1983,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
rg_locked = 0;
if (skip && skip--)
goto next_rgrp;
- if (!gfs2_rs_active(rs) && (loops < 2) &&
- gfs2_rgrp_used_recently(rs, 1000) &&
- gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
- goto next_rgrp;
+ if (!gfs2_rs_active(rs)) {
+ if (loops == 0 &&
+ !fast_to_acquire(rs->rs_rbm.rgd))
+ goto next_rgrp;
+ if ((loops < 2) &&
+ gfs2_rgrp_used_recently(rs, 1000) &&
+ gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+ goto next_rgrp;
+ }
error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
LM_ST_EXCLUSIVE, flags,
&rs->rs_rgd_gh);
@@ -2195,6 +2251,9 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip,
trace_gfs2_rs(rs, TRACE_RS_CLAIM);
if (rs->rs_free && !ret)
goto out;
+ /* We used up our block reservation, so we should
+ reserve more blocks next time. */
+ atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
}
__rs_deltree(rs);
}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5d8f085f7ade..b104f4af3afd 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -20,6 +20,7 @@
*/
#define RGRP_RSRV_MINBYTES 8
#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+#define RGRP_RSRV_ADDBLKS 64
struct gfs2_rgrpd;
struct gfs2_sbd;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index a346f56c4c6d..5b327f837de7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -26,6 +26,7 @@
#include <linux/wait.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
+#include <linux/kernel.h>
#include "gfs2.h"
#include "incore.h"
@@ -399,7 +400,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
struct gfs2_glock *j_gl = ip->i_gl;
- struct gfs2_holder thaw_gh;
+ struct gfs2_holder freeze_gh;
struct gfs2_log_header_host head;
int error;
@@ -408,7 +409,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
return error;
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
- &thaw_gh);
+ &freeze_gh);
if (error)
goto fail_threads;
@@ -434,13 +435,13 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
- gfs2_glock_dq_uninit(&thaw_gh);
+ gfs2_glock_dq_uninit(&freeze_gh);
return 0;
fail:
- thaw_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&thaw_gh);
+ freeze_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_uninit(&freeze_gh);
fail_threads:
kthread_stop(sdp->sd_quotad_process);
kthread_stop(sdp->sd_logd_process);
@@ -580,14 +581,15 @@ int gfs2_statfs_sync(struct super_block *sb, int type)
struct buffer_head *m_bh, *l_bh;
int error;
+ sb_start_write(sb);
error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
&gh);
if (error)
- return error;
+ goto out;
error = gfs2_meta_inode_buffer(m_ip, &m_bh);
if (error)
- goto out;
+ goto out_unlock;
spin_lock(&sdp->sd_statfs_spin);
gfs2_statfs_change_in(m_sc, m_bh->b_data +
@@ -615,8 +617,10 @@ out_bh2:
brelse(l_bh);
out_bh:
brelse(m_bh);
-out:
+out_unlock:
gfs2_glock_dq_uninit(&gh);
+out:
+ sb_end_write(sb);
return error;
}
@@ -643,14 +647,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
struct lfcc *lfcc;
LIST_HEAD(list);
struct gfs2_log_header_host lh;
- struct gfs2_inode *dip = GFS2_I(sdp->sd_root_dir->d_inode);
int error;
- error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0,
- &sdp->sd_freeze_root_gh);
- if (error)
- return error;
- atomic_set(&sdp->sd_frozen_root, 1);
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
if (!lfcc) {
@@ -692,11 +690,6 @@ out:
gfs2_glock_dq_uninit(&lfcc->gh);
kfree(lfcc);
}
- if (error) {
- atomic_dec(&sdp->sd_frozen_root);
- wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0);
- gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh);
- }
return error;
}
@@ -834,18 +827,14 @@ out:
static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
{
- struct gfs2_holder thaw_gh;
+ struct gfs2_holder freeze_gh;
int error;
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE,
- &thaw_gh);
+ &freeze_gh);
if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return error;
- down_write(&sdp->sd_log_flush_lock);
- clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
- up_write(&sdp->sd_log_flush_lock);
-
kthread_stop(sdp->sd_quotad_process);
kthread_stop(sdp->sd_logd_process);
@@ -853,11 +842,16 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
gfs2_quota_sync(sdp->sd_vfs, 0);
gfs2_statfs_sync(sdp->sd_vfs, 0);
+ down_write(&sdp->sd_log_flush_lock);
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+ up_write(&sdp->sd_log_flush_lock);
+
gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
+ wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
- if (thaw_gh.gh_gl)
- gfs2_glock_dq_uninit(&thaw_gh);
+ if (freeze_gh.gh_gl)
+ gfs2_glock_dq_uninit(&freeze_gh);
gfs2_quota_cleanup(sdp);
@@ -943,11 +937,41 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
struct gfs2_sbd *sdp = sb->s_fs_info;
gfs2_quota_sync(sb, -1);
- if (wait && sdp && !atomic_read(&sdp->sd_log_freeze))
+ if (wait && sdp)
gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
return 0;
}
+void gfs2_freeze_func(struct work_struct *work)
+{
+ int error;
+ struct gfs2_holder freeze_gh;
+ struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work);
+ struct super_block *sb = sdp->sd_vfs;
+
+ atomic_inc(&sb->s_active);
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+ &freeze_gh);
+ if (error) {
+ printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error);
+ gfs2_assert_withdraw(sdp, 0);
+ }
+ else {
+ atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
+ error = thaw_super(sb);
+ if (error) {
+ printk(KERN_INFO "GFS2: couldn't thaw filesystem: %d\n",
+ error);
+ gfs2_assert_withdraw(sdp, 0);
+ }
+ if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+ freeze_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_uninit(&freeze_gh);
+ }
+ deactivate_super(sb);
+ return;
+}
+
/**
* gfs2_freeze - prevent further writes to the filesystem
* @sb: the VFS structure for the filesystem
@@ -957,10 +981,16 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
static int gfs2_freeze(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- int error;
+ int error = 0;
- if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
- return -EINVAL;
+ mutex_lock(&sdp->sd_freeze_mutex);
+ if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
+ goto out;
+
+ if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+ error = -EINVAL;
+ goto out;
+ }
for (;;) {
error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
@@ -980,7 +1010,10 @@ static int gfs2_freeze(struct super_block *sb)
fs_err(sdp, "retrying...\n");
msleep(1000);
}
- return 0;
+ error = 0;
+out:
+ mutex_unlock(&sdp->sd_freeze_mutex);
+ return error;
}
/**
@@ -993,10 +1026,15 @@ static int gfs2_unfreeze(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
+ mutex_lock(&sdp->sd_freeze_mutex);
+ if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
+ sdp->sd_freeze_gh.gh_gl == NULL) {
+ mutex_unlock(&sdp->sd_freeze_mutex);
+ return 0;
+ }
+
gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
- atomic_dec(&sdp->sd_frozen_root);
- wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0);
- gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh);
+ mutex_unlock(&sdp->sd_freeze_mutex);
return 0;
}
@@ -1618,8 +1656,8 @@ const struct super_operations gfs2_super_ops = {
.evict_inode = gfs2_evict_inode,
.put_super = gfs2_put_super,
.sync_fs = gfs2_sync_fs,
- .freeze_fs = gfs2_freeze,
- .unfreeze_fs = gfs2_unfreeze,
+ .freeze_super = gfs2_freeze,
+ .thaw_super = gfs2_unfreeze,
.statfs = gfs2_statfs,
.remount_fs = gfs2_remount_fs,
.drop_inode = gfs2_drop_inode,
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 90e3322ffa10..73c97dccae21 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -45,6 +45,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct buffer_head *l_bh);
extern int gfs2_statfs_sync(struct super_block *sb, int type);
+extern void gfs2_freeze_func(struct work_struct *work);
extern struct file_system_type gfs2_fs_type;
extern struct file_system_type gfs2meta_fs_type;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 42bfd3361979..88bff2430669 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -89,14 +89,17 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr = current->journal_info;
s64 nbuf;
+ int alloced = tr->tr_alloced;
+
BUG_ON(!tr);
current->journal_info = NULL;
if (!tr->tr_touched) {
gfs2_log_release(sdp, tr->tr_reserved);
- if (tr->tr_alloced)
+ if (alloced) {
kfree(tr);
- sb_end_intwrite(sdp->sd_vfs);
+ sb_end_intwrite(sdp->sd_vfs);
+ }
return;
}
@@ -109,13 +112,14 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
gfs2_print_trans(tr);
gfs2_log_commit(sdp, tr);
- if (tr->tr_alloced && !tr->tr_attached)
+ if (alloced && !tr->tr_attached)
kfree(tr);
up_read(&sdp->sd_log_flush_lock);
if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
- sb_end_intwrite(sdp->sd_vfs);
+ if (alloced)
+ sb_end_intwrite(sdp->sd_vfs);
}
static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
@@ -192,6 +196,7 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
struct gfs2_meta_header *mh;
struct gfs2_trans *tr;
+ enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
tr = current->journal_info;
tr->tr_touched = 1;
@@ -205,6 +210,10 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
(unsigned long long)bd->bd_bh->b_blocknr);
BUG();
}
+ if (unlikely(state == SFS_FROZEN)) {
+ printk(KERN_INFO "GFS2:adding buf while frozen\n");
+ gfs2_assert_withdraw(sdp, 0);
+ }
gfs2_pin(sdp, bd->bd_bh);
mh->__pad0 = cpu_to_be64(0);
mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index ff0316b925a5..db458ee3a546 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -162,14 +162,16 @@ err2:
*/
int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2)
{
- int retval;
+ __be32 k1p, k2p;
- retval = be32_to_cpu(key1->cat.ParID) - be32_to_cpu(key2->cat.ParID);
- if (!retval)
- retval = hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len,
- key2->cat.CName.name, key2->cat.CName.len);
+ k1p = key1->cat.ParID;
+ k2p = key2->cat.ParID;
- return retval;
+ if (k1p != k2p)
+ return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
+
+ return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len,
+ key2->cat.CName.name, key2->cat.CName.len);
}
/* Try to get a catalog entry for given catalog id */
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 32602c667b4a..7892e6fddb66 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -38,21 +38,30 @@ int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
return hfsplus_strcmp(&k1->cat.name, &k2->cat.name);
}
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
- u32 parent, struct qstr *str)
+/* Generates key for catalog file/folders record. */
+int hfsplus_cat_build_key(struct super_block *sb,
+ hfsplus_btree_key *key, u32 parent, struct qstr *str)
{
- int len;
+ int len, err;
key->cat.parent = cpu_to_be32(parent);
- if (str) {
- hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
- str->name, str->len);
- len = be16_to_cpu(key->cat.name.length);
- } else {
- key->cat.name.length = 0;
- len = 0;
- }
+ err = hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
+ str->name, str->len);
+ if (unlikely(err < 0))
+ return err;
+
+ len = be16_to_cpu(key->cat.name.length);
key->key_len = cpu_to_be16(6 + 2 * len);
+ return 0;
+}
+
+/* Generates key for catalog thread record. */
+void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
+ hfsplus_btree_key *key, u32 parent)
+{
+ key->cat.parent = cpu_to_be32(parent);
+ key->cat.name.length = 0;
+ key->key_len = cpu_to_be16(6);
}
static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
@@ -167,11 +176,16 @@ static int hfsplus_fill_cat_thread(struct super_block *sb,
hfsplus_cat_entry *entry, int type,
u32 parentid, struct qstr *str)
{
+ int err;
+
entry->type = cpu_to_be16(type);
entry->thread.reserved = 0;
entry->thread.parentID = cpu_to_be32(parentid);
- hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
+ err = hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
str->name, str->len);
+ if (unlikely(err < 0))
+ return err;
+
return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2;
}
@@ -183,7 +197,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
int err;
u16 type;
- hfsplus_cat_build_key(sb, fd->search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd->search_key, cnid);
err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry));
if (err)
return err;
@@ -250,11 +264,16 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
return err;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry,
S_ISDIR(inode->i_mode) ?
HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
dir->i_ino, str);
+ if (unlikely(entry_size < 0)) {
+ err = entry_size;
+ goto err2;
+ }
+
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
@@ -265,7 +284,10 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
goto err2;
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ if (unlikely(err))
+ goto err1;
+
entry_size = hfsplus_cat_build_record(&entry, cnid, inode);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
@@ -288,7 +310,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
return 0;
err1:
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
if (!hfs_brec_find(&fd, hfs_find_rec_by_key))
hfs_brec_remove(&fd);
err2:
@@ -313,7 +335,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
if (!str) {
int len;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -329,7 +351,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
off + 2, len);
fd.search_key->key_len = cpu_to_be16(6 + len);
} else
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
+ if (unlikely(err))
+ goto out;
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
@@ -360,7 +384,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
if (err)
goto out;
- hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -405,7 +429,11 @@ int hfsplus_rename_cat(u32 cnid,
dst_fd = src_fd;
/* find the old dir entry and read the data */
- hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+ err = hfsplus_cat_build_key(sb, src_fd.search_key,
+ src_dir->i_ino, src_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -419,7 +447,11 @@ int hfsplus_rename_cat(u32 cnid,
type = be16_to_cpu(entry.type);
/* create new dir entry with the data from the old entry */
- hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
+ err = hfsplus_cat_build_key(sb, dst_fd.search_key,
+ dst_dir->i_ino, dst_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
@@ -436,7 +468,11 @@ int hfsplus_rename_cat(u32 cnid,
dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
/* finally remove the old entry */
- hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
+ err = hfsplus_cat_build_key(sb, src_fd.search_key,
+ src_dir->i_ino, src_name);
+ if (unlikely(err))
+ goto out;
+
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -449,7 +485,7 @@ int hfsplus_rename_cat(u32 cnid,
src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
/* remove old thread entry */
- hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -459,9 +495,14 @@ int hfsplus_rename_cat(u32 cnid,
goto out;
/* create new thread entry */
- hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, dst_fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
dst_dir->i_ino, dst_name);
+ if (unlikely(entry_size < 0)) {
+ err = entry_size;
+ goto out;
+ }
+
err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 610a3260bef1..435bea231cc6 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -44,7 +44,10 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
return ERR_PTR(err);
- hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
+ err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino,
+ &dentry->d_name);
+ if (unlikely(err < 0))
+ goto fail;
again:
err = hfs_brec_read(&fd, &entry, sizeof(entry));
if (err) {
@@ -97,9 +100,11 @@ again:
be32_to_cpu(entry.file.permissions.dev);
str.len = sprintf(name, "iNode%d", linkid);
str.name = name;
- hfsplus_cat_build_key(sb, fd.search_key,
+ err = hfsplus_cat_build_key(sb, fd.search_key,
HFSPLUS_SB(sb)->hidden_dir->i_ino,
&str);
+ if (unlikely(err < 0))
+ goto fail;
goto again;
}
} else if (!dentry->d_fsdata)
@@ -145,7 +150,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
err = -ENOMEM;
goto out;
}
- hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
+ hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index eb5e059f481a..b0441d65fa54 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -443,8 +443,10 @@ int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
const hfsplus_btree_key *k2);
int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
const hfsplus_btree_key *k2);
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
+int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
u32 parent, struct qstr *str);
+void hfsplus_cat_build_key_with_cnid(struct super_block *sb,
+ hfsplus_btree_key *key, u32 parent);
void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
int hfsplus_find_cat(struct super_block *sb, u32 cnid,
struct hfs_find_data *fd);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 4cf2024b87da..593af2fdcc2d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -515,7 +515,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = hfs_find_init(sbi->cat_tree, &fd);
if (err)
goto out_put_root;
- hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
+ err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
+ if (unlikely(err < 0))
+ goto out_put_root;
if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
hfs_find_exit(&fd);
if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 4338ff32959d..5f2755117ce7 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -548,10 +548,11 @@ struct hppfs_dirent {
struct dentry *dentry;
};
-static int hppfs_filldir(void *d, const char *name, int size,
+static int hppfs_filldir(struct dir_context *ctx, const char *name, int size,
loff_t offset, u64 inode, unsigned int type)
{
- struct hppfs_dirent *dirent = d;
+ struct hppfs_dirent *dirent =
+ container_of(ctx, struct hppfs_dirent, ctx);
if (file_removed(dirent->dentry, name))
return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1e2872b25343..5eba47f593f8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
pgoff = offset >> PAGE_SHIFT;
i_size_write(inode, offset);
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
truncate_hugepages(inode, offset);
return 0;
}
@@ -472,12 +472,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
}
/*
- * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never
+ * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
* be taken from reclaim -- unlike regular filesystems. This needs an
* annotation because huge_pmd_share() does an allocation under
- * i_mmap_mutex.
+ * i_mmap_rwsem.
*/
-static struct lock_class_key hugetlbfs_i_mmap_mutex_key;
+static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct inode *dir,
@@ -495,8 +495,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct hugetlbfs_inode_info *info;
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
- lockdep_set_class(&inode->i_mapping->i_mmap_mutex,
- &hugetlbfs_i_mmap_mutex_key);
+ lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
+ &hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 26753ba7b6d6..aa149e7262ac 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -114,6 +114,11 @@ int proc_nr_inodes(struct ctl_table *table, int write,
}
#endif
+static int no_open(struct inode *inode, struct file *file)
+{
+ return -ENXIO;
+}
+
/**
* inode_init_always - perform inode structure intialisation
* @sb: superblock inode belongs to
@@ -125,7 +130,7 @@ int proc_nr_inodes(struct ctl_table *table, int write,
int inode_init_always(struct super_block *sb, struct inode *inode)
{
static const struct inode_operations empty_iops;
- static const struct file_operations empty_fops;
+ static const struct file_operations no_open_fops = {.open = no_open};
struct address_space *const mapping = &inode->i_data;
inode->i_sb = sb;
@@ -133,7 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_flags = 0;
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
- inode->i_fop = &empty_fops;
+ inode->i_fop = &no_open_fops;
inode->__i_nlink = 1;
inode->i_opflags = 0;
i_uid_write(inode, 0);
@@ -143,9 +148,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_blocks = 0;
inode->i_bytes = 0;
inode->i_generation = 0;
-#ifdef CONFIG_QUOTA
- memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
-#endif
inode->i_pipe = NULL;
inode->i_bdev = NULL;
inode->i_cdev = NULL;
@@ -349,7 +351,7 @@ void address_space_init_once(struct address_space *mapping)
memset(mapping, 0, sizeof(*mapping));
INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
spin_lock_init(&mapping->tree_lock);
- mutex_init(&mapping->i_mmap_mutex);
+ init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
mapping->i_mmap = RB_ROOT;
@@ -1801,7 +1803,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
} else if (S_ISFIFO(mode))
inode->i_fop = &pipefifo_fops;
else if (S_ISSOCK(mode))
- inode->i_fop = &bad_sock_fops;
+ ; /* leave it no_open_fops */
else
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
diff --git a/fs/internal.h b/fs/internal.h
index 9477f8f6aefc..e9a61fe67575 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -47,7 +47,6 @@ extern void __init chrdev_init(void);
/*
* namei.c
*/
-extern int __inode_permission(struct inode *, int);
extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
@@ -139,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
/*
- * splice.c
- */
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- loff_t *opos, size_t len, unsigned int flags);
-
-/*
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
@@ -154,3 +147,8 @@ extern const struct file_operations pipefifo_fops;
*/
extern void sb_pin_kill(struct super_block *sb);
extern void mnt_pin_kill(struct mount *m);
+
+/*
+ * fs/nsfs.c
+ */
+extern struct dentry_operations ns_dentry_operations;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 8ac3fad36192..214c3c11fbc2 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -443,7 +443,7 @@ int ioctl_preallocate(struct file *filp, void __user *argp)
return -EINVAL;
}
- return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+ return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
}
static int file_ioctl(struct file *filp, unsigned int cmd,
@@ -518,10 +518,12 @@ static int ioctl_fsfreeze(struct file *filp)
return -EPERM;
/* If filesystem doesn't support freeze feature, return. */
- if (sb->s_op->freeze_fs == NULL)
+ if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL)
return -EOPNOTSUPP;
/* Freeze */
+ if (sb->s_op->freeze_super)
+ return sb->s_op->freeze_super(sb);
return freeze_super(sb);
}
@@ -533,6 +535,8 @@ static int ioctl_fsthaw(struct file *filp)
return -EPERM;
/* Thaw */
+ if (sb->s_op->thaw_super)
+ return sb->s_op->thaw_super(sb);
return thaw_super(sb);
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 881b3bd0143f..d67a16f2a45d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -29,13 +29,9 @@
#define BEQUIET
static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
-static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
static int isofs_dentry_cmpi(const struct dentry *parent,
const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
-static int isofs_dentry_cmp(const struct dentry *parent,
- const struct dentry *dentry,
- unsigned int len, const char *str, const struct qstr *name);
#ifdef CONFIG_JOLIET
static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
@@ -135,10 +131,6 @@ static const struct super_operations isofs_sops = {
static const struct dentry_operations isofs_dentry_ops[] = {
{
- .d_hash = isofs_hash,
- .d_compare = isofs_dentry_cmp,
- },
- {
.d_hash = isofs_hashi,
.d_compare = isofs_dentry_cmpi,
},
@@ -182,27 +174,6 @@ struct iso9660_options{
* Compute the hash for the isofs name corresponding to the dentry.
*/
static int
-isofs_hash_common(struct qstr *qstr, int ms)
-{
- const char *name;
- int len;
-
- len = qstr->len;
- name = qstr->name;
- if (ms) {
- while (len && name[len-1] == '.')
- len--;
- }
-
- qstr->hash = full_name_hash(name, len);
-
- return 0;
-}
-
-/*
- * Compute the hash for the isofs name corresponding to the dentry.
- */
-static int
isofs_hashi_common(struct qstr *qstr, int ms)
{
const char *name;
@@ -258,32 +229,40 @@ static int isofs_dentry_cmp_common(
}
static int
-isofs_hash(const struct dentry *dentry, struct qstr *qstr)
-{
- return isofs_hash_common(qstr, 0);
-}
-
-static int
isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
{
return isofs_hashi_common(qstr, 0);
}
static int
-isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
+isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- return isofs_dentry_cmp_common(len, str, name, 0, 0);
+ return isofs_dentry_cmp_common(len, str, name, 0, 1);
}
+#ifdef CONFIG_JOLIET
+/*
+ * Compute the hash for the isofs name corresponding to the dentry.
+ */
static int
-isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
- unsigned int len, const char *str, const struct qstr *name)
+isofs_hash_common(struct qstr *qstr, int ms)
{
- return isofs_dentry_cmp_common(len, str, name, 0, 1);
+ const char *name;
+ int len;
+
+ len = qstr->len;
+ name = qstr->name;
+ if (ms) {
+ while (len && name[len-1] == '.')
+ len--;
+ }
+
+ qstr->hash = full_name_hash(name, len);
+
+ return 0;
}
-#ifdef CONFIG_JOLIET
static int
isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
{
@@ -930,7 +909,8 @@ root_found:
if (opt.check == 'r')
table++;
- s->s_d_op = &isofs_dentry_ops[table];
+ if (table)
+ s->s_d_op = &isofs_dentry_ops[table - 1];
/* get the root dentry */
s->s_root = d_make_root(inode);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 95295640d9c8..7b543e6b6526 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -18,25 +18,10 @@ static int
isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
{
struct qstr qstr;
-
- if (!compare)
- return 1;
-
- /* check special "." and ".." files */
- if (dlen == 1) {
- /* "." */
- if (compare[0] == 0) {
- if (!dentry->d_name.len)
- return 0;
- compare = ".";
- } else if (compare[0] == 1) {
- compare = "..";
- dlen = 2;
- }
- }
-
qstr.name = compare;
qstr.len = dlen;
+ if (likely(!dentry->d_op))
+ return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen);
return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
}
@@ -146,7 +131,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
(!(de->flags[-sbi->s_high_sierra] & 1))) &&
(sbi->s_showassoc ||
(!(de->flags[-sbi->s_high_sierra] & 4)))) {
- match = (isofs_cmp(dentry, dpnt, dlen) == 0);
+ if (dpnt && (dlen > 1 || dpnt[0] > 1))
+ match = (isofs_cmp(dentry, dpnt, dlen) == 0);
}
if (match) {
isofs_normalize_block_and_offset(de,
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index f488bbae541a..bb63254ed848 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -30,6 +30,7 @@ struct rock_state {
int cont_size;
int cont_extent;
int cont_offset;
+ int cont_loops;
struct inode *inode;
};
@@ -73,6 +74,9 @@ static void init_rock_state(struct rock_state *rs, struct inode *inode)
rs->inode = inode;
}
+/* Maximum number of Rock Ridge continuation entries */
+#define RR_MAX_CE_ENTRIES 32
+
/*
* Returns 0 if the caller should continue scanning, 1 if the scan must end
* and -ve on error.
@@ -105,6 +109,8 @@ static int rock_continue(struct rock_state *rs)
goto out;
}
ret = -EIO;
+ if (++rs->cont_loops >= RR_MAX_CE_ENTRIES)
+ goto out;
bh = sb_bread(rs->inode->i_sb, rs->cont_extent);
if (bh) {
memcpy(rs->buffer, bh->b_data + rs->cont_offset,
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index aab8549591e7..c46a79adb6ad 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1373,8 +1373,7 @@ int journal_destroy(journal_t *journal)
}
mutex_unlock(&journal->j_checkpoint_mutex);
- if (journal->j_inode)
- iput(journal->j_inode);
+ iput(journal->j_inode);
if (journal->j_revoke)
journal_destroy_revoke(journal);
kfree(journal->j_wbuf);
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8898bbd2b61e..dcead636c33b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -93,6 +93,7 @@
#include <linux/bio.h>
#endif
#include <linux/log2.h>
+#include <linux/hash.h>
static struct kmem_cache *revoke_record_cache;
static struct kmem_cache *revoke_table_cache;
@@ -129,15 +130,11 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
/* Utility functions to maintain the revoke table */
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
static inline int hash(journal_t *journal, unsigned int block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
- int hash_shift = table->hash_shift;
- return ((block << (hash_shift - 6)) ^
- (block >> 13) ^
- (block << (hash_shift - 12))) & (table->hash_size - 1);
+ return hash_32(block, table->hash_shift);
}
static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e4dc74713a43..b96bd8076b70 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1714,8 +1714,7 @@ int jbd2_journal_destroy(journal_t *journal)
if (journal->j_proc_entry)
jbd2_stats_proc_exit(journal);
- if (journal->j_inode)
- iput(journal->j_inode);
+ iput(journal->j_inode);
if (journal->j_revoke)
jbd2_journal_destroy_revoke(journal);
if (journal->j_chksum_driver)
@@ -1853,13 +1852,12 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
journal->j_chksum_driver = NULL;
return 0;
}
- }
- /* Precompute checksum seed for all metadata */
- if (jbd2_journal_has_csum_v2or3(journal))
+ /* Precompute checksum seed for all metadata */
journal->j_csum_seed = jbd2_chksum(journal, ~0,
sb->s_uuid,
sizeof(sb->s_uuid));
+ }
}
/* If enabling v1 checksums, downgrade superblock */
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index d5e95a175c92..c6cbaef2bda1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -92,6 +92,7 @@
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/log2.h>
+#include <linux/hash.h>
#endif
static struct kmem_cache *jbd2_revoke_record_cache;
@@ -130,16 +131,9 @@ static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
/* Utility functions to maintain the revoke table */
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
static inline int hash(journal_t *journal, unsigned long long block)
{
- struct jbd2_revoke_table_s *table = journal->j_revoke;
- int hash_shift = table->hash_shift;
- int hash = (int)block ^ (int)((block >> 31) >> 1);
-
- return ((hash << (hash_shift - 6)) ^
- (hash >> 13) ^
- (hash << (hash_shift - 12))) & (table->hash_size - 1);
+ return hash_64(block, journal->j_revoke->hash_shift);
}
static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 386303dca382..dddbde4f56f4 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -224,7 +224,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
- /* If a node has zero dsize, we only have to keep if it if it might be the
+ /* If a node has zero dsize, we only have to keep it if it might be the
node with highest version -- i.e. the one which will end up as f->metadata.
Note that such nodes won't be REF_UNCHECKED since there are no data to
check anyway. */
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index c522d098bb4f..bc5385471a6e 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -844,6 +844,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
/* Write out summary information - called from jffs2_do_reserve_space */
int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
+ __must_hold(&c->erase_completion_block)
{
int datasize, infosize, padsize;
struct jffs2_eraseblock *jeb;
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index cf47f09e8ac8..fa7e795bd8ae 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -94,6 +94,9 @@ struct jfs_inode_info {
unchar _inline_ea[128]; /* 128: inline extended attr */
} link;
} u;
+#ifdef CONFIG_QUOTA
+ struct dquot *i_dquot[MAXQUOTAS];
+#endif
u32 dev; /* will die when we get wide dev_t */
struct inode vfs_inode;
};
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index d59c7defb1ef..38fdc533f4ec 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -84,7 +84,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
struct inode *iplist[2];
struct tblock *tblk;
- jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
+ jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry);
dquot_initialize(dip);
@@ -216,7 +216,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
struct inode *iplist[2];
struct tblock *tblk;
- jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+ jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry);
dquot_initialize(dip);
@@ -352,7 +352,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
struct inode *iplist[2];
struct tblock *tblk;
- jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+ jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry);
/* Init inode for quota operations. */
dquot_initialize(dip);
@@ -480,7 +480,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
s64 new_size = 0;
int commit_flag;
- jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
+ jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry);
/* Init inode for quota operations. */
dquot_initialize(dip);
@@ -797,8 +797,7 @@ static int jfs_link(struct dentry *old_dentry,
struct btstack btstack;
struct inode *iplist[2];
- jfs_info("jfs_link: %s %s", old_dentry->d_name.name,
- dentry->d_name.name);
+ jfs_info("jfs_link: %pd %pd", old_dentry, dentry);
dquot_initialize(dir);
@@ -1082,8 +1081,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int commit_flag;
- jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
- new_dentry->d_name.name);
+ jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry);
dquot_initialize(old_dir);
dquot_initialize(new_dir);
@@ -1355,7 +1353,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
- jfs_info("jfs_mknod: %s", dentry->d_name.name);
+ jfs_info("jfs_mknod: %pd", dentry);
dquot_initialize(dir);
@@ -1444,7 +1442,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, unsig
struct component_name key;
int rc;
- jfs_info("jfs_lookup: name = %s", dentry->d_name.name);
+ jfs_info("jfs_lookup: name = %pd", dentry);
if ((rc = get_UCSname(&key, dentry)))
return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 93e897e588a8..16c3a9556634 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -117,6 +117,9 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
if (!jfs_inode)
return NULL;
+#ifdef CONFIG_QUOTA
+ memset(&jfs_inode->i_dquot, 0, sizeof(jfs_inode->i_dquot));
+#endif
return &jfs_inode->vfs_inode;
}
@@ -537,6 +540,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_QUOTA
sb->dq_op = &dquot_operations;
sb->s_qcop = &dquot_quotactl_ops;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
/*
@@ -836,6 +840,10 @@ out:
return len - towrite;
}
+static struct dquot **jfs_get_dquots(struct inode *inode)
+{
+ return JFS_IP(inode)->i_dquot;
+}
#endif
static const struct super_operations jfs_super_operations = {
@@ -854,6 +862,7 @@ static const struct super_operations jfs_super_operations = {
#ifdef CONFIG_QUOTA
.quota_read = jfs_quota_read,
.quota_write = jfs_quota_write,
+ .get_dquots = jfs_get_dquots,
#endif
};
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1c771931bb60..37989f02a226 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -807,7 +807,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
}
/* instantiate and hash dentry */
- ret = d_materialise_unique(dentry, inode);
+ ret = d_splice_alias(inode, dentry);
out_unlock:
mutex_unlock(&kernfs_mutex);
return ret;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 4429d6d9217f..697390ea47b8 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -106,7 +106,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
const struct kernfs_ops *ops;
/*
- * @of->mutex nests outside active ref and is just to ensure that
+ * @of->mutex nests outside active ref and is primarily to ensure that
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
@@ -189,13 +189,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
const struct kernfs_ops *ops;
char *buf;
- buf = kmalloc(len, GFP_KERNEL);
+ buf = of->prealloc_buf;
+ if (!buf)
+ buf = kmalloc(len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
/*
- * @of->mutex nests outside active ref and is just to ensure that
- * the ops aren't called concurrently for the same open file.
+ * @of->mutex nests outside active ref and is used both to ensure that
+ * the ops aren't called concurrently for the same open file, and
+ * to provide exclusive access to ->prealloc_buf (when that exists).
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) {
@@ -210,21 +213,22 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
else
len = -EINVAL;
- kernfs_put_active(of->kn);
- mutex_unlock(&of->mutex);
-
if (len < 0)
- goto out_free;
+ goto out_unlock;
if (copy_to_user(user_buf, buf, len)) {
len = -EFAULT;
- goto out_free;
+ goto out_unlock;
}
*ppos += len;
+ out_unlock:
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
out_free:
- kfree(buf);
+ if (buf != of->prealloc_buf)
+ kfree(buf);
return len;
}
@@ -278,19 +282,16 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
len = min_t(size_t, count, PAGE_SIZE);
}
- buf = kmalloc(len + 1, GFP_KERNEL);
+ buf = of->prealloc_buf;
+ if (!buf)
+ buf = kmalloc(len + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- if (copy_from_user(buf, user_buf, len)) {
- len = -EFAULT;
- goto out_free;
- }
- buf[len] = '\0'; /* guarantee string termination */
-
/*
- * @of->mutex nests outside active ref and is just to ensure that
- * the ops aren't called concurrently for the same open file.
+ * @of->mutex nests outside active ref and is used both to ensure that
+ * the ops aren't called concurrently for the same open file, and
+ * to provide exclusive access to ->prealloc_buf (when that exists).
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) {
@@ -299,19 +300,27 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
goto out_free;
}
+ if (copy_from_user(buf, user_buf, len)) {
+ len = -EFAULT;
+ goto out_unlock;
+ }
+ buf[len] = '\0'; /* guarantee string termination */
+
ops = kernfs_ops(of->kn);
if (ops->write)
len = ops->write(of, buf, len, *ppos);
else
len = -EINVAL;
- kernfs_put_active(of->kn);
- mutex_unlock(&of->mutex);
-
if (len > 0)
*ppos += len;
+
+out_unlock:
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
out_free:
- kfree(buf);
+ if (buf != of->prealloc_buf)
+ kfree(buf);
return len;
}
@@ -685,6 +694,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
*/
of->atomic_write_len = ops->atomic_write_len;
+ error = -EINVAL;
+ /*
+ * ->seq_show is incompatible with ->prealloc,
+ * as seq_read does its own allocation.
+ * ->read must be used instead.
+ */
+ if (ops->prealloc && ops->seq_show)
+ goto err_free;
+ if (ops->prealloc) {
+ int len = of->atomic_write_len ?: PAGE_SIZE;
+ of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
+ error = -ENOMEM;
+ if (!of->prealloc_buf)
+ goto err_free;
+ }
+
/*
* Always instantiate seq_file even if read access doesn't use
* seq_file or is not requested. This unifies private data access
@@ -715,6 +740,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
err_close:
seq_release(inode, file);
err_free:
+ kfree(of->prealloc_buf);
kfree(of);
err_out:
kernfs_put_active(kn);
@@ -728,6 +754,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
kernfs_put_open_node(kn, of);
seq_release(inode, filp);
+ kfree(of->prealloc_buf);
kfree(of);
return 0;
diff --git a/fs/libfs.c b/fs/libfs.c
index 171d2846f2a3..005843ce5dbd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -114,18 +114,18 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
spin_lock(&dentry->d_lock);
/* d_lock not required for cursor */
- list_del(&cursor->d_u.d_child);
+ list_del(&cursor->d_child);
p = dentry->d_subdirs.next;
while (n && p != &dentry->d_subdirs) {
struct dentry *next;
- next = list_entry(p, struct dentry, d_u.d_child);
+ next = list_entry(p, struct dentry, d_child);
spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(next))
n--;
spin_unlock(&next->d_lock);
p = p->next;
}
- list_add_tail(&cursor->d_u.d_child, p);
+ list_add_tail(&cursor->d_child, p);
spin_unlock(&dentry->d_lock);
}
}
@@ -150,7 +150,7 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *p, *q = &cursor->d_u.d_child;
+ struct list_head *p, *q = &cursor->d_child;
if (!dir_emit_dots(file, ctx))
return 0;
@@ -159,7 +159,7 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
list_move(q, &dentry->d_subdirs);
for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
- struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
+ struct dentry *next = list_entry(p, struct dentry, d_child);
spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
if (!simple_positive(next)) {
spin_unlock(&next->d_lock);
@@ -287,7 +287,7 @@ int simple_empty(struct dentry *dentry)
int ret = 0;
spin_lock(&dentry->d_lock);
- list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(child)) {
spin_unlock(&child->d_lock);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 9106f42c472c..1cc6ec51e6b1 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -214,7 +214,7 @@ int nsm_monitor(const struct nlm_host *host)
if (unlikely(res.status != 0))
status = -EIO;
if (unlikely(status < 0)) {
- printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
+ pr_notice_ratelimited("lockd: cannot monitor %s\n", nsm->sm_name);
return status;
}
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index d1bb7ecfd201..e94c887da2d7 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -350,7 +350,7 @@ static struct svc_serv *lockd_create_svc(void)
printk(KERN_WARNING
"lockd_up: no pid, %d users??\n", nlmsvc_users);
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
return ERR_PTR(-ENOMEM);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 13db95f54176..56598742dde4 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -53,7 +53,7 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
static LIST_HEAD(nlm_blocked);
static DEFINE_SPINLOCK(nlm_blocked_lock);
-#ifdef LOCKD_DEBUG
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
{
/*
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index b6f3b84b6e99..d12ff4e2dbe7 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -408,7 +408,7 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file)
{
struct super_block *sb = datap;
- return sb == file->f_file->f_path.dentry->d_sb;
+ return sb == file_inode(file->f_file)->i_sb;
}
/**
diff --git a/fs/mount.h b/fs/mount.h
index f82c62840905..0ad6f760ce52 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,10 +1,11 @@
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
+#include <linux/ns_common.h>
struct mnt_namespace {
atomic_t count;
- unsigned int proc_inum;
+ struct ns_common ns;
struct mount * root;
struct list_head list;
struct user_namespace *user_ns;
diff --git a/fs/namei.c b/fs/namei.c
index 43927d14db67..bc35b02883bb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -130,7 +130,7 @@ void final_putname(struct filename *name)
#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
-static struct filename *
+struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
struct filename *result, *err;
@@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask)
return security_inode_permission(inode, mask);
}
+EXPORT_SYMBOL(__inode_permission);
/**
* sb_permission - Check superblock-level permissions
@@ -486,6 +487,19 @@ void path_put(const struct path *path)
}
EXPORT_SYMBOL(path_put);
+struct nameidata {
+ struct path path;
+ struct qstr last;
+ struct path root;
+ struct inode *inode; /* path.dentry.d_inode */
+ unsigned int flags;
+ unsigned seq, m_seq;
+ int last_type;
+ unsigned depth;
+ struct file *base;
+ char *saved_names[MAX_NESTED_LINKS + 1];
+};
+
/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt). In situations when we can't
@@ -694,6 +708,18 @@ void nd_jump_link(struct nameidata *nd, struct path *path)
nd->flags |= LOOKUP_JUMPED;
}
+void nd_set_link(struct nameidata *nd, char *path)
+{
+ nd->saved_names[nd->depth] = path;
+}
+EXPORT_SYMBOL(nd_set_link);
+
+char *nd_get_link(struct nameidata *nd)
+{
+ return nd->saved_names[nd->depth];
+}
+EXPORT_SYMBOL(nd_get_link);
+
static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
{
struct inode *inode = link->dentry->d_inode;
@@ -1820,13 +1846,14 @@ static int link_path_walk(const char *name, struct nameidata *nd)
}
static int path_init(int dfd, const char *name, unsigned int flags,
- struct nameidata *nd, struct file **fp)
+ struct nameidata *nd)
{
int retval = 0;
nd->last_type = LAST_ROOT; /* if there are only slashes... */
- nd->flags = flags | LOOKUP_JUMPED;
+ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
nd->depth = 0;
+ nd->base = NULL;
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
@@ -1846,7 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
} else {
path_get(&nd->path);
}
- return 0;
+ goto done;
}
nd->root.mnt = NULL;
@@ -1896,7 +1923,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
if (f.flags & FDPUT_FPUT)
- *fp = f.file;
+ nd->base = f.file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
rcu_read_lock();
} else {
@@ -1907,13 +1934,26 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->inode = nd->path.dentry->d_inode;
if (!(flags & LOOKUP_RCU))
- return 0;
+ goto done;
if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
- return 0;
+ goto done;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
rcu_read_unlock();
return -ECHILD;
+done:
+ current->total_link_count = 0;
+ return link_path_walk(name, nd);
+}
+
+static void path_cleanup(struct nameidata *nd)
+{
+ if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+ path_put(&nd->root);
+ nd->root.mnt = NULL;
+ }
+ if (unlikely(nd->base))
+ fput(nd->base);
}
static inline int lookup_last(struct nameidata *nd, struct path *path)
@@ -1929,7 +1969,6 @@ static inline int lookup_last(struct nameidata *nd, struct path *path)
static int path_lookupat(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
- struct file *base = NULL;
struct path path;
int err;
@@ -1947,14 +1986,7 @@ static int path_lookupat(int dfd, const char *name,
* be handled by restarting a traditional ref-walk (which will always
* be able to complete).
*/
- err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
-
- if (unlikely(err))
- goto out;
-
- current->total_link_count = 0;
- err = link_path_walk(name, nd);
-
+ err = path_init(dfd, name, flags, nd);
if (!err && !(flags & LOOKUP_PARENT)) {
err = lookup_last(nd, &path);
while (err > 0) {
@@ -1982,14 +2014,7 @@ static int path_lookupat(int dfd, const char *name,
}
}
-out:
- if (base)
- fput(base);
-
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- path_put(&nd->root);
- nd->root.mnt = NULL;
- }
+ path_cleanup(nd);
return err;
}
@@ -2296,19 +2321,13 @@ out:
static int
path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
{
- struct file *base = NULL;
struct nameidata nd;
int err;
- err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
+ err = path_init(dfd, name, flags, &nd);
if (unlikely(err))
goto out;
- current->total_link_count = 0;
- err = link_path_walk(name, &nd);
- if (err)
- goto out;
-
err = mountpoint_last(&nd, path);
while (err > 0) {
void *cookie;
@@ -2324,12 +2343,7 @@ path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags
put_link(&nd, &link, cookie);
}
out:
- if (base)
- fput(base);
-
- if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
- path_put(&nd.root);
-
+ path_cleanup(&nd);
return err;
}
@@ -2383,22 +2397,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path,
}
EXPORT_SYMBOL(kern_path_mountpoint);
-/*
- * It's inline, so penalty for filesystems that don't use sticky bit is
- * minimal.
- */
-static inline int check_sticky(struct inode *dir, struct inode *inode)
+int __check_sticky(struct inode *dir, struct inode *inode)
{
kuid_t fsuid = current_fsuid();
- if (!(dir->i_mode & S_ISVTX))
- return 0;
if (uid_eq(inode->i_uid, fsuid))
return 0;
if (uid_eq(dir->i_uid, fsuid))
return 0;
return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
}
+EXPORT_SYMBOL(__check_sticky);
/*
* Check whether we can remove a link victim from directory dir, check
@@ -2501,7 +2510,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
}
mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
return NULL;
}
EXPORT_SYMBOL(lock_rename);
@@ -3064,9 +3073,12 @@ finish_open_created:
error = may_open(&nd->path, acc_mode, open_flag);
if (error)
goto out;
- file->f_path.mnt = nd->path.mnt;
- error = finish_open(file, nd->path.dentry, NULL, opened);
- if (error) {
+
+ BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
+ error = vfs_open(&nd->path, file, current_cred());
+ if (!error) {
+ *opened |= FILE_OPENED;
+ } else {
if (error == -EOPENSTALE)
goto stale_open;
goto out;
@@ -3155,7 +3167,8 @@ static int do_tmpfile(int dfd, struct filename *pathname,
if (error)
goto out2;
audit_inode(pathname, nd->path.dentry, 0);
- error = may_open(&nd->path, op->acc_mode, op->open_flag);
+ /* Don't check for other permissions, the inode was just created */
+ error = may_open(&nd->path, MAY_OPEN, op->open_flag);
if (error)
goto out2;
file->f_path.mnt = nd->path.mnt;
@@ -3181,7 +3194,6 @@ out:
static struct file *path_openat(int dfd, struct filename *pathname,
struct nameidata *nd, const struct open_flags *op, int flags)
{
- struct file *base = NULL;
struct file *file;
struct path path;
int opened = 0;
@@ -3198,12 +3210,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
goto out;
}
- error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
- if (unlikely(error))
- goto out;
-
- current->total_link_count = 0;
- error = link_path_walk(pathname->name, nd);
+ error = path_init(dfd, pathname->name, flags, nd);
if (unlikely(error))
goto out;
@@ -3229,10 +3236,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
put_link(nd, &link, cookie);
}
out:
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
- path_put(&nd->root);
- if (base)
- fput(base);
+ path_cleanup(nd);
if (!(opened & FILE_OPENED)) {
BUG_ON(!error);
put_filp(file);
@@ -4210,12 +4214,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
bool should_retry = false;
int error;
- if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
- if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
+ if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
+ (flags & RENAME_EXCHANGE))
return -EINVAL;
+ if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+ return -EPERM;
+
retry:
from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
if (IS_ERR(from)) {
@@ -4347,6 +4355,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
+int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+{
+ int error = may_create(dir, dentry);
+ if (error)
+ return error;
+
+ if (!dir->i_op->mknod)
+ return -EPERM;
+
+ return dir->i_op->mknod(dir, dentry,
+ S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+}
+EXPORT_SYMBOL(vfs_whiteout);
+
int readlink_copy(char __user *buffer, int buflen, const char *link)
{
int len = PTR_ERR(link);
diff --git a/fs/namespace.c b/fs/namespace.c
index fbba8b17330d..cd1e9681a0cf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -963,7 +963,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
}
/* Don't allow unprivileged users to reveal what is under a mount */
- if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
+ if ((flag & CL_UNPRIVILEGED) &&
+ (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
mnt->mnt.mnt_flags |= MNT_LOCKED;
atomic_inc(&sb->s_active);
@@ -1369,6 +1370,8 @@ void umount_tree(struct mount *mnt, int how)
}
if (last) {
last->mnt_hash.next = unmounted.first;
+ if (unmounted.first)
+ unmounted.first->pprev = &last->mnt_hash.next;
unmounted.first = tmp_list.first;
unmounted.first->pprev = &unmounted.first;
}
@@ -1544,6 +1547,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
goto dput_and_out;
if (mnt->mnt.mnt_flags & MNT_LOCKED)
goto dput_and_out;
+ retval = -EPERM;
+ if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+ goto dput_and_out;
retval = do_umount(mnt, flags);
dput_and_out:
@@ -1569,17 +1575,13 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
static bool is_mnt_ns_file(struct dentry *dentry)
{
/* Is this a proxy for a mount namespace? */
- struct inode *inode = dentry->d_inode;
- struct proc_ns *ei;
-
- if (!proc_ns_inode(inode))
- return false;
-
- ei = get_proc_ns(inode);
- if (ei->ns_ops != &mntns_operations)
- return false;
+ return dentry->d_op == &ns_dentry_operations &&
+ dentry->d_fsdata == &mntns_operations;
+}
- return true;
+struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct mnt_namespace, ns);
}
static bool mnt_ns_loop(struct dentry *dentry)
@@ -1591,7 +1593,7 @@ static bool mnt_ns_loop(struct dentry *dentry)
if (!is_mnt_ns_file(dentry))
return false;
- mnt_ns = get_proc_ns(dentry->d_inode)->ns;
+ mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
@@ -1610,7 +1612,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
if (IS_ERR(q))
return q;
- q->mnt.mnt_flags &= ~MNT_LOCKED;
q->mnt_mountpoint = mnt->mnt_mountpoint;
p = mnt;
@@ -1686,6 +1687,33 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
+/**
+ * clone_private_mount - create a private clone of a path
+ *
+ * This creates a new vfsmount, which will be the clone of @path. The new will
+ * not be attached anywhere in the namespace and will be private (i.e. changes
+ * to the originating mount won't be propagated into this).
+ *
+ * Release with mntput().
+ */
+struct vfsmount *clone_private_mount(struct path *path)
+{
+ struct mount *old_mnt = real_mount(path->mnt);
+ struct mount *new_mnt;
+
+ if (IS_MNT_UNBINDABLE(old_mnt))
+ return ERR_PTR(-EINVAL);
+
+ down_read(&namespace_sem);
+ new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+ up_read(&namespace_sem);
+ if (IS_ERR(new_mnt))
+ return ERR_CAST(new_mnt);
+
+ return &new_mnt->mnt;
+}
+EXPORT_SYMBOL_GPL(clone_private_mount);
+
int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
struct vfsmount *root)
{
@@ -1993,7 +2021,10 @@ static int do_loopback(struct path *path, const char *old_name,
if (IS_MNT_UNBINDABLE(old))
goto out2;
- if (!check_mnt(parent) || !check_mnt(old))
+ if (!check_mnt(parent))
+ goto out2;
+
+ if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
goto out2;
if (!recurse && has_locked_children(old, old_path.dentry))
@@ -2071,7 +2102,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
!(mnt_flags & MNT_NODEV)) {
- return -EPERM;
+ /* Was the nodev implicitly added in mount? */
+ if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
+ !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+ mnt_flags |= MNT_NODEV;
+ } else {
+ return -EPERM;
+ }
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
!(mnt_flags & MNT_NOSUID)) {
@@ -2613,7 +2650,7 @@ dput_out:
static void free_mnt_ns(struct mnt_namespace *ns)
{
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
put_user_ns(ns->user_ns);
kfree(ns);
}
@@ -2635,11 +2672,12 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
if (!new_ns)
return ERR_PTR(-ENOMEM);
- ret = proc_alloc_inum(&new_ns->proc_inum);
+ ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
return ERR_PTR(ret);
}
+ new_ns->ns.ops = &mntns_operations;
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
atomic_set(&new_ns->count, 1);
new_ns->root = NULL;
@@ -2931,6 +2969,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* mount new_root on / */
attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
touch_mnt_namespace(current->nsproxy->mnt_ns);
+ /* A moved mount should not expire automatically */
+ list_del_init(&new_mnt->mnt_expire);
unlock_mount_hash();
chroot_fs_refs(&root, &new);
put_mountpoint(root_mp);
@@ -2975,6 +3015,7 @@ static void __init init_mount_tree(void)
root.mnt = mnt;
root.dentry = mnt->mnt_root;
+ mnt->mnt_flags |= MNT_LOCKED;
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
@@ -3117,31 +3158,31 @@ found:
return visible;
}
-static void *mntns_get(struct task_struct *task)
+static struct ns_common *mntns_get(struct task_struct *task)
{
- struct mnt_namespace *ns = NULL;
+ struct ns_common *ns = NULL;
struct nsproxy *nsproxy;
task_lock(task);
nsproxy = task->nsproxy;
if (nsproxy) {
- ns = nsproxy->mnt_ns;
- get_mnt_ns(ns);
+ ns = &nsproxy->mnt_ns->ns;
+ get_mnt_ns(to_mnt_ns(ns));
}
task_unlock(task);
return ns;
}
-static void mntns_put(void *ns)
+static void mntns_put(struct ns_common *ns)
{
- put_mnt_ns(ns);
+ put_mnt_ns(to_mnt_ns(ns));
}
-static int mntns_install(struct nsproxy *nsproxy, void *ns)
+static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct fs_struct *fs = current->fs;
- struct mnt_namespace *mnt_ns = ns;
+ struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
struct path root;
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
@@ -3171,17 +3212,10 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns)
return 0;
}
-static unsigned int mntns_inum(void *ns)
-{
- struct mnt_namespace *mnt_ns = ns;
- return mnt_ns->proc_inum;
-}
-
const struct proc_ns_operations mntns_operations = {
.name = "mnt",
.type = CLONE_NEWNS,
.get = mntns_get,
.put = mntns_put,
.install = mntns_install,
- .inum = mntns_inum,
};
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7cb751dfbeef..008960101520 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -198,8 +198,8 @@ ncp_single_volume(struct ncp_server *server)
static inline int ncp_is_server_root(struct inode *inode)
{
- return (!ncp_single_volume(NCP_SERVER(inode)) &&
- inode == inode->i_sb->s_root->d_inode);
+ return !ncp_single_volume(NCP_SERVER(inode)) &&
+ is_root_inode(inode);
}
@@ -403,7 +403,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
/* If a pointer is invalid, we search the dentry. */
spin_lock(&parent->d_lock);
- list_for_each_entry(dent, &parent->d_subdirs, d_u.d_child) {
+ list_for_each_entry(dent, &parent->d_subdirs, d_child) {
if ((unsigned long)dent->d_fsdata == fpos) {
if (dent->d_inode)
dget(dent);
@@ -685,8 +685,7 @@ static void
ncp_read_volume_list(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctl)
{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ncp_server *server = NCP_SERVER(inode);
struct ncp_volume_info info;
struct ncp_entry_info entry;
@@ -721,8 +720,7 @@ static void
ncp_do_readdir(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctl)
{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *dir = dentry->d_inode;
+ struct inode *dir = file_inode(file);
struct ncp_server *server = NCP_SERVER(dir);
struct nw_search_sequence seq;
struct ncp_entry_info entry;
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 77640a8bfb87..1dd7007f974d 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -100,8 +100,7 @@ out:
static ssize_t
ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file_inode(file);
size_t already_read = 0;
off_t pos;
size_t bufsize;
@@ -109,7 +108,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
void* freepage;
size_t freelen;
- ncp_dbg(1, "enter %pd2\n", dentry);
+ ncp_dbg(1, "enter %pD2\n", file);
pos = *ppos;
@@ -167,7 +166,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
file_accessed(file);
- ncp_dbg(1, "exit %pd2\n", dentry);
+ ncp_dbg(1, "exit %pD2\n", file);
outrel:
ncp_inode_close(inode);
return already_read ? already_read : error;
@@ -176,15 +175,14 @@ outrel:
static ssize_t
ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file_inode(file);
size_t already_written = 0;
off_t pos;
size_t bufsize;
int errno;
void* bouncebuffer;
- ncp_dbg(1, "enter %pd2\n", dentry);
+ ncp_dbg(1, "enter %pD2\n", file);
if ((ssize_t) count < 0)
return -EINVAL;
pos = *ppos;
@@ -263,7 +261,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
i_size_write(inode, pos);
mutex_unlock(&inode->i_mutex);
}
- ncp_dbg(1, "exit %pd2\n", dentry);
+ ncp_dbg(1, "exit %pD2\n", file);
outrel:
ncp_inode_close(inode);
return already_written ? already_written : errno;
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index d5659d96ee7f..cf7e043a9447 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -447,7 +447,6 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
result = -EIO;
}
}
- result = 0;
}
mutex_unlock(&server->root_setup_lock);
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index b359d12eb359..33b873b259a8 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -30,9 +30,7 @@
static int ncp_file_mmap_fault(struct vm_area_struct *area,
struct vm_fault *vmf)
{
- struct file *file = area->vm_file;
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file_inode(area->vm_file);
char *pg_addr;
unsigned int already_read;
unsigned int count;
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 52cb19d66ecb..b785f74bfe3c 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -191,7 +191,7 @@ ncp_renew_dentries(struct dentry *parent)
struct dentry *dentry;
spin_lock(&parent->d_lock);
- list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) {
+ list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
if (dentry->d_fsdata == NULL)
ncp_age_dentry(server, dentry);
else
@@ -207,7 +207,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
struct dentry *dentry;
spin_lock(&parent->d_lock);
- list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) {
+ list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
dentry->d_fsdata = NULL;
ncp_age_dentry(server, dentry);
}
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5228f201d3d5..77fec6a55f57 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -378,7 +378,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
loff_t offset = header->args.offset;
size_t count = header->args.count;
struct page **pages = header->args.pages;
- int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
unsigned int pg_len;
struct blk_plug plug;
int i;
@@ -812,7 +812,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
/* Optimize common case that writes from 0 to end of file */
end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
- if (end != NFS_I(inode)->npages) {
+ if (end != inode->i_mapping->nrpages) {
rcu_read_lock();
end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index e966c023b1b7..dbe5839cdeba 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -65,17 +65,18 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+ mutex_lock(&nn->bl_mutex);
bl_pipe_msg.bl_wq = &nn->bl_wq;
b->simple.len += 4; /* single volume */
if (b->simple.len > PAGE_SIZE)
- return -EIO;
+ goto out_unlock;
memset(msg, 0, sizeof(*msg));
msg->len = sizeof(*bl_msg) + b->simple.len;
msg->data = kzalloc(msg->len, gfp_mask);
if (!msg->data)
- goto out;
+ goto out_free_data;
bl_msg = msg->data;
bl_msg->type = BL_DEVICE_MOUNT,
@@ -87,7 +88,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
if (rc < 0) {
remove_wait_queue(&nn->bl_wq, &wq);
- goto out;
+ goto out_free_data;
}
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -97,19 +98,21 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
if (reply->status != BL_DEVICE_REQUEST_PROC) {
printk(KERN_WARNING "%s failed to decode device: %d\n",
__func__, reply->status);
- goto out;
+ goto out_free_data;
}
dev = MKDEV(reply->major, reply->minor);
-out:
+out_free_data:
kfree(msg->data);
+out_unlock:
+ mutex_unlock(&nn->bl_mutex);
return dev;
}
static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
size_t mlen)
{
- struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+ struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
nfs_net_id);
if (mlen != sizeof (struct bl_dev_msg))
@@ -232,6 +235,7 @@ static int nfs4blocklayout_net_init(struct net *net)
struct nfs_net *nn = net_generic(net, nfs_net_id);
struct dentry *dentry;
+ mutex_init(&nn->bl_mutex);
init_waitqueue_head(&nn->bl_wq);
nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
if (IS_ERR(nn->bl_device_pipe))
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 73466b934090..e36a9d78ea49 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -49,7 +49,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
goto out_iput;
res->size = i_size_read(inode);
res->change_attr = delegation->change_attr;
- if (nfsi->npages != 0)
+ if (nfsi->nrequests != 0)
res->change_attr++;
res->ctime = inode->i_ctime;
res->mtime = inode->i_mtime;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5853f53db732..7f3f60641344 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,8 @@ again:
continue;
if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
if (!nfs4_stateid_match(&state->stateid, stateid))
continue;
get_nfs_open_context(ctx);
@@ -193,7 +195,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
{
int res = 0;
- res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ res = nfs4_proc_delegreturn(inode,
+ delegation->cred,
+ &delegation->stateid,
+ issync);
nfs_free_delegation(delegation);
return res;
}
@@ -380,11 +386,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
{
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
struct nfs_inode *nfsi = NFS_I(inode);
- int err;
+ int err = 0;
if (delegation == NULL)
return 0;
do {
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ break;
err = nfs_delegation_claim_opens(inode, &delegation->stateid);
if (!issync || err != -EAGAIN)
break;
@@ -605,10 +613,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
rcu_read_unlock();
}
+static void nfs_revoke_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL) {
+ set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+ nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+ }
+ rcu_read_unlock();
+}
+
void nfs_remove_bad_delegation(struct inode *inode)
{
struct nfs_delegation *delegation;
+ nfs_revoke_delegation(inode);
delegation = nfs_inode_detach_delegation(inode);
if (delegation) {
nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5c1cce39297f..e3c20a3ccc93 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
NFS_DELEGATION_RETURN_IF_CLOSED,
NFS_DELEGATION_REFERENCED,
NFS_DELEGATION_RETURNING,
+ NFS_DELEGATION_REVOKED,
};
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06e8cfcbb670..9b0c55cb2a2e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,7 +133,7 @@ out:
static int
nfs_closedir(struct inode *inode, struct file *filp)
{
- put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data);
+ put_nfs_open_dir_context(file_inode(filp), filp->private_data);
return 0;
}
@@ -499,7 +499,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
if (IS_ERR(inode))
goto out;
- alias = d_materialise_unique(dentry, inode);
+ alias = d_splice_alias(inode, dentry);
if (IS_ERR(alias))
goto out;
else if (alias) {
@@ -1393,7 +1393,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
nfs_advise_use_readdirplus(dir);
no_entry:
- res = d_materialise_unique(dentry, inode);
+ res = d_splice_alias(inode, dentry);
if (res != NULL) {
if (IS_ERR(res))
goto out_unblock_sillyrename;
@@ -1527,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
case -ENOENT:
d_drop(dentry);
d_add(dentry, NULL);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
break;
case -EISDIR:
case -ENOTDIR:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 20cffc830468..10bf07280f4a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref)
{
struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+ nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
if (dreq->l_ctx != NULL)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 46fab1cb455a..7afb52f6a25a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -145,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 9bb806a76d99..bfecac781f19 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -204,8 +204,7 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
ifdebug(FACILITY)
print_ds(ds);
- if (ds->ds_clp)
- nfs_put_client(ds->ds_clp);
+ nfs_put_client(ds->ds_clp);
while (!list_empty(&ds->ds_addrs)) {
da = list_first_entry(&ds->ds_addrs,
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 3ef01f0ba0bc..d63bea8bbfbb 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -269,8 +269,8 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
if (!fscache_maybe_release_page(cookie, page, gfp))
return 0;
- nfs_add_fscache_stats(page->mapping->host,
- NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+ nfs_inc_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED);
}
return 1;
@@ -293,8 +293,8 @@ void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
BUG_ON(!PageLocked(page));
fscache_uncache_page(cookie, page);
- nfs_add_fscache_stats(page->mapping->host,
- NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+ nfs_inc_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED);
}
/*
@@ -343,19 +343,19 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
case 0: /* read BIO submitted (page in fscache) */
dfprintk(FSCACHE,
"NFS: readpage_from_fscache: BIO submitted\n");
- nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
return ret;
case -ENOBUFS: /* inode not in cache */
case -ENODATA: /* page not in cache */
- nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
dfprintk(FSCACHE,
"NFS: readpage_from_fscache %d\n", ret);
return 1;
default:
dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
- nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
}
return ret;
}
@@ -429,11 +429,11 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
if (ret != 0) {
fscache_uncache_page(nfs_i_fscache(inode), page);
- nfs_add_fscache_stats(inode,
- NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
- nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+ nfs_inc_fscache_stats(inode,
+ NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
} else {
- nfs_add_fscache_stats(inode,
- NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
+ nfs_inc_fscache_stats(inode,
+ NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
}
}
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 880618a8b048..9ac3846cb59e 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -51,14 +51,14 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
/*
* Ensure that this dentry is invisible to d_find_alias().
* Otherwise, it may be spliced into the tree by
- * d_materialise_unique if a parent directory from the same
+ * d_splice_alias if a parent directory from the same
* filesystem gets mounted at a later time.
* This again causes shrink_dcache_for_umount_subtree() to
* Oops, since the test for IS_ROOT() will fail.
*/
spin_lock(&sb->s_root->d_inode->i_lock);
spin_lock(&sb->s_root->d_lock);
- hlist_del_init(&sb->s_root->d_alias);
+ hlist_del_init(&sb->s_root->d_u.d_alias);
spin_unlock(&sb->s_root->d_lock);
spin_unlock(&sb->s_root->d_inode->i_lock);
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6388a59f2add..4bffe637ea32 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -192,6 +192,7 @@ void nfs_zap_caches(struct inode *inode)
nfs_zap_caches_locked(inode);
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL_GPL(nfs_zap_caches);
void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
{
@@ -626,7 +627,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
- int err;
+ int err = 0;
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
@@ -1149,7 +1150,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
- && nfsi->npages == 0) {
+ && nfsi->nrequests == 0) {
i_size_write(inode, nfs_size_to_loff_t(fattr->size));
ret |= NFS_INO_INVALID_ATTR;
}
@@ -1192,7 +1193,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
cur_size = i_size_read(inode);
new_isize = nfs_size_to_loff_t(fattr->size);
- if (cur_size != new_isize && nfsi->npages == 0)
+ if (cur_size != new_isize && nfsi->nrequests == 0)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
}
@@ -1619,7 +1620,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (new_isize != cur_isize) {
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
- if ((nfsi->npages == 0) || new_isize > cur_isize) {
+ if ((nfsi->nrequests == 0) || new_isize > cur_isize) {
i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
invalid &= ~NFS_INO_REVAL_PAGECACHE;
@@ -1784,7 +1785,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
INIT_LIST_HEAD(&nfsi->commit_info.list);
- nfsi->npages = 0;
+ nfsi->nrequests = 0;
nfsi->commit_info.ncommit = 0;
atomic_set(&nfsi->commit_info.rpcs_out, 0);
atomic_set(&nfsi->silly_count, 1);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index c5832487c456..0cb806fbd4c4 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -55,6 +55,11 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
{
this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
}
+static inline void nfs_inc_fscache_stats(struct inode *inode,
+ enum nfs_stat_fscachecounters stat)
+{
+ this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]);
+}
#endif
static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index ef221fb8a183..f0e06e4acbef 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -19,6 +19,7 @@ struct nfs_net {
struct rpc_pipe *bl_device_pipe;
struct bl_dev_msg bl_mount_reply;
wait_queue_head_t bl_wq;
+ struct mutex bl_mutex;
struct list_head nfs_client_list;
struct list_head nfs_volume_list;
#if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index d10333a197bf..7afb8947dfdf 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -6,6 +6,8 @@
#define __LINUX_FS_NFS_NFS4_2_H
/* nfs4.2proc.c */
+int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
/* nfs4.2xdr.h */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 0886f1db5917..cb170722769c 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -32,6 +32,81 @@ static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
return ret;
}
+static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs42_falloc_args args = {
+ .falloc_fh = NFS_FH(inode),
+ .falloc_offset = offset,
+ .falloc_length = len,
+ };
+ struct nfs42_falloc_res res;
+ struct nfs_server *server = NFS_SERVER(inode);
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
+ if (status)
+ return status;
+
+ return nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
+ loff_t offset, loff_t len)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(filep));
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_fallocate(msg, filep, offset, len);
+ if (err == -ENOTSUPP)
+ return -EOPNOTSUPP;
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
+ };
+ struct inode *inode = file_inode(filep);
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
+ return -EOPNOTSUPP;
+
+ err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == -EOPNOTSUPP)
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
+ return err;
+}
+
+int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE],
+ };
+ struct inode *inode = file_inode(filep);
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
+ return -EOPNOTSUPP;
+
+ err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == -EOPNOTSUPP)
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
+ return err;
+}
+
loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
struct inode *inode = file_inode(filep);
@@ -50,7 +125,7 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
struct nfs_server *server = NFS_SERVER(inode);
int status;
- if (!(server->caps & NFS_CAP_SEEK))
+ if (!nfs_server_capable(inode, NFS_CAP_SEEK))
return -ENOTSUPP;
status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index c90469b604b8..038a7e1521fa 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -4,6 +4,15 @@
#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
#define __LINUX_FS_NFS_NFS4_2XDR_H
+#define encode_fallocate_maxsz (encode_stateid_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */)
+#define encode_allocate_maxsz (op_encode_hdr_maxsz + \
+ encode_fallocate_maxsz)
+#define decode_allocate_maxsz (op_decode_hdr_maxsz)
+#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
+ encode_fallocate_maxsz)
+#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
#define encode_seek_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
2 /* offset */ + \
@@ -14,6 +23,18 @@
2 /* offset */ + \
2 /* length */)
+#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_allocate_maxsz)
+#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_allocate_maxsz)
+#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_deallocate_maxsz)
+#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_deallocate_maxsz)
#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_seek_maxsz)
@@ -22,6 +43,30 @@
decode_seek_maxsz)
+static void encode_fallocate(struct xdr_stream *xdr,
+ struct nfs42_falloc_args *args)
+{
+ encode_nfs4_stateid(xdr, &args->falloc_stateid);
+ encode_uint64(xdr, args->falloc_offset);
+ encode_uint64(xdr, args->falloc_length);
+}
+
+static void encode_allocate(struct xdr_stream *xdr,
+ struct nfs42_falloc_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_ALLOCATE, decode_allocate_maxsz, hdr);
+ encode_fallocate(xdr, args);
+}
+
+static void encode_deallocate(struct xdr_stream *xdr,
+ struct nfs42_falloc_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DEALLOCATE, decode_deallocate_maxsz, hdr);
+ encode_fallocate(xdr, args);
+}
+
static void encode_seek(struct xdr_stream *xdr,
struct nfs42_seek_args *args,
struct compound_hdr *hdr)
@@ -33,6 +78,42 @@ static void encode_seek(struct xdr_stream *xdr,
}
/*
+ * Encode ALLOCATE request
+ */
+static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_falloc_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->falloc_fh, &hdr);
+ encode_allocate(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode DEALLOCATE request
+ */
+static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_falloc_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->falloc_fh, &hdr);
+ encode_deallocate(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode SEEK request
*/
static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
@@ -50,6 +131,16 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
encode_nops(&hdr);
}
+static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
+{
+ return decode_op_hdr(xdr, OP_ALLOCATE);
+}
+
+static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
+{
+ return decode_op_hdr(xdr, OP_DEALLOCATE);
+}
+
static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
{
int status;
@@ -73,6 +164,54 @@ out_overflow:
}
/*
+ * Decode ALLOCATE request
+ */
+static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_falloc_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_allocate(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode DEALLOCATE request
+ */
+static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_falloc_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_deallocate(xdr, res);
+out:
+ return status;
+}
+
+/*
* Decode SEEK request
*/
static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index be6cac37ea10..a08178764cf9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -226,6 +226,7 @@ int nfs4_replace_transport(struct nfs_server *server,
const struct nfs4_fs_locations *locations);
/* nfs4proc.c */
+extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
struct rpc_message *, struct nfs4_sequence_args *,
struct nfs4_sequence_res *, int);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index ffdb28d86cf8..03311259b0c4 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -241,28 +241,25 @@ void nfs4_free_client(struct nfs_client *clp)
*/
static int nfs4_init_callback(struct nfs_client *clp)
{
+ struct rpc_xprt *xprt;
int error;
- if (clp->rpc_ops->version == 4) {
- struct rpc_xprt *xprt;
+ xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt);
- xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt);
-
- if (nfs4_has_session(clp)) {
- error = xprt_setup_backchannel(xprt,
- NFS41_BC_MIN_CALLBACKS);
- if (error < 0)
- return error;
- }
-
- error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);
- if (error < 0) {
- dprintk("%s: failed to start callback. Error = %d\n",
- __func__, error);
+ if (nfs4_has_session(clp)) {
+ error = xprt_setup_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+ if (error < 0)
return error;
- }
- __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
}
+
+ error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);
+ if (error < 0) {
+ dprintk("%s: failed to start callback. Error = %d\n",
+ __func__, error);
+ return error;
+ }
+ __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+
return 0;
}
@@ -498,8 +495,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
atomic_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
- if (prev)
- nfs_put_client(prev);
+ nfs_put_client(prev);
prev = pos;
status = nfs_wait_client_init_complete(pos);
@@ -517,8 +513,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
atomic_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
- if (prev)
- nfs_put_client(prev);
+ nfs_put_client(prev);
prev = pos;
status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
@@ -549,8 +544,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
/* No match found. The server lost our clientid */
out:
- if (prev)
- nfs_put_client(prev);
+ nfs_put_client(prev);
dprintk("NFS: <-- %s status = %d\n", __func__, status);
return status;
}
@@ -641,8 +635,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
atomic_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
- if (prev)
- nfs_put_client(prev);
+ nfs_put_client(prev);
prev = pos;
status = nfs_wait_client_init_complete(pos);
@@ -675,8 +668,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
/* No matching nfs_client found. */
spin_unlock(&nn->nfs_client_lock);
dprintk("NFS: <-- %s status = %d\n", __func__, status);
- if (prev)
- nfs_put_client(prev);
+ nfs_put_client(prev);
return status;
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index c51fb4db9bfe..8b46389c4c5b 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -3,6 +3,8 @@
*
* Copyright (C) 1992 Rick Sladkey
*/
+#include <linux/fs.h>
+#include <linux/falloc.h>
#include <linux/nfs_fs.h>
#include "internal.h"
#include "fscache.h"
@@ -134,6 +136,32 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
return nfs_file_llseek(filep, offset, whence);
}
}
+
+static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(filep);
+ long ret;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)))
+ return -EOPNOTSUPP;
+
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ ret = nfs42_proc_deallocate(filep, offset, len);
+ else
+ ret = nfs42_proc_allocate(filep, offset, len);
+ mutex_unlock(&inode->i_mutex);
+
+ nfs_zap_caches(inode);
+ return ret;
+}
#endif /* CONFIG_NFS_V4_2 */
const struct file_operations nfs4_file_operations = {
@@ -155,6 +183,9 @@ const struct file_operations nfs4_file_operations = {
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
+#ifdef CONFIG_NFS_V4_2
+ .fallocate = nfs42_fallocate,
+#endif /* CONFIG_NFS_V4_2 */
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 405bd95c1f58..e7f8d5ff2581 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -158,8 +158,6 @@ static int nfs4_map_errors(int err)
return -EACCES;
case -NFS4ERR_MINOR_VERS_MISMATCH:
return -EPROTONOSUPPORT;
- case -NFS4ERR_ACCESS:
- return -EACCES;
case -NFS4ERR_FILE_OPEN:
return -EBUSY;
default:
@@ -344,7 +342,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
-static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
@@ -370,11 +368,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
- nfs_remove_bad_delegation(inode);
- exception->retry = 1;
- break;
- }
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -1654,7 +1647,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
nfs_inode_find_state_and_recover(state->inode,
stateid);
nfs4_schedule_stateid_recovery(server, state);
- return 0;
+ return -EAGAIN;
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
set_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -2109,46 +2102,60 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
return ret;
}
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+ nfs_remove_bad_delegation(state->inode);
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+ if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+ nfs_finish_clear_delegation_stateid(state);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+ nfs40_clear_delegation_stateid(state);
+ return nfs4_open_expired(sp, state);
+}
+
#if defined(CONFIG_NFS_V4_1)
-static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
- nfs4_stateid *stateid = &state->stateid;
+ nfs4_stateid stateid;
struct nfs_delegation *delegation;
- struct rpc_cred *cred = NULL;
- int status = -NFS4ERR_BAD_STATEID;
-
- /* If a state reset has been done, test_stateid is unneeded */
- if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
- return;
+ struct rpc_cred *cred;
+ int status;
/* Get the delegation credential for use by test/free_stateid */
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
- if (delegation != NULL &&
- nfs4_stateid_match(&delegation->stateid, stateid)) {
- cred = get_rpccred(delegation->cred);
- rcu_read_unlock();
- status = nfs41_test_stateid(server, stateid, cred);
- trace_nfs4_test_delegation_stateid(state, NULL, status);
- } else
+ if (delegation == NULL) {
rcu_read_unlock();
+ return;
+ }
+
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ cred = get_rpccred(delegation->cred);
+ rcu_read_unlock();
+ status = nfs41_test_stateid(server, &stateid, cred);
+ trace_nfs4_test_delegation_stateid(state, NULL, status);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid, cred);
- nfs_remove_bad_delegation(state->inode);
-
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs41_free_stateid(server, &stateid, cred);
+ nfs_finish_clear_delegation_stateid(state);
}
- if (cred != NULL)
- put_rpccred(cred);
+ put_rpccred(cred);
}
/**
@@ -2192,7 +2199,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
{
int status;
- nfs41_clear_delegation_stateid(state);
+ nfs41_check_delegation_stateid(state);
status = nfs41_check_open_stateid(state);
if (status != NFS_OK)
status = nfs4_open_expired(sp, state);
@@ -2231,19 +2238,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
ret = _nfs4_proc_open(opendata);
- if (ret != 0) {
- if (ret == -ENOENT) {
- dentry = opendata->dentry;
- if (dentry->d_inode)
- d_delete(dentry);
- else if (d_unhashed(dentry))
- d_add(dentry, NULL);
-
- nfs_set_verifier(dentry,
- nfs_save_change_attribute(opendata->dir->d_inode));
- }
+ if (ret != 0)
goto out;
- }
state = nfs4_opendata_to_nfs4_state(opendata);
ret = PTR_ERR(state);
@@ -4841,9 +4837,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
@@ -7709,6 +7702,9 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
dprintk("--> %s\n", __func__);
+ /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
+ pnfs_get_layout_hdr(NFS_I(inode)->layout);
+
lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
if (!lgp->args.layout.pages) {
nfs4_layoutget_release(lgp);
@@ -7721,9 +7717,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
lgp->res.seq_res.sr_slot = NULL;
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
- /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
- pnfs_get_layout_hdr(NFS_I(inode)->layout);
-
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return ERR_CAST(task);
@@ -8341,7 +8334,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
.state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
- .recover_open = nfs4_open_expired,
+ .recover_open = nfs40_open_expired,
.recover_lock = nfs4_lock_expired,
.establish_clid = nfs4_init_clientid,
};
@@ -8408,8 +8401,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
| NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
- | NFS_CAP_ATOMIC_OPEN_V1
- | NFS_CAP_SEEK,
+ | NFS_CAP_ATOMIC_OPEN_V1,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -8431,7 +8423,10 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
- | NFS_CAP_ATOMIC_OPEN_V1,
+ | NFS_CAP_ATOMIC_OPEN_V1
+ | NFS_CAP_ALLOCATE
+ | NFS_CAP_DEALLOCATE
+ | NFS_CAP_SEEK,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 206c08a60c7f..cb4376b78ed9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -141,13 +141,15 @@ static int nfs4_stat_to_errno(int);
XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
1 /* sc_prog */ + \
- XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
- XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
1) /* sc_cb_ident */
#define decode_setclientid_maxsz \
(op_decode_hdr_maxsz + \
- 2 + \
- 1024) /* large value for CLID_INUSE */
+ 2 /* clientid */ + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN))
#define encode_setclientid_confirm_maxsz \
(op_encode_hdr_maxsz + \
3 + (NFS4_VERIFIER_SIZE >> 2))
@@ -7394,6 +7396,8 @@ struct rpc_procinfo nfs4_procedures[] = {
#endif /* CONFIG_NFS_V4_1 */
#ifdef CONFIG_NFS_V4_2
PROC(SEEK, enc_seek, dec_seek),
+ PROC(ALLOCATE, enc_allocate, dec_allocate),
+ PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
#endif /* CONFIG_NFS_V4_2 */
};
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ed0db61f8543..2b5e769beb16 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -258,6 +258,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
static inline void
nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
{
+ struct inode *inode;
WARN_ON_ONCE(prev == req);
if (!prev) {
@@ -276,12 +277,16 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
* nfs_page_group_destroy is called */
kref_get(&req->wb_head->wb_kref);
- /* grab extra ref if head request has extra ref from
- * the write/commit path to handle handoff between write
- * and commit lists */
+ /* grab extra ref and bump the request count if head request
+ * has extra ref from the write/commit path to handle handoff
+ * between write and commit lists. */
if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) {
+ inode = page_file_mapping(req->wb_page)->host;
set_bit(PG_INODE_REF, &req->wb_flags);
kref_get(&req->wb_kref);
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->nrequests++;
+ spin_unlock(&inode->i_lock);
}
}
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index beff2769c5c5..c91a4799c562 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -269,7 +269,7 @@ int nfs_readpage(struct file *file, struct page *page)
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
page, PAGE_CACHE_SIZE, page_file_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
- nfs_add_stats(inode, NFSIOS_READPAGES, 1);
+ nfs_inc_stats(inode, NFSIOS_READPAGES);
/*
* Try to flush any pending writes to the file..
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 12493846a2d3..af3af685a9e3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -575,7 +575,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
int ret;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
- nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
+ nfs_inc_stats(inode, NFSIOS_WRITEPAGES);
nfs_pageio_cond_complete(pgio, page_file_index(page));
ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
@@ -670,7 +670,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
nfs_lock_request(req);
spin_lock(&inode->i_lock);
- if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+ if (!nfsi->nrequests &&
+ NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
inode->i_version++;
/*
* Swap-space should not get truncated. Hence no need to plug the race
@@ -681,9 +682,11 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
SetPagePrivate(req->wb_page);
set_page_private(req->wb_page, (unsigned long)req);
}
- nfsi->npages++;
+ nfsi->nrequests++;
/* this a head request for a page group - mark it as having an
- * extra reference so sub groups can follow suit */
+ * extra reference so sub groups can follow suit.
+ * This flag also informs pgio layer when to bump nrequests when
+ * adding subrequests. */
WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));
kref_get(&req->wb_kref);
spin_unlock(&inode->i_lock);
@@ -709,14 +712,16 @@ static void nfs_inode_remove_request(struct nfs_page *req)
wake_up_page(head->wb_page, PG_private);
clear_bit(PG_MAPPED, &head->wb_flags);
}
- nfsi->npages--;
+ nfsi->nrequests--;
+ spin_unlock(&inode->i_lock);
+ } else {
+ spin_lock(&inode->i_lock);
+ nfsi->nrequests--;
spin_unlock(&inode->i_lock);
}
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
nfs_release_request(req);
- else
- WARN_ON_ONCE(1);
}
static void
@@ -1737,7 +1742,7 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
/* Don't commit yet if this is a non-blocking flush and there
* are a lot of outstanding writes for this mapping.
*/
- if (nfsi->commit_info.ncommit <= (nfsi->npages >> 1))
+ if (nfsi->commit_info.ncommit <= (nfsi->nrequests >> 1))
goto out_mark_dirty;
/* don't wait for the COMMIT response */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index ed2b1151b171..7cbdf1b2e4ab 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -774,8 +774,12 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
{
if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
- dprintk("%s slot is busy\n", __func__);
- return false;
+ /* Race breaker */
+ if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+ dprintk("%s slot is busy\n", __func__);
+ return false;
+ }
+ rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
}
return true;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cdeb3cfd6f32..ac71d13c69ef 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/file.h>
+#include <linux/falloc.h>
#include <linux/slab.h>
#include "idmap.h"
@@ -772,7 +773,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* the client wants us to do more in this compound:
*/
if (!nfsd4_last_compound_op(rqstp))
- rqstp->rq_splice_ok = false;
+ clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
@@ -1014,6 +1015,44 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
static __be32
+nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_fallocate *fallocate, int flags)
+{
+ __be32 status = nfserr_notsupp;
+ struct file *file;
+
+ status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+ &fallocate->falloc_stateid,
+ WR_STATE, &file);
+ if (status != nfs_ok) {
+ dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
+ return status;
+ }
+
+ status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file,
+ fallocate->falloc_offset,
+ fallocate->falloc_length,
+ flags);
+ fput(file);
+ return status;
+}
+
+static __be32
+nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_fallocate *fallocate)
+{
+ return nfsd4_fallocate(rqstp, cstate, fallocate, 0);
+}
+
+static __be32
+nfsd4_deallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_fallocate *fallocate)
+{
+ return nfsd4_fallocate(rqstp, cstate, fallocate,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE);
+}
+
+static __be32
nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_seek *seek)
{
@@ -1272,7 +1311,8 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
*/
if (argp->opcnt == resp->opcnt)
return false;
-
+ if (next->opnum == OP_ILLEGAL)
+ return false;
nextd = OPDESC(next);
/*
* Rest of 2.6.3.1.1: certain operations will return WRONGSEC
@@ -1330,7 +1370,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
*/
- rqstp->rq_usedeferral = false;
+ clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
/*
* According to RFC3010, this takes precedence over all other errors.
@@ -1446,7 +1486,7 @@ encode_op:
BUG_ON(cstate->replay_owner);
out:
/* Reset deferral mechanism for RPC deferrals */
- rqstp->rq_usedeferral = true;
+ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
dprintk("nfsv4 compound returned %d\n", ntohl(status));
return status;
}
@@ -1589,7 +1629,8 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
struct nfsd4_op *op)
{
- return NFS4_MAX_SESSIONID_LEN + 20;
+ return (op_encode_hdr_size
+ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
}
static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
@@ -1893,6 +1934,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_func = (nfsd4op_func)nfsd4_sequence,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
.op_name = "OP_SEQUENCE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_sequence_rsize,
},
[OP_DESTROY_CLIENTID] = {
.op_func = (nfsd4op_func)nfsd4_destroy_clientid,
@@ -1926,6 +1968,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
/* NFSv4.2 operations */
+ [OP_ALLOCATE] = {
+ .op_func = (nfsd4op_func)nfsd4_allocate,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_ALLOCATE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+ },
+ [OP_DEALLOCATE] = {
+ .op_func = (nfsd4op_func)nfsd4_deallocate,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_DEALLOCATE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index a25490ae6c62..cc6a76072009 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -245,10 +245,11 @@ struct nfs4_dir_ctx {
};
static int
-nfsd4_build_namelist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct nfs4_dir_ctx *ctx = arg;
+ struct nfs4_dir_ctx *ctx =
+ container_of(__ctx, struct nfs4_dir_ctx, ctx);
struct name_list *entry;
if (namlen != HEXDIR_LEN - 1)
@@ -704,7 +705,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
struct cld_upcall *tmp, *cup;
struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
uint32_t xid;
- struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+ struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
nfsd_net_id);
struct cld_net *cn = nn->cld_net;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e9c3afe4b5d3..3550a9c87616 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,7 +41,7 @@
#include <linux/ratelimit.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/addr.h>
-#include <linux/hash.h>
+#include <linux/jhash.h>
#include "xdr4.h"
#include "xdr4cb.h"
#include "vfs.h"
@@ -275,9 +275,11 @@ opaque_hashval(const void *ptr, int nbytes)
return x;
}
-static void nfsd4_free_file(struct nfs4_file *f)
+static void nfsd4_free_file_rcu(struct rcu_head *rcu)
{
- kmem_cache_free(file_slab, f);
+ struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu);
+
+ kmem_cache_free(file_slab, fp);
}
static inline void
@@ -286,9 +288,10 @@ put_nfs4_file(struct nfs4_file *fi)
might_lock(&state_lock);
if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
- hlist_del(&fi->fi_hash);
+ hlist_del_rcu(&fi->fi_hash);
spin_unlock(&state_lock);
- nfsd4_free_file(fi);
+ WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
+ call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
}
}
@@ -594,7 +597,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
}
spin_unlock(&blocked_delegations_lock);
}
- hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_base, fh->fh_size, 0);
if (test_bit(hash&255, bd->set[0]) &&
test_bit((hash>>8)&255, bd->set[0]) &&
test_bit((hash>>16)&255, bd->set[0]))
@@ -613,7 +616,7 @@ static void block_delegations(struct knfsd_fh *fh)
u32 hash;
struct bloom_pair *bd = &blocked_delegations;
- hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_base, fh->fh_size, 0);
spin_lock(&blocked_delegations_lock);
__set_bit(hash&255, bd->set[bd->new]);
@@ -1440,7 +1443,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
list_add(&new->se_perclnt, &clp->cl_sessions);
spin_unlock(&clp->cl_lock);
- if (cses->flags & SESSION4_BACK_CHAN) {
+ {
struct sockaddr *sa = svc_addr(rqstp);
/*
* This is a little silly; with sessions there's no real
@@ -1711,15 +1714,14 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
return 0;
}
-static long long
+static int
compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
{
- long long res;
-
- res = o1->len - o2->len;
- if (res)
- return res;
- return (long long)memcmp(o1->data, o2->data, o1->len);
+ if (o1->len < o2->len)
+ return -1;
+ if (o1->len > o2->len)
+ return 1;
+ return memcmp(o1->data, o2->data, o1->len);
}
static int same_name(const char *n1, const char *n2)
@@ -1907,7 +1909,7 @@ add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
static struct nfs4_client *
find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
{
- long long cmp;
+ int cmp;
struct rb_node *node = root->rb_node;
struct nfs4_client *clp;
@@ -3057,10 +3059,9 @@ static struct nfs4_file *nfsd4_alloc_file(void)
}
/* OPEN Share state helper functions */
-static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh)
+static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
+ struct nfs4_file *fp)
{
- unsigned int hashval = file_hashval(fh);
-
lockdep_assert_held(&state_lock);
atomic_set(&fp->fi_ref, 1);
@@ -3073,7 +3074,7 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh)
fp->fi_share_deny = 0;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
memset(fp->fi_access, 0, sizeof(fp->fi_access));
- hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
+ hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
}
void
@@ -3294,17 +3295,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
/* search file_hashtbl[] for file */
static struct nfs4_file *
-find_file_locked(struct knfsd_fh *fh)
+find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
{
- unsigned int hashval = file_hashval(fh);
struct nfs4_file *fp;
- lockdep_assert_held(&state_lock);
-
- hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
+ hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
- get_nfs4_file(fp);
- return fp;
+ if (atomic_inc_not_zero(&fp->fi_ref))
+ return fp;
}
}
return NULL;
@@ -3314,10 +3312,11 @@ static struct nfs4_file *
find_file(struct knfsd_fh *fh)
{
struct nfs4_file *fp;
+ unsigned int hashval = file_hashval(fh);
- spin_lock(&state_lock);
- fp = find_file_locked(fh);
- spin_unlock(&state_lock);
+ rcu_read_lock();
+ fp = find_file_locked(fh, hashval);
+ rcu_read_unlock();
return fp;
}
@@ -3325,11 +3324,18 @@ static struct nfs4_file *
find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
{
struct nfs4_file *fp;
+ unsigned int hashval = file_hashval(fh);
+
+ rcu_read_lock();
+ fp = find_file_locked(fh, hashval);
+ rcu_read_unlock();
+ if (fp)
+ return fp;
spin_lock(&state_lock);
- fp = find_file_locked(fh);
- if (fp == NULL) {
- nfsd4_init_file(new, fh);
+ fp = find_file_locked(fh, hashval);
+ if (likely(fp == NULL)) {
+ nfsd4_init_file(fh, hashval, new);
fp = new;
}
spin_unlock(&state_lock);
@@ -4127,7 +4133,7 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
nfs4_put_stateowner(so);
}
if (open->op_file)
- nfsd4_free_file(open->op_file);
+ kmem_cache_free(file_slab, open->op_file);
if (open->op_stp)
nfs4_put_stid(&open->op_stp->st_stid);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index eeea7a90eb87..15f7b73e0c0f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1514,6 +1514,23 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
}
static __be32
+nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
+ struct nfsd4_fallocate *fallocate)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(16);
+ p = xdr_decode_hyper(p, &fallocate->falloc_offset);
+ xdr_decode_hyper(p, &fallocate->falloc_length);
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
DECODE_HEAD;
@@ -1604,10 +1621,10 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
/* new operations for NFSv4.2 */
- [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
[OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
[OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -1714,7 +1731,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
- argp->rqstp->rq_splice_ok = false;
+ clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
DECODE_TAIL;
}
@@ -1795,9 +1812,12 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
}
else
end++;
+ if (found_esc)
+ end = next;
+
str = end;
}
- pathlen = htonl(xdr->buf->len - pathlen_offset);
+ pathlen = htonl(count);
write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4);
return 0;
}
@@ -1886,7 +1906,7 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
goto out_free;
}
p = xdr_encode_opaque(p, dentry->d_name.name, len);
- dprintk("/%s", dentry->d_name.name);
+ dprintk("/%pd", dentry);
spin_unlock(&dentry->d_lock);
dput(dentry);
ncomponents--;
@@ -3236,10 +3256,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
if (!p) {
- WARN_ON_ONCE(resp->rqstp->rq_splice_ok);
+ WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
return nfserr_resource;
}
- if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) {
+ if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
WARN_ON_ONCE(1);
return nfserr_resource;
}
@@ -3256,7 +3276,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
goto err_truncate;
}
- if (file->f_op->splice_read && resp->rqstp->rq_splice_ok)
+ if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
err = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
err = nfsd4_encode_readv(resp, read, file, maxcount);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 122f69185ef5..83a9694ec485 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -490,7 +490,7 @@ found_entry:
/* From the hall of fame of impractical attacks:
* Is this a user who tries to snoop on the cache? */
rtn = RC_DOIT;
- if (!rqstp->rq_secure && rp->c_secure)
+ if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure)
goto out;
/* Compose RPC reply header */
@@ -579,7 +579,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
spin_lock(&b->cache_lock);
drc_mem_usage += bufsize;
lru_put_end(b, rp);
- rp->c_secure = rqstp->rq_secure;
+ rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags);
rp->c_type = cachetype;
rp->c_state = RC_DONE;
spin_unlock(&b->cache_lock);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index ca73ca79a0ee..19ace74d35f6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -231,6 +231,10 @@ static struct file_operations reply_cache_stats_operations = {
* payload - write methods
*/
+static inline struct net *netns(struct file *file)
+{
+ return file_inode(file)->i_sb->s_fs_info;
+}
/**
* write_unlock_ip - Release all locks used by a client
@@ -252,7 +256,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
struct sockaddr *sap = (struct sockaddr *)&address;
size_t salen = sizeof(address);
char *fo_path;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct net *net = netns(file);
/* sanity check */
if (size == 0)
@@ -350,7 +354,6 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
int len;
struct auth_domain *dom;
struct knfsd_fh fh;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
if (size == 0)
return -EINVAL;
@@ -385,7 +388,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
if (!dom)
return -ENOMEM;
- len = exp_rootfh(net, dom, path, &fh, maxsize);
+ len = exp_rootfh(netns(file), dom, path, &fh, maxsize);
auth_domain_put(dom);
if (len)
return len;
@@ -429,7 +432,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
int rv;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct net *net = netns(file);
if (size > 0) {
int newthreads;
@@ -480,7 +483,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
int len;
int npools;
int *nthreads;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct net *net = netns(file);
mutex_lock(&nfsd_mutex);
npools = nfsd_nrpools(net);
@@ -543,8 +546,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
unsigned minor;
ssize_t tlen = 0;
char *sep;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
if (size>0) {
if (nn->nfsd_serv)
@@ -606,7 +608,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
num);
sep = " ";
- if (len > remaining)
+ if (len >= remaining)
break;
remaining -= len;
buf += len;
@@ -621,7 +623,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
'+' : '-',
minor);
- if (len > remaining)
+ if (len >= remaining)
break;
remaining -= len;
buf += len;
@@ -629,7 +631,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
}
len = snprintf(buf, remaining, "\n");
- if (len > remaining)
+ if (len >= remaining)
return -EINVAL;
return tlen + len;
}
@@ -830,10 +832,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size,
static ssize_t write_ports(struct file *file, char *buf, size_t size)
{
ssize_t rv;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
- rv = __write_ports(file, buf, size, net);
+ rv = __write_ports(file, buf, size, netns(file));
mutex_unlock(&nfsd_mutex);
return rv;
}
@@ -865,8 +866,7 @@ int nfsd_max_blksize;
static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
if (size > 0) {
int bsize;
@@ -915,8 +915,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
unsigned int maxconn = nn->max_connections;
if (size > 0) {
@@ -997,8 +996,7 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
*/
static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
{
- struct net *net = file->f_dentry->d_sb->s_fs_info;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
}
@@ -1014,8 +1012,7 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
*/
static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
{
- struct net *net = file->f_dentry->d_sb->s_fs_info;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
}
@@ -1071,8 +1068,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
{
ssize_t rv;
- struct net *net = file->f_dentry->d_sb->s_fs_info;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
mutex_lock(&nfsd_mutex);
rv = __write_recoverydir(file, buf, size, nn);
@@ -1102,8 +1098,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
*/
static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
{
- struct net *net = file->f_dentry->d_sb->s_fs_info;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
if (size > 0) {
switch(buf[0]) {
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 747f3b95bd11..33a46a8dfaf7 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -335,12 +335,15 @@ void nfsd_lockd_shutdown(void);
(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
- (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
+#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
#else
-#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
+#define NFSD4_2_SECURITY_ATTRS 0
#endif
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
+ NFSD4_2_SECURITY_ATTRS)
+
static inline u32 nfsd_suppattrs0(u32 minorversion)
{
return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 88026fc6a981..965b478d50fc 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -86,7 +86,7 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
int flags = nfsexp_flags(rqstp, exp);
/* Check if the request originated from a secure port. */
- if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) {
+ if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) {
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
dprintk("nfsd: request from insecure port %s!\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 752d56bbe0ba..314f5c8f8f1a 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -692,7 +692,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
/* Now call the procedure handler, and encode NFS status. */
nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
nfserr = map_new_errors(rqstp->rq_vers, nfserr);
- if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
+ if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) {
dprintk("nfsd: Dropping request; may be revisited later\n");
nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 2712042a66b1..9d3be371240a 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -463,17 +463,24 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
/*
* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
*
- * These objects are global. nfsd only keeps one instance of a nfs4_file per
- * inode (though it may keep multiple file descriptors open per inode). These
- * are tracked in the file_hashtbl which is protected by the state_lock
- * spinlock.
+ * These objects are global. nfsd keeps one instance of a nfs4_file per
+ * filehandle (though it may keep multiple file descriptors for each). Each
+ * inode can have multiple filehandles associated with it, so there is
+ * (potentially) a many to one relationship between this struct and struct
+ * inode.
+ *
+ * These are hashed by filehandle in the file_hashtbl, which is protected by
+ * the global state_lock spinlock.
*/
struct nfs4_file {
atomic_t fi_ref;
spinlock_t fi_lock;
- struct hlist_node fi_hash; /* hash by "struct inode *" */
+ struct hlist_node fi_hash; /* hash on fi_fhandle */
struct list_head fi_stateids;
- struct list_head fi_delegations;
+ union {
+ struct list_head fi_delegations;
+ struct rcu_head fi_rcu;
+ };
/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
struct file * fi_fds[3];
/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 989129e2d6ea..5685c679dd93 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -16,6 +16,7 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/splice.h>
+#include <linux/falloc.h>
#include <linux/fcntl.h>
#include <linux/namei.h>
#include <linux/delay.h>
@@ -533,6 +534,26 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif
+__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset, loff_t len,
+ int flags)
+{
+ __be32 err;
+ int error;
+
+ if (!S_ISREG(file_inode(file)->i_mode))
+ return nfserr_inval;
+
+ err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE);
+ if (err)
+ return err;
+
+ error = vfs_fallocate(file, flags, offset, len);
+ if (!error)
+ error = commit_metadata(fhp);
+
+ return nfserrno(error);
+}
#endif /* defined(CONFIG_NFSD_V4) */
#ifdef CONFIG_NFSD_V3
@@ -881,7 +902,7 @@ static __be32
nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file,
loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
{
- if (file->f_op->splice_read && rqstp->rq_splice_ok)
+ if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
return nfsd_splice_read(rqstp, file, offset, count);
else
return nfsd_readv(file, offset, vec, vlen, count);
@@ -930,7 +951,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
unsigned long *cnt, int *stablep)
{
struct svc_export *exp;
- struct dentry *dentry;
struct inode *inode;
mm_segment_t oldfs;
__be32 err = 0;
@@ -938,9 +958,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
int stable = *stablep;
int use_wgather;
loff_t pos = offset;
+ loff_t end = LLONG_MAX;
unsigned int pflags = current->flags;
- if (rqstp->rq_local)
+ if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
/*
* We want less throttling in balance_dirty_pages()
* and shrink_inactive_list() so that nfs to
@@ -949,8 +970,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
*/
current->flags |= PF_LESS_THROTTLE;
- dentry = file->f_path.dentry;
- inode = dentry->d_inode;
+ inode = file_inode(file);
exp = fhp->fh_export;
use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
@@ -969,10 +989,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
fsnotify_modify(file);
if (stable) {
- if (use_wgather)
+ if (use_wgather) {
host_err = wait_for_concurrent_writes(file);
- else
- host_err = vfs_fsync_range(file, offset, offset+*cnt, 0);
+ } else {
+ if (*cnt)
+ end = offset + *cnt - 1;
+ host_err = vfs_fsync_range(file, offset, end, 0);
+ }
}
out_nfserr:
@@ -981,7 +1004,7 @@ out_nfserr:
err = 0;
else
err = nfserrno(host_err);
- if (rqstp->rq_local)
+ if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
return err;
}
@@ -1819,10 +1842,12 @@ struct readdir_data {
int full;
};
-static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
- loff_t offset, u64 ino, unsigned int d_type)
+static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
{
- struct readdir_data *buf = __buf;
+ struct readdir_data *buf =
+ container_of(ctx, struct readdir_data, ctx);
struct buffered_dirent *de = (void *)(buf->dirent + buf->used);
unsigned int reclen;
@@ -1842,7 +1867,7 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
return 0;
}
-static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
+static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
struct readdir_cd *cdp, loff_t *offsetp)
{
struct buffered_dirent *de;
@@ -1926,7 +1951,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
*/
__be32
nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
- struct readdir_cd *cdp, filldir_t func)
+ struct readdir_cd *cdp, nfsd_filldir_t func)
{
__be32 err;
struct file *file;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c2ff3f14e5f6..2050cb016998 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -36,7 +36,7 @@
/*
* Callback function for readdir
*/
-typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
+typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
/* nfsd/vfs.c */
int nfsd_racache_init(int);
@@ -54,6 +54,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *);
#ifdef CONFIG_NFSD_V4
__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
struct xdr_netobj *);
+__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
+ struct file *, loff_t, loff_t, int);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -95,7 +97,7 @@ __be32 nfsd_rename(struct svc_rqst *,
__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
char *name, int len);
__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
- loff_t *, struct readdir_cd *, filldir_t);
+ loff_t *, struct readdir_cd *, nfsd_filldir_t);
__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
struct kstatfs *, int access);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 5720e9457f33..90a5925bd6ab 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,13 @@ struct nfsd4_reclaim_complete {
u32 rca_one_fs;
};
+struct nfsd4_fallocate {
+ /* request */
+ stateid_t falloc_stateid;
+ loff_t falloc_offset;
+ u64 falloc_length;
+};
+
struct nfsd4_seek {
/* request */
stateid_t seek_stateid;
@@ -486,6 +493,8 @@ struct nfsd4_op {
struct nfsd4_free_stateid free_stateid;
/* NFSv4.2 */
+ struct nfsd4_fallocate allocate;
+ struct nfsd4_fallocate deallocate;
struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index e9e3325f29f3..3a03e0aea1fb 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -39,21 +39,15 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
struct the_nilfs *nilfs;
struct inode *inode = file->f_mapping->host;
- int err;
-
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (err)
- return err;
- mutex_lock(&inode->i_mutex);
+ int err = 0;
if (nilfs_inode_dirty(inode)) {
if (datasync)
err = nilfs_construct_dsync_segment(inode->i_sb, inode,
- 0, LLONG_MAX);
+ start, end);
else
err = nilfs_construct_segment(inode->i_sb);
}
- mutex_unlock(&inode->i_mutex);
nilfs = inode->i_sb->s_fs_info;
if (!err)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index e1fa69b341b9..8b5969538f39 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -49,6 +49,8 @@ struct nilfs_iget_args {
int for_gc;
};
+static int nilfs_iget_test(struct inode *inode, void *opaque);
+
void nilfs_inode_add_blocks(struct inode *inode, int n)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
@@ -348,6 +350,17 @@ const struct address_space_operations nilfs_aops = {
.is_partially_uptodate = block_is_partially_uptodate,
};
+static int nilfs_insert_inode_locked(struct inode *inode,
+ struct nilfs_root *root,
+ unsigned long ino)
+{
+ struct nilfs_iget_args args = {
+ .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ };
+
+ return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
+}
+
struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
@@ -383,7 +396,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
err = nilfs_bmap_read(ii->i_bmap, NULL);
if (err < 0)
- goto failed_bmap;
+ goto failed_after_creation;
set_bit(NILFS_I_BMAP, &ii->i_state);
/* No lock is needed; iget() ensures it. */
@@ -399,21 +412,24 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
spin_lock(&nilfs->ns_next_gen_lock);
inode->i_generation = nilfs->ns_next_generation++;
spin_unlock(&nilfs->ns_next_gen_lock);
- insert_inode_hash(inode);
+ if (nilfs_insert_inode_locked(inode, root, ino) < 0) {
+ err = -EIO;
+ goto failed_after_creation;
+ }
err = nilfs_init_acl(inode, dir);
if (unlikely(err))
- goto failed_acl; /* never occur. When supporting
+ goto failed_after_creation; /* never occur. When supporting
nilfs_init_acl(), proper cancellation of
above jobs should be considered */
return inode;
- failed_acl:
- failed_bmap:
+ failed_after_creation:
clear_nlink(inode);
+ unlock_new_inode(inode);
iput(inode); /* raw_inode will be deleted through
- generic_delete_inode() */
+ nilfs_evict_inode() */
goto failed;
failed_ifile_create_inode:
@@ -461,8 +477,8 @@ int nilfs_read_inode_common(struct inode *inode,
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- if (inode->i_nlink == 0 && inode->i_mode == 0)
- return -EINVAL; /* this inode is deleted */
+ if (inode->i_nlink == 0)
+ return -ESTALE; /* this inode is deleted */
inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
ii->i_flags = le32_to_cpu(raw_inode->i_flags);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 9de78f08989e..0f84b257932c 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -51,9 +51,11 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
int err = nilfs_add_link(dentry, inode);
if (!err) {
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
return 0;
}
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -182,6 +184,7 @@ out:
out_fail:
drop_nlink(inode);
nilfs_mark_inode_dirty(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -201,11 +204,15 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
inode_inc_link_count(inode);
ihold(inode);
- err = nilfs_add_nondir(dentry, inode);
- if (!err)
+ err = nilfs_add_link(dentry, inode);
+ if (!err) {
+ d_instantiate(dentry, inode);
err = nilfs_transaction_commit(dir->i_sb);
- else
+ } else {
+ inode_dec_link_count(inode);
+ iput(inode);
nilfs_transaction_abort(dir->i_sb);
+ }
return err;
}
@@ -243,6 +250,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
nilfs_mark_inode_dirty(inode);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
out:
if (!err)
err = nilfs_transaction_commit(dir->i_sb);
@@ -255,6 +263,7 @@ out_fail:
drop_nlink(inode);
drop_nlink(inode);
nilfs_mark_inode_dirty(inode);
+ unlock_new_inode(inode);
iput(inode);
out_dir:
drop_nlink(dir);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 9da25fe9ea61..69bd801afb53 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -808,8 +808,7 @@ void nilfs_put_root(struct nilfs_root *root)
spin_lock(&nilfs->ns_cptree_lock);
rb_erase(&root->rb_node, &nilfs->ns_cptree);
spin_unlock(&nilfs->ns_cptree_lock);
- if (root->ifile)
- iput(root->ifile);
+ iput(root->ifile);
kfree(root);
}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index caaaf9dfe353..44523f4a6084 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
if (old_mask == new_mask)
return;
- if (fsn_mark->i.inode)
- fsnotify_recalc_inode_mask(fsn_mark->i.inode);
+ if (fsn_mark->inode)
+ fsnotify_recalc_inode_mask(fsn_mark->inode);
}
/*
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 9d7e2b9659cb..58b7cdb63da9 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -20,25 +20,24 @@
#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY)
-static int show_fdinfo(struct seq_file *m, struct file *f,
- int (*show)(struct seq_file *m, struct fsnotify_mark *mark))
+static void show_fdinfo(struct seq_file *m, struct file *f,
+ void (*show)(struct seq_file *m,
+ struct fsnotify_mark *mark))
{
struct fsnotify_group *group = f->private_data;
struct fsnotify_mark *mark;
- int ret = 0;
mutex_lock(&group->mark_mutex);
list_for_each_entry(mark, &group->marks_list, g_list) {
- ret = show(m, mark);
- if (ret)
+ show(m, mark);
+ if (seq_has_overflowed(m))
break;
}
mutex_unlock(&group->mark_mutex);
- return ret;
}
#if defined(CONFIG_EXPORTFS)
-static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
+static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
{
struct {
struct file_handle handle;
@@ -52,98 +51,85 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
if ((ret == FILEID_INVALID) || (ret < 0)) {
WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
- return 0;
+ return;
}
f.handle.handle_type = ret;
f.handle.handle_bytes = size * sizeof(u32);
- ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
- f.handle.handle_bytes, f.handle.handle_type);
+ seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
+ f.handle.handle_bytes, f.handle.handle_type);
for (i = 0; i < f.handle.handle_bytes; i++)
- ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
-
- return ret;
+ seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
}
#else
-static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
+static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
{
- return 0;
}
#endif
#ifdef CONFIG_INOTIFY_USER
-static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
{
struct inotify_inode_mark *inode_mark;
struct inode *inode;
- int ret = 0;
if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
- return 0;
+ return;
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
- inode = igrab(mark->i.inode);
+ inode = igrab(mark->inode);
if (inode) {
- ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x "
- "mask:%x ignored_mask:%x ",
- inode_mark->wd, inode->i_ino,
- inode->i_sb->s_dev,
- mark->mask, mark->ignored_mask);
- ret |= show_mark_fhandle(m, inode);
- ret |= seq_putc(m, '\n');
+ seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+ inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
+ mark->mask, mark->ignored_mask);
+ show_mark_fhandle(m, inode);
+ seq_putc(m, '\n');
iput(inode);
}
-
- return ret;
}
-int inotify_show_fdinfo(struct seq_file *m, struct file *f)
+void inotify_show_fdinfo(struct seq_file *m, struct file *f)
{
- return show_fdinfo(m, f, inotify_fdinfo);
+ show_fdinfo(m, f, inotify_fdinfo);
}
#endif /* CONFIG_INOTIFY_USER */
#ifdef CONFIG_FANOTIFY
-static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
{
unsigned int mflags = 0;
struct inode *inode;
- int ret = 0;
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
- return 0;
+ return;
if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
- inode = igrab(mark->i.inode);
+ inode = igrab(mark->inode);
if (!inode)
- goto out;
- ret = seq_printf(m, "fanotify ino:%lx sdev:%x "
- "mflags:%x mask:%x ignored_mask:%x ",
- inode->i_ino, inode->i_sb->s_dev,
- mflags, mark->mask, mark->ignored_mask);
- ret |= show_mark_fhandle(m, inode);
- ret |= seq_putc(m, '\n');
+ return;
+ seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
+ inode->i_ino, inode->i_sb->s_dev,
+ mflags, mark->mask, mark->ignored_mask);
+ show_mark_fhandle(m, inode);
+ seq_putc(m, '\n');
iput(inode);
} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
- struct mount *mnt = real_mount(mark->m.mnt);
+ struct mount *mnt = real_mount(mark->mnt);
- ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x "
- "ignored_mask:%x\n", mnt->mnt_id, mflags,
- mark->mask, mark->ignored_mask);
+ seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
+ mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
}
-out:
- return ret;
}
-int fanotify_show_fdinfo(struct seq_file *m, struct file *f)
+void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
{
struct fsnotify_group *group = f->private_data;
unsigned int flags = 0;
@@ -169,7 +155,7 @@ int fanotify_show_fdinfo(struct seq_file *m, struct file *f)
seq_printf(m, "fanotify flags:%x event-flags:%x\n",
flags, group->fanotify_data.f_flags);
- return show_fdinfo(m, f, fanotify_fdinfo);
+ show_fdinfo(m, f, fanotify_fdinfo);
}
#endif /* CONFIG_FANOTIFY */
diff --git a/fs/notify/fdinfo.h b/fs/notify/fdinfo.h
index 556afda990e9..9664c4904d6b 100644
--- a/fs/notify/fdinfo.h
+++ b/fs/notify/fdinfo.h
@@ -10,11 +10,11 @@ struct file;
#ifdef CONFIG_PROC_FS
#ifdef CONFIG_INOTIFY_USER
-extern int inotify_show_fdinfo(struct seq_file *m, struct file *f);
+void inotify_show_fdinfo(struct seq_file *m, struct file *f);
#endif
#ifdef CONFIG_FANOTIFY
-extern int fanotify_show_fdinfo(struct seq_file *m, struct file *f);
+void fanotify_show_fdinfo(struct seq_file *m, struct file *f);
#endif
#else /* CONFIG_PROC_FS */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9d3e9c50066a..dd3fb0b17be7 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -63,14 +63,14 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
spin_lock(&inode->i_lock);
/* run all of the dentries associated with this inode. Since this is a
* directory, there damn well better only be one item on this list */
- hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
struct dentry *child;
/* run all of the children of the original inode and fix their
* d_flags to indicate parental interest (their parent is the
* original inode) */
spin_lock(&alias->d_lock);
- list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+ list_for_each_entry(child, &alias->d_subdirs, d_child) {
if (!child->d_inode)
continue;
@@ -229,36 +229,42 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
&fsnotify_mark_srcu);
}
+ /*
+ * We need to merge inode & vfsmount mark lists so that inode mark
+ * ignore masks are properly reflected for mount mark notifications.
+ * That's why this traversal is so complicated...
+ */
while (inode_node || vfsmount_node) {
- inode_group = vfsmount_group = NULL;
+ inode_group = NULL;
+ inode_mark = NULL;
+ vfsmount_group = NULL;
+ vfsmount_mark = NULL;
if (inode_node) {
inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
- struct fsnotify_mark, i.i_list);
+ struct fsnotify_mark, obj_list);
inode_group = inode_mark->group;
}
if (vfsmount_node) {
vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
- struct fsnotify_mark, m.m_list);
+ struct fsnotify_mark, obj_list);
vfsmount_group = vfsmount_mark->group;
}
- if (inode_group > vfsmount_group) {
- /* handle inode */
- ret = send_to_group(to_tell, inode_mark, NULL, mask,
- data, data_is, cookie, file_name);
- /* we didn't use the vfsmount_mark */
- vfsmount_group = NULL;
- } else if (vfsmount_group > inode_group) {
- ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
- data, data_is, cookie, file_name);
- inode_group = NULL;
- } else {
- ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
- mask, data, data_is, cookie,
- file_name);
+ if (inode_group && vfsmount_group) {
+ int cmp = fsnotify_compare_groups(inode_group,
+ vfsmount_group);
+ if (cmp > 0) {
+ inode_group = NULL;
+ inode_mark = NULL;
+ } else if (cmp < 0) {
+ vfsmount_group = NULL;
+ vfsmount_mark = NULL;
+ }
}
+ ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
+ data, data_is, cookie, file_name);
if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
goto out;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 9c0898c4cfe1..13a00be516d2 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,8 +12,19 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;
+/* Calculate mask of events for a list of marks */
+extern u32 fsnotify_recalc_mask(struct hlist_head *head);
+
+/* compare two groups for sorting of marks lists */
+extern int fsnotify_compare_groups(struct fsnotify_group *a,
+ struct fsnotify_group *b);
+
extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
__u32 mask);
+/* Add mark to a proper place in mark list */
+extern int fsnotify_add_mark_list(struct hlist_head *head,
+ struct fsnotify_mark *mark,
+ int allow_dups);
/* add a mark to an inode */
extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group, struct inode *inode,
@@ -27,6 +38,11 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
/* inode specific destruction of a mark */
extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
+/* Destroy all marks in the given list */
+extern void fsnotify_destroy_marks(struct list_head *to_free);
+/* Find mark belonging to given group in the list of marks */
+extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
+ struct fsnotify_group *group);
/* run the list of all marks associated with inode and flag them to be freed */
extern void fsnotify_clear_marks_by_inode(struct inode *inode);
/* run the list of all marks associated with vfsmount and flag them to be freed */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 9ce062218de9..3daf513ee99e 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -31,28 +31,13 @@
#include "../internal.h"
/*
- * Recalculate the mask of events relevant to a given inode locked.
- */
-static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
-{
- struct fsnotify_mark *mark;
- __u32 new_mask = 0;
-
- assert_spin_locked(&inode->i_lock);
-
- hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list)
- new_mask |= mark->mask;
- inode->i_fsnotify_mask = new_mask;
-}
-
-/*
* Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
* any notifier is interested in hearing for this inode.
*/
void fsnotify_recalc_inode_mask(struct inode *inode)
{
spin_lock(&inode->i_lock);
- fsnotify_recalc_inode_mask_locked(inode);
+ inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
spin_unlock(&inode->i_lock);
__fsnotify_update_child_dentry_flags(inode);
@@ -60,23 +45,22 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
{
- struct inode *inode = mark->i.inode;
+ struct inode *inode = mark->inode;
BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
assert_spin_locked(&mark->lock);
spin_lock(&inode->i_lock);
- hlist_del_init_rcu(&mark->i.i_list);
- mark->i.inode = NULL;
+ hlist_del_init_rcu(&mark->obj_list);
+ mark->inode = NULL;
/*
* this mark is now off the inode->i_fsnotify_marks list and we
* hold the inode->i_lock, so this is the perfect time to update the
* inode->i_fsnotify_mask
*/
- fsnotify_recalc_inode_mask_locked(inode);
-
+ inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
spin_unlock(&inode->i_lock);
}
@@ -85,30 +69,19 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
*/
void fsnotify_clear_marks_by_inode(struct inode *inode)
{
- struct fsnotify_mark *mark, *lmark;
+ struct fsnotify_mark *mark;
struct hlist_node *n;
LIST_HEAD(free_list);
spin_lock(&inode->i_lock);
- hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) {
- list_add(&mark->i.free_i_list, &free_list);
- hlist_del_init_rcu(&mark->i.i_list);
+ hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
+ list_add(&mark->free_list, &free_list);
+ hlist_del_init_rcu(&mark->obj_list);
fsnotify_get_mark(mark);
}
spin_unlock(&inode->i_lock);
- list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
- struct fsnotify_group *group;
-
- spin_lock(&mark->lock);
- fsnotify_get_group(mark->group);
- group = mark->group;
- spin_unlock(&mark->lock);
-
- fsnotify_destroy_mark(mark, group);
- fsnotify_put_mark(mark);
- fsnotify_put_group(group);
- }
+ fsnotify_destroy_marks(&free_list);
}
/*
@@ -123,34 +96,13 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
* given a group and inode, find the mark associated with that combination.
* if found take a reference to that mark and return it, else return NULL
*/
-static struct fsnotify_mark *fsnotify_find_inode_mark_locked(
- struct fsnotify_group *group,
- struct inode *inode)
-{
- struct fsnotify_mark *mark;
-
- assert_spin_locked(&inode->i_lock);
-
- hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) {
- if (mark->group == group) {
- fsnotify_get_mark(mark);
- return mark;
- }
- }
- return NULL;
-}
-
-/*
- * given a group and inode, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
struct inode *inode)
{
struct fsnotify_mark *mark;
spin_lock(&inode->i_lock);
- mark = fsnotify_find_inode_mark_locked(group, inode);
+ mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
spin_unlock(&inode->i_lock);
return mark;
@@ -168,10 +120,10 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
assert_spin_locked(&mark->lock);
if (mask &&
- mark->i.inode &&
+ mark->inode &&
!(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
- inode = igrab(mark->i.inode);
+ inode = igrab(mark->inode);
/*
* we shouldn't be able to get here if the inode wasn't
* already safely held in memory. But bug in case it
@@ -192,8 +144,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group, struct inode *inode,
int allow_dups)
{
- struct fsnotify_mark *lmark, *last = NULL;
- int ret = 0;
+ int ret;
mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
@@ -201,40 +152,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
assert_spin_locked(&mark->lock);
spin_lock(&inode->i_lock);
-
- mark->i.inode = inode;
-
- /* is mark the first mark? */
- if (hlist_empty(&inode->i_fsnotify_marks)) {
- hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
- goto out;
- }
-
- /* should mark be in the middle of the current list? */
- hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) {
- last = lmark;
-
- if ((lmark->group == group) && !allow_dups) {
- ret = -EEXIST;
- goto out;
- }
-
- if (mark->group->priority < lmark->group->priority)
- continue;
-
- if ((mark->group->priority == lmark->group->priority) &&
- (mark->group < lmark->group))
- continue;
-
- hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
- goto out;
- }
-
- BUG_ON(last == NULL);
- /* mark should be the last entry. last is the current last entry */
- hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list);
-out:
- fsnotify_recalc_inode_mask_locked(inode);
+ mark->inode = inode;
+ ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
+ allow_dups);
+ inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
spin_unlock(&inode->i_lock);
return ret;
@@ -288,20 +209,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
spin_unlock(&inode->i_lock);
/* In case the dropping of a reference would nuke next_i. */
- if ((&next_i->i_sb_list != list) &&
- atomic_read(&next_i->i_count)) {
+ while (&next_i->i_sb_list != list) {
spin_lock(&next_i->i_lock);
- if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+ if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
+ atomic_read(&next_i->i_count)) {
__iget(next_i);
need_iput = next_i;
+ spin_unlock(&next_i->i_lock);
+ break;
}
spin_unlock(&next_i->i_lock);
+ next_i = list_entry(next_i->i_sb_list.next,
+ struct inode, i_sb_list);
}
/*
- * We can safely drop inode_sb_list_lock here because we hold
- * references on both inode and next_i. Also no new inodes
- * will be added since the umount has begun.
+ * We can safely drop inode_sb_list_lock here because either
+ * we actually hold references on both inode and next_i or
+ * end of list. Also no new inodes will be added since the
+ * umount has begun.
*/
spin_unlock(&inode_sb_list_lock);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 7d888d77d59a..2cd900c2c737 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -156,7 +156,7 @@ static int idr_callback(int id, void *p, void *data)
*/
if (fsn_mark)
printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
- fsn_mark->group, fsn_mark->i.inode, i_mark->wd);
+ fsn_mark->group, fsn_mark->inode, i_mark->wd);
return 0;
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index daf76652fe58..450648697433 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -227,14 +227,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
struct fsnotify_event *kevent;
char __user *start;
int ret;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
start = buf;
group = file->private_data;
+ add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
-
mutex_lock(&group->notification_mutex);
kevent = get_one_event(group, count);
mutex_unlock(&group->notification_mutex);
@@ -264,10 +263,10 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
if (start != buf)
break;
- schedule();
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
+ remove_wait_queue(&group->notification_waitq, &wait);
- finish_wait(&group->notification_waitq, &wait);
if (start != buf && ret != -EFAULT)
ret = buf - start;
return ret;
@@ -434,7 +433,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
if (wd == -1) {
WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
+ i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
goto out;
}
@@ -443,7 +442,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
if (unlikely(!found_i_mark)) {
WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
+ i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
goto out;
}
@@ -457,9 +456,9 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
"mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
"found_i_mark->group=%p found_i_mark->inode=%p\n",
__func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
- i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd,
+ i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd,
found_i_mark->fsn_mark.group,
- found_i_mark->fsn_mark.i.inode);
+ found_i_mark->fsn_mark.inode);
goto out;
}
@@ -471,7 +470,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
+ i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
/* we can't really recover with bad ref cnting.. */
BUG();
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d90deaa08e78..92e48c70f0f0 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -110,6 +110,17 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
}
}
+/* Calculate mask of events for a list of marks */
+u32 fsnotify_recalc_mask(struct hlist_head *head)
+{
+ u32 new_mask = 0;
+ struct fsnotify_mark *mark;
+
+ hlist_for_each_entry(mark, head, obj_list)
+ new_mask |= mark->mask;
+ return new_mask;
+}
+
/*
* Any time a mark is getting freed we end up here.
* The caller had better be holding a reference to this mark so we don't actually
@@ -133,7 +144,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
- inode = mark->i.inode;
+ inode = mark->inode;
fsnotify_destroy_inode_mark(mark);
} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
fsnotify_destroy_vfsmount_mark(mark);
@@ -150,7 +161,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
mutex_unlock(&group->mark_mutex);
spin_lock(&destroy_lock);
- list_add(&mark->destroy_list, &destroy_list);
+ list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
wake_up(&destroy_waitq);
/*
@@ -192,6 +203,27 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
mutex_unlock(&group->mark_mutex);
}
+/*
+ * Destroy all marks in the given list. The marks must be already detached from
+ * the original inode / vfsmount.
+ */
+void fsnotify_destroy_marks(struct list_head *to_free)
+{
+ struct fsnotify_mark *mark, *lmark;
+ struct fsnotify_group *group;
+
+ list_for_each_entry_safe(mark, lmark, to_free, free_list) {
+ spin_lock(&mark->lock);
+ fsnotify_get_group(mark->group);
+ group = mark->group;
+ spin_unlock(&mark->lock);
+
+ fsnotify_destroy_mark(mark, group);
+ fsnotify_put_mark(mark);
+ fsnotify_put_group(group);
+ }
+}
+
void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
{
assert_spin_locked(&mark->lock);
@@ -210,6 +242,75 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
}
/*
+ * Sorting function for lists of fsnotify marks.
+ *
+ * Fanotify supports different notification classes (reflected as priority of
+ * notification group). Events shall be passed to notification groups in
+ * decreasing priority order. To achieve this marks in notification lists for
+ * inodes and vfsmounts are sorted so that priorities of corresponding groups
+ * are descending.
+ *
+ * Furthermore correct handling of the ignore mask requires processing inode
+ * and vfsmount marks of each group together. Using the group address as
+ * further sort criterion provides a unique sorting order and thus we can
+ * merge inode and vfsmount lists of marks in linear time and find groups
+ * present in both lists.
+ *
+ * A return value of 1 signifies that b has priority over a.
+ * A return value of 0 signifies that the two marks have to be handled together.
+ * A return value of -1 signifies that a has priority over b.
+ */
+int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
+{
+ if (a == b)
+ return 0;
+ if (!a)
+ return 1;
+ if (!b)
+ return -1;
+ if (a->priority < b->priority)
+ return 1;
+ if (a->priority > b->priority)
+ return -1;
+ if (a < b)
+ return 1;
+ return -1;
+}
+
+/* Add mark into proper place in given list of marks */
+int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
+ int allow_dups)
+{
+ struct fsnotify_mark *lmark, *last = NULL;
+ int cmp;
+
+ /* is mark the first mark? */
+ if (hlist_empty(head)) {
+ hlist_add_head_rcu(&mark->obj_list, head);
+ return 0;
+ }
+
+ /* should mark be in the middle of the current list? */
+ hlist_for_each_entry(lmark, head, obj_list) {
+ last = lmark;
+
+ if ((lmark->group == mark->group) && !allow_dups)
+ return -EEXIST;
+
+ cmp = fsnotify_compare_groups(lmark->group, mark->group);
+ if (cmp >= 0) {
+ hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
+ return 0;
+ }
+ }
+
+ BUG_ON(last == NULL);
+ /* mark should be the last entry. last is the current last entry */
+ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
+ return 0;
+}
+
+/*
* Attach an initialized mark to a given group and fs object.
* These marks may be used for the fsnotify backend to determine which
* event types should be delivered to which group.
@@ -269,7 +370,7 @@ err:
spin_unlock(&mark->lock);
spin_lock(&destroy_lock);
- list_add(&mark->destroy_list, &destroy_list);
+ list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
wake_up(&destroy_waitq);
@@ -287,6 +388,24 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
}
/*
+ * Given a list of marks, find the mark associated with given group. If found
+ * take a reference to that mark and return it, else return NULL.
+ */
+struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
+ struct fsnotify_group *group)
+{
+ struct fsnotify_mark *mark;
+
+ hlist_for_each_entry(mark, head, obj_list) {
+ if (mark->group == group) {
+ fsnotify_get_mark(mark);
+ return mark;
+ }
+ }
+ return NULL;
+}
+
+/*
* clear any marks in a group in which mark->flags & flags is true
*/
void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
@@ -316,8 +435,8 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
{
assert_spin_locked(&old->lock);
- new->i.inode = old->i.inode;
- new->m.mnt = old->m.mnt;
+ new->inode = old->inode;
+ new->mnt = old->mnt;
if (old->group)
fsnotify_get_group(old->group);
new->group = old->group;
@@ -350,8 +469,8 @@ static int fsnotify_mark_destroy(void *ignored)
synchronize_srcu(&fsnotify_mark_srcu);
- list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
- list_del_init(&mark->destroy_list);
+ list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
+ list_del_init(&mark->g_list);
fsnotify_put_mark(mark);
}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index ac851e8376b1..326b148e623c 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -32,31 +32,20 @@
void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
- struct fsnotify_mark *mark, *lmark;
+ struct fsnotify_mark *mark;
struct hlist_node *n;
struct mount *m = real_mount(mnt);
LIST_HEAD(free_list);
spin_lock(&mnt->mnt_root->d_lock);
- hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) {
- list_add(&mark->m.free_m_list, &free_list);
- hlist_del_init_rcu(&mark->m.m_list);
+ hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
+ list_add(&mark->free_list, &free_list);
+ hlist_del_init_rcu(&mark->obj_list);
fsnotify_get_mark(mark);
}
spin_unlock(&mnt->mnt_root->d_lock);
- list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
- struct fsnotify_group *group;
-
- spin_lock(&mark->lock);
- fsnotify_get_group(mark->group);
- group = mark->group;
- spin_unlock(&mark->lock);
-
- fsnotify_destroy_mark(mark, group);
- fsnotify_put_mark(mark);
- fsnotify_put_group(group);
- }
+ fsnotify_destroy_marks(&free_list);
}
void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
@@ -65,66 +54,35 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
}
/*
- * Recalculate the mask of events relevant to a given vfsmount locked.
- */
-static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
-{
- struct mount *m = real_mount(mnt);
- struct fsnotify_mark *mark;
- __u32 new_mask = 0;
-
- assert_spin_locked(&mnt->mnt_root->d_lock);
-
- hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list)
- new_mask |= mark->mask;
- m->mnt_fsnotify_mask = new_mask;
-}
-
-/*
* Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
* any notifier is interested in hearing for this mount point
*/
void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
{
+ struct mount *m = real_mount(mnt);
+
spin_lock(&mnt->mnt_root->d_lock);
- fsnotify_recalc_vfsmount_mask_locked(mnt);
+ m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
spin_unlock(&mnt->mnt_root->d_lock);
}
void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
{
- struct vfsmount *mnt = mark->m.mnt;
+ struct vfsmount *mnt = mark->mnt;
+ struct mount *m = real_mount(mnt);
BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
assert_spin_locked(&mark->lock);
spin_lock(&mnt->mnt_root->d_lock);
- hlist_del_init_rcu(&mark->m.m_list);
- mark->m.mnt = NULL;
-
- fsnotify_recalc_vfsmount_mask_locked(mnt);
+ hlist_del_init_rcu(&mark->obj_list);
+ mark->mnt = NULL;
+ m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
spin_unlock(&mnt->mnt_root->d_lock);
}
-static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
- struct vfsmount *mnt)
-{
- struct mount *m = real_mount(mnt);
- struct fsnotify_mark *mark;
-
- assert_spin_locked(&mnt->mnt_root->d_lock);
-
- hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) {
- if (mark->group == group) {
- fsnotify_get_mark(mark);
- return mark;
- }
- }
- return NULL;
-}
-
/*
* given a group and vfsmount, find the mark associated with that combination.
* if found take a reference to that mark and return it, else return NULL
@@ -132,10 +90,11 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_
struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt)
{
+ struct mount *m = real_mount(mnt);
struct fsnotify_mark *mark;
spin_lock(&mnt->mnt_root->d_lock);
- mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
+ mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
spin_unlock(&mnt->mnt_root->d_lock);
return mark;
@@ -151,8 +110,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
int allow_dups)
{
struct mount *m = real_mount(mnt);
- struct fsnotify_mark *lmark, *last = NULL;
- int ret = 0;
+ int ret;
mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
@@ -160,40 +118,9 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
assert_spin_locked(&mark->lock);
spin_lock(&mnt->mnt_root->d_lock);
-
- mark->m.mnt = mnt;
-
- /* is mark the first mark? */
- if (hlist_empty(&m->mnt_fsnotify_marks)) {
- hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks);
- goto out;
- }
-
- /* should mark be in the middle of the current list? */
- hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) {
- last = lmark;
-
- if ((lmark->group == group) && !allow_dups) {
- ret = -EEXIST;
- goto out;
- }
-
- if (mark->group->priority < lmark->group->priority)
- continue;
-
- if ((mark->group->priority == lmark->group->priority) &&
- (mark->group < lmark->group))
- continue;
-
- hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
- goto out;
- }
-
- BUG_ON(last == NULL);
- /* mark should be the last entry. last is the current last entry */
- hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list);
-out:
- fsnotify_recalc_vfsmount_mask_locked(mnt);
+ mark->mnt = mnt;
+ ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
+ m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
spin_unlock(&mnt->mnt_root->d_lock);
return ret;
diff --git a/fs/nsfs.c b/fs/nsfs.c
new file mode 100644
index 000000000000..af1b24fa899d
--- /dev/null
+++ b/fs/nsfs.c
@@ -0,0 +1,161 @@
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
+#include <linux/magic.h>
+#include <linux/ktime.h>
+
+static struct vfsmount *nsfs_mnt;
+
+static const struct file_operations ns_file_operations = {
+ .llseek = no_llseek,
+};
+
+static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ struct inode *inode = dentry->d_inode;
+ const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+
+ return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+ ns_ops->name, inode->i_ino);
+}
+
+static void ns_prune_dentry(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ if (inode) {
+ struct ns_common *ns = inode->i_private;
+ atomic_long_set(&ns->stashed, 0);
+ }
+}
+
+const struct dentry_operations ns_dentry_operations =
+{
+ .d_prune = ns_prune_dentry,
+ .d_delete = always_delete_dentry,
+ .d_dname = ns_dname,
+};
+
+static void nsfs_evict(struct inode *inode)
+{
+ struct ns_common *ns = inode->i_private;
+ clear_inode(inode);
+ ns->ops->put(ns);
+}
+
+void *ns_get_path(struct path *path, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct vfsmount *mnt = mntget(nsfs_mnt);
+ struct qstr qname = { .name = "", };
+ struct dentry *dentry;
+ struct inode *inode;
+ struct ns_common *ns;
+ unsigned long d;
+
+again:
+ ns = ns_ops->get(task);
+ if (!ns) {
+ mntput(mnt);
+ return ERR_PTR(-ENOENT);
+ }
+ rcu_read_lock();
+ d = atomic_long_read(&ns->stashed);
+ if (!d)
+ goto slow;
+ dentry = (struct dentry *)d;
+ if (!lockref_get_not_dead(&dentry->d_lockref))
+ goto slow;
+ rcu_read_unlock();
+ ns_ops->put(ns);
+got_it:
+ path->mnt = mnt;
+ path->dentry = dentry;
+ return NULL;
+slow:
+ rcu_read_unlock();
+ inode = new_inode_pseudo(mnt->mnt_sb);
+ if (!inode) {
+ ns_ops->put(ns);
+ mntput(mnt);
+ return ERR_PTR(-ENOMEM);
+ }
+ inode->i_ino = ns->inum;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_flags |= S_IMMUTABLE;
+ inode->i_mode = S_IFREG | S_IRUGO;
+ inode->i_fop = &ns_file_operations;
+ inode->i_private = ns;
+
+ dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
+ if (!dentry) {
+ iput(inode);
+ mntput(mnt);
+ return ERR_PTR(-ENOMEM);
+ }
+ d_instantiate(dentry, inode);
+ dentry->d_fsdata = (void *)ns_ops;
+ d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
+ if (d) {
+ d_delete(dentry); /* make sure ->d_prune() does nothing */
+ dput(dentry);
+ cpu_relax();
+ goto again;
+ }
+ goto got_it;
+}
+
+int ns_get_name(char *buf, size_t size, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct ns_common *ns;
+ int res = -ENOENT;
+ ns = ns_ops->get(task);
+ if (ns) {
+ res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+ ns_ops->put(ns);
+ }
+ return res;
+}
+
+struct file *proc_ns_fget(int fd)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &ns_file_operations)
+ goto out_invalid;
+
+ return file;
+
+out_invalid:
+ fput(file);
+ return ERR_PTR(-EINVAL);
+}
+
+static const struct super_operations nsfs_ops = {
+ .statfs = simple_statfs,
+ .evict_inode = nsfs_evict,
+};
+static struct dentry *nsfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
+ &ns_dentry_operations, NSFS_MAGIC);
+}
+static struct file_system_type nsfs = {
+ .name = "nsfs",
+ .mount = nsfs_mount,
+ .kill_sb = kill_anon_super,
+};
+
+void __init nsfs_init(void)
+{
+ nsfs_mnt = kern_mount(&nsfs);
+ if (IS_ERR(nsfs_mnt))
+ panic("can't set nsfs up\n");
+ nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 436f36037e09..b3973c2fd190 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -111,8 +111,8 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
unsigned long dent_ino;
int uname_len;
- ntfs_debug("Looking up %s in directory inode 0x%lx.",
- dent->d_name.name, dir_ino->i_ino);
+ ntfs_debug("Looking up %pd in directory inode 0x%lx.",
+ dent, dir_ino->i_ino);
/* Convert the name of the dentry to Unicode. */
uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
&uname);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a93bf9892256..fcae9ef1a328 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5662,7 +5662,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 cpos, u32 phys_cpos, u32 len, int flags,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u64 refcount_loc)
+ u64 refcount_loc, bool refcount_tree_locked)
{
int ret, credits = 0, extra_blocks = 0;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
@@ -5676,11 +5676,13 @@ int ocfs2_remove_btree_range(struct inode *inode,
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
OCFS2_HAS_REFCOUNT_FL));
- ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
- &ref_tree, NULL);
- if (ret) {
- mlog_errno(ret);
- goto bail;
+ if (!refcount_tree_locked) {
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
}
ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -7021,6 +7023,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
struct ocfs2_extent_tree et;
struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
@@ -7130,9 +7133,18 @@ start:
phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+ if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) {
+ status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
phys_cpos, trunc_len, flags, &dealloc,
- refcount_loc);
+ refcount_loc, true);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -7147,6 +7159,8 @@ start:
goto start;
bail:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
ocfs2_schedule_truncate_log_flush(osb, 1);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index ca381c584127..fb09b97db162 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -142,7 +142,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 cpos, u32 phys_cpos, u32 len, int flags,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u64 refcount_loc);
+ u64 refcount_loc, bool refcount_tree_locked);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct ocfs2_extent_tree *et);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1ef547e49373..46d93e941f3d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -894,7 +894,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
}
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
{
int i;
@@ -915,7 +915,11 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
page_cache_release(wc->w_target_page);
}
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+}
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+ ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
}
@@ -1251,7 +1255,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
NULL);
if (ret < 0) {
- ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
+ mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
"at logical block %llu",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)v_blkno);
@@ -2042,11 +2046,19 @@ out_write_size:
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* unlock pages before dealloc since it needs acquiring j_trans_barrier
+ * lock, or it will cause a deadlock since journal commit threads holds
+ * this lock and will ask for the page lock when flushing the data.
+ * put it here to preserve the unlock order.
+ */
+ ocfs2_unlock_pages(wc);
+
ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
- ocfs2_free_write_ctxt(wc);
+ brelse(wc->w_di_bh);
+ kfree(wc);
return copied;
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index eb9d48746ab4..16eff45727ee 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1127,10 +1127,10 @@ static int o2hb_thread(void *data)
elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
mlog(ML_HEARTBEAT,
- "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+ "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
- elapsed_msec);
+ elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 97de0fbd9f78..2e355e0f8335 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -925,7 +925,7 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
size_t veclen, size_t total)
{
int ret;
- struct msghdr msg;
+ struct msghdr msg = {.msg_flags = 0,};
if (sock == NULL) {
ret = -EINVAL;
@@ -1736,7 +1736,7 @@ static void o2net_connect_expired(struct work_struct *work)
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
- o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+ o2net_set_nn_state(nn, NULL, 0, 0);
}
spin_unlock(&nn->nn_lock);
}
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index e2e05a106beb..4fda7a5f3088 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -172,7 +172,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
struct dentry *dentry;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
spin_lock(&dentry->d_lock);
if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
trace_ocfs2_find_local_alias(dentry->d_name.len,
@@ -251,8 +251,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (dl) {
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
- " \"%.*s\": old parent: %llu, new: %llu\n",
- dentry->d_name.len, dentry->d_name.name,
+ " \"%pd\": old parent: %llu, new: %llu\n",
+ dentry,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
return 0;
@@ -277,8 +277,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
- " \"%.*s\": old parent: %llu, new: %llu\n",
- dentry->d_name.len, dentry->d_name.name,
+ " \"%pd\": old parent: %llu, new: %llu\n",
+ dentry,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
@@ -406,17 +406,15 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
if (inode)
ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
mlog(ML_ERROR, "Dentry is missing cluster lock. "
- "inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
- ino, dentry->d_flags, dentry->d_name.len,
- dentry->d_name.name);
+ "inode: %llu, d_flags: 0x%x, d_name: %pd\n",
+ ino, dentry->d_flags, dentry);
}
goto out;
}
- mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
- dentry->d_name.len, dentry->d_name.name,
- dl->dl_count);
+ mlog_bug_on_msg(dl->dl_count == 0, "dentry: %pd, count: %u\n",
+ dentry, dl->dl_count);
ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 0717662b4aef..319e786175af 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -744,7 +744,7 @@ restart:
if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
/* read error, skip block & hope for the best.
* ocfs2_read_dir_block() has released the bh. */
- ocfs2_error(dir->i_sb, "reading directory %llu, "
+ mlog(ML_ERROR, "reading directory %llu, "
"offset %lu\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno,
block);
@@ -2073,10 +2073,12 @@ struct ocfs2_empty_dir_priv {
unsigned seen_other;
unsigned dx_dir;
};
-static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
- loff_t pos, u64 ino, unsigned type)
+static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
+ int name_len, loff_t pos, u64 ino,
+ unsigned type)
{
- struct ocfs2_empty_dir_priv *p = priv;
+ struct ocfs2_empty_dir_priv *p =
+ container_of(ctx, struct ocfs2_empty_dir_priv, ctx);
/*
* Check the positions of "." and ".." records to be sure
@@ -4477,7 +4479,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
- &dealloc, 0);
+ &dealloc, 0, false);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 02d315fef432..50a59d2337b2 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -877,7 +877,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
* to be put in someone's domain map.
* Also, explicitly disallow joining at certain troublesome
* times (ie. during recovery). */
- if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
+ if (dlm->dlm_state != DLM_CTXT_LEAVING) {
int bit = query->node_idx;
spin_lock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 215e41abf101..a6944b25fd5b 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -695,14 +695,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
res->inflight_assert_workers);
}
-static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res)
-{
- spin_lock(&res->spinlock);
- __dlm_lockres_grab_inflight_worker(dlm, res);
- spin_unlock(&res->spinlock);
-}
-
static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -1460,6 +1452,18 @@ way_up_top:
/* take care of the easy cases up front */
spin_lock(&res->spinlock);
+
+ /*
+ * Right after dlm spinlock was released, dlm_thread could have
+ * purged the lockres. Check if lockres got unhashed. If so
+ * start over.
+ */
+ if (hlist_unhashed(&res->hash_node)) {
+ spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
+ goto way_up_top;
+ }
+
if (res->state & (DLM_LOCK_RES_RECOVERING|
DLM_LOCK_RES_MIGRATING)) {
spin_unlock(&res->spinlock);
@@ -1634,6 +1638,7 @@ send_response:
}
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
+ spin_lock(&res->spinlock);
ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
DLM_ASSERT_MASTER_MLE_CLEANUP);
if (ret < 0) {
@@ -1641,7 +1646,8 @@ send_response:
response = DLM_MASTER_RESP_ERROR;
dlm_lockres_put(res);
} else
- dlm_lockres_grab_inflight_worker(dlm, res);
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
} else {
if (res)
dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 3365839d2971..79b5af5e6a7b 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1656,14 +1656,18 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
req.namelen = res->lockname.len;
memcpy(req.name, res->lockname.name, res->lockname.len);
+resend:
ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
&req, sizeof(req), nodenum, &status);
- /* XXX: negative status not handled properly here. */
if (ret < 0)
mlog(ML_ERROR, "Error %d when sending message %u (key "
"0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
dlm->key, nodenum);
- else {
+ else if (status == -ENOMEM) {
+ mlog_errno(status);
+ msleep(50);
+ goto resend;
+ } else {
BUG_ON(status < 0);
BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
*real_master = (u8) (status & 0xff);
@@ -1705,9 +1709,13 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
int ret = dlm_dispatch_assert_master(dlm, res,
0, 0, flags);
if (ret < 0) {
- mlog_errno(-ENOMEM);
- /* retry!? */
- BUG();
+ mlog_errno(ret);
+ spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
+ spin_unlock(&dlm->spinlock);
+ dlm_put(dlm);
+ /* sender will take care of this and retry */
+ return ret;
} else
__dlm_lockres_grab_inflight_worker(dlm, res);
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 09b7d9dac71d..57c40e34f56f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -565,8 +565,8 @@ static int dlmfs_unlink(struct inode *dir,
* to acquire a lock, this basically destroys our lockres. */
status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
if (status < 0) {
- mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
- dentry->d_name.len, dentry->d_name.name, status);
+ mlog(ML_ERROR, "unlink %pd, error %d from destroy\n",
+ dentry, status);
goto bail;
}
status = simple_unlink(dir, dentry);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 21262f2b1654..1c423af04c69 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -861,8 +861,13 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
* We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
* the OCFS2_LOCK_BUSY flag to prevent the dc thread from
* downconverting the lock before the upconvert has fully completed.
+ * Do not prevent the dc thread from downconverting if NONBLOCK lock
+ * had already returned.
*/
- lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED))
+ lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ else
+ lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
}
@@ -1324,13 +1329,12 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
/* returns 0 if the mw that was removed was already satisfied, -EBUSY
* if the mask still hadn't reached its goal */
-static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
struct ocfs2_mask_waiter *mw)
{
- unsigned long flags;
int ret = 0;
- spin_lock_irqsave(&lockres->l_lock, flags);
+ assert_spin_locked(&lockres->l_lock);
if (!list_empty(&mw->mw_item)) {
if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
ret = -EBUSY;
@@ -1338,6 +1342,18 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
list_del_init(&mw->mw_item);
init_completion(&mw->mw_complete);
}
+
+ return ret;
+}
+
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mask_waiter *mw)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ ret = __lockres_remove_mask_waiter(lockres, mw);
spin_unlock_irqrestore(&lockres->l_lock, flags);
return ret;
@@ -1373,6 +1389,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
unsigned long flags;
unsigned int gen;
int noqueue_attempted = 0;
+ int dlm_locked = 0;
ocfs2_init_mask_waiter(&mw);
@@ -1481,6 +1498,7 @@ again:
ocfs2_recover_from_dlm_error(lockres, 1);
goto out;
}
+ dlm_locked = 1;
mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
lockres->l_name);
@@ -1514,10 +1532,17 @@ out:
if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
wait = 0;
- if (lockres_remove_mask_waiter(lockres, &mw))
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (__lockres_remove_mask_waiter(lockres, &mw)) {
+ if (dlm_locked)
+ lockres_or_flags(lockres,
+ OCFS2_LOCK_NONBLOCK_FINISHED);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = -EAGAIN;
- else
+ } else {
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
goto again;
+ }
}
if (wait) {
ret = ocfs2_wait_for_mask(&mw);
@@ -3725,8 +3750,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
break;
spin_unlock(&dentry_attach_lock);
- mlog(0, "d_delete(%.*s);\n", dentry->d_name.len,
- dentry->d_name.name);
+ mlog(0, "d_delete(%pd);\n", dentry);
/*
* The following dcache calls may do an
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 324dc93ac896..3950693dd0f6 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1803,7 +1803,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
phys_cpos, trunc_len, flags,
- &dealloc, refcount_loc);
+ &dealloc, refcount_loc, false);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -2381,9 +2381,7 @@ out_dio:
if (ret < 0)
written = ret;
- if (!ret && ((old_size != i_size_read(inode)) ||
- (old_clusters != OCFS2_I(inode)->ip_clusters) ||
- has_refcount)) {
+ if (!ret) {
ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 437de7f768c6..c8b25de9efbb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -540,8 +540,7 @@ bail:
if (status < 0)
make_bad_inode(inode);
- if (args && bh)
- brelse(bh);
+ brelse(bh);
return status;
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a9b76de46047..ca3431ee7f24 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -80,6 +80,8 @@ struct ocfs2_inode_info
*/
tid_t i_sync_tid;
tid_t i_datasync_tid;
+
+ struct dquot *i_dquot[MAXQUOTAS];
};
/*
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4b0c68849b36..4f502382180f 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1982,10 +1982,12 @@ struct ocfs2_orphan_filldir_priv {
struct ocfs2_super *osb;
};
-static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
- loff_t pos, u64 ino, unsigned type)
+static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
+ int name_len, loff_t pos, u64 ino,
+ unsigned type)
{
- struct ocfs2_orphan_filldir_priv *p = priv;
+ struct ocfs2_orphan_filldir_priv *p =
+ container_of(ctx, struct ocfs2_orphan_filldir_priv, ctx);
struct inode *iter;
if (name_len == 1 && !strncmp(".", name, 1))
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 74caffeeee1d..56a768d06aa6 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -904,9 +904,6 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
struct buffer_head *di_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!inode)
- return -ENOENT;
-
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8add6f1030d7..b931e04e3388 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -158,7 +158,7 @@ bail_add:
* NOTE: This dentry already has ->d_op set from
* ocfs2_get_parent() and ocfs2_get_dentry()
*/
- if (ret)
+ if (!IS_ERR_OR_NULL(ret))
dentry = ret;
status = ocfs2_dentry_attach_lock(dentry, inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index bbec539230fd..7d6b7d090452 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -144,6 +144,12 @@ enum ocfs2_unlock_action {
* before the upconvert
* has completed */
+#define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster
+ * lock has already
+ * returned, do not block
+ * dc thread from
+ * downconverting */
+
struct ocfs2_lock_res_ops;
typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index a88b2a4fcc85..d5493e361a38 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -306,7 +306,7 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
assert_spin_locked(&osb->osb_lock);
BUG_ON(slot_num < 0);
- BUG_ON(slot_num > osb->max_slots);
+ BUG_ON(slot_num >= osb->max_slots);
if (!si->si_slots[slot_num].sl_valid)
return -ENOENT;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 93c85bc745e1..83723179e1ec 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -143,6 +143,11 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
static int ocfs2_enable_quotas(struct ocfs2_super *osb);
static void ocfs2_disable_quotas(struct ocfs2_super *osb);
+static struct dquot **ocfs2_get_dquots(struct inode *inode)
+{
+ return OCFS2_I(inode)->i_dquot;
+}
+
static const struct super_operations ocfs2_sops = {
.statfs = ocfs2_statfs,
.alloc_inode = ocfs2_alloc_inode,
@@ -155,6 +160,7 @@ static const struct super_operations ocfs2_sops = {
.show_options = ocfs2_show_options,
.quota_read = ocfs2_quota_read,
.quota_write = ocfs2_quota_write,
+ .get_dquots = ocfs2_get_dquots,
};
enum {
@@ -563,6 +569,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
oi->i_sync_tid = 0;
oi->i_datasync_tid = 0;
+ memset(&oi->i_dquot, 0, sizeof(oi->i_dquot));
jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
return &oi->vfs_inode;
@@ -1622,8 +1629,9 @@ static int __init ocfs2_init(void)
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
if (!ocfs2_debugfs_root) {
- status = -EFAULT;
+ status = -ENOMEM;
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
+ goto out4;
}
ocfs2_set_locking_protocol();
@@ -2073,6 +2081,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_export_op = &ocfs2_export_ops;
sb->s_qcop = &ocfs2_quotactl_ops;
sb->dq_op = &ocfs2_quota_operations;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
sb->s_xattr = ocfs2_xattr_handlers;
sb->s_time_gran = 1;
sb->s_flags |= MS_NOATIME;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 016f01df3825..662f8dee149f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1284,7 +1284,7 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
return -EOPNOTSUPP;
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
- ret = -ENODATA;
+ return -ENODATA;
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
diff --git a/fs/open.c b/fs/open.c
index d6fd3acde134..813be037b412 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -222,7 +222,7 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
#endif /* BITS_PER_LONG == 32 */
-int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
long ret;
@@ -295,9 +295,21 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
sb_start_write(inode->i_sb);
ret = file->f_op->fallocate(file, mode, offset, len);
+
+ /*
+ * Create inotify and fanotify events.
+ *
+ * To keep the logic simple always create events if fallocate succeeds.
+ * This implies that events are even created if the file size remains
+ * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
+ */
+ if (ret == 0)
+ fsnotify_modify(file);
+
sb_end_write(inode->i_sb);
return ret;
}
+EXPORT_SYMBOL_GPL(vfs_fallocate);
SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
@@ -305,7 +317,7 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
int error = -EBADF;
if (f.file) {
- error = do_fallocate(f.file, mode, offset, len);
+ error = vfs_fallocate(f.file, mode, offset, len);
fdput(f);
}
return error;
@@ -516,7 +528,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
int err = -EBADF;
if (f.file) {
- audit_inode(NULL, f.file->f_path.dentry, 0);
+ audit_file(f.file);
err = chmod_common(&f.file->f_path, mode);
fdput(f);
}
@@ -642,7 +654,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
error = mnt_want_write_file(f.file);
if (error)
goto out_fput;
- audit_inode(NULL, f.file->f_path.dentry, 0);
+ audit_file(f.file);
error = chown_common(&f.file->f_path, user, group);
mnt_drop_write_file(f.file);
out_fput:
@@ -823,8 +835,7 @@ struct file *dentry_open(const struct path *path, int flags,
f = get_empty_filp();
if (!IS_ERR(f)) {
f->f_flags = flags;
- f->f_path = *path;
- error = do_dentry_open(f, NULL, cred);
+ error = vfs_open(path, f, cred);
if (!error) {
/* from now on we need fput() to dispose of f */
error = open_check_o_direct(f);
@@ -841,6 +852,26 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @filp: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *filp,
+ const struct cred *cred)
+{
+ struct inode *inode = path->dentry->d_inode;
+
+ if (inode->i_op->dentry_open)
+ return inode->i_op->dentry_open(path->dentry, filp, cred);
+ else {
+ filp->f_path = *path;
+ return do_dentry_open(filp, NULL, cred);
+ }
+}
+EXPORT_SYMBOL(vfs_open);
+
static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
new file mode 100644
index 000000000000..34355818a2e0
--- /dev/null
+++ b/fs/overlayfs/Kconfig
@@ -0,0 +1,10 @@
+config OVERLAY_FS
+ tristate "Overlay filesystem support"
+ help
+ An overlay filesystem combines two filesystems - an 'upper' filesystem
+ and a 'lower' filesystem. When a name exists in both filesystems, the
+ object in the 'upper' filesystem is visible while the object in the
+ 'lower' filesystem is either hidden or, in the case of directories,
+ merged with the 'upper' object.
+
+ For more information see Documentation/filesystems/overlayfs.txt
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
new file mode 100644
index 000000000000..900daed3e91d
--- /dev/null
+++ b/fs/overlayfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the overlay filesystem.
+#
+
+obj-$(CONFIG_OVERLAY_FS) += overlay.o
+
+overlay-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
new file mode 100644
index 000000000000..ea10a8719107
--- /dev/null
+++ b/fs/overlayfs/copy_up.c
@@ -0,0 +1,414 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include "overlayfs.h"
+
+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+
+int ovl_copy_xattr(struct dentry *old, struct dentry *new)
+{
+ ssize_t list_size, size;
+ char *buf, *name, *value;
+ int error;
+
+ if (!old->d_inode->i_op->getxattr ||
+ !new->d_inode->i_op->getxattr)
+ return 0;
+
+ list_size = vfs_listxattr(old, NULL, 0);
+ if (list_size <= 0) {
+ if (list_size == -EOPNOTSUPP)
+ return 0;
+ return list_size;
+ }
+
+ buf = kzalloc(list_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ error = -ENOMEM;
+ value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
+ if (!value)
+ goto out;
+
+ list_size = vfs_listxattr(old, buf, list_size);
+ if (list_size <= 0) {
+ error = list_size;
+ goto out_free_value;
+ }
+
+ for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+ size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
+ if (size <= 0) {
+ error = size;
+ goto out_free_value;
+ }
+ error = vfs_setxattr(new, name, value, size, 0);
+ if (error)
+ goto out_free_value;
+ }
+
+out_free_value:
+ kfree(value);
+out:
+ kfree(buf);
+ return error;
+}
+
+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
+{
+ struct file *old_file;
+ struct file *new_file;
+ loff_t old_pos = 0;
+ loff_t new_pos = 0;
+ int error = 0;
+
+ if (len == 0)
+ return 0;
+
+ old_file = ovl_path_open(old, O_RDONLY);
+ if (IS_ERR(old_file))
+ return PTR_ERR(old_file);
+
+ new_file = ovl_path_open(new, O_WRONLY);
+ if (IS_ERR(new_file)) {
+ error = PTR_ERR(new_file);
+ goto out_fput;
+ }
+
+ /* FIXME: copy up sparse files efficiently */
+ while (len) {
+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
+ long bytes;
+
+ if (len < this_len)
+ this_len = len;
+
+ if (signal_pending_state(TASK_KILLABLE, current)) {
+ error = -EINTR;
+ break;
+ }
+
+ bytes = do_splice_direct(old_file, &old_pos,
+ new_file, &new_pos,
+ this_len, SPLICE_F_MOVE);
+ if (bytes <= 0) {
+ error = bytes;
+ break;
+ }
+ WARN_ON(old_pos != new_pos);
+
+ len -= bytes;
+ }
+
+ fput(new_file);
+out_fput:
+ fput(old_file);
+ return error;
+}
+
+static char *ovl_read_symlink(struct dentry *realdentry)
+{
+ int res;
+ char *buf;
+ struct inode *inode = realdentry->d_inode;
+ mm_segment_t old_fs;
+
+ res = -EINVAL;
+ if (!inode->i_op->readlink)
+ goto err;
+
+ res = -ENOMEM;
+ buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ res = inode->i_op->readlink(realdentry,
+ (char __user *)buf, PAGE_SIZE - 1);
+ set_fs(old_fs);
+ if (res < 0) {
+ free_page((unsigned long) buf);
+ goto err;
+ }
+ buf[res] = '\0';
+
+ return buf;
+
+err:
+ return ERR_PTR(res);
+}
+
+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
+{
+ struct iattr attr = {
+ .ia_valid =
+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+ .ia_atime = stat->atime,
+ .ia_mtime = stat->mtime,
+ };
+
+ return notify_change(upperdentry, &attr, NULL);
+}
+
+int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
+{
+ int err = 0;
+
+ if (!S_ISLNK(stat->mode)) {
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = stat->mode,
+ };
+ err = notify_change(upperdentry, &attr, NULL);
+ }
+ if (!err) {
+ struct iattr attr = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = stat->uid,
+ .ia_gid = stat->gid,
+ };
+ err = notify_change(upperdentry, &attr, NULL);
+ }
+ if (!err)
+ ovl_set_timestamps(upperdentry, stat);
+
+ return err;
+
+}
+
+static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
+ struct dentry *dentry, struct path *lowerpath,
+ struct kstat *stat, struct iattr *attr,
+ const char *link)
+{
+ struct inode *wdir = workdir->d_inode;
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *newdentry = NULL;
+ struct dentry *upper = NULL;
+ umode_t mode = stat->mode;
+ int err;
+
+ newdentry = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out;
+
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out1;
+
+ /* Can't properly set mode on creation because of the umask */
+ stat->mode &= S_IFMT;
+ err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
+ stat->mode = mode;
+ if (err)
+ goto out2;
+
+ if (S_ISREG(stat->mode)) {
+ struct path upperpath;
+ ovl_path_upper(dentry, &upperpath);
+ BUG_ON(upperpath.dentry != NULL);
+ upperpath.dentry = newdentry;
+
+ err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
+ if (err)
+ goto out_cleanup;
+ }
+
+ err = ovl_copy_xattr(lowerpath->dentry, newdentry);
+ if (err)
+ goto out_cleanup;
+
+ mutex_lock(&newdentry->d_inode->i_mutex);
+ err = ovl_set_attr(newdentry, stat);
+ if (!err && attr)
+ err = notify_change(newdentry, attr, NULL);
+ mutex_unlock(&newdentry->d_inode->i_mutex);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+ if (err)
+ goto out_cleanup;
+
+ ovl_dentry_update(dentry, newdentry);
+ newdentry = NULL;
+
+ /*
+ * Non-directores become opaque when copied up.
+ */
+ if (!S_ISDIR(stat->mode))
+ ovl_dentry_set_opaque(dentry, true);
+out2:
+ dput(upper);
+out1:
+ dput(newdentry);
+out:
+ return err;
+
+out_cleanup:
+ ovl_cleanup(wdir, newdentry);
+ goto out;
+}
+
+/*
+ * Copy up a single dentry
+ *
+ * Directory renames only allowed on "pure upper" (already created on
+ * upper filesystem, never copied up). Directories which are on lower or
+ * are merged may not be renamed. For these -EXDEV is returned and
+ * userspace has to deal with it. This means, when copying up a
+ * directory we can rely on it and ancestors being stable.
+ *
+ * Non-directory renames start with copy up of source if necessary. The
+ * actual rename will only proceed once the copy up was successful. Copy
+ * up uses upper parent i_mutex for exclusion. Since rename can change
+ * d_parent it is possible that the copy up will lock the old parent. At
+ * that point the file will have already been copied up anyway.
+ */
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ struct path *lowerpath, struct kstat *stat,
+ struct iattr *attr)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ int err;
+ struct kstat pstat;
+ struct path parentpath;
+ struct dentry *upperdir;
+ struct dentry *upperdentry;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+ char *link = NULL;
+
+ ovl_path_upper(parent, &parentpath);
+ upperdir = parentpath.dentry;
+
+ err = vfs_getattr(&parentpath, &pstat);
+ if (err)
+ return err;
+
+ if (S_ISLNK(stat->mode)) {
+ link = ovl_read_symlink(lowerpath->dentry);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ }
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_free_link;
+
+ override_cred->fsuid = stat->uid;
+ override_cred->fsgid = stat->gid;
+ /*
+ * CAP_SYS_ADMIN for copying up extended attributes
+ * CAP_DAC_OVERRIDE for create
+ * CAP_FOWNER for chmod, timestamp update
+ * CAP_FSETID for chmod
+ * CAP_CHOWN for chown
+ * CAP_MKNOD for mknod
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ cap_raise(override_cred->cap_effective, CAP_MKNOD);
+ old_cred = override_creds(override_cred);
+
+ err = -EIO;
+ if (lock_rename(workdir, upperdir) != NULL) {
+ pr_err("overlayfs: failed to lock workdir+upperdir\n");
+ goto out_unlock;
+ }
+ upperdentry = ovl_dentry_upper(dentry);
+ if (upperdentry) {
+ unlock_rename(workdir, upperdir);
+ err = 0;
+ /* Raced with another copy-up? Do the setattr here */
+ if (attr) {
+ mutex_lock(&upperdentry->d_inode->i_mutex);
+ err = notify_change(upperdentry, attr, NULL);
+ mutex_unlock(&upperdentry->d_inode->i_mutex);
+ }
+ goto out_put_cred;
+ }
+
+ err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
+ stat, attr, link);
+ if (!err) {
+ /* Restore timestamps on parent (best effort) */
+ ovl_set_timestamps(upperdir, &pstat);
+ }
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out_put_cred:
+ revert_creds(old_cred);
+ put_cred(override_cred);
+
+out_free_link:
+ if (link)
+ free_page((unsigned long) link);
+
+ return err;
+}
+
+int ovl_copy_up(struct dentry *dentry)
+{
+ int err;
+
+ err = 0;
+ while (!err) {
+ struct dentry *next;
+ struct dentry *parent;
+ struct path lowerpath;
+ struct kstat stat;
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (type != OVL_PATH_LOWER)
+ break;
+
+ next = dget(dentry);
+ /* find the topmost dentry not yet copied up */
+ for (;;) {
+ parent = dget_parent(next);
+
+ type = ovl_path_type(parent);
+ if (type != OVL_PATH_LOWER)
+ break;
+
+ dput(next);
+ next = parent;
+ }
+
+ ovl_path_lower(next, &lowerpath);
+ err = vfs_getattr(&lowerpath, &stat);
+ if (!err)
+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+
+ dput(parent);
+ dput(next);
+ }
+
+ return err;
+}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
new file mode 100644
index 000000000000..8ffc4b980f1b
--- /dev/null
+++ b/fs/overlayfs/dir.c
@@ -0,0 +1,928 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
+{
+ int err;
+
+ dget(wdentry);
+ if (S_ISDIR(wdentry->d_inode->i_mode))
+ err = ovl_do_rmdir(wdir, wdentry);
+ else
+ err = ovl_do_unlink(wdir, wdentry);
+ dput(wdentry);
+
+ if (err) {
+ pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
+ wdentry, err);
+ }
+}
+
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
+{
+ struct dentry *temp;
+ char name[20];
+
+ snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
+
+ temp = lookup_one_len(name, workdir, strlen(name));
+ if (!IS_ERR(temp) && temp->d_inode) {
+ pr_err("overlayfs: workdir/%s already exists\n", name);
+ dput(temp);
+ temp = ERR_PTR(-EIO);
+ }
+
+ return temp;
+}
+
+/* caller holds i_mutex on workdir */
+static struct dentry *ovl_whiteout(struct dentry *workdir,
+ struct dentry *dentry)
+{
+ int err;
+ struct dentry *whiteout;
+ struct inode *wdir = workdir->d_inode;
+
+ whiteout = ovl_lookup_temp(workdir, dentry);
+ if (IS_ERR(whiteout))
+ return whiteout;
+
+ err = ovl_do_whiteout(wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ }
+
+ return whiteout;
+}
+
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink, bool debug)
+{
+ int err;
+
+ if (newdentry->d_inode)
+ return -ESTALE;
+
+ if (hardlink) {
+ err = ovl_do_link(hardlink, dir, newdentry, debug);
+ } else {
+ switch (stat->mode & S_IFMT) {
+ case S_IFREG:
+ err = ovl_do_create(dir, newdentry, stat->mode, debug);
+ break;
+
+ case S_IFDIR:
+ err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
+ break;
+
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ err = ovl_do_mknod(dir, newdentry,
+ stat->mode, stat->rdev, debug);
+ break;
+
+ case S_IFLNK:
+ err = ovl_do_symlink(dir, newdentry, link, debug);
+ break;
+
+ default:
+ err = -EPERM;
+ }
+ }
+ if (!err && WARN_ON(!newdentry->d_inode)) {
+ /*
+ * Not quite sure if non-instantiated dentry is legal or not.
+ * VFS doesn't seem to care so check and warn here.
+ */
+ err = -ENOENT;
+ }
+ return err;
+}
+
+static int ovl_set_opaque(struct dentry *upperdentry)
+{
+ return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
+}
+
+static void ovl_remove_opaque(struct dentry *upperdentry)
+{
+ int err;
+
+ err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr);
+ if (err) {
+ pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
+ upperdentry->d_name.name, err);
+ }
+}
+
+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ int err;
+ enum ovl_path_type type;
+ struct path realpath;
+
+ type = ovl_path_real(dentry, &realpath);
+ err = vfs_getattr(&realpath, stat);
+ if (err)
+ return err;
+
+ stat->dev = dentry->d_sb->s_dev;
+ stat->ino = dentry->d_inode->i_ino;
+
+ /*
+ * It's probably not worth it to count subdirs to get the
+ * correct link count. nlink=1 seems to pacify 'find' and
+ * other utilities.
+ */
+ if (type == OVL_PATH_MERGE)
+ stat->nlink = 1;
+
+ return 0;
+}
+
+static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink)
+{
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *newdentry;
+ int err;
+
+ mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+ newdentry = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+ err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
+ if (err)
+ goto out_dput;
+
+ ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_update(dentry, newdentry);
+ ovl_copyattr(newdentry->d_inode, inode);
+ d_instantiate(dentry, inode);
+ newdentry = NULL;
+out_dput:
+ dput(newdentry);
+out_unlock:
+ mutex_unlock(&udir->i_mutex);
+ return err;
+}
+
+static int ovl_lock_rename_workdir(struct dentry *workdir,
+ struct dentry *upperdir)
+{
+ /* Workdir should not be the same as upperdir */
+ if (workdir == upperdir)
+ goto err;
+
+ /* Workdir should not be subdir of upperdir and vice versa */
+ if (lock_rename(workdir, upperdir) != NULL)
+ goto err_unlock;
+
+ return 0;
+
+err_unlock:
+ unlock_rename(workdir, upperdir);
+err:
+ pr_err("overlayfs: failed to lock workdir+upperdir\n");
+ return -EIO;
+}
+
+static struct dentry *ovl_clear_empty(struct dentry *dentry,
+ struct list_head *list)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct path upperpath;
+ struct dentry *upper;
+ struct dentry *opaquedir;
+ struct kstat stat;
+ int err;
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out;
+
+ ovl_path_upper(dentry, &upperpath);
+ err = vfs_getattr(&upperpath, &stat);
+ if (err)
+ goto out_unlock;
+
+ err = -ESTALE;
+ if (!S_ISDIR(stat.mode))
+ goto out_unlock;
+ upper = upperpath.dentry;
+ if (upper->d_parent->d_inode != udir)
+ goto out_unlock;
+
+ opaquedir = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out_unlock;
+
+ err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
+ if (err)
+ goto out_dput;
+
+ err = ovl_copy_xattr(upper, opaquedir);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_set_opaque(opaquedir);
+ if (err)
+ goto out_cleanup;
+
+ mutex_lock(&opaquedir->d_inode->i_mutex);
+ err = ovl_set_attr(opaquedir, &stat);
+ mutex_unlock(&opaquedir->d_inode->i_mutex);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+ if (err)
+ goto out_cleanup;
+
+ ovl_cleanup_whiteouts(upper, list);
+ ovl_cleanup(wdir, upper);
+ unlock_rename(workdir, upperdir);
+
+ /* dentry's upper doesn't match now, get rid of it */
+ d_drop(dentry);
+
+ return opaquedir;
+
+out_cleanup:
+ ovl_cleanup(wdir, opaquedir);
+out_dput:
+ dput(opaquedir);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out:
+ return ERR_PTR(err);
+}
+
+static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
+{
+ int err;
+ struct dentry *ret = NULL;
+ LIST_HEAD(list);
+
+ err = ovl_check_empty_dir(dentry, &list);
+ if (err)
+ ret = ERR_PTR(err);
+ else {
+ /*
+ * If no upperdentry then skip clearing whiteouts.
+ *
+ * Can race with copy-up, since we don't hold the upperdir
+ * mutex. Doesn't matter, since copy-up can't create a
+ * non-empty directory from an empty one.
+ */
+ if (ovl_dentry_upper(dentry))
+ ret = ovl_clear_empty(dentry, &list);
+ }
+
+ ovl_cache_free(&list);
+
+ return ret;
+}
+
+static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *upper;
+ struct dentry *newdentry;
+ int err;
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out;
+
+ newdentry = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out_dput;
+
+ err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
+ if (err)
+ goto out_dput2;
+
+ if (S_ISDIR(stat->mode)) {
+ err = ovl_set_opaque(newdentry);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, newdentry, udir, upper,
+ RENAME_EXCHANGE);
+ if (err)
+ goto out_cleanup;
+
+ ovl_cleanup(wdir, upper);
+ } else {
+ err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+ if (err)
+ goto out_cleanup;
+ }
+ ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_update(dentry, newdentry);
+ ovl_copyattr(newdentry->d_inode, inode);
+ d_instantiate(dentry, inode);
+ newdentry = NULL;
+out_dput2:
+ dput(upper);
+out_dput:
+ dput(newdentry);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out:
+ return err;
+
+out_cleanup:
+ ovl_cleanup(wdir, newdentry);
+ goto out_dput2;
+}
+
+static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
+ const char *link, struct dentry *hardlink)
+{
+ int err;
+ struct inode *inode;
+ struct kstat stat = {
+ .mode = mode,
+ .rdev = rdev,
+ };
+
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
+ if (!inode)
+ goto out;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ goto out_iput;
+
+ if (!ovl_dentry_is_opaque(dentry)) {
+ err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
+ } else {
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_iput;
+
+ /*
+ * CAP_SYS_ADMIN for setting opaque xattr
+ * CAP_DAC_OVERRIDE for create in workdir, rename
+ * CAP_FOWNER for removing whiteout from sticky dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ old_cred = override_creds(override_cred);
+
+ err = ovl_create_over_whiteout(dentry, inode, &stat, link,
+ hardlink);
+
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+
+ if (!err)
+ inode = NULL;
+out_iput:
+ iput(inode);
+out:
+ return err;
+}
+
+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
+ const char *link)
+{
+ int err;
+
+ err = ovl_want_write(dentry);
+ if (!err) {
+ err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
+ ovl_drop_write(dentry);
+ }
+
+ return err;
+}
+
+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
+}
+
+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+}
+
+static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t rdev)
+{
+ /* Don't allow creation of "whiteout" on overlay */
+ if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
+ return -EPERM;
+
+ return ovl_create_object(dentry, mode, rdev, NULL);
+}
+
+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
+ const char *link)
+{
+ return ovl_create_object(dentry, S_IFLNK, 0, link);
+}
+
+static int ovl_link(struct dentry *old, struct inode *newdir,
+ struct dentry *new)
+{
+ int err;
+ struct dentry *upper;
+
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(old);
+ if (err)
+ goto out_drop_write;
+
+ upper = ovl_dentry_upper(old);
+ err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
+
+out_drop_write:
+ ovl_drop_write(old);
+out:
+ return err;
+}
+
+static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *whiteout;
+ struct dentry *upper;
+ struct dentry *opaquedir = NULL;
+ int err;
+
+ if (is_dir) {
+ opaquedir = ovl_check_empty_and_clear(dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
+ }
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out_dput;
+
+ whiteout = ovl_whiteout(workdir, dentry);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ goto out_unlock;
+
+ upper = ovl_dentry_upper(dentry);
+ if (!upper) {
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto kill_whiteout;
+
+ err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
+ dput(upper);
+ if (err)
+ goto kill_whiteout;
+ } else {
+ int flags = 0;
+
+ if (opaquedir)
+ upper = opaquedir;
+ err = -ESTALE;
+ if (upper->d_parent != upperdir)
+ goto kill_whiteout;
+
+ if (is_dir)
+ flags |= RENAME_EXCHANGE;
+
+ err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
+ if (err)
+ goto kill_whiteout;
+
+ if (is_dir)
+ ovl_cleanup(wdir, upper);
+ }
+ ovl_dentry_version_inc(dentry->d_parent);
+out_d_drop:
+ d_drop(dentry);
+ dput(whiteout);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out_dput:
+ dput(opaquedir);
+out:
+ return err;
+
+kill_whiteout:
+ ovl_cleanup(wdir, whiteout);
+ goto out_d_drop;
+}
+
+static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
+{
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *dir = upperdir->d_inode;
+ struct dentry *upper = ovl_dentry_upper(dentry);
+ int err;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ err = -ESTALE;
+ if (upper->d_parent == upperdir) {
+ /* Don't let d_delete() think it can reset d_inode */
+ dget(upper);
+ if (is_dir)
+ err = vfs_rmdir(dir, upper);
+ else
+ err = vfs_unlink(dir, upper, NULL);
+ dput(upper);
+ ovl_dentry_version_inc(dentry->d_parent);
+ }
+
+ /*
+ * Keeping this dentry hashed would mean having to release
+ * upperpath/lowerpath, which could only be done if we are the
+ * sole user of this dentry. Too tricky... Just unhash for
+ * now.
+ */
+ d_drop(dentry);
+ mutex_unlock(&dir->i_mutex);
+
+ return err;
+}
+
+static inline int ovl_check_sticky(struct dentry *dentry)
+{
+ struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
+ struct inode *inode = ovl_dentry_real(dentry)->d_inode;
+
+ if (check_sticky(dir, inode))
+ return -EPERM;
+
+ return 0;
+}
+
+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
+{
+ enum ovl_path_type type;
+ int err;
+
+ err = ovl_check_sticky(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ goto out_drop_write;
+
+ type = ovl_path_type(dentry);
+ if (type == OVL_PATH_PURE_UPPER) {
+ err = ovl_remove_upper(dentry, is_dir);
+ } else {
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_drop_write;
+
+ /*
+ * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+ * CAP_DAC_OVERRIDE for create in workdir, rename
+ * CAP_FOWNER for removing whiteout from sticky dir
+ * CAP_FSETID for chmod of opaque dir
+ * CAP_CHOWN for chown of opaque dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ old_cred = override_creds(override_cred);
+
+ err = ovl_remove_and_whiteout(dentry, is_dir);
+
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return ovl_do_remove(dentry, false);
+}
+
+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return ovl_do_remove(dentry, true);
+}
+
+static int ovl_rename2(struct inode *olddir, struct dentry *old,
+ struct inode *newdir, struct dentry *new,
+ unsigned int flags)
+{
+ int err;
+ enum ovl_path_type old_type;
+ enum ovl_path_type new_type;
+ struct dentry *old_upperdir;
+ struct dentry *new_upperdir;
+ struct dentry *olddentry;
+ struct dentry *newdentry;
+ struct dentry *trap;
+ bool old_opaque;
+ bool new_opaque;
+ bool new_create = false;
+ bool cleanup_whiteout = false;
+ bool overwrite = !(flags & RENAME_EXCHANGE);
+ bool is_dir = S_ISDIR(old->d_inode->i_mode);
+ bool new_is_dir = false;
+ struct dentry *opaquedir = NULL;
+ const struct cred *old_cred = NULL;
+ struct cred *override_cred = NULL;
+
+ err = -EINVAL;
+ if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
+ goto out;
+
+ flags &= ~RENAME_NOREPLACE;
+
+ err = ovl_check_sticky(old);
+ if (err)
+ goto out;
+
+ /* Don't copy up directory trees */
+ old_type = ovl_path_type(old);
+ err = -EXDEV;
+ if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir)
+ goto out;
+
+ if (new->d_inode) {
+ err = ovl_check_sticky(new);
+ if (err)
+ goto out;
+
+ if (S_ISDIR(new->d_inode->i_mode))
+ new_is_dir = true;
+
+ new_type = ovl_path_type(new);
+ err = -EXDEV;
+ if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir)
+ goto out;
+
+ err = 0;
+ if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
+ if (ovl_dentry_lower(old)->d_inode ==
+ ovl_dentry_lower(new)->d_inode)
+ goto out;
+ }
+ if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
+ if (ovl_dentry_upper(old)->d_inode ==
+ ovl_dentry_upper(new)->d_inode)
+ goto out;
+ }
+ } else {
+ if (ovl_dentry_is_opaque(new))
+ new_type = OVL_PATH_UPPER;
+ else
+ new_type = OVL_PATH_PURE_UPPER;
+ }
+
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(old);
+ if (err)
+ goto out_drop_write;
+
+ err = ovl_copy_up(new->d_parent);
+ if (err)
+ goto out_drop_write;
+ if (!overwrite) {
+ err = ovl_copy_up(new);
+ if (err)
+ goto out_drop_write;
+ }
+
+ old_opaque = old_type != OVL_PATH_PURE_UPPER;
+ new_opaque = new_type != OVL_PATH_PURE_UPPER;
+
+ if (old_opaque || new_opaque) {
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_drop_write;
+
+ /*
+ * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+ * CAP_DAC_OVERRIDE for create in workdir
+ * CAP_FOWNER for removing whiteout from sticky dir
+ * CAP_FSETID for chmod of opaque dir
+ * CAP_CHOWN for chown of opaque dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ old_cred = override_creds(override_cred);
+ }
+
+ if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
+ opaquedir = ovl_check_empty_and_clear(new);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir)) {
+ opaquedir = NULL;
+ goto out_revert_creds;
+ }
+ }
+
+ if (overwrite) {
+ if (old_opaque) {
+ if (new->d_inode || !new_opaque) {
+ /* Whiteout source */
+ flags |= RENAME_WHITEOUT;
+ } else {
+ /* Switch whiteouts */
+ flags |= RENAME_EXCHANGE;
+ }
+ } else if (is_dir && !new->d_inode && new_opaque) {
+ flags |= RENAME_EXCHANGE;
+ cleanup_whiteout = true;
+ }
+ }
+
+ old_upperdir = ovl_dentry_upper(old->d_parent);
+ new_upperdir = ovl_dentry_upper(new->d_parent);
+
+ trap = lock_rename(new_upperdir, old_upperdir);
+
+ olddentry = ovl_dentry_upper(old);
+ newdentry = ovl_dentry_upper(new);
+ if (newdentry) {
+ if (opaquedir) {
+ newdentry = opaquedir;
+ opaquedir = NULL;
+ } else {
+ dget(newdentry);
+ }
+ } else {
+ new_create = true;
+ newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+ new->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+ }
+
+ err = -ESTALE;
+ if (olddentry->d_parent != old_upperdir)
+ goto out_dput;
+ if (newdentry->d_parent != new_upperdir)
+ goto out_dput;
+ if (olddentry == trap)
+ goto out_dput;
+ if (newdentry == trap)
+ goto out_dput;
+
+ if (is_dir && !old_opaque && new_opaque) {
+ err = ovl_set_opaque(olddentry);
+ if (err)
+ goto out_dput;
+ }
+ if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
+ err = ovl_set_opaque(newdentry);
+ if (err)
+ goto out_dput;
+ }
+
+ if (old_opaque || new_opaque) {
+ err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+ new_upperdir->d_inode, newdentry,
+ flags);
+ } else {
+ /* No debug for the plain case */
+ BUG_ON(flags & ~RENAME_EXCHANGE);
+ err = vfs_rename(old_upperdir->d_inode, olddentry,
+ new_upperdir->d_inode, newdentry,
+ NULL, flags);
+ }
+
+ if (err) {
+ if (is_dir && !old_opaque && new_opaque)
+ ovl_remove_opaque(olddentry);
+ if (!overwrite && new_is_dir && old_opaque && !new_opaque)
+ ovl_remove_opaque(newdentry);
+ goto out_dput;
+ }
+
+ if (is_dir && old_opaque && !new_opaque)
+ ovl_remove_opaque(olddentry);
+ if (!overwrite && new_is_dir && !old_opaque && new_opaque)
+ ovl_remove_opaque(newdentry);
+
+ if (old_opaque != new_opaque) {
+ ovl_dentry_set_opaque(old, new_opaque);
+ if (!overwrite)
+ ovl_dentry_set_opaque(new, old_opaque);
+ }
+
+ if (cleanup_whiteout)
+ ovl_cleanup(old_upperdir->d_inode, newdentry);
+
+ ovl_dentry_version_inc(old->d_parent);
+ ovl_dentry_version_inc(new->d_parent);
+
+out_dput:
+ dput(newdentry);
+out_unlock:
+ unlock_rename(new_upperdir, old_upperdir);
+out_revert_creds:
+ if (old_opaque || new_opaque) {
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+out_drop_write:
+ ovl_drop_write(old);
+out:
+ dput(opaquedir);
+ return err;
+}
+
+const struct inode_operations ovl_dir_inode_operations = {
+ .lookup = ovl_lookup,
+ .mkdir = ovl_mkdir,
+ .symlink = ovl_symlink,
+ .unlink = ovl_unlink,
+ .rmdir = ovl_rmdir,
+ .rename2 = ovl_rename2,
+ .link = ovl_link,
+ .setattr = ovl_setattr,
+ .create = ovl_create,
+ .mknod = ovl_mknod,
+ .permission = ovl_permission,
+ .getattr = ovl_dir_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
new file mode 100644
index 000000000000..07d74b24913b
--- /dev/null
+++ b/fs/overlayfs/inode.c
@@ -0,0 +1,434 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+
+static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
+ bool no_data)
+{
+ int err;
+ struct dentry *parent;
+ struct kstat stat;
+ struct path lowerpath;
+
+ parent = dget_parent(dentry);
+ err = ovl_copy_up(parent);
+ if (err)
+ goto out_dput_parent;
+
+ ovl_path_lower(dentry, &lowerpath);
+ err = vfs_getattr(&lowerpath, &stat);
+ if (err)
+ goto out_dput_parent;
+
+ if (no_data)
+ stat.size = 0;
+
+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+
+out_dput_parent:
+ dput(parent);
+ return err;
+}
+
+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ int err;
+ struct dentry *upperdentry;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ upperdentry = ovl_dentry_upper(dentry);
+ if (upperdentry) {
+ mutex_lock(&upperdentry->d_inode->i_mutex);
+ err = notify_change(upperdentry, attr, NULL);
+ mutex_unlock(&upperdentry->d_inode->i_mutex);
+ } else {
+ err = ovl_copy_up_last(dentry, attr, false);
+ }
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct path realpath;
+
+ ovl_path_real(dentry, &realpath);
+ return vfs_getattr(&realpath, stat);
+}
+
+int ovl_permission(struct inode *inode, int mask)
+{
+ struct ovl_entry *oe;
+ struct dentry *alias = NULL;
+ struct inode *realinode;
+ struct dentry *realdentry;
+ bool is_upper;
+ int err;
+
+ if (S_ISDIR(inode->i_mode)) {
+ oe = inode->i_private;
+ } else if (mask & MAY_NOT_BLOCK) {
+ return -ECHILD;
+ } else {
+ /*
+ * For non-directories find an alias and get the info
+ * from there.
+ */
+ alias = d_find_any_alias(inode);
+ if (WARN_ON(!alias))
+ return -ENOENT;
+
+ oe = alias->d_fsdata;
+ }
+
+ realdentry = ovl_entry_real(oe, &is_upper);
+
+ /* Careful in RCU walk mode */
+ realinode = ACCESS_ONCE(realdentry->d_inode);
+ if (!realinode) {
+ WARN_ON(!(mask & MAY_NOT_BLOCK));
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ if (mask & MAY_WRITE) {
+ umode_t mode = realinode->i_mode;
+
+ /*
+ * Writes will always be redirected to upper layer, so
+ * ignore lower layer being read-only.
+ *
+ * If the overlay itself is read-only then proceed
+ * with the permission check, don't return EROFS.
+ * This will only happen if this is the lower layer of
+ * another overlayfs.
+ *
+ * If upper fs becomes read-only after the overlay was
+ * constructed return EROFS to prevent modification of
+ * upper layer.
+ */
+ err = -EROFS;
+ if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+ goto out_dput;
+ }
+
+ err = __inode_permission(realinode, mask);
+out_dput:
+ dput(alias);
+ return err;
+}
+
+
+struct ovl_link_data {
+ struct dentry *realdentry;
+ void *cookie;
+};
+
+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ void *ret;
+ struct dentry *realdentry;
+ struct inode *realinode;
+
+ realdentry = ovl_dentry_real(dentry);
+ realinode = realdentry->d_inode;
+
+ if (WARN_ON(!realinode->i_op->follow_link))
+ return ERR_PTR(-EPERM);
+
+ ret = realinode->i_op->follow_link(realdentry, nd);
+ if (IS_ERR(ret))
+ return ret;
+
+ if (realinode->i_op->put_link) {
+ struct ovl_link_data *data;
+
+ data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
+ if (!data) {
+ realinode->i_op->put_link(realdentry, nd, ret);
+ return ERR_PTR(-ENOMEM);
+ }
+ data->realdentry = realdentry;
+ data->cookie = ret;
+
+ return data;
+ } else {
+ return NULL;
+ }
+}
+
+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+ struct inode *realinode;
+ struct ovl_link_data *data = c;
+
+ if (!data)
+ return;
+
+ realinode = data->realdentry->d_inode;
+ realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+ kfree(data);
+}
+
+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+ struct path realpath;
+ struct inode *realinode;
+
+ ovl_path_real(dentry, &realpath);
+ realinode = realpath.dentry->d_inode;
+
+ if (!realinode->i_op->readlink)
+ return -EINVAL;
+
+ touch_atime(&realpath);
+
+ return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+}
+
+
+static bool ovl_is_private_xattr(const char *name)
+{
+ return strncmp(name, "trusted.overlay.", 14) == 0;
+}
+
+int ovl_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err;
+ struct dentry *upperdentry;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = -EPERM;
+ if (ovl_is_private_xattr(name))
+ goto out_drop_write;
+
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ upperdentry = ovl_dentry_upper(dentry);
+ err = vfs_setxattr(upperdentry, name, value, size, flags);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static bool ovl_need_xattr_filter(struct dentry *dentry,
+ enum ovl_path_type type)
+{
+ return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode);
+}
+
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size)
+{
+ struct path realpath;
+ enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+
+ if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+ return -ENODATA;
+
+ return vfs_getxattr(realpath.dentry, name, value, size);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ struct path realpath;
+ enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+ ssize_t res;
+ int off;
+
+ res = vfs_listxattr(realpath.dentry, list, size);
+ if (res <= 0 || size == 0)
+ return res;
+
+ if (!ovl_need_xattr_filter(dentry, type))
+ return res;
+
+ /* filter out private xattrs */
+ for (off = 0; off < res;) {
+ char *s = list + off;
+ size_t slen = strlen(s) + 1;
+
+ BUG_ON(off + slen > res);
+
+ if (ovl_is_private_xattr(s)) {
+ res -= slen;
+ memmove(s, s + slen, res - off);
+ } else {
+ off += slen;
+ }
+ }
+
+ return res;
+}
+
+int ovl_removexattr(struct dentry *dentry, const char *name)
+{
+ int err;
+ struct path realpath;
+ enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = -ENODATA;
+ if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+ goto out_drop_write;
+
+ if (type == OVL_PATH_LOWER) {
+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
+ if (err < 0)
+ goto out_drop_write;
+
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ ovl_path_upper(dentry, &realpath);
+ }
+
+ err = vfs_removexattr(realpath.dentry, name);
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
+ struct dentry *realdentry)
+{
+ if (type != OVL_PATH_LOWER)
+ return false;
+
+ if (special_file(realdentry->d_inode->i_mode))
+ return false;
+
+ if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
+ return false;
+
+ return true;
+}
+
+static int ovl_dentry_open(struct dentry *dentry, struct file *file,
+ const struct cred *cred)
+{
+ int err;
+ struct path realpath;
+ enum ovl_path_type type;
+ bool want_write = false;
+
+ type = ovl_path_real(dentry, &realpath);
+ if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
+ want_write = true;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ if (file->f_flags & O_TRUNC)
+ err = ovl_copy_up_last(dentry, NULL, true);
+ else
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ ovl_path_upper(dentry, &realpath);
+ }
+
+ err = vfs_open(&realpath, file, cred);
+out_drop_write:
+ if (want_write)
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static const struct inode_operations ovl_file_inode_operations = {
+ .setattr = ovl_setattr,
+ .permission = ovl_permission,
+ .getattr = ovl_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+ .dentry_open = ovl_dentry_open,
+};
+
+static const struct inode_operations ovl_symlink_inode_operations = {
+ .setattr = ovl_setattr,
+ .follow_link = ovl_follow_link,
+ .put_link = ovl_put_link,
+ .readlink = ovl_readlink,
+ .getattr = ovl_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+};
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+ struct ovl_entry *oe)
+{
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ mode &= S_IFMT;
+
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+ switch (mode) {
+ case S_IFDIR:
+ inode->i_private = oe;
+ inode->i_op = &ovl_dir_inode_operations;
+ inode->i_fop = &ovl_dir_operations;
+ break;
+
+ case S_IFLNK:
+ inode->i_op = &ovl_symlink_inode_operations;
+ break;
+
+ case S_IFREG:
+ case S_IFSOCK:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ inode->i_op = &ovl_file_inode_operations;
+ break;
+
+ default:
+ WARN(1, "illegal file type: %i\n", mode);
+ iput(inode);
+ inode = NULL;
+ }
+
+ return inode;
+
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
new file mode 100644
index 000000000000..814bed33dd07
--- /dev/null
+++ b/fs/overlayfs/overlayfs.h
@@ -0,0 +1,191 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+
+struct ovl_entry;
+
+enum ovl_path_type {
+ OVL_PATH_PURE_UPPER,
+ OVL_PATH_UPPER,
+ OVL_PATH_MERGE,
+ OVL_PATH_LOWER,
+};
+
+extern const char *ovl_opaque_xattr;
+
+static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_rmdir(dir, dentry);
+ pr_debug("rmdir(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_unlink(dir, dentry, NULL);
+ pr_debug("unlink(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool debug)
+{
+ int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+ if (debug) {
+ pr_debug("link(%pd2, %pd2) = %i\n",
+ old_dentry, new_dentry, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool debug)
+{
+ int err = vfs_create(dir, dentry, mode, true);
+ if (debug)
+ pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return err;
+}
+
+static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool debug)
+{
+ int err = vfs_mkdir(dir, dentry, mode);
+ if (debug)
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return err;
+}
+
+static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t dev, bool debug)
+{
+ int err = vfs_mknod(dir, dentry, mode, dev);
+ if (debug) {
+ pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
+ dentry, mode, dev, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
+ const char *oldname, bool debug)
+{
+ int err = vfs_symlink(dir, dentry, oldname);
+ if (debug)
+ pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
+ return err;
+}
+
+static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err = vfs_setxattr(dentry, name, value, size, flags);
+ pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
+ dentry, name, (int) size, (char *) value, flags, err);
+ return err;
+}
+
+static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
+{
+ int err = vfs_removexattr(dentry, name);
+ pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
+ return err;
+}
+
+static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
+ struct inode *newdir, struct dentry *newdentry,
+ unsigned int flags)
+{
+ int err;
+
+ pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
+ olddentry, newdentry, flags);
+
+ err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+
+ if (err) {
+ pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
+ olddentry, newdentry, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_whiteout(dir, dentry);
+ pr_debug("whiteout(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+void ovl_dentry_version_inc(struct dentry *dentry);
+void ovl_path_upper(struct dentry *dentry, struct path *path);
+void ovl_path_lower(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+struct dentry *ovl_dentry_upper(struct dentry *dentry);
+struct dentry *ovl_dentry_lower(struct dentry *dentry);
+struct dentry *ovl_dentry_real(struct dentry *dentry);
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
+struct dentry *ovl_workdir(struct dentry *dentry);
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+bool ovl_dentry_is_opaque(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
+bool ovl_is_whiteout(struct dentry *dentry);
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags);
+struct file *ovl_path_open(struct path *path, int flags);
+
+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
+ struct kstat *stat, const char *link);
+
+/* readdir.c */
+extern const struct file_operations ovl_dir_operations;
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
+void ovl_cache_free(struct list_head *list);
+
+/* inode.c */
+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_permission(struct inode *inode, int mask);
+int ovl_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+int ovl_removexattr(struct dentry *dentry, const char *name);
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+ struct ovl_entry *oe);
+static inline void ovl_copyattr(struct inode *from, struct inode *to)
+{
+ to->i_uid = from->i_uid;
+ to->i_gid = from->i_gid;
+}
+
+/* dir.c */
+extern const struct inode_operations ovl_dir_inode_operations;
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink, bool debug);
+void ovl_cleanup(struct inode *dir, struct dentry *dentry);
+
+/* copy_up.c */
+int ovl_copy_up(struct dentry *dentry);
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ struct path *lowerpath, struct kstat *stat,
+ struct iattr *attr);
+int ovl_copy_xattr(struct dentry *old, struct dentry *new);
+int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
new file mode 100644
index 000000000000..c0205990a9f5
--- /dev/null
+++ b/fs/overlayfs/readdir.c
@@ -0,0 +1,588 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+struct ovl_cache_entry {
+ unsigned int len;
+ unsigned int type;
+ u64 ino;
+ struct list_head l_node;
+ struct rb_node node;
+ bool is_whiteout;
+ bool is_cursor;
+ char name[];
+};
+
+struct ovl_dir_cache {
+ long refcount;
+ u64 version;
+ struct list_head entries;
+};
+
+struct ovl_readdir_data {
+ struct dir_context ctx;
+ bool is_merge;
+ struct rb_root root;
+ struct list_head *list;
+ struct list_head middle;
+ int count;
+ int err;
+};
+
+struct ovl_dir_file {
+ bool is_real;
+ bool is_upper;
+ struct ovl_dir_cache *cache;
+ struct ovl_cache_entry cursor;
+ struct file *realfile;
+ struct file *upperfile;
+};
+
+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
+{
+ return container_of(n, struct ovl_cache_entry, node);
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
+ const char *name, int len)
+{
+ struct rb_node *node = root->rb_node;
+ int cmp;
+
+ while (node) {
+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
+
+ cmp = strncmp(name, p->name, len);
+ if (cmp > 0)
+ node = p->node.rb_right;
+ else if (cmp < 0 || len < p->len)
+ node = p->node.rb_left;
+ else
+ return p;
+ }
+
+ return NULL;
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
+ u64 ino, unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+ size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
+
+ p = kmalloc(size, GFP_KERNEL);
+ if (p) {
+ memcpy(p->name, name, len);
+ p->name[len] = '\0';
+ p->len = len;
+ p->type = d_type;
+ p->ino = ino;
+ p->is_whiteout = false;
+ p->is_cursor = false;
+ }
+
+ return p;
+}
+
+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+ const char *name, int len, u64 ino,
+ unsigned int d_type)
+{
+ struct rb_node **newp = &rdd->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct ovl_cache_entry *p;
+
+ while (*newp) {
+ int cmp;
+ struct ovl_cache_entry *tmp;
+
+ parent = *newp;
+ tmp = ovl_cache_entry_from_node(*newp);
+ cmp = strncmp(name, tmp->name, len);
+ if (cmp > 0)
+ newp = &tmp->node.rb_right;
+ else if (cmp < 0 || len < tmp->len)
+ newp = &tmp->node.rb_left;
+ else
+ return 0;
+ }
+
+ p = ovl_cache_entry_new(name, len, ino, d_type);
+ if (p == NULL)
+ return -ENOMEM;
+
+ list_add_tail(&p->l_node, rdd->list);
+ rb_link_node(&p->node, parent, newp);
+ rb_insert_color(&p->node, &rdd->root);
+
+ return 0;
+}
+
+static int ovl_fill_lower(struct ovl_readdir_data *rdd,
+ const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+
+ p = ovl_cache_entry_find(&rdd->root, name, namelen);
+ if (p) {
+ list_move_tail(&p->l_node, &rdd->middle);
+ } else {
+ p = ovl_cache_entry_new(name, namelen, ino, d_type);
+ if (p == NULL)
+ rdd->err = -ENOMEM;
+ else
+ list_add_tail(&p->l_node, &rdd->middle);
+ }
+
+ return rdd->err;
+}
+
+void ovl_cache_free(struct list_head *list)
+{
+ struct ovl_cache_entry *p;
+ struct ovl_cache_entry *n;
+
+ list_for_each_entry_safe(p, n, list, l_node)
+ kfree(p);
+
+ INIT_LIST_HEAD(list);
+}
+
+static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+{
+ struct ovl_dir_cache *cache = od->cache;
+
+ list_del_init(&od->cursor.l_node);
+ WARN_ON(cache->refcount <= 0);
+ cache->refcount--;
+ if (!cache->refcount) {
+ if (ovl_dir_cache(dentry) == cache)
+ ovl_set_dir_cache(dentry, NULL);
+
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ }
+}
+
+static int ovl_fill_merge(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_readdir_data *rdd =
+ container_of(ctx, struct ovl_readdir_data, ctx);
+
+ rdd->count++;
+ if (!rdd->is_merge)
+ return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+ else
+ return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+}
+
+static inline int ovl_dir_read(struct path *realpath,
+ struct ovl_readdir_data *rdd)
+{
+ struct file *realfile;
+ int err;
+
+ realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
+
+ rdd->ctx.pos = 0;
+ do {
+ rdd->count = 0;
+ rdd->err = 0;
+ err = iterate_dir(realfile, &rdd->ctx);
+ if (err >= 0)
+ err = rdd->err;
+ } while (!err && rdd->count);
+ fput(realfile);
+
+ return err;
+}
+
+static void ovl_dir_reset(struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct ovl_dir_cache *cache = od->cache;
+ struct dentry *dentry = file->f_path.dentry;
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (cache && ovl_dentry_version_get(dentry) != cache->version) {
+ ovl_cache_put(od, dentry);
+ od->cache = NULL;
+ }
+ WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
+ if (od->is_real && type == OVL_PATH_MERGE)
+ od->is_real = false;
+}
+
+static int ovl_dir_mark_whiteouts(struct dentry *dir,
+ struct ovl_readdir_data *rdd)
+{
+ struct ovl_cache_entry *p;
+ struct dentry *dentry;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ override_cred = prepare_creds();
+ if (!override_cred) {
+ ovl_cache_free(rdd->list);
+ return -ENOMEM;
+ }
+
+ /*
+ * CAP_DAC_OVERRIDE for lookup
+ */
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ old_cred = override_creds(override_cred);
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ list_for_each_entry(p, rdd->list, l_node) {
+ if (p->is_cursor)
+ continue;
+
+ if (p->type != DT_CHR)
+ continue;
+
+ dentry = lookup_one_len(p->name, dir, p->len);
+ if (IS_ERR(dentry))
+ continue;
+
+ p->is_whiteout = ovl_is_whiteout(dentry);
+ dput(dentry);
+ }
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ revert_creds(old_cred);
+ put_cred(override_cred);
+
+ return 0;
+}
+
+static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
+{
+ int err;
+ struct path lowerpath;
+ struct path upperpath;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_merge,
+ .list = list,
+ .root = RB_ROOT,
+ .is_merge = false,
+ };
+
+ ovl_path_lower(dentry, &lowerpath);
+ ovl_path_upper(dentry, &upperpath);
+
+ if (upperpath.dentry) {
+ err = ovl_dir_read(&upperpath, &rdd);
+ if (err)
+ goto out;
+
+ if (lowerpath.dentry) {
+ err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd);
+ if (err)
+ goto out;
+ }
+ }
+ if (lowerpath.dentry) {
+ /*
+ * Insert lowerpath entries before upperpath ones, this allows
+ * offsets to be reasonably constant
+ */
+ list_add(&rdd.middle, rdd.list);
+ rdd.is_merge = true;
+ err = ovl_dir_read(&lowerpath, &rdd);
+ list_del(&rdd.middle);
+ }
+out:
+ return err;
+}
+
+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
+{
+ struct ovl_cache_entry *p;
+ loff_t off = 0;
+
+ list_for_each_entry(p, &od->cache->entries, l_node) {
+ if (p->is_cursor)
+ continue;
+ if (off >= pos)
+ break;
+ off++;
+ }
+ list_move_tail(&od->cursor.l_node, &p->l_node);
+}
+
+static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
+{
+ int res;
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_dir_cache(dentry);
+ if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+ cache->refcount++;
+ return cache;
+ }
+ ovl_set_dir_cache(dentry, NULL);
+
+ cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+ if (!cache)
+ return ERR_PTR(-ENOMEM);
+
+ cache->refcount = 1;
+ INIT_LIST_HEAD(&cache->entries);
+
+ res = ovl_dir_read_merged(dentry, &cache->entries);
+ if (res) {
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ return ERR_PTR(res);
+ }
+
+ cache->version = ovl_dentry_version_get(dentry);
+ ovl_set_dir_cache(dentry, cache);
+
+ return cache;
+}
+
+static int ovl_iterate(struct file *file, struct dir_context *ctx)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+
+ if (!ctx->pos)
+ ovl_dir_reset(file);
+
+ if (od->is_real)
+ return iterate_dir(od->realfile, ctx);
+
+ if (!od->cache) {
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_cache_get(dentry);
+ if (IS_ERR(cache))
+ return PTR_ERR(cache);
+
+ od->cache = cache;
+ ovl_seek_cursor(od, ctx->pos);
+ }
+
+ while (od->cursor.l_node.next != &od->cache->entries) {
+ struct ovl_cache_entry *p;
+
+ p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
+ /* Skip cursors */
+ if (!p->is_cursor) {
+ if (!p->is_whiteout) {
+ if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+ break;
+ }
+ ctx->pos++;
+ }
+ list_move(&od->cursor.l_node, &p->l_node);
+ }
+ return 0;
+}
+
+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t res;
+ struct ovl_dir_file *od = file->private_data;
+
+ mutex_lock(&file_inode(file)->i_mutex);
+ if (!file->f_pos)
+ ovl_dir_reset(file);
+
+ if (od->is_real) {
+ res = vfs_llseek(od->realfile, offset, origin);
+ file->f_pos = od->realfile->f_pos;
+ } else {
+ res = -EINVAL;
+
+ switch (origin) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ break;
+ case SEEK_SET:
+ break;
+ default:
+ goto out_unlock;
+ }
+ if (offset < 0)
+ goto out_unlock;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (od->cache)
+ ovl_seek_cursor(od, offset);
+ }
+ res = offset;
+ }
+out_unlock:
+ mutex_unlock(&file_inode(file)->i_mutex);
+
+ return res;
+}
+
+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct file *realfile = od->realfile;
+
+ /*
+ * Need to check if we started out being a lower dir, but got copied up
+ */
+ if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) {
+ struct inode *inode = file_inode(file);
+
+ realfile = lockless_dereference(od->upperfile);
+ if (!realfile) {
+ struct path upperpath;
+
+ ovl_path_upper(dentry, &upperpath);
+ realfile = ovl_path_open(&upperpath, O_RDONLY);
+ smp_mb__before_spinlock();
+ mutex_lock(&inode->i_mutex);
+ if (!od->upperfile) {
+ if (IS_ERR(realfile)) {
+ mutex_unlock(&inode->i_mutex);
+ return PTR_ERR(realfile);
+ }
+ od->upperfile = realfile;
+ } else {
+ /* somebody has beaten us to it */
+ if (!IS_ERR(realfile))
+ fput(realfile);
+ realfile = od->upperfile;
+ }
+ mutex_unlock(&inode->i_mutex);
+ }
+ }
+
+ return vfs_fsync_range(realfile, start, end, datasync);
+}
+
+static int ovl_dir_release(struct inode *inode, struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+
+ if (od->cache) {
+ mutex_lock(&inode->i_mutex);
+ ovl_cache_put(od, file->f_path.dentry);
+ mutex_unlock(&inode->i_mutex);
+ }
+ fput(od->realfile);
+ if (od->upperfile)
+ fput(od->upperfile);
+ kfree(od);
+
+ return 0;
+}
+
+static int ovl_dir_open(struct inode *inode, struct file *file)
+{
+ struct path realpath;
+ struct file *realfile;
+ struct ovl_dir_file *od;
+ enum ovl_path_type type;
+
+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
+ if (!od)
+ return -ENOMEM;
+
+ type = ovl_path_real(file->f_path.dentry, &realpath);
+ realfile = ovl_path_open(&realpath, file->f_flags);
+ if (IS_ERR(realfile)) {
+ kfree(od);
+ return PTR_ERR(realfile);
+ }
+ INIT_LIST_HEAD(&od->cursor.l_node);
+ od->realfile = realfile;
+ od->is_real = (type != OVL_PATH_MERGE);
+ od->is_upper = (type != OVL_PATH_LOWER);
+ od->cursor.is_cursor = true;
+ file->private_data = od;
+
+ return 0;
+}
+
+const struct file_operations ovl_dir_operations = {
+ .read = generic_read_dir,
+ .open = ovl_dir_open,
+ .iterate = ovl_iterate,
+ .llseek = ovl_dir_llseek,
+ .fsync = ovl_dir_fsync,
+ .release = ovl_dir_release,
+};
+
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
+{
+ int err;
+ struct ovl_cache_entry *p;
+
+ err = ovl_dir_read_merged(dentry, list);
+ if (err)
+ return err;
+
+ err = 0;
+
+ list_for_each_entry(p, list, l_node) {
+ if (p->is_whiteout)
+ continue;
+
+ if (p->name[0] == '.') {
+ if (p->len == 1)
+ continue;
+ if (p->len == 2 && p->name[1] == '.')
+ continue;
+ }
+ err = -ENOTEMPTY;
+ break;
+ }
+
+ return err;
+}
+
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
+{
+ struct ovl_cache_entry *p;
+
+ mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+ list_for_each_entry(p, list, l_node) {
+ struct dentry *dentry;
+
+ if (!p->is_whiteout)
+ continue;
+
+ dentry = lookup_one_len(p->name, upper, p->len);
+ if (IS_ERR(dentry)) {
+ pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
+ upper->d_name.name, p->len, p->name,
+ (int) PTR_ERR(dentry));
+ continue;
+ }
+ ovl_cleanup(upper->d_inode, dentry);
+ dput(dentry);
+ }
+ mutex_unlock(&upper->d_inode->i_mutex);
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
new file mode 100644
index 000000000000..f16d318b71f8
--- /dev/null
+++ b/fs/overlayfs/super.c
@@ -0,0 +1,833 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include "overlayfs.h"
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Overlay filesystem");
+MODULE_LICENSE("GPL");
+
+#define OVERLAYFS_SUPER_MAGIC 0x794c7630
+
+struct ovl_config {
+ char *lowerdir;
+ char *upperdir;
+ char *workdir;
+};
+
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+ struct vfsmount *upper_mnt;
+ struct vfsmount *lower_mnt;
+ struct dentry *workdir;
+ long lower_namelen;
+ /* pathnames of lower and upper dirs, for show_options */
+ struct ovl_config config;
+};
+
+struct ovl_dir_cache;
+
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+ struct dentry *__upperdentry;
+ struct dentry *lowerdentry;
+ struct ovl_dir_cache *cache;
+ union {
+ struct {
+ u64 version;
+ bool opaque;
+ };
+ struct rcu_head rcu;
+ };
+};
+
+const char *ovl_opaque_xattr = "trusted.overlay.opaque";
+
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe->__upperdentry) {
+ if (oe->lowerdentry) {
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ return OVL_PATH_MERGE;
+ else
+ return OVL_PATH_UPPER;
+ } else {
+ if (oe->opaque)
+ return OVL_PATH_UPPER;
+ else
+ return OVL_PATH_PURE_UPPER;
+ }
+ } else {
+ return OVL_PATH_LOWER;
+ }
+}
+
+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
+{
+ return lockless_dereference(oe->__upperdentry);
+}
+
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ path->mnt = ofs->upper_mnt;
+ path->dentry = ovl_upperdentry_dereference(oe);
+}
+
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (type == OVL_PATH_LOWER)
+ ovl_path_lower(dentry, path);
+ else
+ ovl_path_upper(dentry, path);
+
+ return type;
+}
+
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return ovl_upperdentry_dereference(oe);
+}
+
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->lowerdentry;
+}
+
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ struct dentry *realdentry;
+
+ realdentry = ovl_upperdentry_dereference(oe);
+ if (!realdentry)
+ realdentry = oe->lowerdentry;
+
+ return realdentry;
+}
+
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
+{
+ struct dentry *realdentry;
+
+ realdentry = ovl_upperdentry_dereference(oe);
+ if (realdentry) {
+ *is_upper = true;
+ } else {
+ realdentry = oe->lowerdentry;
+ *is_upper = false;
+ }
+ return realdentry;
+}
+
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->cache;
+}
+
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ oe->cache = cache;
+}
+
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ path->mnt = ofs->lower_mnt;
+ path->dentry = oe->lowerdentry;
+}
+
+int ovl_want_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return mnt_want_write(ofs->upper_mnt);
+}
+
+void ovl_drop_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ mnt_drop_write(ofs->upper_mnt);
+}
+
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return ofs->workdir;
+}
+
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ return oe->opaque;
+}
+
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ oe->opaque = opaque;
+}
+
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+ WARN_ON(oe->__upperdentry);
+ BUG_ON(!upperdentry->d_inode);
+ /*
+ * Make sure upperdentry is consistent before making it visible to
+ * ovl_upperdentry_dereference().
+ */
+ smp_wmb();
+ oe->__upperdentry = upperdentry;
+}
+
+void ovl_dentry_version_inc(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ oe->version++;
+}
+
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ return oe->version;
+}
+
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ return inode && IS_WHITEOUT(inode);
+}
+
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+ int res;
+ char val;
+ struct inode *inode = dentry->d_inode;
+
+ if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
+ return false;
+
+ res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1);
+ if (res == 1 && val == 'y')
+ return true;
+
+ return false;
+}
+
+static void ovl_dentry_release(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe) {
+ dput(oe->__upperdentry);
+ dput(oe->lowerdentry);
+ kfree_rcu(oe, rcu);
+ }
+}
+
+static const struct dentry_operations ovl_dentry_operations = {
+ .d_release = ovl_dentry_release,
+};
+
+static struct ovl_entry *ovl_alloc_entry(void)
+{
+ return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
+}
+
+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
+ struct qstr *name)
+{
+ struct dentry *dentry;
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ dentry = lookup_one_len(name->name, dir, name->len);
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ if (IS_ERR(dentry)) {
+ if (PTR_ERR(dentry) == -ENOENT)
+ dentry = NULL;
+ } else if (!dentry->d_inode) {
+ dput(dentry);
+ dentry = NULL;
+ }
+ return dentry;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct ovl_entry *oe;
+ struct dentry *upperdir;
+ struct dentry *lowerdir;
+ struct dentry *upperdentry = NULL;
+ struct dentry *lowerdentry = NULL;
+ struct inode *inode = NULL;
+ int err;
+
+ err = -ENOMEM;
+ oe = ovl_alloc_entry();
+ if (!oe)
+ goto out;
+
+ upperdir = ovl_dentry_upper(dentry->d_parent);
+ lowerdir = ovl_dentry_lower(dentry->d_parent);
+
+ if (upperdir) {
+ upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
+ err = PTR_ERR(upperdentry);
+ if (IS_ERR(upperdentry))
+ goto out_put_dir;
+
+ if (lowerdir && upperdentry) {
+ if (ovl_is_whiteout(upperdentry)) {
+ dput(upperdentry);
+ upperdentry = NULL;
+ oe->opaque = true;
+ } else if (ovl_is_opaquedir(upperdentry)) {
+ oe->opaque = true;
+ }
+ }
+ }
+ if (lowerdir && !oe->opaque) {
+ lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
+ err = PTR_ERR(lowerdentry);
+ if (IS_ERR(lowerdentry))
+ goto out_dput_upper;
+ }
+
+ if (lowerdentry && upperdentry &&
+ (!S_ISDIR(upperdentry->d_inode->i_mode) ||
+ !S_ISDIR(lowerdentry->d_inode->i_mode))) {
+ dput(lowerdentry);
+ lowerdentry = NULL;
+ oe->opaque = true;
+ }
+
+ if (lowerdentry || upperdentry) {
+ struct dentry *realdentry;
+
+ realdentry = upperdentry ? upperdentry : lowerdentry;
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
+ oe);
+ if (!inode)
+ goto out_dput;
+ ovl_copyattr(realdentry->d_inode, inode);
+ }
+
+ oe->__upperdentry = upperdentry;
+ oe->lowerdentry = lowerdentry;
+
+ dentry->d_fsdata = oe;
+ d_add(dentry, inode);
+
+ return NULL;
+
+out_dput:
+ dput(lowerdentry);
+out_dput_upper:
+ dput(upperdentry);
+out_put_dir:
+ kfree(oe);
+out:
+ return ERR_PTR(err);
+}
+
+struct file *ovl_path_open(struct path *path, int flags)
+{
+ return dentry_open(path, flags, current_cred());
+}
+
+static void ovl_put_super(struct super_block *sb)
+{
+ struct ovl_fs *ufs = sb->s_fs_info;
+
+ dput(ufs->workdir);
+ mntput(ufs->upper_mnt);
+ mntput(ufs->lower_mnt);
+
+ kfree(ufs->config.lowerdir);
+ kfree(ufs->config.upperdir);
+ kfree(ufs->config.workdir);
+ kfree(ufs);
+}
+
+/**
+ * ovl_statfs
+ * @sb: The overlayfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. As writes always target the upper layer
+ * filesystem pass the statfs to the same filesystem.
+ */
+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct dentry *root_dentry = dentry->d_sb->s_root;
+ struct path path;
+ int err;
+
+ ovl_path_upper(root_dentry, &path);
+
+ err = vfs_statfs(&path, buf);
+ if (!err) {
+ buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
+ buf->f_type = OVERLAYFS_SUPER_MAGIC;
+ }
+
+ return err;
+}
+
+/**
+ * ovl_show_options
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct ovl_fs *ufs = sb->s_fs_info;
+
+ seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+ seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+ seq_printf(m, ",workdir=%s", ufs->config.workdir);
+ return 0;
+}
+
+static const struct super_operations ovl_super_operations = {
+ .put_super = ovl_put_super,
+ .statfs = ovl_statfs,
+ .show_options = ovl_show_options,
+};
+
+enum {
+ OPT_LOWERDIR,
+ OPT_UPPERDIR,
+ OPT_WORKDIR,
+ OPT_ERR,
+};
+
+static const match_table_t ovl_tokens = {
+ {OPT_LOWERDIR, "lowerdir=%s"},
+ {OPT_UPPERDIR, "upperdir=%s"},
+ {OPT_WORKDIR, "workdir=%s"},
+ {OPT_ERR, NULL}
+};
+
+static char *ovl_next_opt(char **s)
+{
+ char *sbegin = *s;
+ char *p;
+
+ if (sbegin == NULL)
+ return NULL;
+
+ for (p = sbegin; *p; p++) {
+ if (*p == '\\') {
+ p++;
+ if (!*p)
+ break;
+ } else if (*p == ',') {
+ *p = '\0';
+ *s = p + 1;
+ return sbegin;
+ }
+ }
+ *s = NULL;
+ return sbegin;
+}
+
+static int ovl_parse_opt(char *opt, struct ovl_config *config)
+{
+ char *p;
+
+ while ((p = ovl_next_opt(&opt)) != NULL) {
+ int token;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, ovl_tokens, args);
+ switch (token) {
+ case OPT_UPPERDIR:
+ kfree(config->upperdir);
+ config->upperdir = match_strdup(&args[0]);
+ if (!config->upperdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_LOWERDIR:
+ kfree(config->lowerdir);
+ config->lowerdir = match_strdup(&args[0]);
+ if (!config->lowerdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_WORKDIR:
+ kfree(config->workdir);
+ config->workdir = match_strdup(&args[0]);
+ if (!config->workdir)
+ return -ENOMEM;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+#define OVL_WORKDIR_NAME "work"
+
+static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
+ struct dentry *dentry)
+{
+ struct inode *dir = dentry->d_inode;
+ struct dentry *work;
+ int err;
+ bool retried = false;
+
+ err = mnt_want_write(mnt);
+ if (err)
+ return ERR_PTR(err);
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+retry:
+ work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
+ strlen(OVL_WORKDIR_NAME));
+
+ if (!IS_ERR(work)) {
+ struct kstat stat = {
+ .mode = S_IFDIR | 0,
+ };
+
+ if (work->d_inode) {
+ err = -EEXIST;
+ if (retried)
+ goto out_dput;
+
+ retried = true;
+ ovl_cleanup(dir, work);
+ dput(work);
+ goto retry;
+ }
+
+ err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
+ if (err)
+ goto out_dput;
+ }
+out_unlock:
+ mutex_unlock(&dir->i_mutex);
+ mnt_drop_write(mnt);
+
+ return work;
+
+out_dput:
+ dput(work);
+ work = ERR_PTR(err);
+ goto out_unlock;
+}
+
+static void ovl_unescape(char *s)
+{
+ char *d = s;
+
+ for (;; s++, d++) {
+ if (*s == '\\')
+ s++;
+ *d = *s;
+ if (!*s)
+ break;
+ }
+}
+
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+ int err;
+ char *tmp = kstrdup(name, GFP_KERNEL);
+
+ if (!tmp)
+ return -ENOMEM;
+
+ ovl_unescape(tmp);
+ err = kern_path(tmp, LOOKUP_FOLLOW, path);
+ if (err) {
+ pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err);
+ err = -EINVAL;
+ }
+ kfree(tmp);
+ return err;
+}
+
+static bool ovl_is_allowed_fs_type(struct dentry *root)
+{
+ const struct dentry_operations *dop = root->d_op;
+
+ /*
+ * We don't support:
+ * - automount filesystems
+ * - filesystems with revalidate (FIXME for lower layer)
+ * - filesystems with case insensitive names
+ */
+ if (dop &&
+ (dop->d_manage || dop->d_automount ||
+ dop->d_revalidate || dop->d_weak_revalidate ||
+ dop->d_compare || dop->d_hash)) {
+ return false;
+ }
+ return true;
+}
+
+/* Workdir should not be subdir of upperdir and vice versa */
+static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
+{
+ bool ok = false;
+
+ if (workdir != upperdir) {
+ ok = (lock_rename(workdir, upperdir) == NULL);
+ unlock_rename(workdir, upperdir);
+ }
+ return ok;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct path lowerpath;
+ struct path upperpath;
+ struct path workpath;
+ struct inode *root_inode;
+ struct dentry *root_dentry;
+ struct ovl_entry *oe;
+ struct ovl_fs *ufs;
+ struct kstatfs statfs;
+ int err;
+
+ err = -ENOMEM;
+ ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+ if (!ufs)
+ goto out;
+
+ err = ovl_parse_opt((char *) data, &ufs->config);
+ if (err)
+ goto out_free_config;
+
+ /* FIXME: workdir is not needed for a R/O mount */
+ err = -EINVAL;
+ if (!ufs->config.upperdir || !ufs->config.lowerdir ||
+ !ufs->config.workdir) {
+ pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
+ goto out_free_config;
+ }
+
+ err = -ENOMEM;
+ oe = ovl_alloc_entry();
+ if (oe == NULL)
+ goto out_free_config;
+
+ err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
+ if (err)
+ goto out_free_oe;
+
+ err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath);
+ if (err)
+ goto out_put_upperpath;
+
+ err = ovl_mount_dir(ufs->config.workdir, &workpath);
+ if (err)
+ goto out_put_lowerpath;
+
+ err = -EINVAL;
+ if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
+ !S_ISDIR(lowerpath.dentry->d_inode->i_mode) ||
+ !S_ISDIR(workpath.dentry->d_inode->i_mode)) {
+ pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n");
+ goto out_put_workpath;
+ }
+
+ if (upperpath.mnt != workpath.mnt) {
+ pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+ goto out_put_workpath;
+ }
+ if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
+ pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+ goto out_put_workpath;
+ }
+
+ if (!ovl_is_allowed_fs_type(upperpath.dentry)) {
+ pr_err("overlayfs: filesystem of upperdir is not supported\n");
+ goto out_put_workpath;
+ }
+
+ if (!ovl_is_allowed_fs_type(lowerpath.dentry)) {
+ pr_err("overlayfs: filesystem of lowerdir is not supported\n");
+ goto out_put_workpath;
+ }
+
+ err = vfs_statfs(&lowerpath, &statfs);
+ if (err) {
+ pr_err("overlayfs: statfs failed on lowerpath\n");
+ goto out_put_workpath;
+ }
+ ufs->lower_namelen = statfs.f_namelen;
+
+ sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
+ lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
+
+ err = -EINVAL;
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ pr_err("overlayfs: maximum fs stacking depth exceeded\n");
+ goto out_put_workpath;
+ }
+
+ ufs->upper_mnt = clone_private_mount(&upperpath);
+ err = PTR_ERR(ufs->upper_mnt);
+ if (IS_ERR(ufs->upper_mnt)) {
+ pr_err("overlayfs: failed to clone upperpath\n");
+ goto out_put_workpath;
+ }
+
+ ufs->lower_mnt = clone_private_mount(&lowerpath);
+ err = PTR_ERR(ufs->lower_mnt);
+ if (IS_ERR(ufs->lower_mnt)) {
+ pr_err("overlayfs: failed to clone lowerpath\n");
+ goto out_put_upper_mnt;
+ }
+
+ ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
+ err = PTR_ERR(ufs->workdir);
+ if (IS_ERR(ufs->workdir)) {
+ pr_err("overlayfs: failed to create directory %s/%s\n",
+ ufs->config.workdir, OVL_WORKDIR_NAME);
+ goto out_put_lower_mnt;
+ }
+
+ /*
+ * Make lower_mnt R/O. That way fchmod/fchown on lower file
+ * will fail instead of modifying lower fs.
+ */
+ ufs->lower_mnt->mnt_flags |= MNT_READONLY;
+
+ /* If the upper fs is r/o, we mark overlayfs r/o too */
+ if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
+ sb->s_flags |= MS_RDONLY;
+
+ sb->s_d_op = &ovl_dentry_operations;
+
+ err = -ENOMEM;
+ root_inode = ovl_new_inode(sb, S_IFDIR, oe);
+ if (!root_inode)
+ goto out_put_workdir;
+
+ root_dentry = d_make_root(root_inode);
+ if (!root_dentry)
+ goto out_put_workdir;
+
+ mntput(upperpath.mnt);
+ mntput(lowerpath.mnt);
+ path_put(&workpath);
+
+ oe->__upperdentry = upperpath.dentry;
+ oe->lowerdentry = lowerpath.dentry;
+
+ root_dentry->d_fsdata = oe;
+
+ sb->s_magic = OVERLAYFS_SUPER_MAGIC;
+ sb->s_op = &ovl_super_operations;
+ sb->s_root = root_dentry;
+ sb->s_fs_info = ufs;
+
+ return 0;
+
+out_put_workdir:
+ dput(ufs->workdir);
+out_put_lower_mnt:
+ mntput(ufs->lower_mnt);
+out_put_upper_mnt:
+ mntput(ufs->upper_mnt);
+out_put_workpath:
+ path_put(&workpath);
+out_put_lowerpath:
+ path_put(&lowerpath);
+out_put_upperpath:
+ path_put(&upperpath);
+out_free_oe:
+ kfree(oe);
+out_free_config:
+ kfree(ufs->config.lowerdir);
+ kfree(ufs->config.upperdir);
+ kfree(ufs->config.workdir);
+ kfree(ufs);
+out:
+ return err;
+}
+
+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *raw_data)
+{
+ return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
+}
+
+static struct file_system_type ovl_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "overlay",
+ .mount = ovl_mount,
+ .kill_sb = kill_anon_super,
+};
+MODULE_ALIAS_FS("overlay");
+
+static int __init ovl_init(void)
+{
+ return register_filesystem(&ovl_fs_type);
+}
+
+static void __exit ovl_exit(void)
+{
+ unregister_filesystem(&ovl_fs_type);
+}
+
+module_init(ovl_init);
+module_exit(ovl_exit);
diff --git a/fs/pnode.c b/fs/pnode.c
index aae331a5d03b..260ac8f898a4 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -242,6 +242,7 @@ static int propagate_one(struct mount *m)
child = copy_tree(last_source, last_source->mnt.mnt_root, type);
if (IS_ERR(child))
return PTR_ERR(child);
+ child->mnt.mnt_flags &= ~MNT_LOCKED;
mnt_set_mountpoint(m, mp, child);
last_dest = m;
last_source = child;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cd3653e4f35c..bd117d065b82 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -157,20 +157,29 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
int g;
- struct fdtable *fdt = NULL;
+ struct task_struct *tracer;
const struct cred *cred;
- pid_t ppid, tpid;
+ pid_t ppid, tpid = 0, tgid, ngid;
+ unsigned int max_fds = 0;
rcu_read_lock();
ppid = pid_alive(p) ?
task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
- tpid = 0;
- if (pid_alive(p)) {
- struct task_struct *tracer = ptrace_parent(p);
- if (tracer)
- tpid = task_pid_nr_ns(tracer, ns);
- }
+
+ tracer = ptrace_parent(p);
+ if (tracer)
+ tpid = task_pid_nr_ns(tracer, ns);
+
+ tgid = task_tgid_nr_ns(p, ns);
+ ngid = task_numa_group_id(p);
cred = get_task_cred(p);
+
+ task_lock(p);
+ if (p->files)
+ max_fds = files_fdtable(p->files)->max_fds;
+ task_unlock(p);
+ rcu_read_unlock();
+
seq_printf(m,
"State:\t%s\n"
"Tgid:\t%d\n"
@@ -179,12 +188,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
"PPid:\t%d\n"
"TracerPid:\t%d\n"
"Uid:\t%d\t%d\t%d\t%d\n"
- "Gid:\t%d\t%d\t%d\t%d\n",
+ "Gid:\t%d\t%d\t%d\t%d\n"
+ "FDSize:\t%d\nGroups:\t",
get_task_state(p),
- task_tgid_nr_ns(p, ns),
- task_numa_group_id(p),
- pid_nr_ns(pid, ns),
- ppid, tpid,
+ tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
from_kuid_munged(user_ns, cred->uid),
from_kuid_munged(user_ns, cred->euid),
from_kuid_munged(user_ns, cred->suid),
@@ -192,20 +199,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
from_kgid_munged(user_ns, cred->gid),
from_kgid_munged(user_ns, cred->egid),
from_kgid_munged(user_ns, cred->sgid),
- from_kgid_munged(user_ns, cred->fsgid));
-
- task_lock(p);
- if (p->files)
- fdt = files_fdtable(p->files);
- seq_printf(m,
- "FDSize:\t%d\n"
- "Groups:\t",
- fdt ? fdt->max_fds : 0);
- rcu_read_unlock();
+ from_kgid_munged(user_ns, cred->fsgid),
+ max_fds);
group_info = cred->group_info;
- task_unlock(p);
-
for (g = 0; g < group_info->ngroups; g++)
seq_printf(m, "%d ",
from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 772efa45a452..3f3d7aeb0712 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2464,6 +2464,57 @@ static const struct file_operations proc_projid_map_operations = {
.llseek = seq_lseek,
.release = proc_id_map_release,
};
+
+static int proc_setgroups_open(struct inode *inode, struct file *file)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ int ret;
+
+ ret = -ESRCH;
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ if (file->f_mode & FMODE_WRITE) {
+ ret = -EACCES;
+ if (!ns_capable(ns, CAP_SYS_ADMIN))
+ goto err_put_ns;
+ }
+
+ ret = single_open(file, &proc_setgroups_show, ns);
+ if (ret)
+ goto err_put_ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_setgroups_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ int ret = single_release(inode, file);
+ put_user_ns(ns);
+ return ret;
+}
+
+static const struct file_operations proc_setgroups_operations = {
+ .open = proc_setgroups_open,
+ .write = proc_setgroups_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_setgroups_release,
+};
#endif /* CONFIG_USER_NS */
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@ -2572,6 +2623,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
@@ -2618,6 +2670,9 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
dput(dentry);
}
+ if (pid == tgid)
+ return;
+
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%d", tgid);
leader = d_hash_and_lookup(mnt->mnt_root, &name);
@@ -2789,7 +2844,7 @@ retry:
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
struct tgid_iter iter;
- struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
loff_t pos = ctx->pos;
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
@@ -2913,6 +2968,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
};
@@ -3095,7 +3151,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
*/
- ns = file->f_dentry->d_sb->s_fs_info;
+ ns = inode->i_sb->s_fs_info;
tid = (int)file->f_version;
file->f_version = 0;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index e11d7c590bb0..8e5ad83b629a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -53,7 +53,8 @@ static int seq_show(struct seq_file *m, void *v)
(long long)file->f_pos, f_flags,
real_mount(file->f_path.mnt)->mnt_id);
if (file->f_op->show_fdinfo)
- ret = file->f_op->show_fdinfo(m, file);
+ file->f_op->show_fdinfo(m, file);
+ ret = seq_has_overflowed(m);
fput(file);
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 317b72641ebf..7fea13229f33 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -31,9 +31,73 @@ static DEFINE_SPINLOCK(proc_subdir_lock);
static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
{
- if (de->namelen != len)
- return 0;
- return !memcmp(name, de->name, len);
+ if (len < de->namelen)
+ return -1;
+ if (len > de->namelen)
+ return 1;
+
+ return memcmp(name, de->name, len);
+}
+
+static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
+{
+ return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
+ subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
+{
+ return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry,
+ subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+ const char *name,
+ unsigned int len)
+{
+ struct rb_node *node = dir->subdir.rb_node;
+
+ while (node) {
+ struct proc_dir_entry *de = container_of(node,
+ struct proc_dir_entry,
+ subdir_node);
+ int result = proc_match(len, name, de);
+
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return de;
+ }
+ return NULL;
+}
+
+static bool pde_subdir_insert(struct proc_dir_entry *dir,
+ struct proc_dir_entry *de)
+{
+ struct rb_root *root = &dir->subdir;
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct proc_dir_entry *this =
+ container_of(*new, struct proc_dir_entry, subdir_node);
+ int result = proc_match(de->namelen, de->name, this);
+
+ parent = *new;
+ if (result < 0)
+ new = &(*new)->rb_left;
+ else if (result > 0)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&de->subdir_node, parent, new);
+ rb_insert_color(&de->subdir_node, root);
+ return true;
}
static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
@@ -92,10 +156,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
break;
len = next - cp;
- for (de = de->subdir; de ; de = de->next) {
- if (proc_match(len, cp, de))
- break;
- }
+ de = pde_subdir_find(de, cp, len);
if (!de) {
WARN(1, "name '%s'\n", name);
return -ENOENT;
@@ -183,19 +244,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
struct inode *inode;
spin_lock(&proc_subdir_lock);
- for (de = de->subdir; de ; de = de->next) {
- if (de->namelen != dentry->d_name.len)
- continue;
- if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
- pde_get(de);
- spin_unlock(&proc_subdir_lock);
- inode = proc_get_inode(dir->i_sb, de);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, &simple_dentry_operations);
- d_add(dentry, inode);
- return NULL;
- }
+ de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
+ if (de) {
+ pde_get(de);
+ spin_unlock(&proc_subdir_lock);
+ inode = proc_get_inode(dir->i_sb, de);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ d_set_d_op(dentry, &simple_dentry_operations);
+ d_add(dentry, inode);
+ return NULL;
}
spin_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
@@ -225,7 +283,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
return 0;
spin_lock(&proc_subdir_lock);
- de = de->subdir;
+ de = pde_subdir_first(de);
i = ctx->pos - 2;
for (;;) {
if (!de) {
@@ -234,7 +292,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
}
if (!i)
break;
- de = de->next;
+ de = pde_subdir_next(de);
i--;
}
@@ -249,7 +307,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
}
spin_lock(&proc_subdir_lock);
ctx->pos++;
- next = de->next;
+ next = pde_subdir_next(de);
pde_put(de);
de = next;
} while (de);
@@ -286,9 +344,8 @@ static const struct inode_operations proc_dir_inode_operations = {
static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
- struct proc_dir_entry *tmp;
int ret;
-
+
ret = proc_alloc_inum(&dp->low_ino);
if (ret)
return ret;
@@ -304,21 +361,21 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
dp->proc_iops = &proc_file_inode_operations;
} else {
WARN_ON(1);
+ proc_free_inum(dp->low_ino);
return -EINVAL;
}
spin_lock(&proc_subdir_lock);
-
- for (tmp = dir->subdir; tmp; tmp = tmp->next)
- if (strcmp(tmp->name, dp->name) == 0) {
- WARN(1, "proc_dir_entry '%s/%s' already registered\n",
- dir->name, dp->name);
- break;
- }
-
- dp->next = dir->subdir;
dp->parent = dir;
- dir->subdir = dp;
+ if (pde_subdir_insert(dir, dp) == false) {
+ WARN(1, "proc_dir_entry '%s/%s' already registered\n",
+ dir->name, dp->name);
+ spin_unlock(&proc_subdir_lock);
+ if (S_ISDIR(dp->mode))
+ dir->nlink--;
+ proc_free_inum(dp->low_ino);
+ return -EEXIST;
+ }
spin_unlock(&proc_subdir_lock);
return 0;
@@ -354,6 +411,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
ent->namelen = qstr.len;
ent->mode = mode;
ent->nlink = nlink;
+ ent->subdir = RB_ROOT;
atomic_set(&ent->count, 1);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
@@ -485,7 +543,6 @@ void pde_put(struct proc_dir_entry *pde)
*/
void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
{
- struct proc_dir_entry **p;
struct proc_dir_entry *de = NULL;
const char *fn = name;
unsigned int len;
@@ -497,14 +554,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
}
len = strlen(fn);
- for (p = &parent->subdir; *p; p=&(*p)->next ) {
- if (proc_match(len, fn, *p)) {
- de = *p;
- *p = de->next;
- de->next = NULL;
- break;
- }
- }
+ de = pde_subdir_find(parent, fn, len);
+ if (de)
+ rb_erase(&de->subdir_node, &parent->subdir);
spin_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -516,16 +568,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
if (S_ISDIR(de->mode))
parent->nlink--;
de->nlink = 0;
- WARN(de->subdir, "%s: removing non-empty directory "
- "'%s/%s', leaking at least '%s'\n", __func__,
- de->parent->name, de->name, de->subdir->name);
+ WARN(pde_subdir_first(de),
+ "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
+ __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
pde_put(de);
}
EXPORT_SYMBOL(remove_proc_entry);
int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
{
- struct proc_dir_entry **p;
struct proc_dir_entry *root = NULL, *de, *next;
const char *fn = name;
unsigned int len;
@@ -537,24 +588,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
}
len = strlen(fn);
- for (p = &parent->subdir; *p; p=&(*p)->next ) {
- if (proc_match(len, fn, *p)) {
- root = *p;
- *p = root->next;
- root->next = NULL;
- break;
- }
- }
+ root = pde_subdir_find(parent, fn, len);
if (!root) {
spin_unlock(&proc_subdir_lock);
return -ENOENT;
}
+ rb_erase(&root->subdir_node, &parent->subdir);
+
de = root;
while (1) {
- next = de->subdir;
+ next = pde_subdir_first(de);
if (next) {
- de->subdir = next->next;
- next->next = NULL;
+ rb_erase(&next->subdir_node, &de->subdir);
de = next;
continue;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 333080d7a671..8420a2f80811 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -32,8 +32,6 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
- const struct proc_ns_operations *ns_ops;
- void *ns;
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
@@ -50,11 +48,6 @@ static void proc_evict_inode(struct inode *inode)
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
sysctl_head_put(head);
}
- /* Release any associated namespace */
- ns_ops = PROC_I(inode)->ns.ns_ops;
- ns = PROC_I(inode)->ns.ns;
- if (ns_ops && ns)
- ns_ops->put(ns);
}
static struct kmem_cache * proc_inode_cachep;
@@ -73,8 +66,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
- ei->ns.ns = NULL;
- ei->ns.ns_ops = NULL;
+ ei->ns_ops = NULL;
inode = &ei->vfs_inode;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa7a0ee182e1..6fcdba573e0f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -24,10 +24,9 @@ struct mempolicy;
* tree) of these proc_dir_entries, so that we can dynamically
* add new files to /proc.
*
- * The "next" pointer creates a linked list of one /proc directory,
- * while parent/subdir create the directory structure (every
- * /proc file has a parent, but "subdir" is NULL for all
- * non-directory entries).
+ * parent/subdir are used for the directory structure (every /proc file has a
+ * parent, but "subdir" is empty for all non-directory entries).
+ * subdir_node is used to build the rb tree "subdir" of the parent.
*/
struct proc_dir_entry {
unsigned int low_ino;
@@ -38,7 +37,9 @@ struct proc_dir_entry {
loff_t size;
const struct inode_operations *proc_iops;
const struct file_operations *proc_fops;
- struct proc_dir_entry *next, *parent, *subdir;
+ struct proc_dir_entry *parent;
+ struct rb_root subdir;
+ struct rb_node subdir_node;
void *data;
atomic_t count; /* use count */
atomic_t in_use; /* number of callers into module in progress; */
@@ -64,7 +65,7 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
- struct proc_ns ns;
+ const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index aa1eee06420f..d3ebf2e61853 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -12,6 +12,9 @@
#include <linux/vmstat.h>
#include <linux/atomic.h>
#include <linux/vmalloc.h>
+#ifdef CONFIG_CMA
+#include <linux/cma.h>
+#endif
#include <asm/page.h>
#include <asm/pgtable.h>
#include "internal.h"
@@ -138,6 +141,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"AnonHugePages: %8lu kB\n"
#endif
+#ifdef CONFIG_CMA
+ "CmaTotal: %8lu kB\n"
+ "CmaFree: %8lu kB\n"
+#endif
,
K(i.totalram),
K(i.freeram),
@@ -187,12 +194,16 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
vmi.used >> 10,
vmi.largest_chunk >> 10
#ifdef CONFIG_MEMORY_FAILURE
- ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
+ , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+ , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
HPAGE_PMD_NR)
#endif
+#ifdef CONFIG_CMA
+ , K(totalcma_pages)
+ , K(global_page_state(NR_FREE_CMA_PAGES))
+#endif
);
hugetlb_report_meminfo(m);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 89026095f2b5..c9eac4563fa8 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -1,10 +1,6 @@
#include <linux/proc_fs.h>
#include <linux/nsproxy.h>
-#include <linux/sched.h>
#include <linux/ptrace.h>
-#include <linux/fs_struct.h>
-#include <linux/mount.h>
-#include <linux/path.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/utsname.h>
@@ -34,138 +30,45 @@ static const struct proc_ns_operations *ns_entries[] = {
&mntns_operations,
};
-static const struct file_operations ns_file_operations = {
- .llseek = no_llseek,
-};
-
-static const struct inode_operations ns_inode_operations = {
- .setattr = proc_setattr,
-};
-
-static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
-{
- struct inode *inode = dentry->d_inode;
- const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
-
- return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
- ns_ops->name, inode->i_ino);
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
- .d_delete = always_delete_dentry,
- .d_dname = ns_dname,
-};
-
-static struct dentry *proc_ns_get_dentry(struct super_block *sb,
- struct task_struct *task, const struct proc_ns_operations *ns_ops)
-{
- struct dentry *dentry, *result;
- struct inode *inode;
- struct proc_inode *ei;
- struct qstr qname = { .name = "", };
- void *ns;
-
- ns = ns_ops->get(task);
- if (!ns)
- return ERR_PTR(-ENOENT);
-
- dentry = d_alloc_pseudo(sb, &qname);
- if (!dentry) {
- ns_ops->put(ns);
- return ERR_PTR(-ENOMEM);
- }
-
- inode = iget_locked(sb, ns_ops->inum(ns));
- if (!inode) {
- dput(dentry);
- ns_ops->put(ns);
- return ERR_PTR(-ENOMEM);
- }
-
- ei = PROC_I(inode);
- if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_op = &ns_inode_operations;
- inode->i_mode = S_IFREG | S_IRUGO;
- inode->i_fop = &ns_file_operations;
- ei->ns.ns_ops = ns_ops;
- ei->ns.ns = ns;
- unlock_new_inode(inode);
- } else {
- ns_ops->put(ns);
- }
-
- d_set_d_op(dentry, &ns_dentry_operations);
- result = d_instantiate_unique(dentry, inode);
- if (result) {
- dput(dentry);
- dentry = result;
- }
-
- return dentry;
-}
-
static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
- struct super_block *sb = inode->i_sb;
- struct proc_inode *ei = PROC_I(inode);
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
void *error = ERR_PTR(-EACCES);
task = get_proc_task(inode);
if (!task)
- goto out;
+ return error;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- goto out_put_task;
-
- ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops);
- if (IS_ERR(ns_path.dentry)) {
- error = ERR_CAST(ns_path.dentry);
- goto out_put_task;
+ if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error)
+ nd_jump_link(nd, &ns_path);
}
-
- ns_path.mnt = mntget(nd->path.mnt);
- nd_jump_link(nd, &ns_path);
- error = NULL;
-
-out_put_task:
put_task_struct(task);
-out:
return error;
}
static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct inode *inode = dentry->d_inode;
- struct proc_inode *ei = PROC_I(inode);
- const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
- void *ns;
char name[50];
int res = -EACCES;
task = get_proc_task(inode);
if (!task)
- goto out;
-
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- goto out_put_task;
+ return res;
- res = -ENOENT;
- ns = ns_ops->get(task);
- if (!ns)
- goto out_put_task;
-
- snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
- res = readlink_copy(buffer, buflen, name);
- ns_ops->put(ns);
-out_put_task:
+ if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ res = ns_get_name(name, sizeof(name), task, ns_ops);
+ if (res >= 0)
+ res = readlink_copy(buffer, buflen, name);
+ }
put_task_struct(task);
-out:
return res;
}
@@ -189,7 +92,7 @@ static int proc_ns_instantiate(struct inode *dir,
ei = PROC_I(inode);
inode->i_mode = S_IFLNK|S_IRWXUGO;
inode->i_op = &proc_ns_link_inode_operations;
- ei->ns.ns_ops = ns_ops;
+ ei->ns_ops = ns_ops;
d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
@@ -267,31 +170,3 @@ const struct inode_operations proc_ns_dir_inode_operations = {
.getattr = pid_getattr,
.setattr = proc_setattr,
};
-
-struct file *proc_ns_fget(int fd)
-{
- struct file *file;
-
- file = fget(fd);
- if (!file)
- return ERR_PTR(-EBADF);
-
- if (file->f_op != &ns_file_operations)
- goto out_invalid;
-
- return file;
-
-out_invalid:
- fput(file);
- return ERR_PTR(-EINVAL);
-}
-
-struct proc_ns *get_proc_ns(struct inode *inode)
-{
- return &PROC_I(inode)->ns;
-}
-
-bool proc_ns_inode(struct inode *inode)
-{
- return inode->i_fop == &ns_file_operations;
-}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a63af3e0a612..1bde894bc624 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net)
if (!netd)
goto out;
+ netd->subdir = RB_ROOT;
netd->data = net;
netd->nlink = 2;
netd->namelen = 3;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 094e44d4a6be..e74ac9f1a2c0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -251,6 +251,7 @@ struct proc_dir_entry proc_root = {
.proc_iops = &proc_root_inode_operations,
.proc_fops = &proc_root_operations,
.parent = &proc_root,
+ .subdir = RB_ROOT,
.name = "/proc",
};
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf2d03f8fd3e..510413eb25b8 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -159,7 +159,7 @@ static int show_stat(struct seq_file *p, void *v)
/* sum again ? it could be updated? */
for_each_irq_nr(j)
- seq_put_decimal_ull(p, ' ', kstat_irqs(j));
+ seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j));
seq_printf(p,
"\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4e0388cffe3d..246eae84b13b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -447,59 +447,92 @@ struct mem_size_stats {
u64 pss;
};
+static void smaps_account(struct mem_size_stats *mss, struct page *page,
+ unsigned long size, bool young, bool dirty)
+{
+ int mapcount;
+
+ if (PageAnon(page))
+ mss->anonymous += size;
-static void smaps_pte_entry(pte_t ptent, unsigned long addr,
- unsigned long ptent_size, struct mm_walk *walk)
+ mss->resident += size;
+ /* Accumulate the size in pages that have been accessed. */
+ if (young || PageReferenced(page))
+ mss->referenced += size;
+ mapcount = page_mapcount(page);
+ if (mapcount >= 2) {
+ u64 pss_delta;
+
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += size;
+ else
+ mss->shared_clean += size;
+ pss_delta = (u64)size << PSS_SHIFT;
+ do_div(pss_delta, mapcount);
+ mss->pss += pss_delta;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += size;
+ else
+ mss->private_clean += size;
+ mss->pss += (u64)size << PSS_SHIFT;
+ }
+}
+
+static void smaps_pte_entry(pte_t *pte, unsigned long addr,
+ struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
pgoff_t pgoff = linear_page_index(vma, addr);
struct page *page = NULL;
- int mapcount;
- if (pte_present(ptent)) {
- page = vm_normal_page(vma, addr, ptent);
- } else if (is_swap_pte(ptent)) {
- swp_entry_t swpent = pte_to_swp_entry(ptent);
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
if (!non_swap_entry(swpent))
- mss->swap += ptent_size;
+ mss->swap += PAGE_SIZE;
else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
- } else if (pte_file(ptent)) {
- if (pte_to_pgoff(ptent) != pgoff)
- mss->nonlinear += ptent_size;
+ } else if (pte_file(*pte)) {
+ if (pte_to_pgoff(*pte) != pgoff)
+ mss->nonlinear += PAGE_SIZE;
}
if (!page)
return;
- if (PageAnon(page))
- mss->anonymous += ptent_size;
-
if (page->index != pgoff)
- mss->nonlinear += ptent_size;
+ mss->nonlinear += PAGE_SIZE;
- mss->resident += ptent_size;
- /* Accumulate the size in pages that have been accessed. */
- if (pte_young(ptent) || PageReferenced(page))
- mss->referenced += ptent_size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- if (pte_dirty(ptent) || PageDirty(page))
- mss->shared_dirty += ptent_size;
- else
- mss->shared_clean += ptent_size;
- mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
- } else {
- if (pte_dirty(ptent) || PageDirty(page))
- mss->private_dirty += ptent_size;
- else
- mss->private_clean += ptent_size;
- mss->pss += (ptent_size << PSS_SHIFT);
- }
+ smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = mss->vma;
+ struct page *page;
+
+ /* FOLL_DUMP will return -EFAULT on huge zero page */
+ page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ if (IS_ERR_OR_NULL(page))
+ return;
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
+ smaps_account(mss, page, HPAGE_PMD_SIZE,
+ pmd_young(*pmd), pmd_dirty(*pmd));
+}
+#else
+static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
+ struct mm_walk *walk)
+{
+}
+#endif
+
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -509,9 +542,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
spinlock_t *ptl;
if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
+ smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
- mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
@@ -524,7 +556,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
- smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
+ smaps_pte_entry(pte, addr, walk);
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
@@ -552,6 +584,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
[ilog2(VM_DENYWRITE)] = "dw",
+#ifdef CONFIG_X86_INTEL_MPX
+ [ilog2(VM_MPX)] = "mp",
+#endif
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index fafb7a02a5d6..50416602774d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/uaccess.h>
+#include <linux/syslog.h>
#include "internal.h"
@@ -120,6 +121,18 @@ static const struct seq_operations pstore_ftrace_seq_ops = {
.show = pstore_ftrace_seq_show,
};
+static int pstore_check_syslog_permissions(struct pstore_private *ps)
+{
+ switch (ps->type) {
+ case PSTORE_TYPE_DMESG:
+ case PSTORE_TYPE_CONSOLE:
+ return check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ default:
+ return 0;
+ }
+}
+
static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -138,6 +151,10 @@ static int pstore_file_open(struct inode *inode, struct file *file)
int err;
const struct seq_operations *sops = NULL;
+ err = pstore_check_syslog_permissions(ps);
+ if (err)
+ return err;
+
if (ps->type == PSTORE_TYPE_FTRACE)
sops = &pstore_ftrace_seq_ops;
@@ -174,6 +191,11 @@ static const struct file_operations pstore_file_operations = {
static int pstore_unlink(struct inode *dir, struct dentry *dentry)
{
struct pstore_private *p = dentry->d_inode->i_private;
+ int err;
+
+ err = pstore_check_syslog_permissions(p);
+ if (err)
+ return err;
if (p->psi->erase)
p->psi->erase(p->type, p->id, p->count,
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 3b5744306ed8..8613e5b35c22 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -61,6 +61,11 @@ module_param(mem_size, ulong, 0400);
MODULE_PARM_DESC(mem_size,
"size of reserved RAM used to store oops/panic logs");
+static unsigned int mem_type;
+module_param(mem_type, uint, 0600);
+MODULE_PARM_DESC(mem_type,
+ "set to 1 to try to use unbuffered memory (default 0)");
+
static int dump_oops = 1;
module_param(dump_oops, int, 0600);
MODULE_PARM_DESC(dump_oops,
@@ -79,6 +84,7 @@ struct ramoops_context {
struct persistent_ram_zone *fprz;
phys_addr_t phys_addr;
unsigned long size;
+ unsigned int memtype;
size_t record_size;
size_t console_size;
size_t ftrace_size;
@@ -135,25 +141,27 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
return prz;
}
-static void ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
+static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
bool *compressed)
{
char data_type;
+ int header_length = 0;
- if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n",
- &time->tv_sec, &time->tv_nsec, &data_type) == 3) {
+ if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n%n", &time->tv_sec,
+ &time->tv_nsec, &data_type, &header_length) == 3) {
if (data_type == 'C')
*compressed = true;
else
*compressed = false;
- } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
- &time->tv_sec, &time->tv_nsec) == 2) {
+ } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n%n",
+ &time->tv_sec, &time->tv_nsec, &header_length) == 2) {
*compressed = false;
} else {
time->tv_sec = 0;
time->tv_nsec = 0;
*compressed = false;
}
+ return header_length;
}
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
@@ -165,6 +173,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
ssize_t ecc_notice_size;
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz;
+ int header_length;
prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
cxt->max_dump_cnt, id, type,
@@ -178,7 +187,13 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (!prz)
return 0;
+ if (!persistent_ram_old(prz))
+ return 0;
+
size = persistent_ram_old_size(prz);
+ header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), time,
+ compressed);
+ size -= header_length;
/* ECC correction notice */
ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
@@ -187,8 +202,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (*buf == NULL)
return -ENOMEM;
- memcpy(*buf, persistent_ram_old(prz), size);
- ramoops_read_kmsg_hdr(*buf, time, compressed);
+ memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1);
return size + ecc_notice_size;
@@ -358,7 +372,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
size_t sz = cxt->record_size;
cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
- &cxt->ecc_info);
+ &cxt->ecc_info,
+ cxt->memtype);
if (IS_ERR(cxt->przs[i])) {
err = PTR_ERR(cxt->przs[i]);
dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
@@ -388,7 +403,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
return -ENOMEM;
}
- *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info);
+ *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
@@ -435,6 +450,7 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->size = pdata->mem_size;
cxt->phys_addr = pdata->mem_address;
+ cxt->memtype = pdata->mem_type;
cxt->record_size = pdata->record_size;
cxt->console_size = pdata->console_size;
cxt->ftrace_size = pdata->ftrace_size;
@@ -545,7 +561,6 @@ static struct platform_driver ramoops_driver = {
.remove = __exit_p(ramoops_remove),
.driver = {
.name = "ramoops",
- .owner = THIS_MODULE,
},
};
@@ -564,6 +579,7 @@ static void ramoops_register_dummy(void)
dummy_data->mem_size = mem_size;
dummy_data->mem_address = mem_address;
+ dummy_data->mem_type = 0;
dummy_data->record_size = record_size;
dummy_data->console_size = ramoops_console_size;
dummy_data->ftrace_size = ramoops_ftrace_size;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 9d7b9a83699e..76c3f80efdfa 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -380,7 +380,8 @@ void persistent_ram_zap(struct persistent_ram_zone *prz)
persistent_ram_update_header_ecc(prz);
}
-static void *persistent_ram_vmap(phys_addr_t start, size_t size)
+static void *persistent_ram_vmap(phys_addr_t start, size_t size,
+ unsigned int memtype)
{
struct page **pages;
phys_addr_t page_start;
@@ -392,7 +393,10 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
page_start = start - offset_in_page(start);
page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE);
- prot = pgprot_noncached(PAGE_KERNEL);
+ if (memtype)
+ prot = pgprot_noncached(PAGE_KERNEL);
+ else
+ prot = pgprot_writecombine(PAGE_KERNEL);
pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
@@ -411,8 +415,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
return vaddr;
}
-static void *persistent_ram_iomap(phys_addr_t start, size_t size)
+static void *persistent_ram_iomap(phys_addr_t start, size_t size,
+ unsigned int memtype)
{
+ void *va;
+
if (!request_mem_region(start, size, "persistent_ram")) {
pr_err("request mem region (0x%llx@0x%llx) failed\n",
(unsigned long long)size, (unsigned long long)start);
@@ -422,19 +429,24 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
buffer_start_add = buffer_start_add_locked;
buffer_size_add = buffer_size_add_locked;
- return ioremap(start, size);
+ if (memtype)
+ va = ioremap(start, size);
+ else
+ va = ioremap_wc(start, size);
+
+ return va;
}
static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
- struct persistent_ram_zone *prz)
+ struct persistent_ram_zone *prz, int memtype)
{
prz->paddr = start;
prz->size = size;
if (pfn_valid(start >> PAGE_SHIFT))
- prz->vaddr = persistent_ram_vmap(start, size);
+ prz->vaddr = persistent_ram_vmap(start, size, memtype);
else
- prz->vaddr = persistent_ram_iomap(start, size);
+ prz->vaddr = persistent_ram_iomap(start, size, memtype);
if (!prz->vaddr) {
pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__,
@@ -500,7 +512,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
}
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
- u32 sig, struct persistent_ram_ecc_info *ecc_info)
+ u32 sig, struct persistent_ram_ecc_info *ecc_info,
+ unsigned int memtype)
{
struct persistent_ram_zone *prz;
int ret = -ENOMEM;
@@ -511,7 +524,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
goto err;
}
- ret = persistent_ram_buffer_map(start, size, prz);
+ ret = persistent_ram_buffer_map(start, size, prz, memtype);
if (ret)
goto err;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8b663b2d9562..8f0acef3d184 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -634,7 +634,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
dqstats_inc(DQST_LOOKUPS);
err = sb->dq_op->write_dquot(dquot);
if (!ret && err)
- err = ret;
+ ret = err;
dqput(dquot);
spin_lock(&dq_list_lock);
}
@@ -893,6 +893,11 @@ out:
}
EXPORT_SYMBOL(dqget);
+static inline struct dquot **i_dquot(struct inode *inode)
+{
+ return inode->i_sb->s_op->get_dquots(inode);
+}
+
static int dqinit_needed(struct inode *inode, int type)
{
int cnt;
@@ -900,9 +905,9 @@ static int dqinit_needed(struct inode *inode, int type)
if (IS_NOQUOTA(inode))
return 0;
if (type != -1)
- return !inode->i_dquot[type];
+ return !i_dquot(inode)[type];
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if (!inode->i_dquot[cnt])
+ if (!i_dquot(inode)[cnt])
return 1;
return 0;
}
@@ -965,9 +970,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
static void remove_inode_dquot_ref(struct inode *inode, int type,
struct list_head *tofree_head)
{
- struct dquot *dquot = inode->i_dquot[type];
+ struct dquot *dquot = i_dquot(inode)[type];
- inode->i_dquot[type] = NULL;
+ i_dquot(inode)[type] = NULL;
if (!dquot)
return;
@@ -1402,7 +1407,7 @@ static void __dquot_initialize(struct inode *inode, int type)
* we check it without locking here to avoid unnecessary
* dqget()/dqput() calls.
*/
- if (inode->i_dquot[cnt])
+ if (i_dquot(inode)[cnt])
continue;
init_needed = 1;
@@ -1433,8 +1438,8 @@ static void __dquot_initialize(struct inode *inode, int type)
/* We could race with quotaon or dqget() could have failed */
if (!got[cnt])
continue;
- if (!inode->i_dquot[cnt]) {
- inode->i_dquot[cnt] = got[cnt];
+ if (!i_dquot(inode)[cnt]) {
+ i_dquot(inode)[cnt] = got[cnt];
got[cnt] = NULL;
/*
* Make quota reservation system happy if someone
@@ -1442,7 +1447,7 @@ static void __dquot_initialize(struct inode *inode, int type)
*/
rsv = inode_get_rsv_space(inode);
if (unlikely(rsv))
- dquot_resv_space(inode->i_dquot[cnt], rsv);
+ dquot_resv_space(i_dquot(inode)[cnt], rsv);
}
}
out_err:
@@ -1472,8 +1477,8 @@ static void __dquot_drop(struct inode *inode)
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- put[cnt] = inode->i_dquot[cnt];
- inode->i_dquot[cnt] = NULL;
+ put[cnt] = i_dquot(inode)[cnt];
+ i_dquot(inode)[cnt] = NULL;
}
spin_unlock(&dq_data_lock);
dqput_all(put);
@@ -1494,7 +1499,7 @@ void dquot_drop(struct inode *inode)
* add quota pointers back anyway.
*/
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (inode->i_dquot[cnt])
+ if (i_dquot(inode)[cnt])
break;
}
@@ -1595,7 +1600,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot **dquots = inode->i_dquot;
+ struct dquot **dquots = i_dquot(inode);
int reserve = flags & DQUOT_SPACE_RESERVE;
if (!dquot_active(inode)) {
@@ -1643,11 +1648,11 @@ EXPORT_SYMBOL(__dquot_alloc_space);
/*
* This operation can block, but only after everything is updated
*/
-int dquot_alloc_inode(const struct inode *inode)
+int dquot_alloc_inode(struct inode *inode)
{
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots = inode->i_dquot;
+ struct dquot * const *dquots = i_dquot(inode);
if (!dquot_active(inode))
return 0;
@@ -1696,14 +1701,14 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
spin_lock(&dq_data_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (inode->i_dquot[cnt])
- dquot_claim_reserved_space(inode->i_dquot[cnt],
+ if (i_dquot(inode)[cnt])
+ dquot_claim_reserved_space(i_dquot(inode)[cnt],
number);
}
/* Update inode bytes */
inode_claim_rsv_space(inode, number);
spin_unlock(&dq_data_lock);
- mark_all_dquot_dirty(inode->i_dquot);
+ mark_all_dquot_dirty(i_dquot(inode));
srcu_read_unlock(&dquot_srcu, index);
return 0;
}
@@ -1725,14 +1730,14 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
spin_lock(&dq_data_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (inode->i_dquot[cnt])
- dquot_reclaim_reserved_space(inode->i_dquot[cnt],
+ if (i_dquot(inode)[cnt])
+ dquot_reclaim_reserved_space(i_dquot(inode)[cnt],
number);
}
/* Update inode bytes */
inode_reclaim_rsv_space(inode, number);
spin_unlock(&dq_data_lock);
- mark_all_dquot_dirty(inode->i_dquot);
+ mark_all_dquot_dirty(i_dquot(inode));
srcu_read_unlock(&dquot_srcu, index);
return;
}
@@ -1745,7 +1750,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot **dquots = inode->i_dquot;
+ struct dquot **dquots = i_dquot(inode);
int reserve = flags & DQUOT_SPACE_RESERVE, index;
if (!dquot_active(inode)) {
@@ -1784,11 +1789,11 @@ EXPORT_SYMBOL(__dquot_free_space);
/*
* This operation can block, but only after everything is updated
*/
-void dquot_free_inode(const struct inode *inode)
+void dquot_free_inode(struct inode *inode)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots = inode->i_dquot;
+ struct dquot * const *dquots = i_dquot(inode);
int index;
if (!dquot_active(inode))
@@ -1865,7 +1870,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
if (!sb_has_quota_active(inode->i_sb, cnt))
continue;
is_valid[cnt] = 1;
- transfer_from[cnt] = inode->i_dquot[cnt];
+ transfer_from[cnt] = i_dquot(inode)[cnt];
ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
if (ret)
goto over_quota;
@@ -1901,7 +1906,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
dquot_incr_space(transfer_to[cnt], cur_space);
dquot_resv_space(transfer_to[cnt], rsv_space);
- inode->i_dquot[cnt] = transfer_to[cnt];
+ i_dquot(inode)[cnt] = transfer_to[cnt];
}
spin_unlock(&dq_data_lock);
@@ -2743,8 +2748,8 @@ static int __init dquot_init(void)
for (i = 0; i < nr_hash; i++)
INIT_HLIST_HEAD(dquot_hash + i);
- printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n",
- nr_hash, order, (PAGE_SIZE << order));
+ pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
+ " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
register_shrinker(&dqcache_shrinker);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 75621649dbd7..2aa4151f99d2 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -47,8 +47,11 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
static void quota_sync_one(struct super_block *sb, void *arg)
{
- if (sb->s_qcop && sb->s_qcop->quota_sync)
- sb->s_qcop->quota_sync(sb, *(int *)arg);
+ int type = *(int *)arg;
+
+ if (sb->s_qcop && sb->s_qcop->quota_sync &&
+ (sb->s_quota_types & (1 << type)))
+ sb->s_qcop->quota_sync(sb, type);
}
static int quota_sync_all(int type)
@@ -297,8 +300,14 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
return -EINVAL;
+ /*
+ * Quota not supported on this fs? Check this before s_quota_types
+ * since they needn't be set if quota is not supported at all.
+ */
if (!sb->s_qcop)
return -ENOSYS;
+ if (!(sb->s_quota_types & (1 << type)))
+ return -EINVAL;
ret = check_quotactl_permission(sb, type, cmd, id);
if (ret < 0)
diff --git a/fs/read_write.c b/fs/read_write.c
index 7d9318c3d43c..c0805c93b6fa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -412,6 +412,23 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
EXPORT_SYMBOL(new_sync_read);
+ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
+ loff_t *pos)
+{
+ ssize_t ret;
+
+ if (file->f_op->read)
+ ret = file->f_op->read(file, buf, count, pos);
+ else if (file->f_op->aio_read)
+ ret = do_sync_read(file, buf, count, pos);
+ else if (file->f_op->read_iter)
+ ret = new_sync_read(file, buf, count, pos);
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
+
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
@@ -426,12 +443,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
- if (file->f_op->read)
- ret = file->f_op->read(file, buf, count, pos);
- else if (file->f_op->aio_read)
- ret = do_sync_read(file, buf, count, pos);
- else
- ret = new_sync_read(file, buf, count, pos);
+ ret = __vfs_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
diff --git a/fs/readdir.c b/fs/readdir.c
index 33fd92208cb7..ced679179cac 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -74,10 +74,11 @@ struct readdir_callback {
int result;
};
-static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
- u64 ino, unsigned int d_type)
+static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
- struct readdir_callback *buf = (struct readdir_callback *) __buf;
+ struct readdir_callback *buf =
+ container_of(ctx, struct readdir_callback, ctx);
struct old_linux_dirent __user * dirent;
unsigned long d_ino;
@@ -148,11 +149,12 @@ struct getdents_callback {
int error;
};
-static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
- u64 ino, unsigned int d_type)
+static int filldir(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent __user * dirent;
- struct getdents_callback * buf = (struct getdents_callback *) __buf;
+ struct getdents_callback *buf =
+ container_of(ctx, struct getdents_callback, ctx);
unsigned long d_ino;
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
@@ -232,11 +234,12 @@ struct getdents_callback64 {
int error;
};
-static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
- u64 ino, unsigned int d_type)
+static int filldir64(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent64 __user *dirent;
- struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
+ struct getdents_callback64 *buf =
+ container_of(ctx, struct getdents_callback64, ctx);
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d571e173a990..9d6486d416a3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2772,7 +2772,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
if (journal_init_dev(sb, journal, j_dev_name) != 0) {
reiserfs_warning(sb, "sh-462",
- "unable to initialize jornal device");
+ "unable to initialize journal device");
goto free_and_return;
}
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 1894d96ccb7c..bb79cddf0a1f 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -97,6 +97,10 @@ struct reiserfs_inode_info {
#ifdef CONFIG_REISERFS_FS_XATTR
struct rw_semaphore i_xattr_sem;
#endif
+#ifdef CONFIG_QUOTA
+ struct dquot *i_dquot[MAXQUOTAS];
+#endif
+
struct inode vfs_inode;
};
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f1376c92cf74..71fbbe3e2dab 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -594,6 +594,10 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
return NULL;
atomic_set(&ei->openers, 0);
mutex_init(&ei->tailpack);
+#ifdef CONFIG_QUOTA
+ memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
+#endif
+
return &ei->vfs_inode;
}
@@ -750,6 +754,11 @@ static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
size_t, loff_t);
static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
loff_t);
+
+static struct dquot **reiserfs_get_dquots(struct inode *inode)
+{
+ return REISERFS_I(inode)->i_dquot;
+}
#endif
static const struct super_operations reiserfs_sops = {
@@ -768,6 +777,7 @@ static const struct super_operations reiserfs_sops = {
#ifdef CONFIG_QUOTA
.quota_read = reiserfs_quota_read,
.quota_write = reiserfs_quota_write,
+ .get_dquots = reiserfs_get_dquots,
#endif
};
@@ -1633,6 +1643,7 @@ static int read_super_block(struct super_block *s, int offset)
#ifdef CONFIG_QUOTA
s->s_qcop = &reiserfs_qctl_operations;
s->dq_op = &reiserfs_quota_operations;
+ s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
/*
@@ -2161,6 +2172,9 @@ error_unlocked:
reiserfs_write_unlock(s);
}
+ if (sbi->commit_wq)
+ destroy_workqueue(sbi->commit_wq);
+
cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
reiserfs_free_bitmap_cache(s);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 7c36898af402..04b06146bae2 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -188,10 +188,11 @@ struct reiserfs_dentry_buf {
};
static int
-fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
- u64 ino, unsigned int d_type)
+fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
- struct reiserfs_dentry_buf *dbuf = buf;
+ struct reiserfs_dentry_buf *dbuf =
+ container_of(ctx, struct reiserfs_dentry_buf, ctx);
struct dentry *dentry;
WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
@@ -209,9 +210,9 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
} else if (!dentry->d_inode) {
/* A directory entry exists, but no file? */
reiserfs_error(dentry->d_sb, "xattr-20003",
- "Corrupted directory: xattr %s listed but "
- "not found for file %s.\n",
- dentry->d_name.name, dbuf->xadir->d_name.name);
+ "Corrupted directory: xattr %pd listed but "
+ "not found for file %pd.\n",
+ dentry, dbuf->xadir);
dput(dentry);
return -EIO;
}
@@ -824,10 +825,12 @@ struct listxattr_buf {
struct dentry *dentry;
};
-static int listxattr_filler(void *buf, const char *name, int namelen,
- loff_t offset, u64 ino, unsigned int d_type)
+static int listxattr_filler(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
{
- struct listxattr_buf *b = (struct listxattr_buf *)buf;
+ struct listxattr_buf *b =
+ container_of(ctx, struct listxattr_buf, ctx);
size_t size;
if (name[0] != '.' ||
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 3857b720cb1b..dbf3a59c86bb 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -16,17 +16,6 @@
#include <asm/uaccess.h>
#include <asm/page.h>
-
-/*
- * seq_files have a buffer which can may overflow. When this happens a larger
- * buffer is reallocated and all the data will be printed again.
- * The overflow state is true when m->count == m->size.
- */
-static bool seq_overflow(struct seq_file *m)
-{
- return m->count == m->size;
-}
-
static void seq_set_overflow(struct seq_file *m)
{
m->count = m->size;
@@ -36,7 +25,11 @@ static void *seq_buf_alloc(unsigned long size)
{
void *buf;
- buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ /*
+ * __GFP_NORETRY to avoid oom-killings with high-order allocations -
+ * it's better to fall back to vmalloc() than to kill things.
+ */
+ buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
if (!buf && size > PAGE_SIZE)
buf = vmalloc(size);
return buf;
@@ -124,7 +117,7 @@ static int traverse(struct seq_file *m, loff_t offset)
error = 0;
m->count = 0;
}
- if (seq_overflow(m))
+ if (seq_has_overflowed(m))
goto Eoverflow;
if (pos + m->count > offset) {
m->from = offset - pos;
@@ -267,7 +260,7 @@ Fill:
break;
}
err = m->op->show(m, p);
- if (seq_overflow(m) || err) {
+ if (seq_has_overflowed(m) || err) {
m->count = offs;
if (likely(err <= 0))
break;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 424b7b65321f..7e412ad74836 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -230,7 +230,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
}
#ifdef CONFIG_PROC_FS
-static int signalfd_show_fdinfo(struct seq_file *m, struct file *f)
+static void signalfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct signalfd_ctx *ctx = f->private_data;
sigset_t sigmask;
@@ -238,8 +238,6 @@ static int signalfd_show_fdinfo(struct seq_file *m, struct file *f)
sigmask = ctx->sigmask;
signotset(&sigmask);
render_sigset_t(m, "sigmask:\t", &sigmask);
-
- return 0;
}
#endif
diff --git a/fs/splice.c b/fs/splice.c
index f5cb9ba84510..75c6058eabf2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
return ret;
}
+EXPORT_SYMBOL(do_splice_direct);
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index b6fa8657dcbc..ffb093e72b6c 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -120,6 +120,21 @@ config SQUASHFS_ZLIB
If unsure, say Y.
+config SQUASHFS_LZ4
+ bool "Include support for LZ4 compressed file systems"
+ depends on SQUASHFS
+ select LZ4_DECOMPRESS
+ help
+ Saying Y here includes support for reading Squashfs file systems
+ compressed with LZ4 compression. LZ4 compression is mainly
+ aimed at embedded systems with slower CPUs where the overheads
+ of zlib are too high.
+
+ LZ4 is not the standard compression used in Squashfs and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
config SQUASHFS_LZO
bool "Include support for LZO compressed file systems"
depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 4132520b4ff2..246a6f329d89 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -11,6 +11,7 @@ squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o
squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index ac22fe73b0ad..e9034bf6e5ae 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -41,6 +41,12 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
};
+#ifndef CONFIG_SQUASHFS_LZ4
+static const struct squashfs_decompressor squashfs_lz4_comp_ops = {
+ NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0
+};
+#endif
+
#ifndef CONFIG_SQUASHFS_LZO
static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
@@ -65,6 +71,7 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
static const struct squashfs_decompressor *decompressor[] = {
&squashfs_zlib_comp_ops,
+ &squashfs_lz4_comp_ops,
&squashfs_lzo_comp_ops,
&squashfs_xz_comp_ops,
&squashfs_lzma_unsupported_comp_ops,
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index af0985321808..a25713c031a5 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -46,6 +46,10 @@ static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk,
extern const struct squashfs_decompressor squashfs_xz_comp_ops;
#endif
+#ifdef CONFIG_SQUASHFS_LZ4
+extern const struct squashfs_decompressor squashfs_lz4_comp_ops;
+#endif
+
#ifdef CONFIG_SQUASHFS_LZO
extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
#endif
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
new file mode 100644
index 000000000000..c31e2bc9c081
--- /dev/null
+++ b/fs/squashfs/lz4_wrapper.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2013, 2014
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "page_actor.h"
+
+#define LZ4_LEGACY 1
+
+struct lz4_comp_opts {
+ __le32 version;
+ __le32 flags;
+};
+
+struct squashfs_lz4 {
+ void *input;
+ void *output;
+};
+
+
+static void *lz4_comp_opts(struct squashfs_sb_info *msblk,
+ void *buff, int len)
+{
+ struct lz4_comp_opts *comp_opts = buff;
+
+ /* LZ4 compressed filesystems always have compression options */
+ if (comp_opts == NULL || len < sizeof(*comp_opts))
+ return ERR_PTR(-EIO);
+
+ if (le32_to_cpu(comp_opts->version) != LZ4_LEGACY) {
+ /* LZ4 format currently used by the kernel is the 'legacy'
+ * format */
+ ERROR("Unknown LZ4 version\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return NULL;
+}
+
+
+static void *lz4_init(struct squashfs_sb_info *msblk, void *buff)
+{
+ int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+ struct squashfs_lz4 *stream;
+
+ stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+ if (stream == NULL)
+ goto failed;
+ stream->input = vmalloc(block_size);
+ if (stream->input == NULL)
+ goto failed2;
+ stream->output = vmalloc(block_size);
+ if (stream->output == NULL)
+ goto failed3;
+
+ return stream;
+
+failed3:
+ vfree(stream->input);
+failed2:
+ kfree(stream);
+failed:
+ ERROR("Failed to initialise LZ4 decompressor\n");
+ return ERR_PTR(-ENOMEM);
+}
+
+
+static void lz4_free(void *strm)
+{
+ struct squashfs_lz4 *stream = strm;
+
+ if (stream) {
+ vfree(stream->input);
+ vfree(stream->output);
+ }
+ kfree(stream);
+}
+
+
+static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
+ struct buffer_head **bh, int b, int offset, int length,
+ struct squashfs_page_actor *output)
+{
+ struct squashfs_lz4 *stream = strm;
+ void *buff = stream->input, *data;
+ int avail, i, bytes = length, res;
+ size_t dest_len = output->length;
+
+ for (i = 0; i < b; i++) {
+ avail = min(bytes, msblk->devblksize - offset);
+ memcpy(buff, bh[i]->b_data + offset, avail);
+ buff += avail;
+ bytes -= avail;
+ offset = 0;
+ put_bh(bh[i]);
+ }
+
+ res = lz4_decompress_unknownoutputsize(stream->input, length,
+ stream->output, &dest_len);
+ if (res)
+ return -EIO;
+
+ bytes = dest_len;
+ data = squashfs_first_page(output);
+ buff = stream->output;
+ while (data) {
+ if (bytes <= PAGE_CACHE_SIZE) {
+ memcpy(data, buff, bytes);
+ break;
+ }
+ memcpy(data, buff, PAGE_CACHE_SIZE);
+ buff += PAGE_CACHE_SIZE;
+ bytes -= PAGE_CACHE_SIZE;
+ data = squashfs_next_page(output);
+ }
+ squashfs_finish_page(output);
+
+ return dest_len;
+}
+
+const struct squashfs_decompressor squashfs_lz4_comp_ops = {
+ .init = lz4_init,
+ .comp_opts = lz4_comp_opts,
+ .free = lz4_free,
+ .decompress = lz4_uncompress,
+ .id = LZ4_COMPRESSION,
+ .name = "lz4",
+ .supported = 1
+};
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 4b2beda49498..506f4ba5b983 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -240,6 +240,7 @@ struct meta_index {
#define LZMA_COMPRESSION 2
#define LZO_COMPRESSION 3
#define XZ_COMPRESSION 4
+#define LZ4_COMPRESSION 5
struct squashfs_super_block {
__le32 s_magic;
diff --git a/fs/sync.c b/fs/sync.c
index bdc729d80e5e..01d9f18a70b5 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -154,7 +154,7 @@ SYSCALL_DEFINE1(syncfs, int, fd)
if (!f.file)
return -EBADF;
- sb = f.file->f_dentry->d_sb;
+ sb = f.file->f_path.dentry->d_sb;
down_read(&sb->s_umount);
ret = sync_filesystem(sb);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e9ef59b3abb1..dfe928a9540f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -102,6 +102,22 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
return battr->read(of->file, kobj, battr, buf, pos, count);
}
+/* kernfs read callback for regular sysfs files with pre-alloc */
+static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
+ size_t count, loff_t pos)
+{
+ const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
+ struct kobject *kobj = of->kn->parent->priv;
+
+ /*
+ * If buf != of->prealloc_buf, we don't know how
+ * large it is, so cannot safely pass it to ->show
+ */
+ if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
+ return 0;
+ return ops->show(kobj, of->kn->priv, buf);
+}
+
/* kernfs write callback for regular sysfs files */
static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
@@ -125,7 +141,7 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
if (size) {
if (size <= pos)
- return 0;
+ return -EFBIG;
count = min_t(ssize_t, count, size - pos);
}
if (!count)
@@ -184,6 +200,22 @@ static const struct kernfs_ops sysfs_file_kfops_rw = {
.write = sysfs_kf_write,
};
+static const struct kernfs_ops sysfs_prealloc_kfops_ro = {
+ .read = sysfs_kf_read,
+ .prealloc = true,
+};
+
+static const struct kernfs_ops sysfs_prealloc_kfops_wo = {
+ .write = sysfs_kf_write,
+ .prealloc = true,
+};
+
+static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
+ .read = sysfs_kf_read,
+ .write = sysfs_kf_write,
+ .prealloc = true,
+};
+
static const struct kernfs_ops sysfs_bin_kfops_ro = {
.read = sysfs_kf_bin_read,
};
@@ -222,13 +254,22 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
kobject_name(kobj)))
return -EINVAL;
- if (sysfs_ops->show && sysfs_ops->store)
- ops = &sysfs_file_kfops_rw;
- else if (sysfs_ops->show)
- ops = &sysfs_file_kfops_ro;
- else if (sysfs_ops->store)
- ops = &sysfs_file_kfops_wo;
- else
+ if (sysfs_ops->show && sysfs_ops->store) {
+ if (mode & SYSFS_PREALLOC)
+ ops = &sysfs_prealloc_kfops_rw;
+ else
+ ops = &sysfs_file_kfops_rw;
+ } else if (sysfs_ops->show) {
+ if (mode & SYSFS_PREALLOC)
+ ops = &sysfs_prealloc_kfops_ro;
+ else
+ ops = &sysfs_file_kfops_ro;
+ } else if (sysfs_ops->store) {
+ if (mode & SYSFS_PREALLOC)
+ ops = &sysfs_prealloc_kfops_wo;
+ else
+ ops = &sysfs_file_kfops_wo;
+ } else
ops = &sysfs_file_kfops_empty;
size = PAGE_SIZE;
@@ -253,7 +294,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
if (!attr->ignore_lockdep)
key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif
- kn = __kernfs_create_file(parent, attr->name, mode, size, ops,
+ kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
(void *)attr, ns, true, key);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b46ffa94372a..b94fa6c3c6eb 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -288,7 +288,7 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
}
#ifdef CONFIG_PROC_FS
-static int timerfd_show(struct seq_file *m, struct file *file)
+static void timerfd_show(struct seq_file *m, struct file *file)
{
struct timerfd_ctx *ctx = file->private_data;
struct itimerspec t;
@@ -298,18 +298,19 @@ static int timerfd_show(struct seq_file *m, struct file *file)
t.it_interval = ktime_to_timespec(ctx->tintv);
spin_unlock_irq(&ctx->wqh.lock);
- return seq_printf(m,
- "clockid: %d\n"
- "ticks: %llu\n"
- "settime flags: 0%o\n"
- "it_value: (%llu, %llu)\n"
- "it_interval: (%llu, %llu)\n",
- ctx->clockid, (unsigned long long)ctx->ticks,
- ctx->settime_flags,
- (unsigned long long)t.it_value.tv_sec,
- (unsigned long long)t.it_value.tv_nsec,
- (unsigned long long)t.it_interval.tv_sec,
- (unsigned long long)t.it_interval.tv_nsec);
+ seq_printf(m,
+ "clockid: %d\n"
+ "ticks: %llu\n"
+ "settime flags: 0%o\n"
+ "it_value: (%llu, %llu)\n"
+ "it_interval: (%llu, %llu)\n",
+ ctx->clockid,
+ (unsigned long long)ctx->ticks,
+ ctx->settime_flags,
+ (unsigned long long)t.it_value.tv_sec,
+ (unsigned long long)t.it_value.tv_nsec,
+ (unsigned long long)t.it_interval.tv_sec,
+ (unsigned long long)t.it_interval.tv_nsec);
}
#else
#define timerfd_show NULL
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b5b593c45270..538519ee37d9 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -262,6 +262,7 @@ static int write_begin_slow(struct address_space *mapping,
if (err) {
unlock_page(page);
page_cache_release(page);
+ ubifs_release_budget(c, &req);
return err;
}
}
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index fb166e204441..f6ac3f29323c 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -571,7 +571,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
aligned_dlen = ALIGN(dlen, 8);
aligned_ilen = ALIGN(ilen, 8);
+
len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
+ /* Make sure to also account for extended attributes */
+ len += host_ui->data_len;
+
dent = kmalloc(len, GFP_NOFS);
if (!dent)
return -ENOMEM;
@@ -648,7 +652,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
ino_key_init(c, &ino_key, dir->i_ino);
ino_offs += aligned_ilen;
- err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
+ err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs,
+ UBIFS_INO_NODE_SZ + host_ui->data_len);
if (err)
goto out_ro;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e229315bbf7a..3ccb2f11fc76 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -2082,12 +2082,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
mutex_init(&sbi->s_alloc_mutex);
if (!udf_parse_options((char *)options, &uopt, false))
- goto error_out;
+ goto parse_options_failure;
if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
udf_err(sb, "utf8 cannot be combined with iocharset\n");
- goto error_out;
+ goto parse_options_failure;
}
#ifdef CONFIG_UDF_NLS
if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) {
@@ -2237,8 +2237,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
return 0;
error_out:
- if (sbi->s_vat_inode)
- iput(sbi->s_vat_inode);
+ iput(sbi->s_vat_inode);
+parse_options_failure:
#ifdef CONFIG_UDF_NLS
if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
unload_nls(sbi->s_nls_map);
@@ -2291,8 +2291,7 @@ static void udf_put_super(struct super_block *sb)
sbi = UDF_SB(sb);
- if (sbi->s_vat_inode)
- iput(sbi->s_vat_inode);
+ iput(sbi->s_vat_inode);
#ifdef CONFIG_UDF_NLS
if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
unload_nls(sbi->s_nls_map);
diff --git a/fs/xattr.c b/fs/xattr.c
index 64e83efb742d..4ef698549e31 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -405,16 +405,14 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
const void __user *,value, size_t, size, int, flags)
{
struct fd f = fdget(fd);
- struct dentry *dentry;
int error = -EBADF;
if (!f.file)
return error;
- dentry = f.file->f_path.dentry;
- audit_inode(NULL, dentry, 0);
+ audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
- error = setxattr(dentry, name, value, size, flags);
+ error = setxattr(f.file->f_path.dentry, name, value, size, flags);
mnt_drop_write_file(f.file);
}
fdput(f);
@@ -509,7 +507,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
if (!f.file)
return error;
- audit_inode(NULL, f.file->f_path.dentry, 0);
+ audit_file(f.file);
error = getxattr(f.file->f_path.dentry, name, value, size);
fdput(f);
return error;
@@ -590,7 +588,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
if (!f.file)
return error;
- audit_inode(NULL, f.file->f_path.dentry, 0);
+ audit_file(f.file);
error = listxattr(f.file->f_path.dentry, list, size);
fdput(f);
return error;
@@ -651,16 +649,14 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
struct fd f = fdget(fd);
- struct dentry *dentry;
int error = -EBADF;
if (!f.file)
return error;
- dentry = f.file->f_path.dentry;
- audit_inode(NULL, dentry, 0);
+ audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
- error = removexattr(dentry, name);
+ error = removexattr(f.file->f_path.dentry, name);
mnt_drop_write_file(f.file);
}
fdput(f);
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
deleted file mode 100644
index 6e247a99f5db..000000000000
--- a/fs/xfs/libxfs/xfs_ag.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_AG_H__
-#define __XFS_AG_H__
-
-/*
- * Allocation group header
- * This is divided into three structures, placed in sequential 512-byte
- * buffers after a copy of the superblock (also in a 512-byte buffer).
- */
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_trans;
-
-#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
-#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
-#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
-#define XFS_AGF_VERSION 1
-#define XFS_AGI_VERSION 1
-
-#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
-#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
-
-/*
- * Btree number 0 is bno, 1 is cnt. This value gives the size of the
- * arrays below.
- */
-#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
-
-/*
- * The second word of agf_levels in the first a.g. overlaps the EFS
- * superblock's magic number. Since the magic numbers valid for EFS
- * are > 64k, our value cannot be confused for an EFS superblock's.
- */
-
-typedef struct xfs_agf {
- /*
- * Common allocation group header information
- */
- __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */
- __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */
- __be32 agf_seqno; /* sequence # starting from 0 */
- __be32 agf_length; /* size in blocks of a.g. */
- /*
- * Freespace information
- */
- __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
- __be32 agf_spare0; /* spare field */
- __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
- __be32 agf_spare1; /* spare field */
-
- __be32 agf_flfirst; /* first freelist block's index */
- __be32 agf_fllast; /* last freelist block's index */
- __be32 agf_flcount; /* count of blocks in freelist */
- __be32 agf_freeblks; /* total free blocks */
-
- __be32 agf_longest; /* longest free space */
- __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
- uuid_t agf_uuid; /* uuid of filesystem */
-
- /*
- * reserve some contiguous space for future logged fields before we add
- * the unlogged fields. This makes the range logging via flags and
- * structure offsets much simpler.
- */
- __be64 agf_spare64[16];
-
- /* unlogged fields, written during buffer writeback. */
- __be64 agf_lsn; /* last write sequence */
- __be32 agf_crc; /* crc of agf sector */
- __be32 agf_spare2;
-
- /* structure must be padded to 64 bit alignment */
-} xfs_agf_t;
-
-#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
-
-#define XFS_AGF_MAGICNUM 0x00000001
-#define XFS_AGF_VERSIONNUM 0x00000002
-#define XFS_AGF_SEQNO 0x00000004
-#define XFS_AGF_LENGTH 0x00000008
-#define XFS_AGF_ROOTS 0x00000010
-#define XFS_AGF_LEVELS 0x00000020
-#define XFS_AGF_FLFIRST 0x00000040
-#define XFS_AGF_FLLAST 0x00000080
-#define XFS_AGF_FLCOUNT 0x00000100
-#define XFS_AGF_FREEBLKS 0x00000200
-#define XFS_AGF_LONGEST 0x00000400
-#define XFS_AGF_BTREEBLKS 0x00000800
-#define XFS_AGF_UUID 0x00001000
-#define XFS_AGF_NUM_BITS 13
-#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
-
-#define XFS_AGF_FLAGS \
- { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
- { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
- { XFS_AGF_SEQNO, "SEQNO" }, \
- { XFS_AGF_LENGTH, "LENGTH" }, \
- { XFS_AGF_ROOTS, "ROOTS" }, \
- { XFS_AGF_LEVELS, "LEVELS" }, \
- { XFS_AGF_FLFIRST, "FLFIRST" }, \
- { XFS_AGF_FLLAST, "FLLAST" }, \
- { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
- { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
- { XFS_AGF_LONGEST, "LONGEST" }, \
- { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
- { XFS_AGF_UUID, "UUID" }
-
-/* disk block (xfs_daddr_t) in the AG */
-#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
-#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
-
-extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-
-/*
- * Size of the unlinked inode hash table in the agi.
- */
-#define XFS_AGI_UNLINKED_BUCKETS 64
-
-typedef struct xfs_agi {
- /*
- * Common allocation group header information
- */
- __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */
- __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */
- __be32 agi_seqno; /* sequence # starting from 0 */
- __be32 agi_length; /* size in blocks of a.g. */
- /*
- * Inode information
- * Inodes are mapped by interpreting the inode number, so no
- * mapping data is needed here.
- */
- __be32 agi_count; /* count of allocated inodes */
- __be32 agi_root; /* root of inode btree */
- __be32 agi_level; /* levels in inode btree */
- __be32 agi_freecount; /* number of free inodes */
-
- __be32 agi_newino; /* new inode just allocated */
- __be32 agi_dirino; /* last directory inode chunk */
- /*
- * Hash table of inodes which have been unlinked but are
- * still being referenced.
- */
- __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
- /*
- * This marks the end of logging region 1 and start of logging region 2.
- */
- uuid_t agi_uuid; /* uuid of filesystem */
- __be32 agi_crc; /* crc of agi sector */
- __be32 agi_pad32;
- __be64 agi_lsn; /* last write sequence */
-
- __be32 agi_free_root; /* root of the free inode btree */
- __be32 agi_free_level;/* levels in free inode btree */
-
- /* structure must be padded to 64 bit alignment */
-} xfs_agi_t;
-
-#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
-
-#define XFS_AGI_MAGICNUM (1 << 0)
-#define XFS_AGI_VERSIONNUM (1 << 1)
-#define XFS_AGI_SEQNO (1 << 2)
-#define XFS_AGI_LENGTH (1 << 3)
-#define XFS_AGI_COUNT (1 << 4)
-#define XFS_AGI_ROOT (1 << 5)
-#define XFS_AGI_LEVEL (1 << 6)
-#define XFS_AGI_FREECOUNT (1 << 7)
-#define XFS_AGI_NEWINO (1 << 8)
-#define XFS_AGI_DIRINO (1 << 9)
-#define XFS_AGI_UNLINKED (1 << 10)
-#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
-#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define XFS_AGI_FREE_ROOT (1 << 11)
-#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_NUM_BITS_R2 13
-
-/* disk block (xfs_daddr_t) in the AG */
-#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
-#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
-
-extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, struct xfs_buf **bpp);
-
-/*
- * The third a.g. block contains the a.g. freelist, an array
- * of block pointers to blocks owned by the allocation btree code.
- */
-#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
-#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
-
-#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
- &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
- (__be32 *)(bp)->b_addr)
-
-/*
- * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
- * slots in the beginning of the block for a proper header with the
- * location information and CRC.
- */
-#define XFS_AGFL_SIZE(mp) \
- (((mp)->m_sb.sb_sectsize - \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
- sizeof(struct xfs_agfl) : 0)) / \
- sizeof(xfs_agblock_t))
-
-typedef struct xfs_agfl {
- __be32 agfl_magicnum;
- __be32 agfl_seqno;
- uuid_t agfl_uuid;
- __be64 agfl_lsn;
- __be32 agfl_crc;
- __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
-
-#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
-
-/*
- * tags for inode radix tree
- */
-#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
- in xfs_inode_ag_iterator */
-#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
-#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
-
-#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
-#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
- (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define XFS_MIN_FREELIST(a,mp) \
- (XFS_MIN_FREELIST_RAW( \
- be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
- be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define XFS_MIN_FREELIST_PAG(pag,mp) \
- (XFS_MIN_FREELIST_RAW( \
- (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
- (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-
-#define XFS_AGB_TO_FSB(mp,agno,agbno) \
- (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
-#define XFS_FSB_TO_AGNO(mp,fsbno) \
- ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
-#define XFS_FSB_TO_AGBNO(mp,fsbno) \
- ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
-#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
- ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
- (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
-#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
-
-/*
- * For checking for bad ranges of xfs_daddr_t's, covering multiple
- * allocation groups or a single xfs_daddr_t that's a superblock copy.
- */
-#define XFS_AG_CHECK_DADDR(mp,d,len) \
- ((len) == 1 ? \
- ASSERT((d) == XFS_SB_DADDR || \
- xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
- ASSERT(xfs_daddr_to_agno(mp, d) == \
- xfs_daddr_to_agno(mp, (d) + (len) - 1)))
-
-#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index eff34218f405..a6fbf4472017 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -23,7 +23,6 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index feacb061bab7..d1b4b6a5c894 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
+int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index e0e83e24d3ef..59d521c09a17 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -22,7 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 353fb425faef..0a472fbe06d4 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -42,7 +40,6 @@
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
-#include "xfs_dinode.h"
/*
* xfs_attr.c
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b1f73dbbf3d8..5d38e8b8a913 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -24,7 +24,6 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -41,7 +40,6 @@
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
-#include "xfs_dinode.h"
#include "xfs_dir2.h"
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 7510ab8058a4..20de88d1bf86 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -23,8 +23,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 79c981984dca..b5eb4743f75a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -22,9 +22,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -46,7 +44,6 @@
#include "xfs_trace.h"
#include "xfs_symlink.h"
#include "xfs_attr_leaf.h"
-#include "xfs_dinode.h"
#include "xfs_filestream.h"
@@ -5450,13 +5447,11 @@ xfs_bmse_merge(
struct xfs_btree_cur *cur,
int *logflags) /* output */
{
- struct xfs_ifork *ifp;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec left;
xfs_filblks_t blockcount;
int error, i;
- ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_bmbt_get_all(gotp, &got);
xfs_bmbt_get_all(leftp, &left);
blockcount = left.br_blockcount + got.br_blockcount;
@@ -5489,32 +5484,25 @@ xfs_bmse_merge(
error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
got.br_blockcount, &i);
if (error)
- goto out_error;
- XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
error = xfs_btree_delete(cur, &i);
if (error)
- goto out_error;
- XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
/* lookup and update size of the previous extent */
error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
left.br_blockcount, &i);
if (error)
- goto out_error;
- XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
left.br_blockcount = blockcount;
- error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
- left.br_blockcount, left.br_state);
- if (error)
- goto out_error;
-
- return 0;
-
-out_error:
- return error;
+ return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
+ left.br_blockcount, left.br_state);
}
/*
@@ -5544,35 +5532,29 @@ xfs_bmse_shift_one(
startoff = got.br_startoff - offset_shift_fsb;
/* delalloc extents should be prevented by caller */
- XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock),
- out_error);
+ XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock));
/*
- * If this is the first extent in the file, make sure there's enough
- * room at the start of the file and jump right to the shift as there's
- * no left extent to merge.
+ * Check for merge if we've got an extent to the left, otherwise make
+ * sure there's enough room at the start of the file for the shift.
*/
- if (*current_ext == 0) {
- if (got.br_startoff < offset_shift_fsb)
- return -EINVAL;
- goto shift_extent;
- }
+ if (*current_ext) {
+ /* grab the left extent and check for a large enough hole */
+ leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
+ xfs_bmbt_get_all(leftp, &left);
- /* grab the left extent and check for a large enough hole */
- leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
- xfs_bmbt_get_all(leftp, &left);
+ if (startoff < left.br_startoff + left.br_blockcount)
+ return -EINVAL;
- if (startoff < left.br_startoff + left.br_blockcount)
+ /* check whether to merge the extent or shift it down */
+ if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
+ return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+ *current_ext, gotp, leftp, cur,
+ logflags);
+ }
+ } else if (got.br_startoff < offset_shift_fsb)
return -EINVAL;
- /* check whether to merge the extent or shift it down */
- if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb))
- goto shift_extent;
-
- return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext,
- gotp, leftp, cur, logflags);
-
-shift_extent:
/*
* Increment the extent index for the next iteration, update the start
* offset of the in-core extent and update the btree if applicable.
@@ -5589,18 +5571,11 @@ shift_extent:
got.br_blockcount, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
got.br_startoff = startoff;
- error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+ return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
got.br_blockcount, got.br_state);
- if (error)
- return error;
-
- return 0;
-
-out_error:
- return error;
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fba753308f31..2c44c8e50782 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
@@ -36,7 +34,6 @@
#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
-#include "xfs_dinode.h"
/*
* Determine the extent state.
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 8fe6a93ff473..81cad433df85 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index fd827530afec..9cb0115c6bd1 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -23,8 +23,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -514,7 +512,6 @@ xfs_da3_root_split(
struct xfs_buf *bp;
struct xfs_inode *dp;
struct xfs_trans *tp;
- struct xfs_mount *mp;
struct xfs_dir2_leaf *leaf;
xfs_dablk_t blkno;
int level;
@@ -534,7 +531,6 @@ xfs_da3_root_split(
dp = args->dp;
tp = args->trans;
- mp = state->mp;
error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
if (error)
return error;
@@ -2342,14 +2338,12 @@ xfs_da_shrink_inode(
xfs_inode_t *dp;
int done, error, w, count;
xfs_trans_t *tp;
- xfs_mount_t *mp;
trace_xfs_da_shrink_inode(args);
dp = args->dp;
w = args->whichfork;
tp = args->trans;
- mp = dp->i_mount;
count = args->geo->fsbcount;
for (;;) {
/*
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index 7e42fdfd2f1d..9d624a622946 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -22,8 +22,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
deleted file mode 100644
index 623bbe8fd921..000000000000
--- a/fs/xfs/libxfs/xfs_dinode.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DINODE_H__
-#define __XFS_DINODE_H__
-
-#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
-
-typedef struct xfs_timestamp {
- __be32 t_sec; /* timestamp seconds */
- __be32 t_nsec; /* timestamp nanoseconds */
-} xfs_timestamp_t;
-
-/*
- * On-disk inode structure.
- *
- * This is just the header or "dinode core", the inode is expanded to fill a
- * variable size the leftover area split into a data and an attribute fork.
- * The format of the data and attribute fork depends on the format of the
- * inode as indicated by di_format and di_aformat. To access the data and
- * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
- * below.
- *
- * There is a very similar struct icdinode in xfs_inode which matches the
- * layout of the first 96 bytes of this structure, but is kept in native
- * format instead of big endian.
- *
- * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
- * padding field for v3 inodes.
- */
-typedef struct xfs_dinode {
- __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
- __be16 di_mode; /* mode and type of file */
- __u8 di_version; /* inode version */
- __u8 di_format; /* format of di_c data */
- __be16 di_onlink; /* old number of links to file */
- __be32 di_uid; /* owner's user id */
- __be32 di_gid; /* owner's group id */
- __be32 di_nlink; /* number of links to file */
- __be16 di_projid_lo; /* lower part of owner's project id */
- __be16 di_projid_hi; /* higher part owner's project id */
- __u8 di_pad[6]; /* unused, zeroed space */
- __be16 di_flushiter; /* incremented on flush */
- xfs_timestamp_t di_atime; /* time last accessed */
- xfs_timestamp_t di_mtime; /* time last modified */
- xfs_timestamp_t di_ctime; /* time created/inode modified */
- __be64 di_size; /* number of bytes in file */
- __be64 di_nblocks; /* # of direct & btree blocks used */
- __be32 di_extsize; /* basic/minimum extent size for file */
- __be32 di_nextents; /* number of extents in data fork */
- __be16 di_anextents; /* number of extents in attribute fork*/
- __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
- __s8 di_aformat; /* format of attr fork's data */
- __be32 di_dmevmask; /* DMIG event mask */
- __be16 di_dmstate; /* DMIG state info */
- __be16 di_flags; /* random flags, XFS_DIFLAG_... */
- __be32 di_gen; /* generation number */
-
- /* di_next_unlinked is the only non-core field in the old dinode */
- __be32 di_next_unlinked;/* agi unlinked list ptr */
-
- /* start of the extended dinode, writable fields */
- __le32 di_crc; /* CRC of the inode */
- __be64 di_changecount; /* number of attribute changes */
- __be64 di_lsn; /* flush sequence */
- __be64 di_flags2; /* more random flags */
- __u8 di_pad2[16]; /* more padding for future expansion */
-
- /* fields only written to during inode creation */
- xfs_timestamp_t di_crtime; /* time created */
- __be64 di_ino; /* inode number */
- uuid_t di_uuid; /* UUID of the filesystem */
-
- /* structure must be padded to 64 bit alignment */
-} xfs_dinode_t;
-
-#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
-
-#define DI_MAX_FLUSH 0xffff
-
-/*
- * Size of the core inode on disk. Version 1 and 2 inodes have
- * the same size, but version 3 has grown a few additional fields.
- */
-static inline uint xfs_dinode_size(int version)
-{
- if (version == 3)
- return sizeof(struct xfs_dinode);
- return offsetof(struct xfs_dinode, di_crc);
-}
-
-/*
- * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
- * Since the pathconf interface is signed, we use 2^31 - 1 instead.
- * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
- */
-#define XFS_MAXLINK ((1U << 31) - 1U)
-#define XFS_MAXLINK_1 65535U
-
-/*
- * Values for di_format
- */
-typedef enum xfs_dinode_fmt {
- XFS_DINODE_FMT_DEV, /* xfs_dev_t */
- XFS_DINODE_FMT_LOCAL, /* bulk data */
- XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
- XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
- XFS_DINODE_FMT_UUID /* uuid_t */
-} xfs_dinode_fmt_t;
-
-/*
- * Inode minimum and maximum sizes.
- */
-#define XFS_DINODE_MIN_LOG 8
-#define XFS_DINODE_MAX_LOG 11
-#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
-#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
-
-/*
- * Inode size for given fs.
- */
-#define XFS_LITINO(mp, version) \
- ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
-
-/*
- * Inode data & attribute fork sizes, per inode.
- */
-#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
-#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
-
-#define XFS_DFORK_DSIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_DFORK_BOFF(dip) : \
- XFS_LITINO(mp, (dip)->di_version))
-#define XFS_DFORK_ASIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
- 0)
-#define XFS_DFORK_SIZE(dip,mp,w) \
- ((w) == XFS_DATA_FORK ? \
- XFS_DFORK_DSIZE(dip, mp) : \
- XFS_DFORK_ASIZE(dip, mp))
-
-/*
- * Return pointers to the data or attribute forks.
- */
-#define XFS_DFORK_DPTR(dip) \
- ((char *)dip + xfs_dinode_size(dip->di_version))
-#define XFS_DFORK_APTR(dip) \
- (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
-#define XFS_DFORK_PTR(dip,w) \
- ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
-
-#define XFS_DFORK_FORMAT(dip,w) \
- ((w) == XFS_DATA_FORK ? \
- (dip)->di_format : \
- (dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
- ((w) == XFS_DATA_FORK ? \
- be32_to_cpu((dip)->di_nextents) : \
- be16_to_cpu((dip)->di_anextents))
-
-#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr))
-
-/*
- * For block and character special files the 32bit dev_t is stored at the
- * beginning of the data fork.
- */
-static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
-{
- return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
-}
-
-static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
-{
- *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
-}
-
-/*
- * Values for di_flags
- * There should be a one-to-one correspondence between these flags and the
- * XFS_XFLAG_s.
- */
-#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
-#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
-#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
-#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
-#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
-#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
-#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
-#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
-#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
-#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
-#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
-#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
-#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
-#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
-#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
-#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
-#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
-#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
-#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
-#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
-#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
-#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
-#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
-#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
-#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
-#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
-#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
-#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
-#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
-#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
-
-#ifdef CONFIG_XFS_RT
-#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
-#else
-#define XFS_IS_REALTIME_INODE(ip) (0)
-#endif
-
-#define XFS_DIFLAG_ANY \
- (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
- XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
- XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
- XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
- XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
-
-#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 7075aaf131f4..a69fb3a1e161 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -20,9 +20,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -34,10 +31,25 @@
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_dinode.h"
struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+/*
+ * @mode, if set, indicates that the type field needs to be set up.
+ * This uses the transformation from file mode to DT_* as defined in linux/fs.h
+ * for file type specification. This will be propagated into the directory
+ * structure if appropriate for the given operation and filesystem config.
+ */
+const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
+ [0] = XFS_DIR3_FT_UNKNOWN,
+ [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
+};
/*
* ASCII case-insensitive (ie. A-Z) support for directories that was
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 4dff261e6ed5..e55353651f5b 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -32,6 +32,12 @@ struct xfs_dir2_data_unused;
extern struct xfs_name xfs_name_dotdot;
/*
+ * directory filetype conversion tables.
+ */
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+
+/*
* directory operations vector for encode/decode routines
*/
struct xfs_dir_ops {
@@ -177,4 +183,138 @@ extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+ return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr. It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+ return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir2_block_tail *)
+ ((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+ return (struct xfs_dir2_leaf_tail *)
+ ((char *)lp + geo->blksize -
+ sizeof(struct xfs_dir2_leaf_tail));
+}
+
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9628ceccfa02..9354e190b82e 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -36,7 +34,6 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
-#include "xfs_dinode.h"
/*
* Local function prototypes.
@@ -353,7 +350,6 @@ xfs_dir2_block_addname(
int low; /* low index for binary srch */
int lowstale; /* low stale index */
int mid=0; /* midpoint for binary srch */
- xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log header */
int needscan; /* need to rescan freespace */
__be16 *tagp; /* pointer to tag value */
@@ -363,7 +359,6 @@ xfs_dir2_block_addname(
dp = args->dp;
tp = args->trans;
- mp = dp->i_mount;
/* Read the (one and only) directory block into bp. */
error = xfs_dir3_block_read(tp, dp, &bp);
@@ -618,7 +613,6 @@ xfs_dir2_block_lookup(
xfs_inode_t *dp; /* incore inode */
int ent; /* entry index */
int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
trace_xfs_dir2_block_lookup(args);
@@ -629,7 +623,6 @@ xfs_dir2_block_lookup(
if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
return error;
dp = args->dp;
- mp = dp->i_mount;
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
btp = xfs_dir2_block_tail_p(args->geo, hdr);
@@ -770,7 +763,6 @@ xfs_dir2_block_removename(
xfs_inode_t *dp; /* incore inode */
int ent; /* block leaf entry index */
int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log block header */
int needscan; /* need to fixup bestfree */
xfs_dir2_sf_hdr_t sfh; /* shortform header */
@@ -788,7 +780,6 @@ xfs_dir2_block_removename(
}
dp = args->dp;
tp = args->trans;
- mp = dp->i_mount;
hdr = bp->b_addr;
btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
@@ -852,7 +843,6 @@ xfs_dir2_block_replace(
xfs_inode_t *dp; /* incore inode */
int ent; /* leaf entry index */
int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
trace_xfs_dir2_block_replace(args);
@@ -864,7 +854,6 @@ xfs_dir2_block_replace(
return error;
}
dp = args->dp;
- mp = dp->i_mount;
hdr = bp->b_addr;
btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index fdd803fecb8e..5ff31be9b1cd 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index a19174eb3cb2..106119955400 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -384,7 +382,6 @@ xfs_dir2_block_to_leaf(
xfs_dir2_db_t ldb; /* leaf block's bno */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */
- xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log block header */
int needscan; /* need to rescan bestfree */
xfs_trans_t *tp; /* transaction pointer */
@@ -395,7 +392,6 @@ xfs_dir2_block_to_leaf(
trace_xfs_dir2_block_to_leaf(args);
dp = args->dp;
- mp = dp->i_mount;
tp = args->trans;
/*
* Add the leaf block to the inode.
@@ -626,7 +622,6 @@ xfs_dir2_leaf_addname(
int lfloghigh; /* high leaf logging index */
int lowstale; /* index of prev stale leaf */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
- xfs_mount_t *mp; /* filesystem mount point */
int needbytes; /* leaf block bytes needed */
int needlog; /* need to log data header */
int needscan; /* need to rescan data free */
@@ -641,7 +636,6 @@ xfs_dir2_leaf_addname(
dp = args->dp;
tp = args->trans;
- mp = dp->i_mount;
error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
@@ -1356,11 +1350,9 @@ xfs_dir2_leaf_removename(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log data header */
int needscan; /* need to rescan data frees */
xfs_dir2_data_off_t oldbest; /* old value of best free */
- xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir2_data_free *bf; /* bestfree table */
struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
@@ -1374,8 +1366,6 @@ xfs_dir2_leaf_removename(
return error;
}
dp = args->dp;
- tp = args->trans;
- mp = dp->i_mount;
leaf = lbp->b_addr;
hdr = dbp->b_addr;
xfs_dir3_data_check(dp, dbp);
@@ -1607,11 +1597,9 @@ xfs_dir2_leaf_trim_data(
int error; /* error return value */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
dp = args->dp;
- mp = dp->i_mount;
tp = args->trans;
/*
* Read the offending data block. We need its buffer.
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 2ae6ac2c11ae..41b80d3d3877 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -297,7 +295,6 @@ xfs_dir2_leaf_to_node(
int i; /* leaf freespace index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
int n; /* count of live freespc ents */
xfs_dir2_data_off_t off; /* freespace entry value */
__be16 *to; /* pointer to freespace entry */
@@ -307,7 +304,6 @@ xfs_dir2_leaf_to_node(
trace_xfs_dir2_leaf_to_node(args);
dp = args->dp;
- mp = dp->i_mount;
tp = args->trans;
/*
* Add a freespace block to the directory.
@@ -387,16 +383,12 @@ xfs_dir2_leafn_add(
int lfloghigh; /* high leaf entry logging */
int lfloglow; /* low leaf entry logging */
int lowstale; /* previous stale entry */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir3_icleaf_hdr leafhdr;
struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leafn_add(args, index);
dp = args->dp;
- mp = dp->i_mount;
- tp = args->trans;
leaf = bp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);
@@ -1170,7 +1162,6 @@ xfs_dir2_leafn_remove(
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
int longest; /* longest data free entry */
int off; /* data block entry offset */
- xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log data header */
int needscan; /* need to rescan data frees */
xfs_trans_t *tp; /* transaction pointer */
@@ -1182,7 +1173,6 @@ xfs_dir2_leafn_remove(
dp = args->dp;
tp = args->trans;
- mp = dp->i_mount;
leaf = bp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);
@@ -1323,7 +1313,6 @@ xfs_dir2_leafn_split(
xfs_da_args_t *args; /* operation arguments */
xfs_dablk_t blkno; /* new leaf block number */
int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
struct xfs_inode *dp;
/*
@@ -1331,7 +1320,6 @@ xfs_dir2_leafn_split(
*/
args = state->args;
dp = args->dp;
- mp = dp->i_mount;
ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
error = xfs_da_grow_inode(args, &blkno);
if (error) {
@@ -2231,12 +2219,10 @@ xfs_dir2_node_trim_free(
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
xfs_dir2_free_t *free; /* freespace structure */
- xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir3_icfree_hdr freehdr;
dp = args->dp;
- mp = dp->i_mount;
tp = args->trans;
/*
* Read the freespace block.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 27ce0794d196..ef9f6ead96a4 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -20,140 +20,6 @@
struct dir_context;
-/*
- * Directory offset/block conversion functions.
- *
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-
-/*
- * Convert dataptr to byte in file space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
-{
- return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
-}
-
-/*
- * Convert byte in file space to dataptr. It had better be aligned.
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
-{
- return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-
-/*
- * Convert byte in space to (DB) block
- */
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
- return (xfs_dir2_db_t)(by >> geo->blklog);
-}
-
-/*
- * Convert dataptr to a block number
- */
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
-{
- return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
-}
-
-/*
- * Convert byte in space to offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
- return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
-}
-
-/*
- * Convert dataptr to a byte offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
-{
- return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
-}
-
-/*
- * Convert block and offset to byte in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return ((xfs_dir2_off_t)db << geo->blklog) + o;
-}
-
-/*
- * Convert block (DB) to block (dablk)
- */
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
-}
-
-/*
- * Convert byte in space to (DA) block
- */
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
- return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
-}
-
-/*
- * Convert block and offset to dataptr
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
-}
-
-/*
- * Convert block (dablk) to block (DB)
- */
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
-{
- return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
-}
-
-/*
- * Convert block (dablk) to byte offset in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
-{
- return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
-}
-
-/*
- * Directory tail pointer accessor functions. Based on block geometry.
- */
-static inline struct xfs_dir2_block_tail *
-xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
-{
- return ((struct xfs_dir2_block_tail *)
- ((char *)hdr + geo->blksize)) - 1;
-}
-
-static inline struct xfs_dir2_leaf_tail *
-xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
-{
- return (struct xfs_dir2_leaf_tail *)
- ((char *)lp + geo->blksize -
- sizeof(struct xfs_dir2_leaf_tail));
-}
-
/* xfs_dir2.c */
extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -161,12 +27,6 @@ extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
const unsigned char *name, int len);
-#define S_SHIFT 12
-extern const unsigned char xfs_mode_to_ftype[];
-
-extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
- __uint8_t filetype);
-
/* xfs_dir2_block.c */
extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 5079e051ef08..974d62e677f4 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -20,8 +20,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -32,7 +30,6 @@
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_trace.h"
-#include "xfs_dinode.h"
/*
* Prototypes for internal functions.
@@ -455,13 +452,11 @@ xfs_dir2_sf_addname_hard(
xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */
- struct xfs_mount *mp;
/*
* Copy the old directory to the stack buffer.
*/
dp = args->dp;
- mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_d.di_size;
@@ -542,7 +537,6 @@ xfs_dir2_sf_addname_pick(
xfs_inode_t *dp; /* incore directory inode */
int holefit; /* found hole it will fit in */
int i; /* entry number */
- xfs_mount_t *mp; /* filesystem mount point */
xfs_dir2_data_aoff_t offset; /* data block offset */
xfs_dir2_sf_entry_t *sfep; /* shortform entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
@@ -550,7 +544,6 @@ xfs_dir2_sf_addname_pick(
int used; /* data bytes used */
dp = args->dp;
- mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
size = dp->d_ops->data_entsize(args->namelen);
@@ -616,10 +609,8 @@ xfs_dir2_sf_check(
int offset; /* data offset */
xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- struct xfs_mount *mp;
dp = args->dp;
- mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
offset = dp->d_ops->data_first_offset;
@@ -1016,12 +1007,10 @@ xfs_dir2_sf_toino4(
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
- struct xfs_mount *mp;
trace_xfs_dir2_sf_toino4(args);
dp = args->dp;
- mp = dp->i_mount;
/*
* Copy the old directory to the buffer.
@@ -1094,12 +1083,10 @@ xfs_dir2_sf_toino8(
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
- struct xfs_mount *mp;
trace_xfs_dir2_sf_toino8(args);
dp = args->dp;
- mp = dp->i_mount;
/*
* Copy the old directory to the buffer.
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index bb969337efc8..6fbf2d853a54 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -22,8 +22,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_quota.h"
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 7e42bba9a420..fbd6da263571 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -34,6 +34,1077 @@ struct xfs_buf;
struct xfs_ifork;
/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
+#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
+#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
+#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
+#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
+#define XFS_SB_VERSION_NUMBITS 0x000f
+#define XFS_SB_VERSION_ALLFBITS 0xfff0
+#define XFS_SB_VERSION_ATTRBIT 0x0010
+#define XFS_SB_VERSION_NLINKBIT 0x0020
+#define XFS_SB_VERSION_QUOTABIT 0x0040
+#define XFS_SB_VERSION_ALIGNBIT 0x0080
+#define XFS_SB_VERSION_DALIGNBIT 0x0100
+#define XFS_SB_VERSION_SHAREDBIT 0x0200
+#define XFS_SB_VERSION_LOGV2BIT 0x0400
+#define XFS_SB_VERSION_SECTORBIT 0x0800
+#define XFS_SB_VERSION_EXTFLGBIT 0x1000
+#define XFS_SB_VERSION_DIRV2BIT 0x2000
+#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
+#define XFS_SB_VERSION_MOREBITSBIT 0x8000
+
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define XFS_SB_VERSION_OKBITS \
+ ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+ ~XFS_SB_VERSION_SHAREDBIT)
+
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2. Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
+
+#define XFS_SB_VERSION2_OKBITS \
+ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_FTYPE)
+
+/*
+ * Superblock - in core version. Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+ __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
+ __uint32_t sb_blocksize; /* logical block size, bytes */
+ xfs_rfsblock_t sb_dblocks; /* number of data blocks */
+ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
+ xfs_rtblock_t sb_rextents; /* number of realtime extents */
+ uuid_t sb_uuid; /* file system unique id */
+ xfs_fsblock_t sb_logstart; /* starting block of log if internal */
+ xfs_ino_t sb_rootino; /* root inode number */
+ xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
+ xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
+ xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */
+ xfs_agblock_t sb_agblocks; /* size of an allocation group */
+ xfs_agnumber_t sb_agcount; /* number of allocation groups */
+ xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
+ xfs_extlen_t sb_logblocks; /* number of log blocks */
+ __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
+ __uint16_t sb_sectsize; /* volume sector size, bytes */
+ __uint16_t sb_inodesize; /* inode size, bytes */
+ __uint16_t sb_inopblock; /* inodes per block */
+ char sb_fname[12]; /* file system name */
+ __uint8_t sb_blocklog; /* log2 of sb_blocksize */
+ __uint8_t sb_sectlog; /* log2 of sb_sectsize */
+ __uint8_t sb_inodelog; /* log2 of sb_inodesize */
+ __uint8_t sb_inopblog; /* log2 of sb_inopblock */
+ __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
+ __uint8_t sb_rextslog; /* log2 of sb_rextents */
+ __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
+ __uint8_t sb_imax_pct; /* max % of fs for inode space */
+ /* statistics */
+ /*
+ * These fields must remain contiguous. If you really
+ * want to change their layout, make sure you fix the
+ * code in xfs_trans_apply_sb_deltas().
+ */
+ __uint64_t sb_icount; /* allocated inodes */
+ __uint64_t sb_ifree; /* free inodes */
+ __uint64_t sb_fdblocks; /* free data blocks */
+ __uint64_t sb_frextents; /* free realtime extents */
+ /*
+ * End contiguous fields.
+ */
+ xfs_ino_t sb_uquotino; /* user quota inode */
+ xfs_ino_t sb_gquotino; /* group quota inode */
+ __uint16_t sb_qflags; /* quota flags */
+ __uint8_t sb_flags; /* misc. flags */
+ __uint8_t sb_shared_vn; /* shared version number */
+ xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
+ __uint32_t sb_unit; /* stripe or raid unit */
+ __uint32_t sb_width; /* stripe or raid width */
+ __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
+ __uint8_t sb_logsectlog; /* log2 of the log sector size */
+ __uint16_t sb_logsectsize; /* sector size for the log, bytes */
+ __uint32_t sb_logsunit; /* stripe unit size for the log */
+ __uint32_t sb_features2; /* additional feature bits */
+
+ /*
+ * bad features2 field as a result of failing to pad the sb
+ * structure to 64 bits. Some machines will be using this field
+ * for features2 bits. Easiest just to mark it bad and not use
+ * it for anything else.
+ */
+ __uint32_t sb_bad_features2;
+
+ /* version 5 superblock fields start here */
+
+ /* feature masks */
+ __uint32_t sb_features_compat;
+ __uint32_t sb_features_ro_compat;
+ __uint32_t sb_features_incompat;
+ __uint32_t sb_features_log_incompat;
+
+ __uint32_t sb_crc; /* superblock crc */
+ __uint32_t sb_pad;
+
+ xfs_ino_t sb_pquotino; /* project quota inode */
+ xfs_lsn_t sb_lsn; /* last write sequence */
+
+ /* must be padded to 64 bit alignment */
+} xfs_sb_t;
+
+#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
+
+/*
+ * Superblock - on disk version. Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_dsb {
+ __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
+ __be32 sb_blocksize; /* logical block size, bytes */
+ __be64 sb_dblocks; /* number of data blocks */
+ __be64 sb_rblocks; /* number of realtime blocks */
+ __be64 sb_rextents; /* number of realtime extents */
+ uuid_t sb_uuid; /* file system unique id */
+ __be64 sb_logstart; /* starting block of log if internal */
+ __be64 sb_rootino; /* root inode number */
+ __be64 sb_rbmino; /* bitmap inode for realtime extents */
+ __be64 sb_rsumino; /* summary inode for rt bitmap */
+ __be32 sb_rextsize; /* realtime extent size, blocks */
+ __be32 sb_agblocks; /* size of an allocation group */
+ __be32 sb_agcount; /* number of allocation groups */
+ __be32 sb_rbmblocks; /* number of rt bitmap blocks */
+ __be32 sb_logblocks; /* number of log blocks */
+ __be16 sb_versionnum; /* header version == XFS_SB_VERSION */
+ __be16 sb_sectsize; /* volume sector size, bytes */
+ __be16 sb_inodesize; /* inode size, bytes */
+ __be16 sb_inopblock; /* inodes per block */
+ char sb_fname[12]; /* file system name */
+ __u8 sb_blocklog; /* log2 of sb_blocksize */
+ __u8 sb_sectlog; /* log2 of sb_sectsize */
+ __u8 sb_inodelog; /* log2 of sb_inodesize */
+ __u8 sb_inopblog; /* log2 of sb_inopblock */
+ __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */
+ __u8 sb_rextslog; /* log2 of sb_rextents */
+ __u8 sb_inprogress; /* mkfs is in progress, don't mount */
+ __u8 sb_imax_pct; /* max % of fs for inode space */
+ /* statistics */
+ /*
+ * These fields must remain contiguous. If you really
+ * want to change their layout, make sure you fix the
+ * code in xfs_trans_apply_sb_deltas().
+ */
+ __be64 sb_icount; /* allocated inodes */
+ __be64 sb_ifree; /* free inodes */
+ __be64 sb_fdblocks; /* free data blocks */
+ __be64 sb_frextents; /* free realtime extents */
+ /*
+ * End contiguous fields.
+ */
+ __be64 sb_uquotino; /* user quota inode */
+ __be64 sb_gquotino; /* group quota inode */
+ __be16 sb_qflags; /* quota flags */
+ __u8 sb_flags; /* misc. flags */
+ __u8 sb_shared_vn; /* shared version number */
+ __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */
+ __be32 sb_unit; /* stripe or raid unit */
+ __be32 sb_width; /* stripe or raid width */
+ __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */
+ __u8 sb_logsectlog; /* log2 of the log sector size */
+ __be16 sb_logsectsize; /* sector size for the log, bytes */
+ __be32 sb_logsunit; /* stripe unit size for the log */
+ __be32 sb_features2; /* additional feature bits */
+ /*
+ * bad features2 field as a result of failing to pad the sb
+ * structure to 64 bits. Some machines will be using this field
+ * for features2 bits. Easiest just to mark it bad and not use
+ * it for anything else.
+ */
+ __be32 sb_bad_features2;
+
+ /* version 5 superblock fields start here */
+
+ /* feature masks */
+ __be32 sb_features_compat;
+ __be32 sb_features_ro_compat;
+ __be32 sb_features_incompat;
+ __be32 sb_features_log_incompat;
+
+ __le32 sb_crc; /* superblock crc */
+ __be32 sb_pad;
+
+ __be64 sb_pquotino; /* project quota inode */
+ __be64 sb_lsn; /* last write sequence */
+
+ /* must be padded to 64 bit alignment */
+} xfs_dsb_t;
+
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+ XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+ XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+ XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+ XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+ XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+ XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+ XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+ XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+ XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+ XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+ XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+ XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
+ XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
+ XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
+ XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
+ XFS_SBS_PQUOTINO, XFS_SBS_LSN,
+ XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
+#define XFS_SB_UUID XFS_SB_MVAL(UUID)
+#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
+#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
+#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
+#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
+#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
+#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
+#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
+#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
+#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2)
+#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
+#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
+#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
+#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
+#define XFS_SB_CRC XFS_SB_MVAL(CRC)
+#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
+#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
+#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
+#define XFS_SB_MOD_BITS \
+ (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+ XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
+ XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
+ XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+ XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+ XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
+
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS 0x00 /* no flags set */
+#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN 0
+
+#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+
+/*
+ * The first XFS version we support is a v4 superblock with V2 directories.
+ */
+static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
+{
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+ return false;
+
+ /* check for unknown features in the fs */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
+
+ return true;
+}
+
+static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
+{
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+ return true;
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ return xfs_sb_good_v4_features(sbp);
+ return false;
+}
+
+/*
+ * Detect a mismatched features2 field. Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+{
+ return sbp->sb_bad_features2 != sbp->sb_features2;
+}
+
+static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
+{
+ return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
+}
+
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+}
+
+static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
+{
+ return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+}
+
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+}
+
+static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
+}
+
+static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
+{
+ return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+}
+
+static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+}
+
+static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+}
+
+static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
+{
+ return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+}
+
+static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
+{
+ return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+
+/*
+ * sb_features2 bit version macros.
+ */
+static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+}
+
+static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
+}
+
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+ sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+ sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+
+static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
+{
+ sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+ sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+ if (!sbp->sb_features2)
+ sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+
+static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
+}
+
+static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+ sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+ sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+}
+
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+ (XFS_SB_FEAT_RO_COMPAT_FINOBT)
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_ro_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+ (XFS_SB_FEAT_INCOMPAT_FTYPE)
+
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_incompat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_log_incompat & feature) != 0;
+}
+
+/*
+ * V5 superblock specific feature checks
+ */
+static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
+}
+
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
+}
+
+/*
+ * end of superblock version macros
+ */
+
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+ return (ino == sbp->sb_uquotino ||
+ ino == sbp->sb_gquotino ||
+ ino == sbp->sb_pquotino);
+}
+
+#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
+
+#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
+ xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
+ XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log)
+
+/*
+ * File system block to basic block conversions.
+ */
+#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSB(mp,bb) \
+ (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b) \
+ ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
+
+/*
+ * Allocation group header
+ *
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
+#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
+#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
+#define XFS_AGF_VERSION 1
+#define XFS_AGI_VERSION 1
+
+#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
+#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
+
+/*
+ * Btree number 0 is bno, 1 is cnt. This value gives the size of the
+ * arrays below.
+ */
+#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number. Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf {
+ /*
+ * Common allocation group header information
+ */
+ __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */
+ __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */
+ __be32 agf_seqno; /* sequence # starting from 0 */
+ __be32 agf_length; /* size in blocks of a.g. */
+ /*
+ * Freespace information
+ */
+ __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
+ __be32 agf_spare0; /* spare field */
+ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
+ __be32 agf_spare1; /* spare field */
+
+ __be32 agf_flfirst; /* first freelist block's index */
+ __be32 agf_fllast; /* last freelist block's index */
+ __be32 agf_flcount; /* count of blocks in freelist */
+ __be32 agf_freeblks; /* total free blocks */
+
+ __be32 agf_longest; /* longest free space */
+ __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
+ uuid_t agf_uuid; /* uuid of filesystem */
+
+ /*
+ * reserve some contiguous space for future logged fields before we add
+ * the unlogged fields. This makes the range logging via flags and
+ * structure offsets much simpler.
+ */
+ __be64 agf_spare64[16];
+
+ /* unlogged fields, written during buffer writeback. */
+ __be64 agf_lsn; /* last write sequence */
+ __be32 agf_crc; /* crc of agf sector */
+ __be32 agf_spare2;
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_agf_t;
+
+#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
+
+#define XFS_AGF_MAGICNUM 0x00000001
+#define XFS_AGF_VERSIONNUM 0x00000002
+#define XFS_AGF_SEQNO 0x00000004
+#define XFS_AGF_LENGTH 0x00000008
+#define XFS_AGF_ROOTS 0x00000010
+#define XFS_AGF_LEVELS 0x00000020
+#define XFS_AGF_FLFIRST 0x00000040
+#define XFS_AGF_FLLAST 0x00000080
+#define XFS_AGF_FLCOUNT 0x00000100
+#define XFS_AGF_FREEBLKS 0x00000200
+#define XFS_AGF_LONGEST 0x00000400
+#define XFS_AGF_BTREEBLKS 0x00000800
+#define XFS_AGF_UUID 0x00001000
+#define XFS_AGF_NUM_BITS 13
+#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
+
+#define XFS_AGF_FLAGS \
+ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
+ { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
+ { XFS_AGF_SEQNO, "SEQNO" }, \
+ { XFS_AGF_LENGTH, "LENGTH" }, \
+ { XFS_AGF_ROOTS, "ROOTS" }, \
+ { XFS_AGF_LEVELS, "LEVELS" }, \
+ { XFS_AGF_FLFIRST, "FLFIRST" }, \
+ { XFS_AGF_FLLAST, "FLLAST" }, \
+ { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
+ { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
+ { XFS_AGF_LONGEST, "LONGEST" }, \
+ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
+ { XFS_AGF_UUID, "UUID" }
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define XFS_AGI_UNLINKED_BUCKETS 64
+
+typedef struct xfs_agi {
+ /*
+ * Common allocation group header information
+ */
+ __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */
+ __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */
+ __be32 agi_seqno; /* sequence # starting from 0 */
+ __be32 agi_length; /* size in blocks of a.g. */
+ /*
+ * Inode information
+ * Inodes are mapped by interpreting the inode number, so no
+ * mapping data is needed here.
+ */
+ __be32 agi_count; /* count of allocated inodes */
+ __be32 agi_root; /* root of inode btree */
+ __be32 agi_level; /* levels in inode btree */
+ __be32 agi_freecount; /* number of free inodes */
+
+ __be32 agi_newino; /* new inode just allocated */
+ __be32 agi_dirino; /* last directory inode chunk */
+ /*
+ * Hash table of inodes which have been unlinked but are
+ * still being referenced.
+ */
+ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+ /*
+ * This marks the end of logging region 1 and start of logging region 2.
+ */
+ uuid_t agi_uuid; /* uuid of filesystem */
+ __be32 agi_crc; /* crc of agi sector */
+ __be32 agi_pad32;
+ __be64 agi_lsn; /* last write sequence */
+
+ __be32 agi_free_root; /* root of the free inode btree */
+ __be32 agi_free_level;/* levels in free inode btree */
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_agi_t;
+
+#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
+
+#define XFS_AGI_MAGICNUM (1 << 0)
+#define XFS_AGI_VERSIONNUM (1 << 1)
+#define XFS_AGI_SEQNO (1 << 2)
+#define XFS_AGI_LENGTH (1 << 3)
+#define XFS_AGI_COUNT (1 << 4)
+#define XFS_AGI_ROOT (1 << 5)
+#define XFS_AGI_LEVEL (1 << 6)
+#define XFS_AGI_FREECOUNT (1 << 7)
+#define XFS_AGI_NEWINO (1 << 8)
+#define XFS_AGI_DIRINO (1 << 9)
+#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
+#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1 << 11)
+#define XFS_AGI_FREE_LEVEL (1 << 12)
+#define XFS_AGI_NUM_BITS_R2 13
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
+
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+ (__be32 *)(bp)->b_addr)
+
+/*
+ * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
+ */
+#define XFS_AGFL_SIZE(mp) \
+ (((mp)->m_sb.sb_sectsize - \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ sizeof(struct xfs_agfl) : 0)) / \
+ sizeof(xfs_agblock_t))
+
+typedef struct xfs_agfl {
+ __be32 agfl_magicnum;
+ __be32 agfl_seqno;
+ uuid_t agfl_uuid;
+ __be64 agfl_lsn;
+ __be32 agfl_crc;
+ __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+
+#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
+
+
+#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
+ (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#define XFS_MIN_FREELIST(a,mp) \
+ (XFS_MIN_FREELIST_RAW( \
+ be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
+ be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
+#define XFS_MIN_FREELIST_PAG(pag,mp) \
+ (XFS_MIN_FREELIST_RAW( \
+ (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+ (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+
+#define XFS_AGB_TO_FSB(mp,agno,agbno) \
+ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define XFS_FSB_TO_AGNO(mp,fsbno) \
+ ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define XFS_FSB_TO_AGBNO(mp,fsbno) \
+ ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
+ ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+ (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define XFS_AG_CHECK_DADDR(mp,d,len) \
+ ((len) == 1 ? \
+ ASSERT((d) == XFS_SB_DADDR || \
+ xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+ ASSERT(xfs_daddr_to_agno(mp, d) == \
+ xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+
+typedef struct xfs_timestamp {
+ __be32 t_sec; /* timestamp seconds */
+ __be32 t_nsec; /* timestamp nanoseconds */
+} xfs_timestamp_t;
+
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat. To access the data and
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
+ */
+#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
+typedef struct xfs_dinode {
+ __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
+ __be16 di_mode; /* mode and type of file */
+ __u8 di_version; /* inode version */
+ __u8 di_format; /* format of di_c data */
+ __be16 di_onlink; /* old number of links to file */
+ __be32 di_uid; /* owner's user id */
+ __be32 di_gid; /* owner's group id */
+ __be32 di_nlink; /* number of links to file */
+ __be16 di_projid_lo; /* lower part of owner's project id */
+ __be16 di_projid_hi; /* higher part owner's project id */
+ __u8 di_pad[6]; /* unused, zeroed space */
+ __be16 di_flushiter; /* incremented on flush */
+ xfs_timestamp_t di_atime; /* time last accessed */
+ xfs_timestamp_t di_mtime; /* time last modified */
+ xfs_timestamp_t di_ctime; /* time created/inode modified */
+ __be64 di_size; /* number of bytes in file */
+ __be64 di_nblocks; /* # of direct & btree blocks used */
+ __be32 di_extsize; /* basic/minimum extent size for file */
+ __be32 di_nextents; /* number of extents in data fork */
+ __be16 di_anextents; /* number of extents in attribute fork*/
+ __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
+ __s8 di_aformat; /* format of attr fork's data */
+ __be32 di_dmevmask; /* DMIG event mask */
+ __be16 di_dmstate; /* DMIG state info */
+ __be16 di_flags; /* random flags, XFS_DIFLAG_... */
+ __be32 di_gen; /* generation number */
+
+ /* di_next_unlinked is the only non-core field in the old dinode */
+ __be32 di_next_unlinked;/* agi unlinked list ptr */
+
+ /* start of the extended dinode, writable fields */
+ __le32 di_crc; /* CRC of the inode */
+ __be64 di_changecount; /* number of attribute changes */
+ __be64 di_lsn; /* flush sequence */
+ __be64 di_flags2; /* more random flags */
+ __u8 di_pad2[16]; /* more padding for future expansion */
+
+ /* fields only written to during inode creation */
+ xfs_timestamp_t di_crtime; /* time created */
+ __be64 di_ino; /* inode number */
+ uuid_t di_uuid; /* UUID of the filesystem */
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
+
+#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
+
+#define DI_MAX_FLUSH 0xffff
+
+/*
+ * Size of the core inode on disk. Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+ if (version == 3)
+ return sizeof(struct xfs_dinode);
+ return offsetof(struct xfs_dinode, di_crc);
+}
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define XFS_MAXLINK ((1U << 31) - 1U)
+#define XFS_MAXLINK_1 65535U
+
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt {
+ XFS_DINODE_FMT_DEV, /* xfs_dev_t */
+ XFS_DINODE_FMT_LOCAL, /* bulk data */
+ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
+ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
+ XFS_DINODE_FMT_UUID /* uuid_t */
+} xfs_dinode_fmt_t;
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define XFS_DINODE_MIN_LOG 8
+#define XFS_DINODE_MAX_LOG 11
+#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
+#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#define XFS_LITINO(mp, version) \
+ ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
+
+#define XFS_DFORK_DSIZE(dip,mp) \
+ (XFS_DFORK_Q(dip) ? \
+ XFS_DFORK_BOFF(dip) : \
+ XFS_LITINO(mp, (dip)->di_version))
+#define XFS_DFORK_ASIZE(dip,mp) \
+ (XFS_DFORK_Q(dip) ? \
+ XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
+ 0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+ ((w) == XFS_DATA_FORK ? \
+ XFS_DFORK_DSIZE(dip, mp) : \
+ XFS_DFORK_ASIZE(dip, mp))
+
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+ ((char *)dip + xfs_dinode_size(dip->di_version))
+#define XFS_DFORK_APTR(dip) \
+ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w) \
+ ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
+#define XFS_DFORK_FORMAT(dip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (dip)->di_format : \
+ (dip)->di_aformat)
+#define XFS_DFORK_NEXTENTS(dip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ be32_to_cpu((dip)->di_nextents) : \
+ be16_to_cpu((dip)->di_anextents))
+
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+ return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+ *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
+#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
+
+#define XFS_DIFLAG_ANY \
+ (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+ XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+ XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+ XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+ XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
+#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
+#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
+#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
+#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
+#define XFS_INO_BITS(mp) \
+ XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define XFS_INO_TO_AGNO(mp,i) \
+ ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGINO(mp,i) \
+ ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGBNO(mp,i) \
+ (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+ XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define XFS_INO_TO_OFFSET(mp,i) \
+ ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_INO_TO_FSB(mp,i) \
+ XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define XFS_AGINO_TO_INO(mp,a,i) \
+ (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp))
+#define XFS_AGINO_TO_OFFSET(mp,i) \
+ ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
+ ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+
+#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+/*
* RealTime Device format definitions
*/
@@ -413,4 +1484,40 @@ struct xfs_btree_block {
#define XFS_BTREE_LBLOCK_CRC_OFF \
offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+/*
+ * On-disk XFS access control list structure.
+ */
+struct xfs_acl_entry {
+ __be32 ae_tag;
+ __be32 ae_id;
+ __be16 ae_perm;
+ __be16 ae_pad; /* fill the implicit hole in the structure */
+};
+
+struct xfs_acl {
+ __be32 acl_cnt;
+ struct xfs_acl_entry acl_entry[0];
+};
+
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+ (xfs_sb_version_hascrc(&mp->m_sb) \
+ ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ sizeof(struct xfs_acl_entry) \
+ : 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ (sizeof(struct xfs_acl) + \
+ sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
+
#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 23dcb72fc5e6..116ef1ddb3e3 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -22,9 +22,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -39,7 +37,6 @@
#include "xfs_buf_item.h"
#include "xfs_icreate_item.h"
#include "xfs_icache.h"
-#include "xfs_dinode.h"
#include "xfs_trace.h"
@@ -48,12 +45,12 @@
*/
static inline int
xfs_ialloc_cluster_alignment(
- xfs_alloc_arg_t *args)
+ struct xfs_mount *mp)
{
- if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
- args->mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
- return args->mp->m_sb.sb_inoalignmt;
+ if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ mp->m_sb.sb_inoalignmt >=
+ XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+ return mp->m_sb.sb_inoalignmt;
return 1;
}
@@ -412,7 +409,7 @@ xfs_ialloc_ag_alloc(
* but not to use them in the actual exact allocation.
*/
args.alignment = 1;
- args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+ args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
/* Allow space for the inode btree to split. */
args.minleft = args.mp->m_in_maxlevels - 1;
@@ -448,7 +445,7 @@ xfs_ialloc_ag_alloc(
args.alignment = args.mp->m_dalign;
isaligned = 1;
} else
- args.alignment = xfs_ialloc_cluster_alignment(&args);
+ args.alignment = xfs_ialloc_cluster_alignment(args.mp);
/*
* Need to figure out where to allocate the inode blocks.
* Ideally they should be spaced out through the a.g.
@@ -477,7 +474,7 @@ xfs_ialloc_ag_alloc(
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
- args.alignment = xfs_ialloc_cluster_alignment(&args);
+ args.alignment = xfs_ialloc_cluster_alignment(args.mp);
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -632,10 +629,24 @@ xfs_ialloc_ag_select(
}
/*
- * Is there enough free space for the file plus a block of
- * inodes? (if we need to allocate some)?
+ * Check that there is enough free space for the file plus a
+ * chunk of inodes if we need to allocate some. If this is the
+ * first pass across the AGs, take into account the potential
+ * space needed for alignment of inode chunks when checking the
+ * longest contiguous free space in the AG - this prevents us
+ * from getting ENOSPC because we have free space larger than
+ * m_ialloc_blks but alignment constraints prevent us from using
+ * it.
+ *
+ * If we can't find an AG with space for full alignment slack to
+ * be taken into account, we must be near ENOSPC in all AGs.
+ * Hence we don't include alignment for the second pass and so
+ * if we fail allocation due to alignment issues then it is most
+ * likely a real ENOSPC condition.
*/
ineed = mp->m_ialloc_blks;
+ if (flags && ineed > 1)
+ ineed += xfs_ialloc_cluster_alignment(mp);
longest = pag->pagf_longest;
if (!longest)
longest = pag->pagf_flcount > 0;
@@ -1137,11 +1148,7 @@ xfs_dialloc_ag_update_inobt(
XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
(rec.ir_freecount == frec->ir_freecount));
- error = xfs_inobt_update(cur, &rec);
- if (error)
- return error;
-
- return 0;
+ return xfs_inobt_update(cur, &rec);
}
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 95ad1c002d60..100007d56449 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -160,4 +160,8 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_agblock_t length, unsigned int gen);
+int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, struct xfs_buf **bpp);
+
+
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index c9b06f30fe86..964c465ca69c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f18fd2da49f7..002b6b3a1988 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_error.h"
@@ -30,7 +28,6 @@
#include "xfs_icache.h"
#include "xfs_trans.h"
#include "xfs_ialloc.h"
-#include "xfs_dinode.h"
/*
* Check that none of the inode's in the buffer have a next
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 6a00f7fed69d..0defbd02f62d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -22,9 +22,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
@@ -34,7 +31,6 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
kmem_zone_t *xfs_ifork_zone;
diff --git a/fs/xfs/libxfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
deleted file mode 100644
index 4ff2278e147a..000000000000
--- a/fs/xfs/libxfs/xfs_inum.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_INUM_H__
-#define __XFS_INUM_H__
-
-/*
- * Inode number format:
- * low inopblog bits - offset in block
- * next agblklog bits - block number in ag
- * next agno_log bits - ag number
- * high agno_log-agblklog-inopblog bits - 0
- */
-
-struct xfs_mount;
-
-#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
-#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
-#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
-#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
-#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
-#define XFS_INO_BITS(mp) \
- XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
-#define XFS_INO_TO_AGNO(mp,i) \
- ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
-#define XFS_INO_TO_AGINO(mp,i) \
- ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
-#define XFS_INO_TO_AGBNO(mp,i) \
- (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
- XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
-#define XFS_INO_TO_OFFSET(mp,i) \
- ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
-#define XFS_INO_TO_FSB(mp,i) \
- XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
-#define XFS_AGINO_TO_INO(mp,a,i) \
- (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
-#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp))
-#define XFS_AGINO_TO_OFFSET(mp,i) \
- ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
-#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
- ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
-
-#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
-
-#endif /* __XFS_INUM_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index aff12f2d4428..265314690415 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -361,7 +361,7 @@ typedef struct xfs_ictimestamp {
/*
* NOTE: This structure must be kept identical to struct xfs_dinode
- * in xfs_dinode.h except for the endianness annotations.
+ * except for the endianness annotations.
*/
typedef struct xfs_icdinode {
__uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index ee7e0e80246b..c10597973333 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_trans_space.h"
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 7c818f1e4484..9b59ffa1fc19 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
@@ -36,7 +34,6 @@
#include "xfs_trace.h"
#include "xfs_buf.h"
#include "xfs_icache.h"
-#include "xfs_dinode.h"
#include "xfs_rtalloc.h"
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5f902fa7913f..752915fa775a 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -23,7 +23,6 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
@@ -33,7 +32,6 @@
#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
-#include "xfs_dinode.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 2e739708afd3..8eb1c54bafbf 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -19,590 +19,6 @@
#define __XFS_SB_H__
/*
- * Super block
- * Fits into a sector-sized buffer at address 0 of each allocation group.
- * Only the first of these is ever updated except during growfs.
- */
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_trans;
-
-#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
-#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
-#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
-#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
-#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
-#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
-#define XFS_SB_VERSION_NUMBITS 0x000f
-#define XFS_SB_VERSION_ALLFBITS 0xfff0
-#define XFS_SB_VERSION_ATTRBIT 0x0010
-#define XFS_SB_VERSION_NLINKBIT 0x0020
-#define XFS_SB_VERSION_QUOTABIT 0x0040
-#define XFS_SB_VERSION_ALIGNBIT 0x0080
-#define XFS_SB_VERSION_DALIGNBIT 0x0100
-#define XFS_SB_VERSION_SHAREDBIT 0x0200
-#define XFS_SB_VERSION_LOGV2BIT 0x0400
-#define XFS_SB_VERSION_SECTORBIT 0x0800
-#define XFS_SB_VERSION_EXTFLGBIT 0x1000
-#define XFS_SB_VERSION_DIRV2BIT 0x2000
-#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
-#define XFS_SB_VERSION_MOREBITSBIT 0x8000
-
-/*
- * Supported feature bit list is just all bits in the versionnum field because
- * we've used them all up and understand them all. Except, of course, for the
- * shared superblock bit, which nobody knows what it does and so is unsupported.
- */
-#define XFS_SB_VERSION_OKBITS \
- ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
- ~XFS_SB_VERSION_SHAREDBIT)
-
-/*
- * There are two words to hold XFS "feature" bits: the original
- * word, sb_versionnum, and sb_features2. Whenever a bit is set in
- * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
- *
- * These defines represent bits in sb_features2.
- */
-#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
-#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
-#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
-#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
-#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
-#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
-#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
-#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
-
-#define XFS_SB_VERSION2_OKBITS \
- (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
- XFS_SB_VERSION2_ATTR2BIT | \
- XFS_SB_VERSION2_PROJID32BIT | \
- XFS_SB_VERSION2_FTYPE)
-
-/*
- * Superblock - in core version. Must match the ondisk version below.
- * Must be padded to 64 bit alignment.
- */
-typedef struct xfs_sb {
- __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
- __uint32_t sb_blocksize; /* logical block size, bytes */
- xfs_rfsblock_t sb_dblocks; /* number of data blocks */
- xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
- xfs_rtblock_t sb_rextents; /* number of realtime extents */
- uuid_t sb_uuid; /* file system unique id */
- xfs_fsblock_t sb_logstart; /* starting block of log if internal */
- xfs_ino_t sb_rootino; /* root inode number */
- xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
- xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
- xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */
- xfs_agblock_t sb_agblocks; /* size of an allocation group */
- xfs_agnumber_t sb_agcount; /* number of allocation groups */
- xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
- xfs_extlen_t sb_logblocks; /* number of log blocks */
- __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
- __uint16_t sb_sectsize; /* volume sector size, bytes */
- __uint16_t sb_inodesize; /* inode size, bytes */
- __uint16_t sb_inopblock; /* inodes per block */
- char sb_fname[12]; /* file system name */
- __uint8_t sb_blocklog; /* log2 of sb_blocksize */
- __uint8_t sb_sectlog; /* log2 of sb_sectsize */
- __uint8_t sb_inodelog; /* log2 of sb_inodesize */
- __uint8_t sb_inopblog; /* log2 of sb_inopblock */
- __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
- __uint8_t sb_rextslog; /* log2 of sb_rextents */
- __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
- __uint8_t sb_imax_pct; /* max % of fs for inode space */
- /* statistics */
- /*
- * These fields must remain contiguous. If you really
- * want to change their layout, make sure you fix the
- * code in xfs_trans_apply_sb_deltas().
- */
- __uint64_t sb_icount; /* allocated inodes */
- __uint64_t sb_ifree; /* free inodes */
- __uint64_t sb_fdblocks; /* free data blocks */
- __uint64_t sb_frextents; /* free realtime extents */
- /*
- * End contiguous fields.
- */
- xfs_ino_t sb_uquotino; /* user quota inode */
- xfs_ino_t sb_gquotino; /* group quota inode */
- __uint16_t sb_qflags; /* quota flags */
- __uint8_t sb_flags; /* misc. flags */
- __uint8_t sb_shared_vn; /* shared version number */
- xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
- __uint32_t sb_unit; /* stripe or raid unit */
- __uint32_t sb_width; /* stripe or raid width */
- __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
- __uint8_t sb_logsectlog; /* log2 of the log sector size */
- __uint16_t sb_logsectsize; /* sector size for the log, bytes */
- __uint32_t sb_logsunit; /* stripe unit size for the log */
- __uint32_t sb_features2; /* additional feature bits */
-
- /*
- * bad features2 field as a result of failing to pad the sb
- * structure to 64 bits. Some machines will be using this field
- * for features2 bits. Easiest just to mark it bad and not use
- * it for anything else.
- */
- __uint32_t sb_bad_features2;
-
- /* version 5 superblock fields start here */
-
- /* feature masks */
- __uint32_t sb_features_compat;
- __uint32_t sb_features_ro_compat;
- __uint32_t sb_features_incompat;
- __uint32_t sb_features_log_incompat;
-
- __uint32_t sb_crc; /* superblock crc */
- __uint32_t sb_pad;
-
- xfs_ino_t sb_pquotino; /* project quota inode */
- xfs_lsn_t sb_lsn; /* last write sequence */
-
- /* must be padded to 64 bit alignment */
-} xfs_sb_t;
-
-#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
-
-/*
- * Superblock - on disk version. Must match the in core version above.
- * Must be padded to 64 bit alignment.
- */
-typedef struct xfs_dsb {
- __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
- __be32 sb_blocksize; /* logical block size, bytes */
- __be64 sb_dblocks; /* number of data blocks */
- __be64 sb_rblocks; /* number of realtime blocks */
- __be64 sb_rextents; /* number of realtime extents */
- uuid_t sb_uuid; /* file system unique id */
- __be64 sb_logstart; /* starting block of log if internal */
- __be64 sb_rootino; /* root inode number */
- __be64 sb_rbmino; /* bitmap inode for realtime extents */
- __be64 sb_rsumino; /* summary inode for rt bitmap */
- __be32 sb_rextsize; /* realtime extent size, blocks */
- __be32 sb_agblocks; /* size of an allocation group */
- __be32 sb_agcount; /* number of allocation groups */
- __be32 sb_rbmblocks; /* number of rt bitmap blocks */
- __be32 sb_logblocks; /* number of log blocks */
- __be16 sb_versionnum; /* header version == XFS_SB_VERSION */
- __be16 sb_sectsize; /* volume sector size, bytes */
- __be16 sb_inodesize; /* inode size, bytes */
- __be16 sb_inopblock; /* inodes per block */
- char sb_fname[12]; /* file system name */
- __u8 sb_blocklog; /* log2 of sb_blocksize */
- __u8 sb_sectlog; /* log2 of sb_sectsize */
- __u8 sb_inodelog; /* log2 of sb_inodesize */
- __u8 sb_inopblog; /* log2 of sb_inopblock */
- __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */
- __u8 sb_rextslog; /* log2 of sb_rextents */
- __u8 sb_inprogress; /* mkfs is in progress, don't mount */
- __u8 sb_imax_pct; /* max % of fs for inode space */
- /* statistics */
- /*
- * These fields must remain contiguous. If you really
- * want to change their layout, make sure you fix the
- * code in xfs_trans_apply_sb_deltas().
- */
- __be64 sb_icount; /* allocated inodes */
- __be64 sb_ifree; /* free inodes */
- __be64 sb_fdblocks; /* free data blocks */
- __be64 sb_frextents; /* free realtime extents */
- /*
- * End contiguous fields.
- */
- __be64 sb_uquotino; /* user quota inode */
- __be64 sb_gquotino; /* group quota inode */
- __be16 sb_qflags; /* quota flags */
- __u8 sb_flags; /* misc. flags */
- __u8 sb_shared_vn; /* shared version number */
- __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */
- __be32 sb_unit; /* stripe or raid unit */
- __be32 sb_width; /* stripe or raid width */
- __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */
- __u8 sb_logsectlog; /* log2 of the log sector size */
- __be16 sb_logsectsize; /* sector size for the log, bytes */
- __be32 sb_logsunit; /* stripe unit size for the log */
- __be32 sb_features2; /* additional feature bits */
- /*
- * bad features2 field as a result of failing to pad the sb
- * structure to 64 bits. Some machines will be using this field
- * for features2 bits. Easiest just to mark it bad and not use
- * it for anything else.
- */
- __be32 sb_bad_features2;
-
- /* version 5 superblock fields start here */
-
- /* feature masks */
- __be32 sb_features_compat;
- __be32 sb_features_ro_compat;
- __be32 sb_features_incompat;
- __be32 sb_features_log_incompat;
-
- __le32 sb_crc; /* superblock crc */
- __be32 sb_pad;
-
- __be64 sb_pquotino; /* project quota inode */
- __be64 sb_lsn; /* last write sequence */
-
- /* must be padded to 64 bit alignment */
-} xfs_dsb_t;
-
-/*
- * Sequence number values for the fields.
- */
-typedef enum {
- XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
- XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
- XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
- XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
- XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
- XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
- XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
- XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
- XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
- XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
- XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
- XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
- XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
- XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
- XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
- XFS_SBS_PQUOTINO, XFS_SBS_LSN,
- XFS_SBS_FIELDCOUNT
-} xfs_sb_field_t;
-
-/*
- * Mask values, defined based on the xfs_sb_field_t values.
- * Only define the ones we're using.
- */
-#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
-#define XFS_SB_UUID XFS_SB_MVAL(UUID)
-#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
-#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
-#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
-#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
-#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
-#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
-#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
-#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
-#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
-#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
-#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
-#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
-#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
-#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
-#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2)
-#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
-#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
-#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
-#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
-#define XFS_SB_CRC XFS_SB_MVAL(CRC)
-#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
-#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
-#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
-#define XFS_SB_MOD_BITS \
- (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
- XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
- XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
- XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
- XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
- XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
- XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
-
-
-/*
- * Misc. Flags - warning - these will be cleared by xfs_repair unless
- * a feature bit is set when the flag is used.
- */
-#define XFS_SBF_NOFLAGS 0x00 /* no flags set */
-#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */
-
-/*
- * define max. shared version we can interoperate with
- */
-#define XFS_SB_MAX_SHARED_VN 0
-
-#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-
-/*
- * The first XFS version we support is a v4 superblock with V2 directories.
- */
-static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
-{
- if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
- return false;
-
- /* check for unknown features in the fs */
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
- return false;
-
- return true;
-}
-
-static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
-{
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
- return true;
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
- return xfs_sb_good_v4_features(sbp);
- return false;
-}
-
-/*
- * Detect a mismatched features2 field. Older kernels read/wrote
- * this into the wrong slot, so to be safe we keep them in sync.
- */
-static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
-{
- return sbp->sb_bad_features2 != sbp->sb_features2;
-}
-
-static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
-}
-
-static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
-{
- sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
-}
-
-static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
-}
-
-static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
-{
- sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
-}
-
-static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
-}
-
-static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
-}
-
-static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
-}
-
-static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
-}
-
-static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
-}
-
-static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
-}
-
-static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
-}
-
-/*
- * sb_features2 bit version macros.
- */
-static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
-}
-
-static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
-}
-
-static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
-{
- sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
- sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
- sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-}
-
-static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
-{
- sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
- sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
- if (!sbp->sb_features2)
- sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
-}
-
-static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
-}
-
-static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
-{
- sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
- sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
- sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
-}
-
-/*
- * Extended v5 superblock feature masks. These are to be used for new v5
- * superblock features only.
- *
- * Compat features are new features that old kernels will not notice or affect
- * and so can mount read-write without issues.
- *
- * RO-Compat (read only) are features that old kernels can read but will break
- * if they write. Hence only read-only mounts of such filesystems are allowed on
- * kernels that don't support the feature bit.
- *
- * InCompat features are features which old kernels will not understand and so
- * must not mount.
- *
- * Log-InCompat features are for changes to log formats or new transactions that
- * can't be replayed on older kernels. The fields are set when the filesystem is
- * mounted, and a clean unmount clears the fields.
- */
-#define XFS_SB_FEAT_COMPAT_ALL 0
-#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
-static inline bool
-xfs_sb_has_compat_feature(
- struct xfs_sb *sbp,
- __uint32_t feature)
-{
- return (sbp->sb_features_compat & feature) != 0;
-}
-
-#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
-#define XFS_SB_FEAT_RO_COMPAT_ALL \
- (XFS_SB_FEAT_RO_COMPAT_FINOBT)
-#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
-static inline bool
-xfs_sb_has_ro_compat_feature(
- struct xfs_sb *sbp,
- __uint32_t feature)
-{
- return (sbp->sb_features_ro_compat & feature) != 0;
-}
-
-#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
-#define XFS_SB_FEAT_INCOMPAT_ALL \
- (XFS_SB_FEAT_INCOMPAT_FTYPE)
-
-#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
-static inline bool
-xfs_sb_has_incompat_feature(
- struct xfs_sb *sbp,
- __uint32_t feature)
-{
- return (sbp->sb_features_incompat & feature) != 0;
-}
-
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
-#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
-static inline bool
-xfs_sb_has_incompat_log_feature(
- struct xfs_sb *sbp,
- __uint32_t feature)
-{
- return (sbp->sb_features_log_incompat & feature) != 0;
-}
-
-/*
- * V5 superblock specific feature checks
- */
-static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
-}
-
-static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
- (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
-}
-
-/*
- * end of superblock version macros
- */
-
-static inline bool
-xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
-{
- return (ino == sbp->sb_uquotino ||
- ino == sbp->sb_gquotino ||
- ino == sbp->sb_pquotino);
-}
-
-#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
-#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
-#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
-
-#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
-#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
- xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
-#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
- XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
-
-/*
- * File system sector to basic block conversions.
- */
-#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log)
-
-/*
- * File system block to basic block conversions.
- */
-#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
-#define XFS_BB_TO_FSB(mp,bb) \
- (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
-#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
-
-/*
- * File system block to byte conversions.
- */
-#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
-#define XFS_B_TO_FSB(mp,b) \
- ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
-#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
-#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
-
-/*
* perag get/put wrappers for ref counting
*/
extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 5782f037eab4..c80c5236c3da 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f2bda7c76b8a..6c1330f29050 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -22,8 +22,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index a65fa5dde6e9..4b641676f258 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -19,8 +19,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_acl.h"
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 5dc163744511..3841b07f27bf 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,42 +22,6 @@ struct inode;
struct posix_acl;
struct xfs_inode;
-#define XFS_ACL_NOT_PRESENT (-1)
-
-/* On-disk XFS access control list structure */
-struct xfs_acl_entry {
- __be32 ae_tag;
- __be32 ae_id;
- __be16 ae_perm;
- __be16 ae_pad; /* fill the implicit hole in the structure */
-};
-
-struct xfs_acl {
- __be32 acl_cnt;
- struct xfs_acl_entry acl_entry[0];
-};
-
-/*
- * The number of ACL entries allowed is defined by the on-disk format.
- * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
- * limited only by the maximum size of the xattr that stores the information.
- */
-#define XFS_ACL_MAX_ENTRIES(mp) \
- (xfs_sb_version_hascrc(&mp->m_sb) \
- ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
- sizeof(struct xfs_acl_entry) \
- : 25)
-
-#define XFS_ACL_MAX_SIZE(mp) \
- (sizeof(struct xfs_acl) + \
- sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
-
-/* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
-#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
-#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
-
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f5b2453a43b2..18e2f3bbae5e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -20,8 +20,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
@@ -33,7 +31,6 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index aa2a8b1838a2..83af4c149635 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -23,8 +23,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -39,7 +37,6 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
-#include "xfs_dinode.h"
#include "xfs_dir2.h"
/*
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 62db83ab6cbc..a43d370d2c58 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -39,7 +37,6 @@
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
-#include "xfs_dinode.h"
#include "xfs_dir2.h"
STATIC int
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 92e8f99a5857..22a5dcb70b32 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -23,8 +23,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_inode.h"
@@ -42,7 +40,6 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
-#include "xfs_dinode.h"
/* Kernel only BMAP related definitions and functions */
@@ -1338,7 +1335,10 @@ xfs_free_file_space(
goto out;
}
-
+/*
+ * Preallocate and zero a range of a file. This mechanism has the allocation
+ * semantics of fallocate and in addition converts data in the range to zeroes.
+ */
int
xfs_zero_file_space(
struct xfs_inode *ip,
@@ -1346,65 +1346,30 @@ xfs_zero_file_space(
xfs_off_t len)
{
struct xfs_mount *mp = ip->i_mount;
- uint granularity;
- xfs_off_t start_boundary;
- xfs_off_t end_boundary;
+ uint blksize;
int error;
trace_xfs_zero_file_space(ip);
- granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+ blksize = 1 << mp->m_sb.sb_blocklog;
/*
- * Round the range of extents we are going to convert inwards. If the
- * offset is aligned, then it doesn't get changed so we zero from the
- * start of the block offset points to.
+ * Punch a hole and prealloc the range. We use hole punch rather than
+ * unwritten extent conversion for two reasons:
+ *
+ * 1.) Hole punch handles partial block zeroing for us.
+ *
+ * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
+ * by virtue of the hole punch.
*/
- start_boundary = round_up(offset, granularity);
- end_boundary = round_down(offset + len, granularity);
-
- ASSERT(start_boundary >= offset);
- ASSERT(end_boundary <= offset + len);
-
- if (start_boundary < end_boundary - 1) {
- /*
- * Writeback the range to ensure any inode size updates due to
- * appending writes make it to disk (otherwise we could just
- * punch out the delalloc blocks).
- */
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- start_boundary, end_boundary - 1);
- if (error)
- goto out;
- truncate_pagecache_range(VFS_I(ip), start_boundary,
- end_boundary - 1);
-
- /* convert the blocks */
- error = xfs_alloc_file_space(ip, start_boundary,
- end_boundary - start_boundary - 1,
- XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
- if (error)
- goto out;
-
- /* We've handled the interior of the range, now for the edges */
- if (start_boundary != offset) {
- error = xfs_iozero(ip, offset, start_boundary - offset);
- if (error)
- goto out;
- }
-
- if (end_boundary != offset + len)
- error = xfs_iozero(ip, end_boundary,
- offset + len - end_boundary);
-
- } else {
- /*
- * It's either a sub-granularity range or the range spanned lies
- * partially across two adjacent blocks.
- */
- error = xfs_iozero(ip, offset, len);
- }
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ goto out;
+ error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+ round_up(offset + len, blksize) -
+ round_down(offset, blksize),
+ XFS_BMAPI_PREALLOC);
out:
return error;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 24b4ebea0d4d..bb502a391792 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -34,18 +34,16 @@
#include <linux/backing-dev.h>
#include <linux/freezer.h>
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
#include "xfs_log.h"
static kmem_zone_t *xfs_buf_zone;
-static struct workqueue_struct *xfslogd_workqueue;
-
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
@@ -463,7 +461,7 @@ _xfs_buf_find(
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (blkno >= eofs) {
+ if (blkno < 0 || blkno >= eofs) {
/*
* XXX (dgc): we should really be returning -EFSCORRUPTED here,
* but none of the higher level infrastructure supports
@@ -1043,7 +1041,7 @@ xfs_buf_ioend_work(
struct work_struct *work)
{
struct xfs_buf *bp =
- container_of(work, xfs_buf_t, b_iodone_work);
+ container_of(work, xfs_buf_t, b_ioend_work);
xfs_buf_ioend(bp);
}
@@ -1052,8 +1050,8 @@ void
xfs_buf_ioend_async(
struct xfs_buf *bp)
{
- INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
- queue_work(xfslogd_workqueue, &bp->b_iodone_work);
+ INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
+ queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
}
void
@@ -1222,6 +1220,13 @@ _xfs_buf_ioapply(
*/
bp->b_error = 0;
+ /*
+ * Initialize the I/O completion workqueue if we haven't yet or the
+ * submitter has not opted to specify a custom one.
+ */
+ if (!bp->b_ioend_wq)
+ bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
+
if (bp->b_flags & XBF_WRITE) {
if (bp->b_flags & XBF_SYNCIO)
rw = WRITE_SYNC;
@@ -1882,15 +1887,8 @@ xfs_buf_init(void)
if (!xfs_buf_zone)
goto out;
- xfslogd_workqueue = alloc_workqueue("xfslogd",
- WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
- if (!xfslogd_workqueue)
- goto out_free_buf_zone;
-
return 0;
- out_free_buf_zone:
- kmem_zone_destroy(xfs_buf_zone);
out:
return -ENOMEM;
}
@@ -1898,6 +1896,5 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- destroy_workqueue(xfslogd_workqueue);
kmem_zone_destroy(xfs_buf_zone);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 82002c00af90..75ff5d5a7d2e 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -164,7 +164,8 @@ typedef struct xfs_buf {
struct xfs_perag *b_pag; /* contains rbtree root */
xfs_buftarg_t *b_target; /* buffer target (device) */
void *b_addr; /* virtual address of buffer */
- struct work_struct b_iodone_work;
+ struct work_struct b_ioend_work;
+ struct workqueue_struct *b_ioend_wq; /* I/O completion wq */
xfs_buf_iodone_t b_iodone; /* I/O completion function */
struct completion b_iowait; /* queue for I/O waiters */
void *b_fspriv;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f15969543326..3f9bd58edec7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -17,11 +17,11 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index f1b69edcdf31..098cd78fe708 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -34,7 +32,6 @@
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_trans.h"
-#include "xfs_dinode.h"
/*
* Directory file type support functions
@@ -44,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = {
DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
};
-unsigned char
+static unsigned char
xfs_dir3_get_dtype(
struct xfs_mount *mp,
__uint8_t filetype)
@@ -57,22 +54,6 @@ xfs_dir3_get_dtype(
return xfs_dir3_filetype_table[filetype];
}
-/*
- * @mode, if set, indicates that the type field needs to be set up.
- * This uses the transformation from file mode to DT_* as defined in linux/fs.h
- * for file type specification. This will be propagated into the directory
- * structure if appropriate for the given operation and filesystem config.
- */
-const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
- [0] = XFS_DIR3_FT_UNKNOWN,
- [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
- [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
-};
STATIC int
xfs_dir2_sf_getdents(
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 13d08a1b390e..799e5a2d334d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -20,7 +20,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_quota.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 63c2de49f61d..02c01bbbc789 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -22,8 +22,6 @@
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index f33fbaaa4d8a..814cff94e78f 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -20,8 +20,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_quota.h"
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index b92fd7bc49e3..3ee186ac1093 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -20,8 +20,6 @@
#include "xfs_fs.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_error.h"
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5a6bd5d8779a..5eb4a14e0a0f 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -19,10 +19,9 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_export.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index fd22f69049d4..c263e079273e 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -24,7 +24,6 @@
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index c4327419dc5c..cb7fe64cdbfa 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -17,10 +17,9 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index eb596b419942..13e974e6a889 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -37,7 +35,6 @@
#include "xfs_ioctl.h"
#include "xfs_trace.h"
#include "xfs_log.h"
-#include "xfs_dinode.h"
#include "xfs_icache.h"
#include <linux/aio.h>
@@ -933,7 +930,6 @@ xfs_file_readdir(
{
struct inode *inode = file_inode(file);
xfs_inode_t *ip = XFS_I(inode);
- int error;
size_t bufsize;
/*
@@ -950,10 +946,7 @@ xfs_file_readdir(
*/
bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
- error = xfs_readdir(ip, ctx, bufsize);
- if (error)
- return error;
- return 0;
+ return xfs_readdir(ip, ctx, bufsize);
}
STATIC int
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index e92730c1d3ca..a2e86e8a0fea 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -20,16 +20,13 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_ag.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_inum.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_alloc.h"
#include "xfs_mru_cache.h"
-#include "xfs_dinode.h"
#include "xfs_filestream.h"
#include "xfs_trace.h"
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c05ac8b70fa9..fdc64220fcb0 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -22,7 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -40,7 +39,6 @@
#include "xfs_rtalloc.h"
#include "xfs_trace.h"
#include "xfs_log.h"
-#include "xfs_dinode.h"
#include "xfs_filestream.h"
/*
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index b45f7b27b5df..9771b7ef62ed 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -20,9 +20,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_error.h"
@@ -65,6 +63,7 @@ xfs_inode_alloc(
return NULL;
}
+ XFS_STATS_INC(vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
@@ -130,6 +129,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!xfs_isiflocked(ip));
+ XFS_STATS_DEC(vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 46748b86b12f..62f1f91c32cb 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -34,6 +34,14 @@ struct xfs_eofblocks {
#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
+ in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
+
+/*
* Flags for xfs_iget()
*/
#define XFS_IGET_CREATE 0x1
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 7e4549233251..d45ca72af6fb 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -18,11 +18,10 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8ed049d1e332..41f804e740d7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,9 +23,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_da_format.h"
@@ -1082,7 +1080,7 @@ xfs_create(
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
struct xfs_dquot *pdqp = NULL;
- struct xfs_trans_res tres;
+ struct xfs_trans_res *tres;
uint resblks;
trace_xfs_create(dp, name);
@@ -1105,13 +1103,11 @@ xfs_create(
if (is_dir) {
rdev = 0;
resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
- tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
- tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
+ tres = &M_RES(mp)->tr_mkdir;
tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
} else {
resblks = XFS_CREATE_SPACE_RES(mp, name->len);
- tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
- tres.tr_logcount = XFS_CREATE_LOG_COUNT;
+ tres = &M_RES(mp)->tr_create;
tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
}
@@ -1123,17 +1119,16 @@ xfs_create(
* the case we'll drop the one we have and get a more
* appropriate transaction later.
*/
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- error = xfs_trans_reserve(tp, &tres, resblks, 0);
+ error = xfs_trans_reserve(tp, tres, resblks, 0);
if (error == -ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(mp);
- error = xfs_trans_reserve(tp, &tres, resblks, 0);
+ error = xfs_trans_reserve(tp, tres, resblks, 0);
}
if (error == -ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
resblks = 0;
- error = xfs_trans_reserve(tp, &tres, 0, 0);
+ error = xfs_trans_reserve(tp, tres, 0, 0);
}
if (error) {
cancel_flags = 0;
@@ -2488,9 +2483,7 @@ xfs_remove(
xfs_fsblock_t first_block;
int cancel_flags;
int committed;
- int link_zero;
uint resblks;
- uint log_count;
trace_xfs_remove(dp, name);
@@ -2505,13 +2498,10 @@ xfs_remove(
if (error)
goto std_return;
- if (is_dir) {
+ if (is_dir)
tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
- log_count = XFS_DEFAULT_LOG_COUNT;
- } else {
+ else
tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
- log_count = XFS_REMOVE_LOG_COUNT;
- }
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
@@ -2579,9 +2569,6 @@ xfs_remove(
if (error)
goto out_trans_cancel;
- /* Determine if this is the last link while the inode is locked */
- link_zero = (ip->i_d.di_nlink == 0);
-
xfs_bmap_init(&free_list, &first_block);
error = xfs_dir_removename(tp, dp, name, ip->i_ino,
&first_block, &free_list, resblks);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9af2882e1f4c..4ed2ba9342dc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,7 +20,6 @@
#include "xfs_inode_buf.h"
#include "xfs_inode_fork.h"
-#include "xfs_dinode.h"
/*
* Kernel only inode definitions
@@ -324,7 +323,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
((pip)->i_d.di_mode & S_ISGID))
-
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 63de0b0acc32..bf13a5a7e2f4 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -20,8 +20,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
@@ -29,7 +27,6 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans_priv.h"
-#include "xfs_dinode.h"
#include "xfs_log.h"
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 24c926b6fe85..a1831980a68e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_ioctl.h"
@@ -40,7 +38,6 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
-#include "xfs_dinode.h"
#include "xfs_trans.h"
#include <linux/capability.h>
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 94ce027e28e3..ec6772866f3d 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -25,8 +25,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_itable.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index afcf3c926565..c980e2a5086b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -38,7 +36,6 @@
#include "xfs_quota.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
-#include "xfs_dinode.h"
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -52,7 +49,6 @@ xfs_iomap_eof_align_last_fsb(
xfs_extlen_t extsize,
xfs_fileoff_t *last_fsb)
{
- xfs_fileoff_t new_last_fsb = 0;
xfs_extlen_t align = 0;
int eof, error;
@@ -70,8 +66,8 @@ xfs_iomap_eof_align_last_fsb(
else if (mp->m_dalign)
align = mp->m_dalign;
- if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
- new_last_fsb = roundup_64(*last_fsb, align);
+ if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
+ align = 0;
}
/*
@@ -79,14 +75,14 @@ xfs_iomap_eof_align_last_fsb(
* (when file on a real-time subvolume or has di_extsize hint).
*/
if (extsize) {
- if (new_last_fsb)
- align = roundup_64(new_last_fsb, extsize);
+ if (align)
+ align = roundup_64(align, extsize);
else
align = extsize;
- new_last_fsb = roundup_64(*last_fsb, align);
}
- if (new_last_fsb) {
+ if (align) {
+ xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align);
error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
if (error)
return error;
@@ -264,7 +260,6 @@ xfs_iomap_eof_want_preallocate(
{
xfs_fileoff_t start_fsb;
xfs_filblks_t count_fsb;
- xfs_fsblock_t firstblock;
int n, error, imaps;
int found_delalloc = 0;
@@ -289,7 +284,6 @@ xfs_iomap_eof_want_preallocate(
count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
while (count_fsb > 0) {
imaps = nimaps;
- firstblock = NULLFSBLOCK;
error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
0);
if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ec6dcdc181ee..c50311cae1b1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_inode.h"
@@ -37,8 +35,7 @@
#include "xfs_icache.h"
#include "xfs_symlink.h"
#include "xfs_da_btree.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_dinode.h"
+#include "xfs_dir2.h"
#include "xfs_trans_space.h"
#include <linux/capability.h>
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f1deb961a296..82e314258f73 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -21,9 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -33,7 +30,6 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_dinode.h"
STATIC int
xfs_internal_inum(
@@ -236,8 +232,10 @@ xfs_bulkstat_grab_ichunk(
XFS_WANT_CORRUPTED_RETURN(stat == 1);
/* Check if the record contains the inode in request */
- if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
- return -EINVAL;
+ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
+ *icount = 0;
+ return 0;
+ }
idx = agino - irec->ir_startino + 1;
if (idx < XFS_INODES_PER_CHUNK &&
@@ -262,75 +260,76 @@ xfs_bulkstat_grab_ichunk(
#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
+struct xfs_bulkstat_agichunk {
+ char __user **ac_ubuffer;/* pointer into user's buffer */
+ int ac_ubleft; /* bytes left in user's buffer */
+ int ac_ubelem; /* spaces used in user's buffer */
+};
+
/*
* Process inodes in chunk with a pointer to a formatter function
* that will iget the inode and fill in the appropriate structure.
*/
-int
+static int
xfs_bulkstat_ag_ichunk(
struct xfs_mount *mp,
xfs_agnumber_t agno,
struct xfs_inobt_rec_incore *irbp,
bulkstat_one_pf formatter,
size_t statstruct_size,
- struct xfs_bulkstat_agichunk *acp)
+ struct xfs_bulkstat_agichunk *acp,
+ xfs_agino_t *last_agino)
{
- xfs_ino_t lastino = acp->ac_lastino;
char __user **ubufp = acp->ac_ubuffer;
- int ubleft = acp->ac_ubleft;
- int ubelem = acp->ac_ubelem;
- int chunkidx, clustidx;
+ int chunkidx;
int error = 0;
- xfs_agino_t agino;
+ xfs_agino_t agino = irbp->ir_startino;
- for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
- XFS_BULKSTAT_UBLEFT(ubleft) &&
- irbp->ir_freecount < XFS_INODES_PER_CHUNK;
- chunkidx++, clustidx++, agino++) {
- int fmterror; /* bulkstat formatter result */
+ for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
+ chunkidx++, agino++) {
+ int fmterror;
int ubused;
- xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino);
- ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+ /* inode won't fit in buffer, we are done */
+ if (acp->ac_ubleft < statstruct_size)
+ break;
/* Skip if this inode is free */
- if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
- lastino = ino;
+ if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
continue;
- }
-
- /*
- * Count used inodes as free so we can tell when the
- * chunk is used up.
- */
- irbp->ir_freecount++;
/* Get the inode and fill in a single buffer */
ubused = statstruct_size;
- error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
- if (fmterror == BULKSTAT_RV_NOTHING) {
- if (error && error != -ENOENT && error != -EINVAL) {
- ubleft = 0;
- break;
- }
- lastino = ino;
- continue;
- }
- if (fmterror == BULKSTAT_RV_GIVEUP) {
- ubleft = 0;
+ error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
+ *ubufp, acp->ac_ubleft, &ubused, &fmterror);
+
+ if (fmterror == BULKSTAT_RV_GIVEUP ||
+ (error && error != -ENOENT && error != -EINVAL)) {
+ acp->ac_ubleft = 0;
ASSERT(error);
break;
}
- if (*ubufp)
- *ubufp += ubused;
- ubleft -= ubused;
- ubelem++;
- lastino = ino;
+
+ /* be careful not to leak error if at end of chunk */
+ if (fmterror == BULKSTAT_RV_NOTHING || error) {
+ error = 0;
+ continue;
+ }
+
+ *ubufp += ubused;
+ acp->ac_ubleft -= ubused;
+ acp->ac_ubelem++;
}
- acp->ac_lastino = lastino;
- acp->ac_ubleft = ubleft;
- acp->ac_ubelem = ubelem;
+ /*
+ * Post-update *last_agino. At this point, agino will always point one
+ * inode past the last inode we processed successfully. Hence we
+ * substract that inode when setting the *last_agino cursor so that we
+ * return the correct cookie to userspace. On the next bulkstat call,
+ * the inode under the lastino cookie will be skipped as we have already
+ * processed it here.
+ */
+ *last_agino = agino - 1;
return error;
}
@@ -349,49 +348,36 @@ xfs_bulkstat(
int *done) /* 1 if there are more stats to get */
{
xfs_buf_t *agbp; /* agi header buffer */
- xfs_agi_t *agi; /* agi header data */
xfs_agino_t agino; /* inode # in allocation group */
xfs_agnumber_t agno; /* allocation group number */
xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
- int end_of_ag; /* set if we've seen the ag end */
- int error; /* error code */
- int fmterror;/* bulkstat formatter result */
- int i; /* loop index */
- int icount; /* count of inodes good in irbuf */
size_t irbsize; /* size of irec buffer in bytes */
- xfs_ino_t ino; /* inode number (filesystem) */
- xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */
xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
- xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
- xfs_ino_t lastino; /* last inode number returned */
int nirbuf; /* size of irbuf */
- int rval; /* return value error code */
- int tmp; /* result value from btree calls */
int ubcount; /* size of user's buffer */
- int ubleft; /* bytes left in user's buffer */
- char __user *ubufp; /* pointer into user's buffer */
- int ubelem; /* spaces used in user's buffer */
+ struct xfs_bulkstat_agichunk ac;
+ int error = 0;
/*
* Get the last inode value, see if there's nothing to do.
*/
- ino = (xfs_ino_t)*lastinop;
- lastino = ino;
- agno = XFS_INO_TO_AGNO(mp, ino);
- agino = XFS_INO_TO_AGINO(mp, ino);
+ agno = XFS_INO_TO_AGNO(mp, *lastinop);
+ agino = XFS_INO_TO_AGINO(mp, *lastinop);
if (agno >= mp->m_sb.sb_agcount ||
- ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
*done = 1;
*ubcountp = 0;
return 0;
}
ubcount = *ubcountp; /* statstruct's */
- ubleft = ubcount * statstruct_size; /* bytes */
- *ubcountp = ubelem = 0;
+ ac.ac_ubuffer = &ubuffer;
+ ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
+ ac.ac_ubelem = 0;
+
+ *ubcountp = 0;
*done = 0;
- fmterror = 0;
- ubufp = ubuffer;
+
irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
if (!irbuf)
return -ENOMEM;
@@ -402,22 +388,21 @@ xfs_bulkstat(
* Loop over the allocation groups, starting from the last
* inode returned; 0 means start of the allocation group.
*/
- rval = 0;
- while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
- cond_resched();
+ while (agno < mp->m_sb.sb_agcount) {
+ struct xfs_inobt_rec_incore *irbp = irbuf;
+ struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf;
+ bool end_of_ag = false;
+ int icount = 0;
+ int stat;
+
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
if (error)
break;
- agi = XFS_BUF_TO_AGI(agbp);
/*
* Allocate and initialize a btree cursor for ialloc btree.
*/
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
XFS_BTNUM_INO);
- irbp = irbuf;
- irbufend = irbuf + nirbuf;
- end_of_ag = 0;
- icount = 0;
if (agino > 0) {
/*
* In the middle of an allocation group, we need to get
@@ -427,22 +412,23 @@ xfs_bulkstat(
error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
if (error)
- break;
+ goto del_cursor;
if (icount) {
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
- agino = r.ir_startino + XFS_INODES_PER_CHUNK;
}
/* Increment to the next record */
- error = xfs_btree_increment(cur, 0, &tmp);
+ error = xfs_btree_increment(cur, 0, &stat);
} else {
/* Start of ag. Lookup the first inode chunk */
- error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
+ }
+ if (error || stat == 0) {
+ end_of_ag = true;
+ goto del_cursor;
}
- if (error)
- break;
/*
* Loop through inode btree records in this ag,
@@ -451,10 +437,10 @@ xfs_bulkstat(
while (irbp < irbufend && icount < ubcount) {
struct xfs_inobt_rec_incore r;
- error = xfs_inobt_get_rec(cur, &r, &i);
- if (error || i == 0) {
- end_of_ag = 1;
- break;
+ error = xfs_inobt_get_rec(cur, &r, &stat);
+ if (error || stat == 0) {
+ end_of_ag = true;
+ goto del_cursor;
}
/*
@@ -469,77 +455,79 @@ xfs_bulkstat(
irbp++;
icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
}
- /*
- * Set agino to after this chunk and bump the cursor.
- */
- agino = r.ir_startino + XFS_INODES_PER_CHUNK;
- error = xfs_btree_increment(cur, 0, &tmp);
+ error = xfs_btree_increment(cur, 0, &stat);
+ if (error || stat == 0) {
+ end_of_ag = true;
+ goto del_cursor;
+ }
cond_resched();
}
+
/*
- * Drop the btree buffers and the agi buffer.
- * We can't hold any of the locks these represent
- * when calling iget.
+ * Drop the btree buffers and the agi buffer as we can't hold any
+ * of the locks these represent when calling iget. If there is a
+ * pending error, then we are done.
*/
+del_cursor:
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_buf_relse(agbp);
+ if (error)
+ break;
/*
- * Now format all the good inodes into the user's buffer.
+ * Now format all the good inodes into the user's buffer. The
+ * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
+ * for the next loop iteration.
*/
irbufend = irbp;
for (irbp = irbuf;
- irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
- struct xfs_bulkstat_agichunk ac;
-
- ac.ac_lastino = lastino;
- ac.ac_ubuffer = &ubuffer;
- ac.ac_ubleft = ubleft;
- ac.ac_ubelem = ubelem;
+ irbp < irbufend && ac.ac_ubleft >= statstruct_size;
+ irbp++) {
error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
- formatter, statstruct_size, &ac);
+ formatter, statstruct_size, &ac,
+ &agino);
if (error)
- rval = error;
-
- lastino = ac.ac_lastino;
- ubleft = ac.ac_ubleft;
- ubelem = ac.ac_ubelem;
+ break;
cond_resched();
}
+
/*
- * Set up for the next loop iteration.
+ * If we've run out of space or had a formatting error, we
+ * are now done
*/
- if (XFS_BULKSTAT_UBLEFT(ubleft)) {
- if (end_of_ag) {
- agno++;
- agino = 0;
- } else
- agino = XFS_INO_TO_AGINO(mp, lastino);
- } else
+ if (ac.ac_ubleft < statstruct_size || error)
break;
+
+ if (end_of_ag) {
+ agno++;
+ agino = 0;
+ }
}
/*
* Done, we're either out of filesystem or space to put the data.
*/
kmem_free(irbuf);
- *ubcountp = ubelem;
+ *ubcountp = ac.ac_ubelem;
+
/*
- * Found some inodes, return them now and return the error next time.
+ * We found some inodes, so clear the error status and return them.
+ * The lastino pointer will point directly at the inode that triggered
+ * any error that occurred, so on the next call the error will be
+ * triggered again and propagated to userspace as there will be no
+ * formatted inodes in the buffer.
*/
- if (ubelem)
- rval = 0;
- if (agno >= mp->m_sb.sb_agcount) {
- /*
- * If we ran out of filesystem, mark lastino as off
- * the end of the filesystem, so the next call
- * will return immediately.
- */
- *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+ if (ac.ac_ubelem)
+ error = 0;
+
+ /*
+ * If we ran out of filesystem, lastino will point off the end of
+ * the filesystem so the next call will return immediately.
+ */
+ *lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
+ if (agno >= mp->m_sb.sb_agcount)
*done = 1;
- } else
- *lastinop = (xfs_ino_t)lastino;
- return rval;
+ return error;
}
int
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index aaed08022eb9..6ea8b3912fa4 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
int *ubused,
int *stat);
-struct xfs_bulkstat_agichunk {
- xfs_ino_t ac_lastino; /* last inode returned */
- char __user **ac_ubuffer;/* pointer into user's buffer */
- int ac_ubleft; /* bytes left in user's buffer */
- int ac_ubelem; /* spaces used in user's buffer */
-};
-
-int
-xfs_bulkstat_ag_ichunk(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- struct xfs_inobt_rec_incore *irbp,
- bulkstat_one_pf formatter,
- size_t statstruct_size,
- struct xfs_bulkstat_agichunk *acp);
-
/*
* Values for stat return value.
*/
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 6a51619d8690..c31d2c2eadc4 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -384,4 +384,10 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
#endif /* XFS_WARN */
#endif /* DEBUG */
+#ifdef CONFIG_XFS_RT
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#endif
+
#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index fe88ef67f93a..e408bf5a3ff7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_trans.h"
@@ -1031,7 +1029,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
struct xlog *log = mp->m_log;
int needed = 0;
- if (!xfs_fs_writable(mp))
+ if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
return 0;
if (!xlog_cil_empty(log))
@@ -1808,6 +1806,8 @@ xlog_sync(
XFS_BUF_ZEROFLAGS(bp);
XFS_BUF_ASYNC(bp);
bp->b_flags |= XBF_SYNCIO;
+ /* use high priority completion wq */
+ bp->b_ioend_wq = log->l_mp->m_log_workqueue;
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
bp->b_flags |= XBF_FUA;
@@ -1856,6 +1856,8 @@ xlog_sync(
bp->b_flags |= XBF_SYNCIO;
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
bp->b_flags |= XBF_FUA;
+ /* use high priority completion wq */
+ bp->b_ioend_wq = log->l_mp->m_log_workqueue;
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index f506c457011e..45cc0ce18adf 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -17,11 +17,10 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 00cd7f3a8f59..a5a945fc3bdc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -22,11 +22,10 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_log.h"
@@ -42,7 +41,6 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_error.h"
#include "xfs_dir2.h"
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 63ca2f0420b1..d8b67547ab34 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,10 +17,9 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 51435dbce9c4..d3d38836f87f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -22,11 +22,10 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_ialloc.h"
@@ -41,7 +40,6 @@
#include "xfs_fsops.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_dinode.h"
#include "xfs_sysfs.h"
@@ -1074,11 +1072,23 @@ xfs_unmountfs(
xfs_sysfs_del(&mp->m_kobj);
}
-int
-xfs_fs_writable(xfs_mount_t *mp)
+/*
+ * Determine whether modifications can proceed. The caller specifies the minimum
+ * freeze level for which modifications should not be allowed. This allows
+ * certain operations to proceed while the freeze sequence is in progress, if
+ * necessary.
+ */
+bool
+xfs_fs_writable(
+ struct xfs_mount *mp,
+ int level)
{
- return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) ||
- (mp->m_flags & XFS_MOUNT_RDONLY));
+ ASSERT(level > SB_UNFROZEN);
+ if ((mp->m_super->s_writers.frozen >= level) ||
+ XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY))
+ return false;
+
+ return true;
}
/*
@@ -1086,9 +1096,9 @@ xfs_fs_writable(xfs_mount_t *mp)
*
* Sync the superblock counters to disk.
*
- * Note this code can be called during the process of freezing, so
- * we may need to use the transaction allocator which does not
- * block when the transaction subsystem is in its frozen state.
+ * Note this code can be called during the process of freezing, so we use the
+ * transaction allocator that does not block when the transaction subsystem is
+ * in its frozen state.
*/
int
xfs_log_sbcount(xfs_mount_t *mp)
@@ -1096,7 +1106,8 @@ xfs_log_sbcount(xfs_mount_t *mp)
xfs_trans_t *tp;
int error;
- if (!xfs_fs_writable(mp))
+ /* allow this to proceed during the freeze sequence... */
+ if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
return 0;
xfs_icsb_sync_counters(mp, 0);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b0447c86e7e2..22ccf69d4d3c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -168,6 +168,7 @@ typedef struct xfs_mount {
/* low free space thresholds */
struct xfs_kobj m_kobj;
+ struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_data_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
@@ -320,10 +321,7 @@ typedef struct xfs_mod_sb {
/*
* Per-ag incore structure, copies of information in agf and agi, to improve the
- * performance of allocation group selection. This is defined for the kernel
- * only, and hence is defined here instead of in xfs_ag.h. You need the struct
- * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree),
- * so this doesn't introduce any strange header file dependencies.
+ * performance of allocation group selection.
*/
typedef struct xfs_perag {
struct xfs_mount *pag_mount; /* owner filesystem */
@@ -384,7 +382,7 @@ extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
-extern int xfs_fs_writable(xfs_mount_t *);
+extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d68f23021af3..79fb19dd9c83 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -23,7 +23,6 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
@@ -38,7 +37,6 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_cksum.h"
-#include "xfs_dinode.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -1749,23 +1747,21 @@ xfs_qm_vop_dqalloc(
xfs_iunlock(ip, lockflags);
if (O_udqpp)
*O_udqpp = uq;
- else if (uq)
+ else
xfs_qm_dqrele(uq);
if (O_gdqpp)
*O_gdqpp = gq;
- else if (gq)
+ else
xfs_qm_dqrele(gq);
if (O_pdqpp)
*O_pdqpp = pq;
- else if (pq)
+ else
xfs_qm_dqrele(pq);
return 0;
error_rele:
- if (gq)
- xfs_qm_dqrele(gq);
- if (uq)
- xfs_qm_dqrele(uq);
+ xfs_qm_dqrele(gq);
+ xfs_qm_dqrele(uq);
return error;
}
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2c61e61b0205..3e52d5de7ae1 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -20,8 +20,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 80f2d77d929a..74fca68e43b6 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -26,7 +26,6 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
@@ -784,19 +783,21 @@ xfs_qm_log_quotaoff(
{
xfs_trans_t *tp;
int error;
- xfs_qoff_logitem_t *qoffi=NULL;
- uint oldsbqflag=0;
+ xfs_qoff_logitem_t *qoffi;
+
+ *qoffstartp = NULL;
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
- if (error)
- goto error0;
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out;
+ }
qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
xfs_trans_log_quotaoff_item(tp, qoffi);
spin_lock(&mp->m_sb_lock);
- oldsbqflag = mp->m_sb.sb_qflags;
mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
spin_unlock(&mp->m_sb_lock);
@@ -809,19 +810,11 @@ xfs_qm_log_quotaoff(
*/
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp, 0);
+ if (error)
+ goto out;
-error0:
- if (error) {
- xfs_trans_cancel(tp, 0);
- /*
- * No one else is modifying sb_qflags, so this is OK.
- * We still hold the quotaofflock.
- */
- spin_lock(&mp->m_sb_lock);
- mp->m_sb.sb_qflags = oldsbqflag;
- spin_unlock(&mp->m_sb_lock);
- }
*qoffstartp = qoffi;
+out:
return error;
}
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index b238027df987..7542bbeca6a1 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -19,8 +19,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_quota.h"
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e1175ea9b551..f2079b6911cc 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,8 +22,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
@@ -36,7 +34,6 @@
#include "xfs_trace.h"
#include "xfs_buf.h"
#include "xfs_icache.h"
-#include "xfs_dinode.h"
#include "xfs_rtalloc.h"
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9f622feda6a4..19cbda196369 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -21,9 +21,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_inode.h"
@@ -44,7 +42,6 @@
#include "xfs_icache.h"
#include "xfs_trace.h"
#include "xfs_icreate_item.h"
-#include "xfs_dinode.h"
#include "xfs_filestream.h"
#include "xfs_quota.h"
#include "xfs_sysfs.h"
@@ -796,8 +793,7 @@ xfs_open_devices(
out_free_ddev_targ:
xfs_free_buftarg(mp, mp->m_ddev_targp);
out_close_rtdev:
- if (rtdev)
- xfs_blkdev_put(rtdev);
+ xfs_blkdev_put(rtdev);
out_close_logdev:
if (logdev && logdev != ddev)
xfs_blkdev_put(logdev);
@@ -842,10 +838,15 @@ STATIC int
xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
+ mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname);
+ if (!mp->m_buf_workqueue)
+ goto out;
+
mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_data_workqueue)
- goto out;
+ goto out_destroy_buf;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
@@ -863,7 +864,7 @@ xfs_init_mount_workqueues(
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
@@ -884,6 +885,8 @@ out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
out_destroy_data_iodone_queue:
destroy_workqueue(mp->m_data_workqueue);
+out_destroy_buf:
+ destroy_workqueue(mp->m_buf_workqueue);
out:
return -ENOMEM;
}
@@ -898,6 +901,7 @@ xfs_destroy_mount_workqueues(
destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_data_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
+ destroy_workqueue(mp->m_buf_workqueue);
}
/*
@@ -1000,7 +1004,6 @@ xfs_fs_evict_inode(
clear_inode(inode);
XFS_STATS_INC(vn_rele);
XFS_STATS_INC(vn_remove);
- XFS_STATS_DEC(vn_active);
xfs_inactive(ip);
}
@@ -1425,6 +1428,7 @@ xfs_fs_fill_super(
sb->s_export_op = &xfs_export_operations;
#ifdef CONFIG_XFS_QUOTA
sb->s_qcop = &xfs_quotactl_operations;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
sb->s_op = &xfs_super_operations;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 02ae62a998e0..25791df6f638 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -23,8 +23,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
@@ -42,7 +40,6 @@
#include "xfs_symlink.h"
#include "xfs_trans.h"
#include "xfs_log.h"
-#include "xfs_dinode.h"
/* ----- Kernel only functions below ----- */
STATIC int
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 1e85bcd0e418..13a029806805 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 30e8e3410955..fa3135b9bf04 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -22,8 +22,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_extent_busy.h"
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 859482f53b5a..573aefb5a573 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -18,10 +18,9 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index e2b2216b1635..0a4d4ab6d9a9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
@@ -229,13 +227,6 @@ xfs_trans_getsb(xfs_trans_t *tp,
return bp;
}
-#ifdef DEBUG
-xfs_buftarg_t *xfs_error_target;
-int xfs_do_error;
-int xfs_req_num;
-int xfs_error_mod = 33;
-#endif
-
/*
* Get and lock the buffer for the caller if it is not already
* locked within the given transaction. If it has not yet been
@@ -257,46 +248,11 @@ xfs_trans_read_buf_map(
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
- xfs_buf_t *bp;
- xfs_buf_log_item_t *bip;
+ struct xfs_buf *bp = NULL;
+ struct xfs_buf_log_item *bip;
int error;
*bpp = NULL;
- if (!tp) {
- bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
- if (!bp)
- return (flags & XBF_TRYLOCK) ?
- -EAGAIN : -ENOMEM;
-
- if (bp->b_error) {
- error = bp->b_error;
- xfs_buf_ioerror_alert(bp, __func__);
- XFS_BUF_UNDONE(bp);
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
-
- /* bad CRC means corrupted metadata */
- if (error == -EFSBADCRC)
- error = -EFSCORRUPTED;
- return error;
- }
-#ifdef DEBUG
- if (xfs_do_error) {
- if (xfs_error_target == target) {
- if (((xfs_req_num++) % xfs_error_mod) == 0) {
- xfs_buf_relse(bp);
- xfs_debug(mp, "Returning error!");
- return -EIO;
- }
- }
- }
-#endif
- if (XFS_FORCED_SHUTDOWN(mp))
- goto shutdown_abort;
- *bpp = bp;
- return 0;
- }
-
/*
* If we find the buffer in the cache with this transaction
* pointer in its b_fsprivate2 field, then we know we already
@@ -305,49 +261,24 @@ xfs_trans_read_buf_map(
* If the buffer is not yet read in, then we read it in, increment
* the lock recursion count, and return it to the caller.
*/
- bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
- if (bp != NULL) {
+ if (tp)
+ bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
+ if (bp) {
ASSERT(xfs_buf_islocked(bp));
ASSERT(bp->b_transp == tp);
ASSERT(bp->b_fspriv != NULL);
ASSERT(!bp->b_error);
- if (!(XFS_BUF_ISDONE(bp))) {
- trace_xfs_trans_read_buf_io(bp, _RET_IP_);
- ASSERT(!XFS_BUF_ISASYNC(bp));
- ASSERT(bp->b_iodone == NULL);
- XFS_BUF_READ(bp);
- bp->b_ops = ops;
-
- error = xfs_buf_submit_wait(bp);
- if (error) {
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_relse(bp);
- /*
- * We can gracefully recover from most read
- * errors. Ones we can't are those that happen
- * after the transaction's already dirty.
- */
- if (tp->t_flags & XFS_TRANS_DIRTY)
- xfs_force_shutdown(tp->t_mountp,
- SHUTDOWN_META_IO_ERROR);
- /* bad CRC means corrupted metadata */
- if (error == -EFSBADCRC)
- error = -EFSCORRUPTED;
- return error;
- }
- }
+ ASSERT(bp->b_flags & XBF_DONE);
+
/*
* We never locked this buf ourselves, so we shouldn't
* brelse it either. Just get out.
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
- *bpp = NULL;
return -EIO;
}
-
bip = bp->b_fspriv;
bip->bli_recur++;
@@ -358,17 +289,29 @@ xfs_trans_read_buf_map(
}
bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
- if (bp == NULL) {
- *bpp = NULL;
- return (flags & XBF_TRYLOCK) ?
- 0 : -ENOMEM;
+ if (!bp) {
+ if (!(flags & XBF_TRYLOCK))
+ return -ENOMEM;
+ return tp ? 0 : -EAGAIN;
}
+
+ /*
+ * If we've had a read error, then the contents of the buffer are
+ * invalid and should not be used. To ensure that a followup read tries
+ * to pull the buffer from disk again, we clear the XBF_DONE flag and
+ * mark the buffer stale. This ensures that anyone who has a current
+ * reference to the buffer will interpret it's contents correctly and
+ * future cache lookups will also treat it as an empty, uninitialised
+ * buffer.
+ */
if (bp->b_error) {
error = bp->b_error;
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xfs_buf_ioerror_alert(bp, __func__);
+ bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
- XFS_BUF_DONE(bp);
- xfs_buf_ioerror_alert(bp, __func__);
- if (tp->t_flags & XFS_TRANS_DIRTY)
+
+ if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
xfs_buf_relse(bp);
@@ -377,33 +320,19 @@ xfs_trans_read_buf_map(
error = -EFSCORRUPTED;
return error;
}
-#ifdef DEBUG
- if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) {
- if (xfs_error_target == target) {
- if (((xfs_req_num++) % xfs_error_mod) == 0) {
- xfs_force_shutdown(tp->t_mountp,
- SHUTDOWN_META_IO_ERROR);
- xfs_buf_relse(bp);
- xfs_debug(mp, "Returning trans error!");
- return -EIO;
- }
- }
+
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_buf_relse(bp);
+ trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+ return -EIO;
}
-#endif
- if (XFS_FORCED_SHUTDOWN(mp))
- goto shutdown_abort;
- _xfs_trans_bjoin(tp, bp, 1);
+ if (tp)
+ _xfs_trans_bjoin(tp, bp, 1);
trace_xfs_trans_read_buf(bp->b_fspriv);
-
*bpp = bp;
return 0;
-shutdown_abort:
- trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
- xfs_buf_relse(bp);
- *bpp = NULL;
- return -EIO;
}
/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 846e061c2e98..76a16df55ef7 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 47978ba89dae..284397dd7990 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -18,10 +18,9 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdb4d86520e1..17280cd71934 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -21,8 +21,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 93455b998041..69f6e475de97 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -20,8 +20,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_inode.h"