summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/v9fs.h31
-rw-r--r--fs/9p/v9fs_vfs.h11
-rw-r--r--fs/9p/vfs_dir.c4
-rw-r--r--fs/9p/vfs_file.c40
-rw-r--r--fs/9p/vfs_inode.c166
-rw-r--r--fs/9p/vfs_inode_dotl.c200
-rw-r--r--fs/9p/vfs_super.c45
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/super.c3
-rw-r--r--fs/afs/dir.c10
-rw-r--r--fs/afs/flock.c60
-rw-r--r--fs/afs/rotate.c21
-rw-r--r--fs/afs/validation.c16
-rw-r--r--fs/aio.c37
-rw-r--r--fs/attr.c7
-rw-r--r--fs/backing-file.c4
-rw-r--r--fs/bcachefs/Makefile7
-rw-r--r--fs/bcachefs/acl.c30
-rw-r--r--fs/bcachefs/alloc_background.c281
-rw-r--r--fs/bcachefs/alloc_background.h1
-rw-r--r--fs/bcachefs/alloc_foreground.c27
-rw-r--r--fs/bcachefs/alloc_types.h3
-rw-r--r--fs/bcachefs/backpointers.c337
-rw-r--r--fs/bcachefs/backpointers.h41
-rw-r--r--fs/bcachefs/bbpos_types.h2
-rw-r--r--fs/bcachefs/bcachefs.h33
-rw-r--r--fs/bcachefs/bcachefs_format.h77
-rw-r--r--fs/bcachefs/bkey.h211
-rw-r--r--fs/bcachefs/bkey_methods.c8
-rw-r--r--fs/bcachefs/bkey_types.h213
-rw-r--r--fs/bcachefs/bset.c14
-rw-r--r--fs/bcachefs/bset.h2
-rw-r--r--fs/bcachefs/btree_cache.c109
-rw-r--r--fs/bcachefs/btree_gc.c681
-rw-r--r--fs/bcachefs/btree_io.c65
-rw-r--r--fs/bcachefs/btree_iter.c72
-rw-r--r--fs/bcachefs/btree_iter.h11
-rw-r--r--fs/bcachefs/btree_journal_iter.c279
-rw-r--r--fs/bcachefs/btree_journal_iter.h22
-rw-r--r--fs/bcachefs/btree_key_cache.c14
-rw-r--r--fs/bcachefs/btree_locking.c31
-rw-r--r--fs/bcachefs/btree_node_scan.c519
-rw-r--r--fs/bcachefs/btree_node_scan.h11
-rw-r--r--fs/bcachefs/btree_node_scan_types.h30
-rw-r--r--fs/bcachefs/btree_trans_commit.c39
-rw-r--r--fs/bcachefs/btree_types.h23
-rw-r--r--fs/bcachefs/btree_update.c29
-rw-r--r--fs/bcachefs/btree_update.h3
-rw-r--r--fs/bcachefs/btree_update_interior.c494
-rw-r--r--fs/bcachefs/btree_update_interior.h34
-rw-r--r--fs/bcachefs/btree_write_buffer.c34
-rw-r--r--fs/bcachefs/buckets.c50
-rw-r--r--fs/bcachefs/buckets.h9
-rw-r--r--fs/bcachefs/chardev.c133
-rw-r--r--fs/bcachefs/checksum.c25
-rw-r--r--fs/bcachefs/checksum.h5
-rw-r--r--fs/bcachefs/compress.c14
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/data_update.c29
-rw-r--r--fs/bcachefs/debug.c83
-rw-r--r--fs/bcachefs/dirent.c143
-rw-r--r--fs/bcachefs/dirent.h6
-rw-r--r--fs/bcachefs/ec.c64
-rw-r--r--fs/bcachefs/ec.h2
-rw-r--r--fs/bcachefs/errcode.c15
-rw-r--r--fs/bcachefs/errcode.h19
-rw-r--r--fs/bcachefs/error.c14
-rw-r--r--fs/bcachefs/error.h12
-rw-r--r--fs/bcachefs/extents.c70
-rw-r--r--fs/bcachefs/extents.h42
-rw-r--r--fs/bcachefs/eytzinger.c234
-rw-r--r--fs/bcachefs/eytzinger.h81
-rw-r--r--fs/bcachefs/fifo.h4
-rw-r--r--fs/bcachefs/fs-common.c74
-rw-r--r--fs/bcachefs/fs-io-buffered.c149
-rw-r--r--fs/bcachefs/fs-io-direct.c23
-rw-r--r--fs/bcachefs/fs-io-pagecache.h9
-rw-r--r--fs/bcachefs/fs-io.c16
-rw-r--r--fs/bcachefs/fs.c226
-rw-r--r--fs/bcachefs/fsck.c1124
-rw-r--r--fs/bcachefs/fsck.h1
-rw-r--r--fs/bcachefs/inode.c57
-rw-r--r--fs/bcachefs/inode.h19
-rw-r--r--fs/bcachefs/io_misc.c2
-rw-r--r--fs/bcachefs/io_read.c2
-rw-r--r--fs/bcachefs/io_write.c18
-rw-r--r--fs/bcachefs/journal.c278
-rw-r--r--fs/bcachefs/journal.h7
-rw-r--r--fs/bcachefs/journal_io.c435
-rw-r--r--fs/bcachefs/journal_io.h47
-rw-r--r--fs/bcachefs/journal_reclaim.c31
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c72
-rw-r--r--fs/bcachefs/journal_types.h31
-rw-r--r--fs/bcachefs/logged_ops.c11
-rw-r--r--fs/bcachefs/lru.c7
-rw-r--r--fs/bcachefs/mean_and_variance.c28
-rw-r--r--fs/bcachefs/mean_and_variance.h14
-rw-r--r--fs/bcachefs/mean_and_variance_test.c108
-rw-r--r--fs/bcachefs/migrate.c8
-rw-r--r--fs/bcachefs/movinggc.c3
-rw-r--r--fs/bcachefs/opts.c41
-rw-r--r--fs/bcachefs/opts.h31
-rw-r--r--fs/bcachefs/rebalance.c4
-rw-r--r--fs/bcachefs/recovery.c482
-rw-r--r--fs/bcachefs/recovery.h32
-rw-r--r--fs/bcachefs/recovery_passes.c249
-rw-r--r--fs/bcachefs/recovery_passes.h17
-rw-r--r--fs/bcachefs/recovery_passes_types.h (renamed from fs/bcachefs/recovery_types.h)15
-rw-r--r--fs/bcachefs/reflink.c3
-rw-r--r--fs/bcachefs/replicas.c19
-rw-r--r--fs/bcachefs/sb-clean.c16
-rw-r--r--fs/bcachefs/sb-downgrade.c15
-rw-r--r--fs/bcachefs/sb-errors_types.h27
-rw-r--r--fs/bcachefs/sb-members.c53
-rw-r--r--fs/bcachefs/sb-members.h21
-rw-r--r--fs/bcachefs/snapshot.c251
-rw-r--r--fs/bcachefs/snapshot.h89
-rw-r--r--fs/bcachefs/str_hash.h15
-rw-r--r--fs/bcachefs/subvolume.c259
-rw-r--r--fs/bcachefs/subvolume.h11
-rw-r--r--fs/bcachefs/subvolume_format.h4
-rw-r--r--fs/bcachefs/subvolume_types.h2
-rw-r--r--fs/bcachefs/super-io.c68
-rw-r--r--fs/bcachefs/super.c129
-rw-r--r--fs/bcachefs/super_types.h4
-rw-r--r--fs/bcachefs/sysfs.c21
-rw-r--r--fs/bcachefs/tests.c2
-rw-r--r--fs/bcachefs/thread_with_file.c391
-rw-r--r--fs/bcachefs/thread_with_file.h59
-rw-r--r--fs/bcachefs/thread_with_file_types.h15
-rw-r--r--fs/bcachefs/time_stats.c165
-rw-r--r--fs/bcachefs/time_stats.h159
-rw-r--r--fs/bcachefs/trace.h19
-rw-r--r--fs/bcachefs/util.c370
-rw-r--r--fs/bcachefs/util.h167
-rw-r--r--fs/bcachefs/xattr.c5
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf_fdpic.c4
-rw-r--r--fs/btrfs/dev-replace.c14
-rw-r--r--fs/btrfs/ioctl.c16
-rw-r--r--fs/btrfs/volumes.c150
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/btrfs/zoned.c35
-rw-r--r--fs/buffer.c22
-rw-r--r--fs/ceph/addr.c4
-rw-r--r--fs/ceph/caps.c12
-rw-r--r--fs/ceph/file.c31
-rw-r--r--fs/ceph/locks.c74
-rw-r--r--fs/ceph/mds_client.c9
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/super.c18
-rw-r--r--fs/coda/inode.c147
-rw-r--r--fs/coredump.c45
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/fname.c8
-rw-r--r--fs/crypto/fscrypt_private.h14
-rw-r--r--fs/crypto/hooks.c15
-rw-r--r--fs/crypto/keyring.c8
-rw-r--r--fs/crypto/keysetup.c11
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/inode.c25
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lock.c108
-rw-r--r--fs/dlm/plock.c44
-rw-r--r--fs/dlm/user.c10
-rw-r--r--fs/ecryptfs/crypto.c10
-rw-r--r--fs/efs/super.c118
-rw-r--r--fs/erofs/compress.h7
-rw-r--r--fs/erofs/data.c7
-rw-r--r--fs/erofs/decompressor.c3
-rw-r--r--fs/erofs/decompressor_deflate.c3
-rw-r--r--fs/erofs/decompressor_lzma.c3
-rw-r--r--fs/erofs/fscache.c297
-rw-r--r--fs/erofs/inode.c14
-rw-r--r--fs/erofs/internal.h11
-rw-r--r--fs/erofs/super.c49
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/zdata.c287
-rw-r--r--fs/eventfd.c16
-rw-r--r--fs/eventpoll.c147
-rw-r--r--fs/exec.c13
-rw-r--r--fs/exfat/cache.c2
-rw-r--r--fs/exfat/dir.c290
-rw-r--r--fs/exfat/exfat_fs.h25
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/exfat/namei.c352
-rw-r--r--fs/exfat/super.c2
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext2/Kconfig15
-rw-r--r--fs/ext2/balloc.c2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c5
-rw-r--r--fs/ext2/xattr.c2
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/extents.c6
-rw-r--r--fs/ext4/fsmap.c8
-rw-r--r--fs/ext4/inode.c10
-rw-r--r--fs/ext4/mballoc-test.c601
-rw-r--r--fs/ext4/mballoc.c62
-rw-r--r--fs/ext4/namei.c1
-rw-r--r--fs/ext4/resize.c5
-rw-r--r--fs/ext4/super.c102
-rw-r--r--fs/ext4/xattr.c61
-rw-r--r--fs/f2fs/checkpoint.c74
-rw-r--r--fs/f2fs/compress.c55
-rw-r--r--fs/f2fs/data.c191
-rw-r--r--fs/f2fs/debug.c11
-rw-r--r--fs/f2fs/dir.c10
-rw-r--r--fs/f2fs/extent_cache.c5
-rw-r--r--fs/f2fs/f2fs.h257
-rw-r--r--fs/f2fs/file.c171
-rw-r--r--fs/f2fs/gc.c129
-rw-r--r--fs/f2fs/gc.h4
-rw-r--r--fs/f2fs/namei.c37
-rw-r--r--fs/f2fs/node.c26
-rw-r--r--fs/f2fs/node.h4
-rw-r--r--fs/f2fs/recovery.c56
-rw-r--r--fs/f2fs/segment.c459
-rw-r--r--fs/f2fs/segment.h90
-rw-r--r--fs/f2fs/super.c225
-rw-r--r--fs/f2fs/sysfs.c52
-rw-r--r--fs/f2fs/verity.c16
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fat/nfs.c6
-rw-r--r--fs/fcntl.c72
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file_table.c86
-rw-r--r--fs/freevxfs/vxfs_super.c2
-rw-r--r--fs/fs-writeback.c25
-rw-r--r--fs/fs_parser.c4
-rw-r--r--fs/fuse/Kconfig11
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/control.c6
-rw-r--r--fs/fuse/dev.c156
-rw-r--r--fs/fuse/dir.c55
-rw-r--r--fs/fuse/file.c469
-rw-r--r--fs/fuse/fuse_i.h153
-rw-r--r--fs/fuse/inode.c55
-rw-r--r--fs/fuse/iomode.c254
-rw-r--r--fs/fuse/passthrough.c355
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/fuse/virtio_fs.c156
-rw-r--r--fs/gfs2/bmap.c7
-rw-r--r--fs/gfs2/file.c16
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c23
-rw-r--r--fs/inode.c15
-rw-r--r--fs/internal.h8
-rw-r--r--fs/ioctl.c33
-rw-r--r--fs/iomap/buffered-io.c579
-rw-r--r--fs/iomap/direct-io.c1
-rw-r--r--fs/iomap/trace.h48
-rw-r--r--fs/isofs/inode.c20
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/jfs_incore.h2
-rw-r--r--fs/jfs/jfs_logmgr.c26
-rw-r--r--fs/jfs/jfs_logmgr.h2
-rw-r--r--fs/jfs/jfs_mount.c2
-rw-r--r--fs/jfs/super.c4
-rw-r--r--fs/kernfs/dir.c31
-rw-r--r--fs/kernfs/file.c17
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c4
-rw-r--r--fs/libfs.c347
-rw-r--r--fs/lockd/clnt4xdr.c14
-rw-r--r--fs/lockd/clntlock.c2
-rw-r--r--fs/lockd/clntproc.c65
-rw-r--r--fs/lockd/clntxdr.c14
-rw-r--r--fs/lockd/svc.c3
-rw-r--r--fs/lockd/svc4proc.c10
-rw-r--r--fs/lockd/svclock.c64
-rw-r--r--fs/lockd/svcproc.c10
-rw-r--r--fs/lockd/svcsubs.c24
-rw-r--r--fs/lockd/xdr.c14
-rw-r--r--fs/lockd/xdr4.c14
-rw-r--r--fs/locks.c894
-rw-r--r--fs/mbcache.c4
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/mnt_idmapping.c2
-rw-r--r--fs/mpage.c1
-rw-r--r--fs/namei.c15
-rw-r--r--fs/netfs/fscache_io.c4
-rw-r--r--fs/nfs/blocklayout/blocklayout.h2
-rw-r--r--fs/nfs/blocklayout/dev.c68
-rw-r--r--fs/nfs/callback.c3
-rw-r--r--fs/nfs/client.c9
-rw-r--r--fs/nfs/delegation.c8
-rw-r--r--fs/nfs/direct.c21
-rw-r--r--fs/nfs/file.c22
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/fs_context.c1
-rw-r--r--fs/nfs/fscache.c9
-rw-r--r--fs/nfs/inode.c10
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/netns.h2
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs42.h7
-rw-r--r--fs/nfs/nfs42xattr.c2
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c68
-rw-r--r--fs/nfs/nfs4state.c34
-rw-r--r--fs/nfs/nfs4super.c24
-rw-r--r--fs/nfs/nfs4trace.c2
-rw-r--r--fs/nfs/nfs4trace.h62
-rw-r--r--fs/nfs/nfs4xdr.c8
-rw-r--r--fs/nfs/nfsroot.c4
-rw-r--r--fs/nfs/pnfs.c8
-rw-r--r--fs/nfs/pnfs_nfs.c44
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfs/write.c14
-rw-r--r--fs/nfsd/blocklayout.c4
-rw-r--r--fs/nfsd/cache.h2
-rw-r--r--fs/nfsd/filecache.c80
-rw-r--r--fs/nfsd/filecache.h1
-rw-r--r--fs/nfsd/netns.h29
-rw-r--r--fs/nfsd/nfs3proc.c6
-rw-r--r--fs/nfsd/nfs3xdr.c5
-rw-r--r--fs/nfsd/nfs4callback.c193
-rw-r--r--fs/nfsd/nfs4layouts.c96
-rw-r--r--fs/nfsd/nfs4proc.c13
-rw-r--r--fs/nfsd/nfs4state.c977
-rw-r--r--fs/nfsd/nfs4xdr.c105
-rw-r--r--fs/nfsd/nfscache.c43
-rw-r--r--fs/nfsd/nfsctl.c17
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/nfsfh.c3
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c16
-rw-r--r--fs/nfsd/pnfs.h8
-rw-r--r--fs/nfsd/state.h83
-rw-r--r--fs/nfsd/stats.c52
-rw-r--r--fs/nfsd/stats.h70
-rw-r--r--fs/nfsd/trace.h222
-rw-r--r--fs/nfsd/vfs.c90
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/nfsd/xdr3.h2
-rw-r--r--fs/nfsd/xdr4cb.h18
-rw-r--r--fs/nilfs2/alloc.c91
-rw-r--r--fs/nilfs2/bmap.c3
-rw-r--r--fs/nilfs2/btree.c9
-rw-r--r--fs/nilfs2/cpfile.c321
-rw-r--r--fs/nilfs2/cpfile.h10
-rw-r--r--fs/nilfs2/dat.c40
-rw-r--r--fs/nilfs2/direct.c9
-rw-r--r--fs/nilfs2/ifile.c21
-rw-r--r--fs/nilfs2/ifile.h10
-rw-r--r--fs/nilfs2/inode.c46
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/mdt.c4
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/nilfs2/page.c8
-rw-r--r--fs/nilfs2/recovery.c4
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/segment.c121
-rw-r--r--fs/nilfs2/sufile.c88
-rw-r--r--fs/nilfs2/super.c33
-rw-r--r--fs/nilfs2/the_nilfs.c2
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fsnotify.c28
-rw-r--r--fs/nsfs.c124
-rw-r--r--fs/ntfs/Kconfig81
-rw-r--r--fs/ntfs/Makefile15
-rw-r--r--fs/ntfs/aops.c1744
-rw-r--r--fs/ntfs/aops.h88
-rw-r--r--fs/ntfs/attrib.c2624
-rw-r--r--fs/ntfs/attrib.h102
-rw-r--r--fs/ntfs/bitmap.c179
-rw-r--r--fs/ntfs/bitmap.h104
-rw-r--r--fs/ntfs/collate.c110
-rw-r--r--fs/ntfs/collate.h36
-rw-r--r--fs/ntfs/compress.c950
-rw-r--r--fs/ntfs/debug.c159
-rw-r--r--fs/ntfs/debug.h57
-rw-r--r--fs/ntfs/dir.c1540
-rw-r--r--fs/ntfs/dir.h34
-rw-r--r--fs/ntfs/endian.h79
-rw-r--r--fs/ntfs/file.c1997
-rw-r--r--fs/ntfs/index.c440
-rw-r--r--fs/ntfs/index.h134
-rw-r--r--fs/ntfs/inode.c3102
-rw-r--r--fs/ntfs/inode.h310
-rw-r--r--fs/ntfs/layout.h2421
-rw-r--r--fs/ntfs/lcnalloc.c1000
-rw-r--r--fs/ntfs/lcnalloc.h131
-rw-r--r--fs/ntfs/logfile.c849
-rw-r--r--fs/ntfs/logfile.h295
-rw-r--r--fs/ntfs/malloc.h77
-rw-r--r--fs/ntfs/mft.c2907
-rw-r--r--fs/ntfs/mft.h110
-rw-r--r--fs/ntfs/mst.c189
-rw-r--r--fs/ntfs/namei.c392
-rw-r--r--fs/ntfs/ntfs.h150
-rw-r--r--fs/ntfs/quota.c103
-rw-r--r--fs/ntfs/quota.h21
-rw-r--r--fs/ntfs/runlist.c1893
-rw-r--r--fs/ntfs/runlist.h88
-rw-r--r--fs/ntfs/super.c3202
-rw-r--r--fs/ntfs/sysctl.c58
-rw-r--r--fs/ntfs/sysctl.h27
-rw-r--r--fs/ntfs/time.h89
-rw-r--r--fs/ntfs/types.h55
-rw-r--r--fs/ntfs/unistr.c384
-rw-r--r--fs/ntfs/upcase.c73
-rw-r--r--fs/ntfs/usnjrnl.c70
-rw-r--r--fs/ntfs/usnjrnl.h191
-rw-r--r--fs/ntfs/volume.h164
-rw-r--r--fs/ntfs3/namei.c2
-rw-r--r--fs/ntfs3/super.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c32
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/locks.c12
-rw-r--r--fs/ocfs2/quota_global.c12
-rw-r--r--fs/ocfs2/quota_local.c3
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c13
-rw-r--r--fs/open.c56
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/orangefs-cache.c2
-rw-r--r--fs/orangefs/orangefs-kernel.h10
-rw-r--r--fs/orangefs/super.c4
-rw-r--r--fs/overlayfs/copy_up.c6
-rw-r--r--fs/overlayfs/params.c14
-rw-r--r--fs/overlayfs/super.c54
-rw-r--r--fs/overlayfs/util.c18
-rw-r--r--fs/pidfs.c283
-rw-r--r--fs/pipe.c81
-rw-r--r--fs/posix_acl.c9
-rw-r--r--fs/proc/Kconfig2
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/bootconfig.c12
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/task_mmu.c17
-rw-r--r--fs/pstore/inode.c10
-rw-r--r--fs/pstore/ram.c1
-rw-r--r--fs/pstore/zone.c3
-rw-r--r--fs/qnx4/inode.c49
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/quota/dquot.c174
-rw-r--r--fs/quota/quota_tree.c152
-rw-r--r--fs/quota/quota_v1.c6
-rw-r--r--fs/quota/quota_v2.c35
-rw-r--r--fs/ramfs/inode.c32
-rw-r--r--fs/reiserfs/journal.c38
-rw-r--r--fs/reiserfs/procfs.c2
-rw-r--r--fs/reiserfs/reiserfs.h10
-rw-r--r--fs/reiserfs/super.c3
-rw-r--r--fs/romfs/super.c6
-rw-r--r--fs/select.c15
-rw-r--r--fs/smb/client/Makefile2
-rw-r--r--fs/smb/client/cached_dir.c9
-rw-r--r--fs/smb/client/cifs_debug.c40
-rw-r--r--fs/smb/client/cifsfs.c24
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h86
-rw-r--r--fs/smb/client/cifsproto.h40
-rw-r--r--fs/smb/client/cifssmb.c18
-rw-r--r--fs/smb/client/connect.c174
-rw-r--r--fs/smb/client/dfs.c51
-rw-r--r--fs/smb/client/dfs.h33
-rw-r--r--fs/smb/client/dfs_cache.c53
-rw-r--r--fs/smb/client/dir.c22
-rw-r--r--fs/smb/client/file.c506
-rw-r--r--fs/smb/client/fs_context.c91
-rw-r--r--fs/smb/client/fs_context.h25
-rw-r--r--fs/smb/client/fscache.c16
-rw-r--r--fs/smb/client/fscache.h6
-rw-r--r--fs/smb/client/inode.c142
-rw-r--r--fs/smb/client/ioctl.c11
-rw-r--r--fs/smb/client/misc.c46
-rw-r--r--fs/smb/client/readdir.c22
-rw-r--r--fs/smb/client/reparse.c532
-rw-r--r--fs/smb/client/reparse.h113
-rw-r--r--fs/smb/client/sess.c4
-rw-r--r--fs/smb/client/smb1ops.c4
-rw-r--r--fs/smb/client/smb2file.c2
-rw-r--r--fs/smb/client/smb2glob.h3
-rw-r--r--fs/smb/client/smb2inode.c487
-rw-r--r--fs/smb/client/smb2misc.c4
-rw-r--r--fs/smb/client/smb2ops.c341
-rw-r--r--fs/smb/client/smb2pdu.c55
-rw-r--r--fs/smb/client/smb2pdu.h36
-rw-r--r--fs/smb/client/smb2proto.h15
-rw-r--r--fs/smb/client/smb2transport.c2
-rw-r--r--fs/smb/client/trace.h38
-rw-r--r--fs/smb/common/smb2pdu.h59
-rw-r--r--fs/smb/common/smbfsctl.h6
-rw-r--r--fs/smb/server/glob.h2
-rw-r--r--fs/smb/server/ksmbd_netlink.h4
-rw-r--r--fs/smb/server/mgmt/share_config.c7
-rw-r--r--fs/smb/server/mgmt/user_session.c28
-rw-r--r--fs/smb/server/mgmt/user_session.h3
-rw-r--r--fs/smb/server/oplock.c96
-rw-r--r--fs/smb/server/oplock.h7
-rw-r--r--fs/smb/server/server.c1
-rw-r--r--fs/smb/server/smb2misc.c26
-rw-r--r--fs/smb/server/smb2ops.c16
-rw-r--r--fs/smb/server/smb2pdu.c556
-rw-r--r--fs/smb/server/smb2pdu.h15
-rw-r--r--fs/smb/server/smb_common.c11
-rw-r--r--fs/smb/server/transport_ipc.c37
-rw-r--r--fs/smb/server/vfs.c26
-rw-r--r--fs/smb/server/vfs_cache.c137
-rw-r--r--fs/smb/server/vfs_cache.h9
-rw-r--r--fs/super.c18
-rw-r--r--fs/sysfs/group.c55
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/sysv/itree.c10
-rw-r--r--fs/tracefs/event_inode.c84
-rw-r--r--fs/tracefs/inode.c1
-rw-r--r--fs/tracefs/internal.h2
-rw-r--r--fs/ubifs/debug.c9
-rw-r--r--fs/ubifs/dir.c3
-rw-r--r--fs/ubifs/file.c443
-rw-r--r--fs/ubifs/find.c32
-rw-r--r--fs/ubifs/journal.c171
-rw-r--r--fs/ubifs/lprops.c6
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/ubifs/super.c9
-rw-r--r--fs/ubifs/tnc.c9
-rw-r--r--fs/ubifs/tnc_misc.c22
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/inode.c2
-rw-r--r--fs/udf/namei.c21
-rw-r--r--fs/udf/super.c556
-rw-r--r--fs/udf/udf_sb.h1
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/userfaultfd.c86
-rw-r--r--fs/vboxsf/file.c1
-rw-r--r--fs/vboxsf/super.c12
-rw-r--r--fs/vboxsf/utils.c3
-rw-r--r--fs/verity/fsverity_private.h1
-rw-r--r--fs/verity/measure.c4
-rw-r--r--fs/verity/open.c1
-rw-r--r--fs/verity/verify.c48
-rw-r--r--fs/xattr.c9
-rw-r--r--fs/xfs/Kconfig13
-rw-r--r--fs/xfs/Makefile15
-rw-r--r--fs/xfs/kmem.c30
-rw-r--r--fs/xfs/kmem.h83
-rw-r--r--fs/xfs/libxfs/xfs_ag.c79
-rw-r--r--fs/xfs/libxfs/xfs_ag.h18
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c258
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c191
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_attr.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c22
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c37
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c365
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h19
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c152
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c1078
-rw-r--r--fs/xfs/libxfs/xfs_btree.h274
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.c347
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.h75
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c133
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h10
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_defer.c25
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c59
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h13
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c7
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c16
-rw-r--r--fs/xfs/libxfs/xfs_format.h21
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/libxfs/xfs_health.h95
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c232
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c155
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h11
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c26
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c12
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c49
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h1
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c69
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c78
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c284
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h31
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c231
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h8
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c11
-rw-r--r--fs/xfs/libxfs/xfs_sb.c42
-rw-r--r--fs/xfs/libxfs/xfs_sb.h5
-rw-r--r--fs/xfs/libxfs/xfs_shared.h67
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c155
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.h26
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/libxfs/xfs_types.h26
-rw-r--r--fs/xfs/mrlock.h78
-rw-r--r--fs/xfs/scrub/agb_bitmap.h5
-rw-r--r--fs/xfs/scrub/agheader.c12
-rw-r--r--fs/xfs/scrub/agheader_repair.c47
-rw-r--r--fs/xfs/scrub/alloc_repair.c27
-rw-r--r--fs/xfs/scrub/bitmap.c14
-rw-r--r--fs/xfs/scrub/bitmap.h2
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/bmap_repair.c8
-rw-r--r--fs/xfs/scrub/btree.c58
-rw-r--r--fs/xfs/scrub/common.c133
-rw-r--r--fs/xfs/scrub/common.h13
-rw-r--r--fs/xfs/scrub/cow_repair.c2
-rw-r--r--fs/xfs/scrub/dir.c4
-rw-r--r--fs/xfs/scrub/fscounters.c29
-rw-r--r--fs/xfs/scrub/fscounters.h20
-rw-r--r--fs/xfs/scrub/fscounters_repair.c72
-rw-r--r--fs/xfs/scrub/health.c140
-rw-r--r--fs/xfs/scrub/health.h5
-rw-r--r--fs/xfs/scrub/ialloc.c20
-rw-r--r--fs/xfs/scrub/ialloc_repair.c10
-rw-r--r--fs/xfs/scrub/inode_repair.c237
-rw-r--r--fs/xfs/scrub/iscan.c767
-rw-r--r--fs/xfs/scrub/iscan.h84
-rw-r--r--fs/xfs/scrub/newbt.c14
-rw-r--r--fs/xfs/scrub/newbt.h7
-rw-r--r--fs/xfs/scrub/nlinks.c930
-rw-r--r--fs/xfs/scrub/nlinks.h102
-rw-r--r--fs/xfs/scrub/nlinks_repair.c223
-rw-r--r--fs/xfs/scrub/quotacheck.c867
-rw-r--r--fs/xfs/scrub/quotacheck.h76
-rw-r--r--fs/xfs/scrub/quotacheck_repair.c261
-rw-r--r--fs/xfs/scrub/rcbag.c307
-rw-r--r--fs/xfs/scrub/rcbag.h28
-rw-r--r--fs/xfs/scrub/rcbag_btree.c370
-rw-r--r--fs/xfs/scrub/rcbag_btree.h81
-rw-r--r--fs/xfs/scrub/readdir.c4
-rw-r--r--fs/xfs/scrub/reap.c2
-rw-r--r--fs/xfs/scrub/refcount.c12
-rw-r--r--fs/xfs/scrub/refcount_repair.c177
-rw-r--r--fs/xfs/scrub/repair.c120
-rw-r--r--fs/xfs/scrub/repair.h23
-rw-r--r--fs/xfs/scrub/rmap.c26
-rw-r--r--fs/xfs/scrub/rmap_repair.c1697
-rw-r--r--fs/xfs/scrub/rtsummary.c6
-rw-r--r--fs/xfs/scrub/scrub.c37
-rw-r--r--fs/xfs/scrub/scrub.h18
-rw-r--r--fs/xfs/scrub/stats.c6
-rw-r--r--fs/xfs/scrub/symlink.c3
-rw-r--r--fs/xfs/scrub/trace.c8
-rw-r--r--fs/xfs/scrub/trace.h637
-rw-r--r--fs/xfs/scrub/xfarray.c234
-rw-r--r--fs/xfs/scrub/xfarray.h30
-rw-r--r--fs/xfs/scrub/xfile.c345
-rw-r--r--fs/xfs/scrub/xfile.h62
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_aops.c16
-rw-r--r--fs/xfs/xfs_attr_inactive.c4
-rw-r--r--fs/xfs/xfs_attr_item.c25
-rw-r--r--fs/xfs/xfs_attr_list.c26
-rw-r--r--fs/xfs/xfs_bmap_item.c119
-rw-r--r--fs/xfs/xfs_bmap_item.h4
-rw-r--r--fs/xfs/xfs_bmap_util.c20
-rw-r--r--fs/xfs/xfs_buf.c330
-rw-r--r--fs/xfs/xfs_buf.h25
-rw-r--r--fs/xfs/xfs_buf_item.c8
-rw-r--r--fs/xfs/xfs_buf_item_recover.c8
-rw-r--r--fs/xfs/xfs_buf_mem.c270
-rw-r--r--fs/xfs/xfs_buf_mem.h34
-rw-r--r--fs/xfs/xfs_dir2_readdir.c8
-rw-r--r--fs/xfs/xfs_discard.c19
-rw-r--r--fs/xfs/xfs_dquot.c54
-rw-r--r--fs/xfs/xfs_error.c8
-rw-r--r--fs/xfs/xfs_extent_busy.c5
-rw-r--r--fs/xfs/xfs_extfree_item.c8
-rw-r--r--fs/xfs/xfs_file.c4
-rw-r--r--fs/xfs/xfs_filestream.c6
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_health.c202
-rw-r--r--fs/xfs/xfs_hooks.c52
-rw-r--r--fs/xfs/xfs_hooks.h65
-rw-r--r--fs/xfs/xfs_icache.c22
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c289
-rw-r--r--fs/xfs/xfs_inode.h37
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_inode_item_recover.c5
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_iomap.c19
-rw-r--r--fs/xfs/xfs_iops.c9
-rw-r--r--fs/xfs/xfs_itable.c12
-rw-r--r--fs/xfs/xfs_iwalk.c41
-rw-r--r--fs/xfs/xfs_linux.h17
-rw-r--r--fs/xfs/xfs_log.c34
-rw-r--r--fs/xfs/xfs_log_cil.c31
-rw-r--r--fs/xfs/xfs_log_recover.c102
-rw-r--r--fs/xfs/xfs_mount.c6
-rw-r--r--fs/xfs/xfs_mount.h12
-rw-r--r--fs/xfs/xfs_mru_cache.c17
-rw-r--r--fs/xfs/xfs_qm.c59
-rw-r--r--fs/xfs/xfs_qm.h16
-rw-r--r--fs/xfs/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/xfs_quota.h46
-rw-r--r--fs/xfs/xfs_refcount_item.c12
-rw-r--r--fs/xfs/xfs_reflink.c16
-rw-r--r--fs/xfs/xfs_rmap_item.c11
-rw-r--r--fs/xfs/xfs_rtalloc.c18
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_stats.h2
-rw-r--r--fs/xfs/xfs_super.c71
-rw-r--r--fs/xfs/xfs_symlink.c158
-rw-r--r--fs/xfs/xfs_symlink.h1
-rw-r--r--fs/xfs/xfs_sysfs.c4
-rw-r--r--fs/xfs/xfs_trace.c3
-rw-r--r--fs/xfs/xfs_trace.h612
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h10
-rw-r--r--fs/xfs/xfs_trans_ail.c7
-rw-r--r--fs/xfs/xfs_trans_buf.c42
-rw-r--r--fs/xfs/xfs_trans_dquot.c171
-rw-r--r--fs/zonefs/file.c3
-rw-r--r--fs/zonefs/super.c173
735 files changed, 33688 insertions, 43728 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 61dbe52bb3a3..281a1ed03a04 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -637,7 +637,7 @@ static int v9fs_init_inode_cache(void)
v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
sizeof(struct v9fs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
v9fs_inode_init_once);
if (!v9fs_inode_cache)
return -ENOMEM;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 698c43dd5dc8..9defa12208f9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -179,16 +179,13 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
-extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
- struct p9_fid *fid,
- struct super_block *sb, int new);
+extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid);
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
extern const struct netfs_request_ops v9fs_req_ops;
-extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
- struct p9_fid *fid,
- struct super_block *sb, int new);
+extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb,
+ struct p9_fid *fid);
/* other default globals */
#define V9FS_PORT 564
@@ -230,27 +227,9 @@ v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
struct super_block *sb)
{
if (v9fs_proto_dotl(v9ses))
- return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
+ return v9fs_fid_iget_dotl(sb, fid);
else
- return v9fs_inode_from_fid(v9ses, fid, sb, 0);
-}
-
-/**
- * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb)
-{
- if (v9fs_proto_dotl(v9ses))
- return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
- else
- return v9fs_inode_from_fid(v9ses, fid, sb, 1);
+ return v9fs_fid_iget(sb, fid);
}
#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 0e8418066a48..7923c3c347cb 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,13 +40,16 @@ extern struct kmem_cache *v9fs_inode_cache;
struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
- dev_t rdev);
void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, umode_t mode, dev_t rdev);
+ struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
-ino_t v9fs_qid2ino(struct p9_qid *qid);
+#if (BITS_PER_LONG == 32)
+#define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32)))
+#else
+#define QID2INO(q) ((ino_t) ((q)->path+2))
+#endif
+
void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct super_block *sb, unsigned int flags);
void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 4102759a5cb5..e0d34e4e9076 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -127,7 +127,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
}
over = !dir_emit(ctx, st.name, strlen(st.name),
- v9fs_qid2ino(&st.qid), dt_type(&st));
+ QID2INO(&st.qid), dt_type(&st));
p9stat_free(&st);
if (over)
return 0;
@@ -184,7 +184,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, curdirent.d_name,
strlen(curdirent.d_name),
- v9fs_qid2ino(&curdirent.qid),
+ QID2INO(&curdirent.qid),
curdirent.d_type))
return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index bae330c2f0cf..abdbbaee5184 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -107,7 +107,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
- if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+ if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
@@ -121,13 +121,12 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
struct p9_fid *fid;
uint8_t status = P9_LOCK_ERROR;
int res = 0;
- unsigned char fl_type;
struct v9fs_session_info *v9ses;
fid = filp->private_data;
BUG_ON(fid == NULL);
- BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX);
+ BUG_ON((fl->c.flc_flags & FL_POSIX) != FL_POSIX);
res = locks_lock_file_wait(filp, fl);
if (res < 0)
@@ -136,7 +135,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
/* convert posix lock to p9 tlock args */
memset(&flock, 0, sizeof(flock));
/* map the lock type */
- switch (fl->fl_type) {
+ switch (fl->c.flc_type) {
case F_RDLCK:
flock.type = P9_LOCK_TYPE_RDLCK;
break;
@@ -152,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
flock.length = 0;
else
flock.length = fl->fl_end - fl->fl_start + 1;
- flock.proc_id = fl->fl_pid;
+ flock.proc_id = fl->c.flc_pid;
flock.client_id = fid->clnt->name;
if (IS_SETLKW(cmd))
flock.flags = P9_LOCK_FLAGS_BLOCK;
@@ -207,12 +206,13 @@ out_unlock:
* incase server returned error for lock request, revert
* it locally
*/
- if (res < 0 && fl->fl_type != F_UNLCK) {
- fl_type = fl->fl_type;
- fl->fl_type = F_UNLCK;
+ if (res < 0 && fl->c.flc_type != F_UNLCK) {
+ unsigned char type = fl->c.flc_type;
+
+ fl->c.flc_type = F_UNLCK;
/* Even if this fails we want to return the remote error */
locks_lock_file_wait(filp, fl);
- fl->fl_type = fl_type;
+ fl->c.flc_type = type;
}
if (flock.client_id != fid->clnt->name)
kfree(flock.client_id);
@@ -234,7 +234,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
* if we have a conflicting lock locally, no need to validate
* with server
*/
- if (fl->fl_type != F_UNLCK)
+ if (fl->c.flc_type != F_UNLCK)
return res;
/* convert posix lock to p9 tgetlock args */
@@ -245,7 +245,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
glock.length = 0;
else
glock.length = fl->fl_end - fl->fl_start + 1;
- glock.proc_id = fl->fl_pid;
+ glock.proc_id = fl->c.flc_pid;
glock.client_id = fid->clnt->name;
res = p9_client_getlock_dotl(fid, &glock);
@@ -254,13 +254,13 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
/* map 9p lock type to os lock type */
switch (glock.type) {
case P9_LOCK_TYPE_RDLCK:
- fl->fl_type = F_RDLCK;
+ fl->c.flc_type = F_RDLCK;
break;
case P9_LOCK_TYPE_WRLCK:
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
break;
case P9_LOCK_TYPE_UNLCK:
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
break;
}
if (glock.type != P9_LOCK_TYPE_UNLCK) {
@@ -269,7 +269,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
fl->fl_end = OFFSET_MAX;
else
fl->fl_end = glock.start + glock.length - 1;
- fl->fl_pid = -glock.proc_id;
+ fl->c.flc_pid = -glock.proc_id;
}
out:
if (glock.client_id != fid->clnt->name)
@@ -293,7 +293,7 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+ if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
@@ -324,16 +324,16 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
goto out_err;
- if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+ if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
/* Convert flock to posix lock */
- fl->fl_flags |= FL_POSIX;
- fl->fl_flags ^= FL_FLOCK;
+ fl->c.flc_flags |= FL_POSIX;
+ fl->c.flc_flags ^= FL_FLOCK;
if (IS_SETLK(cmd) | IS_SETLKW(cmd))
ret = v9fs_file_do_lock(filp, cmd, fl);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 32572982f72e..b01b1bbf2493 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -253,9 +253,12 @@ void v9fs_set_netfs_context(struct inode *inode)
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, umode_t mode, dev_t rdev)
+ struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev)
{
int err = 0;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+
+ memcpy(&v9inode->qid, qid, sizeof(struct p9_qid));
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_blocks = 0;
@@ -332,36 +335,6 @@ error:
}
/**
- * v9fs_get_inode - helper function to setup an inode
- * @sb: superblock
- * @mode: mode to setup inode with
- * @rdev: The device numbers to set
- */
-
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
-{
- int err;
- struct inode *inode;
- struct v9fs_session_info *v9ses = sb->s_fs_info;
-
- p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
-
- inode = new_inode(sb);
- if (!inode) {
- pr_warn("%s (%d): Problem allocating inode\n",
- __func__, task_pid_nr(current));
- return ERR_PTR(-ENOMEM);
- }
- err = v9fs_init_inode(v9ses, inode, mode, rdev);
- if (err) {
- iput(inode);
- return ERR_PTR(err);
- }
- v9fs_set_netfs_context(inode);
- return inode;
-}
-
-/**
* v9fs_evict_inode - Remove an inode from the inode cache
* @inode: inode to release
*
@@ -371,95 +344,57 @@ void v9fs_evict_inode(struct inode *inode)
struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
__le32 __maybe_unused version;
- truncate_inode_pages_final(&inode->i_data);
+ if (!is_bad_inode(inode)) {
+ truncate_inode_pages_final(&inode->i_data);
- version = cpu_to_le32(v9inode->qid.version);
- netfs_clear_inode_writeback(inode, &version);
+ version = cpu_to_le32(v9inode->qid.version);
+ netfs_clear_inode_writeback(inode, &version);
- clear_inode(inode);
- filemap_fdatawrite(&inode->i_data);
+ clear_inode(inode);
+ filemap_fdatawrite(&inode->i_data);
#ifdef CONFIG_9P_FSCACHE
- fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
+ if (v9fs_inode_cookie(v9inode))
+ fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
#endif
+ } else
+ clear_inode(inode);
}
-static int v9fs_test_inode(struct inode *inode, void *data)
-{
- int umode;
- dev_t rdev;
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_wstat *st = (struct p9_wstat *)data;
- struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-
- umode = p9mode2unixmode(v9ses, st, &rdev);
- /* don't match inode of different type */
- if (inode_wrong_type(inode, umode))
- return 0;
-
- /* compare qid details */
- if (memcmp(&v9inode->qid.version,
- &st->qid.version, sizeof(v9inode->qid.version)))
- return 0;
-
- if (v9inode->qid.type != st->qid.type)
- return 0;
-
- if (v9inode->qid.path != st->qid.path)
- return 0;
- return 1;
-}
-
-static int v9fs_test_new_inode(struct inode *inode, void *data)
-{
- return 0;
-}
-
-static int v9fs_set_inode(struct inode *inode, void *data)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_wstat *st = (struct p9_wstat *)data;
-
- memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
- return 0;
-}
-
-static struct inode *v9fs_qid_iget(struct super_block *sb,
- struct p9_qid *qid,
- struct p9_wstat *st,
- int new)
+struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid)
{
dev_t rdev;
int retval;
umode_t umode;
- unsigned long i_ino;
struct inode *inode;
+ struct p9_wstat *st;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *inode, void *data);
- if (new)
- test = v9fs_test_new_inode;
- else
- test = v9fs_test_inode;
-
- i_ino = v9fs_qid2ino(qid);
- inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st);
- if (!inode)
+ inode = iget_locked(sb, QID2INO(&fid->qid));
+ if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
+
/*
* initialize the inode with the stat info
* FIXME!! we may need support for stale inodes
* later.
*/
- inode->i_ino = i_ino;
+ st = p9_client_stat(fid);
+ if (IS_ERR(st)) {
+ retval = PTR_ERR(st);
+ goto error;
+ }
+
umode = p9mode2unixmode(v9ses, st, &rdev);
- retval = v9fs_init_inode(v9ses, inode, umode, rdev);
+ retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev);
+ v9fs_stat2inode(st, inode, sb, 0);
+ p9stat_free(st);
+ kfree(st);
if (retval)
goto error;
- v9fs_stat2inode(st, inode, sb, 0);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
@@ -470,23 +405,6 @@ error:
}
-struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb, int new)
-{
- struct p9_wstat *st;
- struct inode *inode = NULL;
-
- st = p9_client_stat(fid);
- if (IS_ERR(st))
- return ERR_CAST(st);
-
- inode = v9fs_qid_iget(sb, &st->qid, st, new);
- p9stat_free(st);
- kfree(st);
- return inode;
-}
-
/**
* v9fs_at_to_dotl_flags- convert Linux specific AT flags to
* plan 9 AT flag.
@@ -633,7 +551,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
/*
* instantiate inode and assign the unopened fid to the dentry
*/
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS,
@@ -761,10 +679,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
inode = NULL;
else if (IS_ERR(fid))
inode = ERR_CAST(fid);
- else if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
else
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
/*
* If we had a rename on the server and a parallel lookup
* for the new name, then make sure we instantiate with
@@ -1187,26 +1103,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
}
/**
- * v9fs_qid2ino - convert qid into inode number
- * @qid: qid to hash
- *
- * BUG: potential for inode number collisions?
- */
-
-ino_t v9fs_qid2ino(struct p9_qid *qid)
-{
- u64 path = qid->path + 2;
- ino_t i = 0;
-
- if (sizeof(ino_t) == sizeof(path))
- memcpy(&i, &path, sizeof(ino_t));
- else
- i = (ino_t) (path ^ (path >> 32));
-
- return i;
-}
-
-/**
* v9fs_vfs_get_link - follow a symlink path
* @dentry: dentry for symlink
* @inode: inode for symlink
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 3505227e1704..55dde186041a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -52,82 +52,37 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
return current_fsgid();
}
-static int v9fs_test_inode_dotl(struct inode *inode, void *data)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
-
- /* don't match inode of different type */
- if (inode_wrong_type(inode, st->st_mode))
- return 0;
-
- if (inode->i_generation != st->st_gen)
- return 0;
-
- /* compare qid details */
- if (memcmp(&v9inode->qid.version,
- &st->qid.version, sizeof(v9inode->qid.version)))
- return 0;
-
- if (v9inode->qid.type != st->qid.type)
- return 0;
-
- if (v9inode->qid.path != st->qid.path)
- return 0;
- return 1;
-}
-
-/* Always get a new inode */
-static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
-{
- return 0;
-}
-
-static int v9fs_set_inode_dotl(struct inode *inode, void *data)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
-
- memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
- inode->i_generation = st->st_gen;
- return 0;
-}
-
-static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
- struct p9_qid *qid,
- struct p9_fid *fid,
- struct p9_stat_dotl *st,
- int new)
+struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid)
{
int retval;
- unsigned long i_ino;
struct inode *inode;
+ struct p9_stat_dotl *st;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *inode, void *data);
-
- if (new)
- test = v9fs_test_new_inode_dotl;
- else
- test = v9fs_test_inode_dotl;
- i_ino = v9fs_qid2ino(qid);
- inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st);
- if (!inode)
+ inode = iget_locked(sb, QID2INO(&fid->qid));
+ if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
+
/*
* initialize the inode with the stat info
* FIXME!! we may need support for stale inodes
* later.
*/
- inode->i_ino = i_ino;
- retval = v9fs_init_inode(v9ses, inode,
+ st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
+ if (IS_ERR(st)) {
+ retval = PTR_ERR(st);
+ goto error;
+ }
+
+ retval = v9fs_init_inode(v9ses, inode, &fid->qid,
st->st_mode, new_decode_dev(st->st_rdev));
+ v9fs_stat2inode_dotl(st, inode, 0);
+ kfree(st);
if (retval)
goto error;
- v9fs_stat2inode_dotl(st, inode, 0);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
@@ -135,6 +90,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
goto error;
unlock_new_inode(inode);
+
return inode;
error:
iget_failed(inode);
@@ -142,22 +98,6 @@ error:
}
-struct inode *
-v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb, int new)
-{
- struct p9_stat_dotl *st;
- struct inode *inode = NULL;
-
- st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
- if (IS_ERR(st))
- return ERR_CAST(st);
-
- inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
- kfree(st);
- return inode;
-}
-
struct dotl_openflag_map {
int open_flag;
int dotl_flag;
@@ -307,7 +247,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
goto out;
}
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -357,7 +297,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
umode_t omode)
{
int err;
- struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL, *dfid = NULL;
kgid_t gid;
const unsigned char *name;
@@ -367,7 +306,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
struct posix_acl *dacl = NULL, *pacl = NULL;
p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
- v9ses = v9fs_inode2v9ses(dir);
omode |= S_IFDIR;
if (dir->i_mode & S_ISGID)
@@ -402,32 +340,17 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
}
/* instantiate inode and assign the unopened fid to the dentry */
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
- goto error;
- }
- v9fs_fid_add(dentry, &fid);
- v9fs_set_create_acl(inode, fid, dacl, pacl);
- d_instantiate(dentry, inode);
- err = 0;
- } else {
- /*
- * Not in cached mode. No need to populate
- * inode with stat. We need to get an inode
- * so that we can set the acl with dentry
- */
- inode = v9fs_get_inode(dir->i_sb, mode, 0);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto error;
- }
- v9fs_set_create_acl(inode, fid, dacl, pacl);
- d_instantiate(dentry, inode);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
+ goto error;
}
+ v9fs_fid_add(dentry, &fid);
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
+ d_instantiate(dentry, inode);
+ err = 0;
inc_nlink(dir);
v9fs_invalidate_inode_attr(dir);
error:
@@ -709,14 +632,11 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir,
kgid_t gid;
const unsigned char *name;
struct p9_qid qid;
- struct inode *inode;
struct p9_fid *dfid;
struct p9_fid *fid = NULL;
- struct v9fs_session_info *v9ses;
name = dentry->d_name.name;
p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
- v9ses = v9fs_inode2v9ses(dir);
dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
@@ -736,36 +656,6 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir,
}
v9fs_invalidate_inode_attr(dir);
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- /* Now walk from the parent so we can get an unopened fid. */
- fid = p9_client_walk(dfid, 1, &name, 1);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
- goto error;
- }
-
- /* instantiate inode and assign the unopened fid to dentry */
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
- goto error;
- }
- v9fs_fid_add(dentry, &fid);
- d_instantiate(dentry, inode);
- err = 0;
- } else {
- /* Not in cached mode. No need to populate inode with stat */
- inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto error;
- }
- d_instantiate(dentry, inode);
- }
error:
p9_fid_put(fid);
@@ -847,7 +737,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
kgid_t gid;
const unsigned char *name;
umode_t mode;
- struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL, *dfid = NULL;
struct inode *inode;
struct p9_qid qid;
@@ -857,7 +746,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
- v9ses = v9fs_inode2v9ses(dir);
dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
@@ -888,33 +776,17 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
err);
goto error;
}
-
- /* instantiate inode and assign the unopened fid to the dentry */
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
- err);
- goto error;
- }
- v9fs_set_create_acl(inode, fid, dacl, pacl);
- v9fs_fid_add(dentry, &fid);
- d_instantiate(dentry, inode);
- err = 0;
- } else {
- /*
- * Not in cached mode. No need to populate inode with stat.
- * socket syscall returns a fd, so we need instantiate
- */
- inode = v9fs_get_inode(dir->i_sb, mode, rdev);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto error;
- }
- v9fs_set_create_acl(inode, fid, dacl, pacl);
- d_instantiate(dentry, inode);
+ inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
+ goto error;
}
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
+ v9fs_fid_add(dentry, &fid);
+ d_instantiate(dentry, inode);
+ err = 0;
error:
p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 941f7d0e0bfa..4236058c7bbd 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -110,7 +110,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
struct inode *inode = NULL;
struct dentry *root = NULL;
struct v9fs_session_info *v9ses = NULL;
- umode_t mode = 0777 | S_ISVTX;
struct p9_fid *fid;
int retval = 0;
@@ -140,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
else
sb->s_d_op = &v9fs_dentry_operations;
- inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
+ inode = v9fs_get_inode_from_fid(v9ses, fid, sb);
if (IS_ERR(inode)) {
retval = PTR_ERR(inode);
goto release_sb;
@@ -152,32 +151,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
}
sb->s_root = root;
- if (v9fs_proto_dotl(v9ses)) {
- struct p9_stat_dotl *st = NULL;
-
- st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto release_sb;
- }
- d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode_dotl(st, d_inode(root), 0);
- kfree(st);
- } else {
- struct p9_wstat *st = NULL;
-
- st = p9_client_stat(fid);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto release_sb;
- }
-
- d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode(st, d_inode(root), sb, 0);
-
- p9stat_free(st);
- kfree(st);
- }
retval = v9fs_get_acl(inode, fid);
if (retval)
goto release_sb;
@@ -271,21 +244,6 @@ done:
return res;
}
-static int v9fs_drop_inode(struct inode *inode)
-{
- struct v9fs_session_info *v9ses;
-
- v9ses = v9fs_inode2v9ses(inode);
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- return generic_drop_inode(inode);
- /*
- * in case of non cached mode always drop the
- * inode because we want the inode attribute
- * to always match that on the server.
- */
- return 1;
-}
-
static int v9fs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
@@ -320,7 +278,6 @@ static const struct super_operations v9fs_super_ops_dotl = {
.alloc_inode = v9fs_alloc_inode,
.free_inode = v9fs_free_inode,
.statfs = v9fs_statfs,
- .drop_inode = v9fs_drop_inode,
.evict_inode = v9fs_evict_inode,
.show_options = v9fs_show_options,
.umount_begin = v9fs_umount_begin,
diff --git a/fs/Kconfig b/fs/Kconfig
index 89fdbefd1075..a46b0cbc4d8f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -60,7 +60,6 @@ endif # BLOCK
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
- depends on !(ARM || MIPS || SPARC)
depends on ZONE_DEVICE || FS_DAX_LIMITED
select FS_IOMAP
select DAX
@@ -162,7 +161,6 @@ menu "DOS/FAT/EXFAT/NT Filesystems"
source "fs/fat/Kconfig"
source "fs/exfat/Kconfig"
-source "fs/ntfs/Kconfig"
source "fs/ntfs3/Kconfig"
endmenu
@@ -262,6 +260,7 @@ menuconfig HUGETLBFS
depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
depends on (SYSFS || SYSCTL)
select MEMFD_CREATE
+ select PADATA if SMP
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
diff --git a/fs/Makefile b/fs/Makefile
index c09016257f05..6ecc9b0a53f2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,7 @@ obj-y := open.o read_write.o file_table.o super.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- kernel_read_file.o mnt_idmapping.o remap_range.o
+ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
@@ -91,7 +91,6 @@ obj-y += unicode/
obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_SMBFS) += smb/
obj-$(CONFIG_HPFS_FS) += hpfs/
-obj-$(CONFIG_NTFS_FS) += ntfs/
obj-$(CONFIG_NTFS3_FS) += ntfs3/
obj-$(CONFIG_UFS_FS) += ufs/
obj-$(CONFIG_EFS_FS) += efs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index e8bfc38239cd..9354b14bbfe3 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -249,7 +249,7 @@ static int __init init_inodecache(void)
adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
sizeof(struct adfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (adfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index b56a95cf414a..3c5821339609 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -130,8 +130,7 @@ static int __init init_inodecache(void)
{
affs_inode_cachep = kmem_cache_create("affs_inode_cache",
sizeof(struct affs_inode_info),
- 0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ 0, (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT),
init_once);
if (affs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 8a67fc427e74..67afe68972d5 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -474,16 +474,6 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
continue;
}
- /* Don't expose silly rename entries to userspace. */
- if (nlen > 6 &&
- dire->u.name[0] == '.' &&
- ctx->actor != afs_lookup_filldir &&
- ctx->actor != afs_lookup_one_filldir &&
- memcmp(dire->u.name, ".__afs", 6) == 0) {
- ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
- continue;
- }
-
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 9c6dea3139f5..f0e96a35093f 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -93,13 +93,13 @@ static void afs_grant_locks(struct afs_vnode *vnode)
bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE);
list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
- if (!exclusive && p->fl_type == F_WRLCK)
+ if (!exclusive && lock_is_write(p))
continue;
list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks);
p->fl_u.afs.state = AFS_LOCK_GRANTED;
trace_afs_flock_op(vnode, p, afs_flock_op_grant);
- wake_up(&p->fl_wait);
+ locks_wake_up(p);
}
}
@@ -112,25 +112,24 @@ static void afs_next_locker(struct afs_vnode *vnode, int error)
{
struct file_lock *p, *_p, *next = NULL;
struct key *key = vnode->lock_key;
- unsigned int fl_type = F_RDLCK;
+ unsigned int type = F_RDLCK;
_enter("");
if (vnode->lock_type == AFS_LOCK_WRITE)
- fl_type = F_WRLCK;
+ type = F_WRLCK;
list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
if (error &&
- p->fl_type == fl_type &&
- afs_file_key(p->fl_file) == key) {
+ p->c.flc_type == type &&
+ afs_file_key(p->c.flc_file) == key) {
list_del_init(&p->fl_u.afs.link);
p->fl_u.afs.state = error;
- wake_up(&p->fl_wait);
+ locks_wake_up(p);
}
/* Select the next locker to hand off to. */
- if (next &&
- (next->fl_type == F_WRLCK || p->fl_type == F_RDLCK))
+ if (next && (lock_is_write(next) || lock_is_read(p)))
continue;
next = p;
}
@@ -142,7 +141,7 @@ static void afs_next_locker(struct afs_vnode *vnode, int error)
afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
next->fl_u.afs.state = AFS_LOCK_YOUR_TRY;
trace_afs_flock_op(vnode, next, afs_flock_op_wake);
- wake_up(&next->fl_wait);
+ locks_wake_up(next);
} else {
afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE);
trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0);
@@ -166,7 +165,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
struct file_lock, fl_u.afs.link);
list_del_init(&p->fl_u.afs.link);
p->fl_u.afs.state = -ENOENT;
- wake_up(&p->fl_wait);
+ locks_wake_up(p);
}
key_put(vnode->lock_key);
@@ -464,14 +463,14 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
_enter("{%llx:%llu},%llu-%llu,%u,%u",
vnode->fid.vid, vnode->fid.vnode,
- fl->fl_start, fl->fl_end, fl->fl_type, mode);
+ fl->fl_start, fl->fl_end, fl->c.flc_type, mode);
fl->fl_ops = &afs_lock_ops;
INIT_LIST_HEAD(&fl->fl_u.afs.link);
fl->fl_u.afs.state = AFS_LOCK_PENDING;
partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX);
- type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+ type = lock_is_read(fl) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
if (mode == afs_flock_mode_write && partial)
type = AFS_LOCK_WRITE;
@@ -524,7 +523,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
}
if (vnode->lock_state == AFS_VNODE_LOCK_NONE &&
- !(fl->fl_flags & FL_SLEEP)) {
+ !(fl->c.flc_flags & FL_SLEEP)) {
ret = -EAGAIN;
if (type == AFS_LOCK_READ) {
if (vnode->status.lock_count == -1)
@@ -621,7 +620,7 @@ skip_server_lock:
return 0;
lock_is_contended:
- if (!(fl->fl_flags & FL_SLEEP)) {
+ if (!(fl->c.flc_flags & FL_SLEEP)) {
list_del_init(&fl->fl_u.afs.link);
afs_next_locker(vnode, 0);
ret = -EAGAIN;
@@ -641,7 +640,7 @@ need_to_wait:
spin_unlock(&vnode->lock);
trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0);
- ret = wait_event_interruptible(fl->fl_wait,
+ ret = wait_event_interruptible(fl->c.flc_wait,
fl->fl_u.afs.state != AFS_LOCK_PENDING);
trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret);
@@ -704,7 +703,8 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
int ret;
- _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+ _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode,
+ fl->c.flc_type);
trace_afs_flock_op(vnode, fl, afs_flock_op_unlock);
@@ -730,11 +730,11 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
if (vnode->lock_state == AFS_VNODE_LOCK_DELETED)
return -ENOENT;
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
/* check local lock records first */
posix_test_lock(file, fl);
- if (fl->fl_type == F_UNLCK) {
+ if (lock_is_unlock(fl)) {
/* no local locks; consult the server */
ret = afs_fetch_status(vnode, key, false, NULL);
if (ret < 0)
@@ -743,18 +743,18 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
lock_count = READ_ONCE(vnode->status.lock_count);
if (lock_count != 0) {
if (lock_count > 0)
- fl->fl_type = F_RDLCK;
+ fl->c.flc_type = F_RDLCK;
else
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
fl->fl_start = 0;
fl->fl_end = OFFSET_MAX;
- fl->fl_pid = 0;
+ fl->c.flc_pid = 0;
}
}
ret = 0;
error:
- _leave(" = %d [%hd]", ret, fl->fl_type);
+ _leave(" = %d [%hd]", ret, fl->c.flc_type);
return ret;
}
@@ -769,7 +769,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
_enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
vnode->fid.vid, vnode->fid.vnode, cmd,
- fl->fl_type, fl->fl_flags,
+ fl->c.flc_type, fl->c.flc_flags,
(long long) fl->fl_start, (long long) fl->fl_end);
if (IS_GETLK(cmd))
@@ -778,7 +778,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
trace_afs_flock_op(vnode, fl, afs_flock_op_lock);
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
ret = afs_do_unlk(file, fl);
else
ret = afs_do_setlk(file, fl);
@@ -804,7 +804,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
_enter("{%llx:%llu},%d,{t=%x,fl=%x}",
vnode->fid.vid, vnode->fid.vnode, cmd,
- fl->fl_type, fl->fl_flags);
+ fl->c.flc_type, fl->c.flc_flags);
/*
* No BSD flocks over NFS allowed.
@@ -813,14 +813,14 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
* Not sure whether that would be unique, though, or whether
* that would break in other places.
*/
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
trace_afs_flock_op(vnode, fl, afs_flock_op_flock);
/* we're simulating flock() locks using posix locks on the server */
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
ret = afs_do_unlk(file, fl);
else
ret = afs_do_setlk(file, fl);
@@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
*/
static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file));
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file));
_enter("");
@@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
*/
static void afs_fl_release_private(struct file_lock *fl)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file));
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file));
_enter("");
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 700a27bc8c25..ed04bd1eeae8 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -602,6 +602,8 @@ iterate_address:
goto wait_for_more_probe_results;
alist = op->estate->addresses;
+ best_prio = -1;
+ addr_index = 0;
for (i = 0; i < alist->nr_addrs; i++) {
if (alist->addrs[i].prio > best_prio) {
addr_index = i;
@@ -609,9 +611,7 @@ iterate_address:
}
}
- addr_index = READ_ONCE(alist->preferred);
- if (!test_bit(addr_index, &set))
- addr_index = __ffs(set);
+ alist->preferred = addr_index;
op->addr_index = addr_index;
set_bit(addr_index, &op->addr_tried);
@@ -656,12 +656,6 @@ wait_for_more_probe_results:
next_server:
trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
_debug("next");
- ASSERT(op->estate);
- alist = op->estate->addresses;
- if (op->call_responded &&
- op->addr_index != READ_ONCE(alist->preferred) &&
- test_bit(alist->preferred, &op->addr_tried))
- WRITE_ONCE(alist->preferred, op->addr_index);
op->estate = NULL;
goto pick_server;
@@ -690,14 +684,7 @@ no_more_servers:
failed:
trace_afs_rotate(op, afs_rotate_trace_failed, 0);
op->flags |= AFS_OPERATION_STOP;
- if (op->estate) {
- alist = op->estate->addresses;
- if (op->call_responded &&
- op->addr_index != READ_ONCE(alist->preferred) &&
- test_bit(alist->preferred, &op->addr_tried))
- WRITE_ONCE(alist->preferred, op->addr_index);
- op->estate = NULL;
- }
+ op->estate = NULL;
_leave(" = f [failed %d]", afs_op_error(op));
return false;
}
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index 46b37f2cce7d..32a53fc8dfb2 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -122,6 +122,9 @@ bool afs_check_validity(const struct afs_vnode *vnode)
const struct afs_volume *volume = vnode->volume;
time64_t deadline = ktime_get_real_seconds() + 10;
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+ return true;
+
if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
atomic64_read(&vnode->cb_expires_at) <= deadline ||
volume->cb_expires_at <= deadline ||
@@ -389,12 +392,17 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
key_serial(key));
if (afs_check_validity(vnode))
- return 0;
+ return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
ret = down_write_killable(&vnode->validate_lock);
if (ret < 0)
goto error;
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ ret = -ESTALE;
+ goto error_unlock;
+ }
+
/* Validate a volume after the v_break has changed or the volume
* callback expired. We only want to do this once per volume per
* v_break change. The actual work will be done when parsing the
@@ -448,12 +456,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
vnode->cb_ro_snapshot = cb_ro_snapshot;
vnode->cb_scrub = cb_scrub;
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- _debug("file already deleted");
- ret = -ESTALE;
- goto error_unlock;
- }
-
/* if the vnode's data version number changed then its contents are
* different */
zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
diff --git a/fs/aio.c b/fs/aio.c
index 28223f511931..0f4f531c9780 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -589,8 +589,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
- struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
- struct kioctx *ctx = req->ki_ctx;
+ struct aio_kiocb *req;
+ struct kioctx *ctx;
unsigned long flags;
/*
@@ -600,9 +600,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
if (!(iocb->ki_flags & IOCB_AIO_RW))
return;
+ req = container_of(iocb, struct aio_kiocb, rw);
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
+ ctx = req->ki_ctx;
+
spin_lock_irqsave(&ctx->ctx_lock, flags);
list_add_tail(&req->ki_list, &ctx->active_reqs);
req->ki_cancel = cancel;
@@ -1198,8 +1202,8 @@ static void aio_complete(struct aio_kiocb *iocb)
spin_lock_irqsave(&ctx->wait.lock, flags);
list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry)
if (avail >= curr->min_nr) {
- list_del_init_careful(&curr->w.entry);
wake_up_process(curr->w.private);
+ list_del_init_careful(&curr->w.entry);
}
spin_unlock_irqrestore(&ctx->wait.lock, flags);
}
@@ -2165,11 +2169,14 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
#endif
/* sys_io_cancel:
- * Attempts to cancel an iocb previously passed to io_submit(). If the
- * operation is successfully cancelled 0 is returned. May fail with
- * -EFAULT if any of the data structures pointed to are invalid. May
- * fail with -EINVAL if aio_context specified by ctx_id is invalid. Will
- * fail with -ENOSYS if not implemented.
+ * Attempts to cancel an iocb previously passed to io_submit. If
+ * the operation is successfully cancelled, the resulting event is
+ * copied into the memory pointed to by result without being placed
+ * into the completion queue and 0 is returned. May fail with
+ * -EFAULT if any of the data structures pointed to are invalid.
+ * May fail with -EINVAL if aio_context specified by ctx_id is
+ * invalid. May fail with -EAGAIN if the iocb specified was not
+ * cancelled. Will fail with -ENOSYS if not implemented.
*/
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
@@ -2200,12 +2207,14 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
}
spin_unlock_irq(&ctx->ctx_lock);
- /*
- * The result argument is no longer used - the io_event is always
- * delivered via the ring buffer.
- */
- if (ret == 0 && kiocb->rw.ki_flags & IOCB_AIO_RW)
- aio_complete_rw(&kiocb->rw, -EINTR);
+ if (!ret) {
+ /*
+ * The result argument is no longer used - the io_event is
+ * always delivered via the ring buffer. -EINPROGRESS indicates
+ * cancellation is progress:
+ */
+ ret = -EINPROGRESS;
+ }
percpu_ref_put(&ctx->users);
diff --git a/fs/attr.c b/fs/attr.c
index 5a13f0c8495f..960a310581eb 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -16,8 +16,6 @@
#include <linux/fcntl.h>
#include <linux/filelock.h>
#include <linux/security.h>
-#include <linux/evm.h>
-#include <linux/ima.h>
#include "internal.h"
@@ -352,7 +350,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
EXPORT_SYMBOL(may_setattr);
/**
- * notify_change - modify attributes of a filesytem object
+ * notify_change - modify attributes of a filesystem object
* @idmap: idmap of the mount the inode was found from
* @dentry: object affected
* @attr: new attributes
@@ -502,8 +500,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
if (!error) {
fsnotify_change(dentry, ia_valid);
- ima_inode_post_setattr(idmap, dentry);
- evm_inode_post_setattr(dentry, ia_valid);
+ security_inode_post_setattr(idmap, dentry, ia_valid);
}
return error;
diff --git a/fs/backing-file.c b/fs/backing-file.c
index a681f38d84d8..740185198db3 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap);
static int __init backing_aio_init(void)
{
- backing_aio_cachep = kmem_cache_create("backing_aio",
- sizeof(struct backing_aio),
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN);
if (!backing_aio_cachep)
return -ENOMEM;
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1a05cecda7cc..66ca0bbee639 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -17,6 +17,7 @@ bcachefs-y := \
btree_journal_iter.o \
btree_key_cache.o \
btree_locking.o \
+ btree_node_scan.o \
btree_trans_commit.o \
btree_update.o \
btree_update_interior.o \
@@ -37,6 +38,7 @@ bcachefs-y := \
error.o \
extents.o \
extent_update.o \
+ eytzinger.o \
fs.o \
fs-common.o \
fs-ioctl.o \
@@ -67,6 +69,7 @@ bcachefs-y := \
quota.o \
rebalance.o \
recovery.o \
+ recovery_passes.o \
reflink.o \
replicas.o \
sb-clean.o \
@@ -82,6 +85,7 @@ bcachefs-y := \
super-io.o \
sysfs.o \
tests.o \
+ time_stats.o \
thread_with_file.o \
trace.o \
two_state_shared_lock.o \
@@ -90,3 +94,6 @@ bcachefs-y := \
xattr.o
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
+
+# Silence "note: xyz changed in GCC X.X" messages
+subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3640f417cce1..5c180fdc3efb 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
@@ -290,28 +289,27 @@ retry:
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
&hash, inode_inum(inode), &search, 0);
- if (ret) {
- if (!bch2_err_matches(ret, ENOENT))
- acl = ERR_PTR(ret);
- goto out;
- }
+ if (ret)
+ goto err;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
- if (ret) {
- acl = ERR_PTR(ret);
- goto out;
- }
+ if (ret)
+ goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ le16_to_cpu(xattr.v->x_val_len));
+ ret = PTR_ERR_OR_ZERO(acl);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
- if (!IS_ERR(acl))
+ if (ret)
+ acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
+
+ if (!IS_ERR_OR_NULL(acl))
set_cached_acl(&inode->v, type, acl);
-out:
- if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
- goto retry;
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd3e175d8342..4ff56fa4d539 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -29,6 +29,8 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+
/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -530,13 +532,13 @@ int bch2_bucket_gens_init(struct bch_fs *c)
u8 gen = bch2_alloc_to_v4(k, &a)->gen;
unsigned offset;
struct bpos pos = alloc_gens_pos(iter.pos, &offset);
+ int ret2 = 0;
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
- if (ret)
- break;
+ ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret2)
+ goto iter_err;
have_bucket_gens_key = false;
}
@@ -547,7 +549,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
}
g.v.gens[offset] = gen;
- 0;
+iter_err:
+ ret2;
}));
if (have_bucket_gens_key && !ret)
@@ -850,7 +853,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bucket_journal_seq);
if (ret) {
bch2_fs_fatal_error(c,
- "error setting bucket_needs_journal_commit: %i", ret);
+ "setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
return ret;
}
}
@@ -860,23 +863,28 @@ int bch2_trigger_alloc(struct btree_trans *trans,
*bucket_gen(ca, new.k->p.offset) = new_a->gen;
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+ percpu_up_read(&c->mark_lock);
+
+#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
+#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
+#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
- if (new_a->data_type == BCH_DATA_free &&
- (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ if (statechange(a->data_type == BCH_DATA_free) &&
+ bucket_flushed(new_a))
closure_wake_up(&c->freelist_wait);
- if (new_a->data_type == BCH_DATA_need_discard &&
- (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
- bch2_do_discards(c);
+ if (statechange(a->data_type == BCH_DATA_need_discard) &&
+ !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
+ bucket_flushed(new_a))
+ bch2_discard_one_bucket_fast(c, new.k->p);
- if (old_a->data_type != BCH_DATA_cached &&
- new_a->data_type == BCH_DATA_cached &&
+ if (statechange(a->data_type == BCH_DATA_cached) &&
+ !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
bch2_do_invalidates(c);
- if (new_a->data_type == BCH_DATA_need_gc_gens)
+ if (statechange(a->data_type == BCH_DATA_need_gc_gens))
bch2_do_gc_gens(c);
- percpu_up_read(&c->mark_lock);
}
if ((flags & BTREE_TRIGGER_GC) &&
@@ -1045,14 +1053,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (k.k->type != discard_key_type &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, need_discard_key_wrong,
- "incorrect key in need_discard btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[discard_key_type],
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(k.k->type != discard_key_type,
+ c, need_discard_key_wrong,
+ "incorrect key in need_discard btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[discard_key_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1076,15 +1083,14 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (k.k->type != freespace_key_type &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, freespace_key_wrong,
- "incorrect key in freespace btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[freespace_key_type],
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(k.k->type != freespace_key_type,
+ c, freespace_key_wrong,
+ "incorrect key in freespace btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[freespace_key_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1108,14 +1114,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (a->gen != alloc_gen(k, gens_offset) &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, bucket_gens_key_wrong,
- "incorrect gen in bucket_gens btree (got %u should be %u)\n"
- " %s",
- alloc_gen(k, gens_offset), a->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
+ c, bucket_gens_key_wrong,
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+ " %s",
+ alloc_gen(k, gens_offset), a->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i_bucket_gens *g =
bch2_trans_kmalloc(trans, sizeof(*g));
@@ -1167,14 +1172,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
*end = bkey_min(k.k->p, *end);
- if (k.k->type != KEY_TYPE_set &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, freespace_hole_missing,
- "hole in alloc btree missing in freespace btree\n"
- " device %llu buckets %llu-%llu",
- freespace_iter->pos.inode,
- freespace_iter->pos.offset,
- end->offset))) {
+ if (fsck_err_on(k.k->type != KEY_TYPE_set,
+ c, freespace_hole_missing,
+ "hole in alloc btree missing in freespace btree\n"
+ " device %llu buckets %llu-%llu",
+ freespace_iter->pos.inode,
+ freespace_iter->pos.offset,
+ end->offset)) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1604,6 +1608,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
+static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+{
+ int ret;
+
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i)
+ if (bkey_eq(*i, bucket)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = darray_push(&c->discard_buckets_in_flight, bucket);
+out:
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+ return ret;
+}
+
+static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+{
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i)
+ if (bkey_eq(*i, bucket)) {
+ darray_remove_item(&c->discard_buckets_in_flight, i);
+ goto found;
+ }
+ BUG();
+found:
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+}
+
struct discard_buckets_state {
u64 seen;
u64 open;
@@ -1642,6 +1676,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bch_dev *ca;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
+ bool discard_locked = false;
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
@@ -1678,37 +1713,45 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
- a->v.gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- goto write;
- }
-
- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
- "%s",
- a->v.journal_seq,
- c->journal.flushed_seq_ondisk,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ if (a->v.dirty_sectors) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "attempting to discard bucket with dirty data\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = -EIO;
- }
goto out;
}
if (a->v.data_type != BCH_DATA_need_discard) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "bucket incorrectly set in need_discard btree\n"
- "%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
+ if (data_type_is_empty(a->v.data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
}
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "bucket incorrectly set in need_discard btree\n"
+ "%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
goto out;
}
+ if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
+ a->v.journal_seq,
+ c->journal.flushed_seq_ondisk,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
+ goto out;
+ }
+
+ if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+ goto out;
+
+ discard_locked = true;
+
if (!bkey_eq(*discard_pos_done, iter.pos) &&
ca->mi.discard && !c->opts.nochanges) {
/*
@@ -1740,6 +1783,8 @@ write:
count_event(c, bucket_discard);
s->discarded++;
out:
+ if (discard_locked)
+ discard_in_flight_remove(c, iter.pos);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
@@ -1779,6 +1824,94 @@ void bch2_do_discards(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
+static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
+{
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto err;
+
+ BUG_ON(a->v.dirty_sectors);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+ a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static void bch2_do_discards_fast_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+
+ while (1) {
+ bool got_bucket = false;
+ struct bpos bucket;
+ struct bch_dev *ca;
+
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i) {
+ if (i->snapshot)
+ continue;
+
+ ca = bch_dev_bkey_exists(c, i->inode);
+
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ darray_remove_item(&c->discard_buckets_in_flight, i);
+ continue;
+ }
+
+ got_bucket = true;
+ bucket = *i;
+ i->snapshot = true;
+ break;
+ }
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+
+ if (!got_bucket)
+ break;
+
+ if (ca->mi.discard && !c->opts.nochanges)
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
+
+ int ret = bch2_trans_do(c, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_bucket_needs_discard(trans, bucket));
+ bch_err_fn(c, ret);
+
+ percpu_ref_put(&ca->io_ref);
+ discard_in_flight_remove(c, bucket);
+
+ if (ret)
+ break;
+ }
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+ if (!percpu_ref_is_dying(&ca->io_ref) &&
+ !discard_in_flight_add(c, bucket) &&
+ bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
+ !queue_work(c->write_ref_wq, &c->discard_fast_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
static int invalidate_one_bucket(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
@@ -1813,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
goto out;
BUG_ON(a->v.data_type != BCH_DATA_cached);
+ BUG_ON(a->v.dirty_sectors);
if (!a->v.cached_sectors)
bch_err(c, "invalidating empty bucket, confused");
@@ -2210,9 +2344,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
+void bch2_fs_allocator_background_exit(struct bch_fs *c)
+{
+ darray_exit(&c->discard_buckets_in_flight);
+}
+
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
+ mutex_init(&c->discard_buckets_in_flight_lock);
INIT_WORK(&c->discard_work, bch2_do_discards_work);
+ INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index e7f7e842ee1b..052b2fac25d6 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -269,6 +269,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+void bch2_fs_allocator_background_exit(struct bch_fs *);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 633d3223b353..a1fc30adf912 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{
switch (watermark) {
- case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
return 0;
+ case BCH_WATERMARK_reclaim:
+ return OPEN_BUCKETS_COUNT / 6;
case BCH_WATERMARK_btree:
case BCH_WATERMARK_btree_copygc:
return OPEN_BUCKETS_COUNT / 4;
@@ -236,8 +238,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@@ -263,11 +264,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, false);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
spin_unlock(&c->freelist_lock);
return ob;
@@ -555,8 +553,7 @@ again:
goto again;
}
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;
@@ -1361,15 +1358,17 @@ retry:
/* Don't retry from all devices if we're out of open buckets: */
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, watermark,
flags, cl);
- if (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ if (!ret2 ||
+ bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
+ ret = ret2;
goto alloc_done;
+ }
}
/*
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index b91b7a461056..c2226e947c41 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -22,7 +22,8 @@ struct bucket_alloc_state {
x(copygc) \
x(btree) \
x(btree_copygc) \
- x(reclaim)
+ x(reclaim) \
+ x(interior_updates)
enum bch_watermark {
#define x(name) BCH_WATERMARK_##name,
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 569b97904da4..fadb1078903d 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -8,6 +8,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "checksum.h"
#include "error.h"
#include <linux/mm.h>
@@ -29,8 +30,7 @@ static bool extent_matches_bp(struct bch_fs *c,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
- &bucket2, &bp2);
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2);
if (bpos_eq(bucket, bucket2) &&
!memcmp(&bp, &bp2, sizeof(bp)))
return true;
@@ -44,13 +44,20 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+ /* these will be caught by fsck */
+ if (!bch2_dev_exists2(c, bp.k->p.inode))
+ return 0;
+
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode);
struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
int ret = 0;
- bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+ bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
+ !bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
c, err,
- backpointer_pos_wrong,
- "backpointer at wrong pos");
+ backpointer_bucket_offset_wrong,
+ "backpointer bucket_offset wrong");
fsck_err:
return ret;
}
@@ -131,8 +138,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
printbuf_exit(&buf);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- bch2_inconsistent_error(c);
- return -EIO;
+ return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
} else {
return 0;
}
@@ -379,7 +385,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, bp_iter, 0);
goto out;
}
@@ -415,6 +421,84 @@ struct extents_to_bp_state {
struct bkey_buf last_flushed;
};
+static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
+ struct bkey_s_c extent, unsigned dev)
+{
+ struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bch2_bkey_drop_device(bkey_i_to_s(n), dev);
+ return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+static int check_extent_checksum(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_s_c extent,
+ enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct printbuf buf = PRINTBUF;
+ void *data_buf = NULL;
+ struct bio *bio = NULL;
+ size_t bytes;
+ int ret = 0;
+
+ if (bkey_is_btree_ptr(extent.k))
+ return false;
+
+ bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
+ if (p.ptr.dev == dev)
+ goto found;
+ BUG();
+found:
+ if (!p.crc.csum_type)
+ return false;
+
+ bytes = p.crc.compressed_size << 9;
+
+ struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
+ if (!bch2_dev_get_ioref(ca, READ))
+ return false;
+
+ data_buf = kvmalloc(bytes, GFP_KERNEL);
+ if (!data_buf) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ bio->bi_iter.bi_sector = p.ptr.offset;
+ bch2_bio_map(bio, data_buf, bytes);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+
+ prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree));
+ bch2_bkey_val_to_text(&buf, c, extent);
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
+ bch2_bkey_val_to_text(&buf, c, extent2);
+
+ struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+ struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
+ if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
+ c, dup_backpointer_to_bad_csum_extent,
+ "%s", buf.buf))
+ ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
+fsck_err:
+err:
+ if (bio)
+ bio_put(bio);
+ kvfree(data_buf);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s,
struct bpos bucket,
@@ -422,7 +506,8 @@ static int check_bp_exists(struct btree_trans *trans,
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter bp_iter = { NULL };
+ struct btree_iter bp_iter = {};
+ struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
struct bkey_buf tmp;
@@ -430,13 +515,19 @@ static int check_bp_exists(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ prt_str(&buf, "extent for nonexistent device:bucket ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ bch_err(c, "%s", buf.buf);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
if (bpos_lt(bucket, s->bucket_start) ||
bpos_gt(bucket, s->bucket_end))
return 0;
- if (!bch2_dev_bucket_exists(c, bucket))
- goto missing;
-
bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp(c, bucket, bp.bucket_offset),
0);
@@ -462,24 +553,96 @@ static int check_bp_exists(struct btree_trans *trans,
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
- goto missing;
+
+ goto check_existing_bp;
}
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
bch2_bkey_buf_exit(&tmp, c);
printbuf_exit(&buf);
return ret;
+check_existing_bp:
+ /* Do we have a backpointer for a different extent? */
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ goto missing;
+
+ struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
+
+ struct bkey_s_c other_extent =
+ bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
+ ret = bkey_err(other_extent);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ ret = 0;
+ if (ret)
+ goto err;
+
+ if (!other_extent.k)
+ goto missing;
+
+ if (bch2_extents_match(orig_k, other_extent)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+
+ if (other_extent.k->size <= orig_k.k->size) {
+ ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret)
+ goto err;
+ goto out;
+ } else {
+ ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
+ if (ret)
+ goto err;
+ goto missing;
+ }
+ }
+
+ ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto missing;
+ }
+
+ ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
missing:
+ printbuf_reset(&buf);
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\nbp pos ");
- bch2_bpos_to_text(&buf, bp_iter.pos);
+ prt_printf(&buf, "\n got: ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+
+ struct bkey_i_backpointer n_bp_k;
+ bkey_backpointer_init(&n_bp_k.k_i);
+ n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ n_bp_k.v = bp;
+ prt_printf(&buf, "\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
- if (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+ if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
goto out;
@@ -504,8 +667,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree, level,
- k, p, &bucket_pos, &bp);
+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp);
ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
@@ -555,60 +717,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
};
}
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+static u64 mem_may_pin_bytes(struct bch_fs *c)
{
struct sysinfo i;
- u64 mem_bytes;
-
si_meminfo(&i);
- mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
+
+ u64 mem_bytes = i.totalram * i.mem_unit;
+ return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+ return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
- unsigned btree_leaf_mask,
- unsigned btree_interior_mask,
+ u64 btree_leaf_mask,
+ u64 btree_interior_mask,
struct bbpos start, struct bbpos *end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
- enum btree_id btree;
+ struct bch_fs *c = trans->c;
+ s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
- for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
- unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+ btree_interior_mask |= btree_leaf_mask;
+
+ c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
+ c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
+ c->btree_cache.pinned_nodes_start = start;
+ c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
+
+ for (enum btree_id btree = start.btree;
+ btree < BTREE_ID_NR && !ret;
+ btree++) {
+ unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
+ struct btree_iter iter;
+ struct btree *b;
if (!((1U << btree) & btree_leaf_mask) &&
!((1U << btree) & btree_interior_mask))
continue;
- bch2_trans_node_iter_init(trans, &iter, btree,
- btree == start.btree ? start.pos : POS_MIN,
- 0, depth, 0);
- /*
- * for_each_btree_key_contineu() doesn't check the return value
- * from bch2_btree_iter_advance(), which is needed when
- * iterating over interior nodes where we'll see keys at
- * SPOS_MAX:
- */
- do {
- k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
- ret = bkey_err(k);
- if (!k.k || ret)
- break;
-
- --btree_nodes;
- if (!btree_nodes) {
- *end = BBPOS(btree, k.k->p);
+ __for_each_btree_node(trans, iter, btree,
+ btree == start.btree ? start.pos : POS_MIN,
+ 0, depth, BTREE_ITER_PREFETCH, b, ret) {
+ mem_may_pin -= btree_buf_bytes(b);
+ if (mem_may_pin <= 0) {
+ c->btree_cache.pinned_nodes_end = *end =
+ BBPOS(btree, b->key.k.p);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
- } while (bch2_btree_iter_advance(&iter));
+ }
bch2_trans_iter_exit(trans, &iter);
}
- *end = BBPOS_MAX;
return ret;
}
@@ -666,62 +829,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
return 0;
}
-static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
- struct bpos bucket)
-{
- return bch2_dev_exists2(c, bucket.inode)
- ? bucket_pos_to_bp(c, bucket, 0)
- : bucket;
-}
-
-static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
- struct bpos start, struct bpos *end)
-{
- struct btree_iter alloc_iter;
- struct btree_iter bp_iter;
- struct bkey_s_c alloc_k, bp_k;
- size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
- bool alloc_end = false, bp_end = false;
- int ret = 0;
-
- bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
- start, 0, 1, 0);
- bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
- while (1) {
- alloc_k = !alloc_end
- ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
- : bkey_s_c_null;
- bp_k = !bp_end
- ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
- : bkey_s_c_null;
-
- ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
- if ((!alloc_k.k && !bp_k.k) || ret) {
- *end = SPOS_MAX;
- break;
- }
-
- --btree_nodes;
- if (!btree_nodes) {
- *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
- break;
- }
-
- if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
- bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
- if (!bch2_btree_iter_advance(&alloc_iter))
- alloc_end = true;
- } else {
- if (!bch2_btree_iter_advance(&bp_iter))
- bp_end = true;
- }
- }
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_trans_iter_exit(trans, &alloc_iter);
- return ret;
-}
-
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
@@ -732,10 +839,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bkey_init(&s.last_flushed.k->k);
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
+ struct bbpos end;
+ ret = bch2_get_btree_in_memory_pos(trans,
+ BIT_ULL(BTREE_ID_backpointers),
+ BIT_ULL(BTREE_ID_backpointers),
+ BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
if (ret)
break;
+ s.bucket_end = end.pos;
+
if ( bpos_eq(s.bucket_start, POS_MIN) &&
!bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
@@ -763,6 +876,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
+ c->btree_cache.pinned_nodes_leaf_mask = 0;
+ c->btree_cache.pinned_nodes_interior_mask = 0;
+
bch_err_fn(c, ret);
return ret;
}
@@ -868,6 +984,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
+ c->btree_cache.pinned_nodes_leaf_mask = 0;
+ c->btree_cache.pinned_nodes_interior_mask = 0;
+
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 327365a9feac..85949b9fd880 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -53,14 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
u64 bucket_offset)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
- struct bpos ret;
-
- ret = POS(bucket.inode,
- (bucket_to_sector(ca, bucket.offset) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+ struct bpos ret = POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
-
return ret;
}
@@ -90,20 +87,40 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
}
-static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p)
+static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry)
{
- return level ? BCH_DATA_btree :
- p.has_ec ? BCH_DATA_stripe :
- BCH_DATA_user;
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ return BCH_DATA_btree;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+ case KEY_TYPE_stripe: {
+ const struct bch_extent_ptr *ptr = &entry->ptr;
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ BUG_ON(ptr < s.v->ptrs ||
+ ptr >= s.v->ptrs + s.v->nr_blocks);
+
+ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity
+ : BCH_DATA_user;
+ }
+ default:
+ BUG();
+ }
}
static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
struct bpos *bucket_pos, struct bch_backpointer *bp)
{
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
index 5198e94cf3b8..f63893344f80 100644
--- a/fs/bcachefs/bbpos_types.h
+++ b/fs/bcachefs/bbpos_types.h
@@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 69d0d60d50e3..91c3c1fef233 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -209,9 +209,10 @@
#include "fifo.h"
#include "nocow_locking_types.h"
#include "opts.h"
-#include "recovery_types.h"
+#include "recovery_passes_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
+#include "time_stats.h"
#include "util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -266,6 +267,9 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
__printf(2, 3)
+void bch2_print_opts(struct bch_opts *, const char *, ...);
+
+__printf(2, 3)
void __bch2_print(struct bch_fs *c, const char *fmt, ...);
#define maybe_dev_to_fs(_c) _Generic((_c), \
@@ -452,6 +456,7 @@ enum bch_time_stats {
#include "alloc_types.h"
#include "btree_types.h"
+#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
@@ -504,6 +509,7 @@ enum gc_phase {
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
GC_PHASE_BTREE_rebalance_work,
+ GC_PHASE_BTREE_subvolume_children,
GC_PHASE_PENDING_DELETE,
};
@@ -593,7 +599,7 @@ struct bch_dev {
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
- struct bch2_time_stats io_latency[2];
+ struct bch2_time_stats_quantiles io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
@@ -609,6 +615,7 @@ struct bch_dev {
*/
#define BCH_FS_FLAGS() \
+ x(new_fs) \
x(started) \
x(may_go_rw) \
x(rw) \
@@ -663,6 +670,8 @@ struct journal_seq_blacklist_table {
};
struct journal_keys {
+ /* must match layout in darray_types.h */
+ size_t nr, size;
struct journal_key {
u64 journal_seq;
u32 journal_offset;
@@ -671,15 +680,13 @@ struct journal_keys {
bool allocated;
bool overwritten;
struct bkey_i *k;
- } *d;
+ } *data;
/*
* Gap buffer: instead of all the empty space in the array being at the
* end of the buffer - from @nr to @size - the empty space is at @gap.
* This means that sequential insertions are O(n) instead of O(n^2).
*/
size_t gap;
- size_t nr;
- size_t size;
atomic_t ref;
bool initial_ref_held;
};
@@ -702,7 +709,10 @@ struct btree_trans_buf {
x(stripe_delete) \
x(reflink) \
x(fallocate) \
+ x(fsync) \
+ x(dio_write) \
x(discard) \
+ x(discard_fast) \
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
@@ -790,6 +800,7 @@ struct bch_fs {
u64 features;
u64 compat;
unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ u64 btrees_lost_data;
} sb;
@@ -804,7 +815,6 @@ struct bch_fs {
/* snapshot.c: */
struct snapshot_table __rcu *snapshots;
- size_t snapshot_table_size;
struct mutex snapshot_table_lock;
struct rw_semaphore snapshot_create_lock;
@@ -843,6 +853,8 @@ struct bch_fs {
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
+ struct workqueue_struct *btree_node_rewrite_worker;
+
struct list_head pending_node_rewrites;
struct mutex pending_node_rewrites_lock;
@@ -919,8 +931,6 @@ struct bch_fs {
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
- u64 blocked_allocate;
- u64 blocked_allocate_open_bucket;
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
@@ -940,8 +950,11 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
- struct work_struct discard_work;
struct work_struct invalidate_work;
+ struct work_struct discard_work;
+ struct mutex discard_buckets_in_flight_lock;
+ DARRAY(struct bpos) discard_buckets_in_flight;
+ struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
@@ -1095,6 +1108,8 @@ struct bch_fs {
struct journal_keys journal_keys;
struct list_head journal_iters;
+ struct find_btree_nodes found_btree_nodes;
+
u64 last_bucket_seq_cleanup;
u64 counters_on_mount[BCH_COUNTER_NR];
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0668b682a21c..085987435a5e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -189,7 +189,11 @@ struct bversion {
__u32 hi;
__u64 lo;
#endif
-} __packed __aligned(4);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
struct bkey {
/* Size of combined key and value, in u64s */
@@ -222,7 +226,36 @@ struct bkey {
__u8 pad[1];
#endif
-} __packed __aligned(8);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+/*
+ * The big-endian version of bkey can't be compiled by rustc with the "aligned"
+ * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
+ * So for Rust compatibility, don't include this. It can be included in the LE
+ * version because the "packed" attr is redundant in that case.
+ *
+ * History: (quoting Kent)
+ *
+ * Specifically, when i was designing bkey, I wanted the header to be no
+ * bigger than necessary so that bkey_packed could use the rest. That means that
+ * decently offten extent keys will fit into only 8 bytes, instead of spilling over
+ * to 16.
+ *
+ * But packed_bkey treats the part after the header - the packed section -
+ * as a single multi word, variable length integer. And bkey, the unpacked
+ * version, is just a special case version of a bkey_packed; all the packed
+ * bkey code will work on keys in any packed format, the in-memory
+ * representation of an unpacked key also is just one type of packed key...
+ *
+ * So that constrains the key part of a bkig endian bkey to start right
+ * after the header.
+ *
+ * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
+ * some reason - that will clean up this wart.
+ */
+__aligned(8)
+#endif
+;
struct bkey_packed {
__u64 _data[0];
@@ -545,7 +578,8 @@ struct bch_member {
__le64 nbuckets; /* device size */
__le16 first_bucket; /* index of first bucket used */
__le16 bucket_size; /* sectors */
- __le32 pad;
+ __u8 btree_bitmap_shift;
+ __u8 pad[3];
__le64 last_mount; /* time_t */
__le64 flags;
@@ -554,6 +588,7 @@ struct bch_member {
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
__le64 seq;
+ __le64 btree_allocated_bitmap;
};
#define BCH_MEMBER_V1_BYTES 56
@@ -785,6 +820,7 @@ struct bch_sb_field_ext {
struct bch_sb_field field;
__le64 recovery_passes_required[2];
__le64 errors_silent[8];
+ __le64 btrees_lost_data;
};
struct bch_sb_field_downgrade_entry {
@@ -840,7 +876,10 @@ struct bch_sb_field_downgrade {
x(snapshot_skiplists, BCH_VERSION(1, 1)) \
x(deleted_inodes, BCH_VERSION(1, 2)) \
x(rebalance_work, BCH_VERSION(1, 3)) \
- x(member_seq, BCH_VERSION(1, 4))
+ x(member_seq, BCH_VERSION(1, 4)) \
+ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
+ x(btree_subvolume_children, BCH_VERSION(1, 6)) \
+ x(mi_btree_bitmap, BCH_VERSION(1, 7))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1275,9 +1314,10 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(dev_usage, 8) \
x(log, 9) \
x(overwrite, 10) \
- x(write_buffer_keys, 11)
+ x(write_buffer_keys, 11) \
+ x(datetime, 12)
-enum {
+enum bch_jset_entry_type {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
BCH_JSET_ENTRY_TYPES()
#undef x
@@ -1323,7 +1363,7 @@ struct jset_entry_blacklist_v2 {
x(inodes, 1) \
x(key_version, 2)
-enum {
+enum bch_fs_usage_type {
#define x(f, nr) BCH_FS_USAGE_##f = nr,
BCH_FS_USAGE_TYPES()
#undef x
@@ -1376,6 +1416,11 @@ struct jset_entry_log {
u8 d[];
} __packed __aligned(8);
+struct jset_entry_datetime {
+ struct jset_entry entry;
+ __le64 seconds;
+} __packed __aligned(8);
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@@ -1482,7 +1527,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
- BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
+ x(subvolume_children, 19, 0, \
+ BIT_ULL(KEY_TYPE_set))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
@@ -1491,6 +1538,20 @@ enum btree_id {
BTREE_ID_NR
};
+static inline bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ return true;
+ default:
+ return false;
+ }
+}
+
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 831be01809f2..3a45d128f608 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -4,7 +4,7 @@
#include <linux/bug.h>
#include "bcachefs_format.h"
-
+#include "bkey_types.h"
#include "btree_types.h"
#include "util.h"
#include "vstructs.h"
@@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *,
const struct bkey_format *,
const struct bkey_packed *);
-/* bkey with split value, const */
-struct bkey_s_c {
- const struct bkey *k;
- const struct bch_val *v;
-};
-
-/* bkey with split value */
-struct bkey_s {
- union {
- struct {
- struct bkey *k;
- struct bch_val *v;
- };
- struct bkey_s_c s_c;
- };
-};
-
-#define bkey_p_next(_k) vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
- return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
- return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
- unsigned u64s = BKEY_U64s + val_u64s;
-
- BUG_ON(u64s > U8_MAX);
- k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
- set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
@@ -362,10 +311,13 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
const struct bkey_packed *k)
{
- unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+ return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+}
- EBUG_ON(k->u64s < ret);
- return ret;
+static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
+ const struct bkey_packed *k)
+{
+ return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
}
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@@ -553,155 +505,6 @@ static inline void bkey_reassemble(struct bkey_i *dst,
memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
-#define bkey_s_null ((struct bkey_s) { .k = NULL })
-#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
- return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
- return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
- return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
- return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...) \
-struct bkey_i_##name { \
- union { \
- struct bkey k; \
- struct bkey_i k_i; \
- }; \
- struct bch_##name v; \
-}; \
- \
-struct bkey_s_c_##name { \
- union { \
- struct { \
- const struct bkey *k; \
- const struct bch_##name *v; \
- }; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-struct bkey_s_##name { \
- union { \
- struct { \
- struct bkey *k; \
- struct bch_##name *v; \
- }; \
- struct bkey_s_c_##name c; \
- struct bkey_s s; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline const struct bkey_i_##name * \
-bkey_i_to_##name##_c(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{ \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-name##_i_to_s_c(const struct bkey_i_##name *k) \
-{ \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-bkey_i_to_s_c_##name(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{ \
- struct bkey_i_##name *k = \
- container_of(&_k->k, struct bkey_i_##name, k); \
- \
- bkey_init(&k->k); \
- memset(&k->v, 0, sizeof(k->v)); \
- k->k.type = KEY_TYPE_##name; \
- set_bkey_val_bytes(&k->k, sizeof(k->v)); \
- \
- return k; \
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
/* byte order helpers */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e52684764eb..db336a43fc08 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -171,11 +171,15 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type >= BKEY_TYPE_NR)
return 0;
- bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+ bkey_fsck_err_on((type == BKEY_TYPE_btree ||
+ (flags & BKEY_INVALID_COMMIT)) &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
- bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+ bch2_btree_node_type_str(type),
+ k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_types[k.k->type]
+ : "(unknown)");
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
bkey_fsck_err_on(k.k->size == 0, c, err,
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
new file mode 100644
index 000000000000..c9ae9e42b385
--- /dev/null
+++ b/fs/bcachefs/bkey_types.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_TYPES_H
+#define _BCACHEFS_BKEY_TYPES_H
+
+#include "bcachefs_format.h"
+
+/*
+ * bkey_i - bkey with inline value
+ * bkey_s - bkey with split value
+ * bkey_s_c - bkey with split value, const
+ */
+
+#define bkey_p_next(_k) vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+ return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+ return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+ unsigned u64s = BKEY_U64s + val_u64s;
+
+ BUG_ON(u64s > U8_MAX);
+ k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+ set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k) \
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+ const struct bkey *k;
+ const struct bch_val *v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+ union {
+ struct {
+ struct bkey *k;
+ struct bch_val *v;
+ };
+ struct bkey_s_c s_c;
+ };
+};
+
+#define bkey_s_null ((struct bkey_s) { .k = NULL })
+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+ return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+ return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+ return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...) \
+struct bkey_i_##name { \
+ union { \
+ struct bkey k; \
+ struct bkey_i k_i; \
+ }; \
+ struct bch_##name v; \
+}; \
+ \
+struct bkey_s_c_##name { \
+ union { \
+ struct { \
+ const struct bkey *k; \
+ const struct bch_##name *v; \
+ }; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+struct bkey_s_##name { \
+ union { \
+ struct { \
+ struct bkey *k; \
+ struct bch_##name *v; \
+ }; \
+ struct bkey_s_c_##name c; \
+ struct bkey_s s; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline const struct bkey_i_##name * \
+bkey_i_to_##name##_c(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{ \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+name##_i_to_s_c(const struct bkey_i_##name *k) \
+{ \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+bkey_i_to_s_c_##name(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{ \
+ struct bkey_i_##name *k = \
+ container_of(&_k->k, struct bkey_i_##name, k); \
+ \
+ bkey_init(&k->k); \
+ memset(&k->v, 0, sizeof(k->v)); \
+ k->k.type = KEY_TYPE_##name; \
+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \
+ \
+ return k; \
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3fd1085b6c61..3bb477840eab 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -134,18 +134,24 @@ void bch2_dump_btree_node_iter(struct btree *b,
printbuf_exit(&buf);
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
{
struct bset_tree *t;
struct bkey_packed *k;
- struct btree_nr_keys nr = { 0 };
+ struct btree_nr_keys nr = {};
for_each_bset(b, t)
bset_tree_for_each_key(b, t, k)
if (!bkey_deleted(k))
btree_keys_account_key_add(&nr, t - b->set, k);
+ return nr;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+ struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 79c77baaa383..120a79fd456b 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -458,6 +458,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
/* Accounting: */
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
+
static inline void btree_keys_account_key(struct btree_nr_keys *n,
unsigned bset,
struct bkey_packed *k,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d7c81beac14a..02c70e813fac 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
@@ -60,7 +61,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_buf_bytes(b));
+ kvfree(b->data);
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -94,7 +95,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_buf_bytes(b), gfp);
+ b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +108,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_buf_bytes(b));
+ kvfree(b->data);
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
int ret = 0;
lockdep_assert_held(&bc->lock);
+
+ struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+ u64 mask = b->c.level
+ ? bc->pinned_nodes_interior_mask
+ : bc->pinned_nodes_leaf_mask;
+
+ if ((mask & BIT_ULL(b->c.btree_id)) &&
+ bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+ bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@@ -408,7 +421,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
- kvpfree(c->verify_ondisk, c->opts.btree_node_size);
+ kvfree(c->verify_ondisk);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@@ -696,9 +709,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- u32 seq;
- BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+ if (unlikely(level >= BTREE_MAX_DEPTH)) {
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
+ level, BTREE_MAX_DEPTH);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(!bkey_is_btree_ptr(&k->k))) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
/*
* Parent node must be locked, else we could read in a btree node that's
* been freed:
@@ -711,6 +746,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
b = bch2_btree_node_mem_alloc(trans, level != 0);
if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+ if (!path)
+ return b;
+
trans->memory_allocation_failure = true;
trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -736,33 +774,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
}
set_btree_node_read_in_flight(b);
-
six_unlock_write(&b->c.lock);
- seq = six_lock_seq(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- /* Unlock before doing IO: */
- if (path && sync)
- bch2_trans_unlock_noassert(trans);
+ if (path) {
+ u32 seq = six_lock_seq(&b->c.lock);
- bch2_btree_node_read(trans, b, sync);
+ /* Unlock before doing IO: */
+ six_unlock_intent(&b->c.lock);
+ bch2_trans_unlock_noassert(trans);
- if (!sync)
- return NULL;
+ bch2_btree_node_read(trans, b, sync);
- if (path) {
- int ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(ret);
- }
- }
+ if (!sync)
+ return NULL;
- if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- if (path)
- trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ b = NULL;
+ } else {
+ bch2_btree_node_read(trans, b, sync);
+ if (lock_type == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
}
return b;
@@ -791,7 +822,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
prt_printf(&buf, "\nmax ");
bch2_bpos_to_text(&buf, b->data->max_key);
- bch2_fs_inconsistent(c, "%s", buf.buf);
+ bch2_fs_topology_error(c, "%s", buf.buf);
+
printbuf_exit(&buf);
}
@@ -901,7 +933,7 @@ retry:
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-BCH_ERR_btree_node_read_error);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -992,7 +1024,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-BCH_ERR_btree_node_read_error);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1075,7 +1107,7 @@ lock_node:
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
- b = ERR_PTR(-EIO);
+ b = ERR_PTR(-BCH_ERR_btree_node_read_error);
goto out;
}
@@ -1094,18 +1126,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- BUG_ON(trans && !btree_node_locked(path, level + 1));
+ BUG_ON(path && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
- b = btree_cache_find(bc, k);
+ struct btree *b = btree_cache_find(bc, k);
if (b)
return 0;
b = bch2_btree_node_fill(trans, path, k, btree_id,
level, SIX_LOCK_read, false);
- return PTR_ERR_OR_ZERO(b);
+ if (!IS_ERR_OR_NULL(b))
+ six_unlock_read(&b->c.lock);
+ return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
}
void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
@@ -1117,6 +1150,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
b = btree_cache_find(bc, k);
if (!b)
return;
+
+ BUG_ON(b == btree_node_root(trans->c, b));
wait_on_io:
/* not allowed to wait on io with btree locks held: */
@@ -1128,6 +1163,8 @@ wait_on_io:
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
+ goto out;
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
@@ -1142,7 +1179,7 @@ wait_on_io:
btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
mutex_unlock(&bc->lock);
-
+out:
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1102995643b1..ecbd9598f69f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -7,11 +7,13 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_gc.h"
@@ -24,7 +26,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "reflink.h"
#include "replicas.h"
#include "super-io.h"
@@ -40,6 +42,7 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
+#define DID_FILL_FROM_SCAN 12
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
{
@@ -70,90 +73,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
__gc_pos_set(c, new_pos);
}
-/*
- * Missing: if an interior btree node is empty, we need to do something -
- * perhaps just kill it
- */
-static int bch2_gc_check_topology(struct bch_fs *c,
- struct btree *b,
- struct bkey_buf *prev,
- struct bkey_buf cur,
- bool is_last)
-{
- struct bpos node_start = b->data->min_key;
- struct bpos node_end = b->data->max_key;
- struct bpos expected_start = bkey_deleted(&prev->k->k)
- ? node_start
- : bpos_successor(prev->k->k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- int ret = 0;
-
- if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
-
- if (!bpos_eq(expected_start, bp->v.min_key)) {
- bch2_topology_error(c);
-
- if (bkey_deleted(&prev->k->k)) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, node_start);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
- }
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
-
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
- }
-
- if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf1);
- printbuf_reset(&buf2);
-
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
- bch2_bpos_to_text(&buf2, node_end);
-
- if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) &&
- should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
-
- bch2_bkey_buf_copy(prev, c, cur.k);
-err:
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
{
switch (b->key.k.type) {
@@ -212,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
struct bkey_i_btree_ptr_v2 *new;
int ret;
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_min);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
return -BCH_ERR_ENOMEM_gc_repair_key;
@@ -237,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
struct bkey_i_btree_ptr_v2 *new;
int ret;
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
if (ret)
return ret;
@@ -268,128 +209,140 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
-static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
- struct btree *prev, struct btree *cur)
+static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
+ struct btree *prev, struct btree *cur,
+ struct bpos *pulled_from_scan)
{
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (!prev) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, b->data->min_key);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
+
+ if (bpos_eq(expected_start, cur->data->min_key))
+ return 0;
+
+ prt_printf(&buf, " at btree %s level %u:\n parent: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ if (prev) {
+ prt_printf(&buf, "\n prev: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
}
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
-
- if (prev &&
- bpos_gt(expected_start, cur->data->min_key) &&
- BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
- /* cur overwrites prev: */
-
- if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
- cur->data->min_key), c,
- btree_node_topology_overwritten_by_next_node,
- "btree node overwritten by next node at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_PREV_NODE;
- goto out;
- }
+ prt_str(&buf, "\n next: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
- if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
- bpos_predecessor(cur->data->min_key)), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_max(c, prev,
- bpos_predecessor(cur->data->min_key));
- } else {
- /* prev overwrites cur: */
-
- if (mustfix_fsck_err_on(bpos_ge(expected_start,
- cur->data->max_key), c,
- btree_node_topology_overwritten_by_prev_node,
- "btree node overwritten by prev node at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_THIS_NODE;
- goto out;
- }
+ if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, cur->data->min_key)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ expected_start,
+ bpos_predecessor(cur->data->min_key));
+ if (ret)
+ goto err;
- if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_min(c, cur, expected_start);
+ *pulled_from_scan = cur->data->min_key;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ } else { /* overlap */
+ if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
+ if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
+ if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
+ "btree node overwritten by next node%s", buf.buf))
+ ret = DROP_PREV_NODE;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf))
+ ret = set_node_max(c, prev,
+ bpos_predecessor(cur->data->min_key));
+ }
+ } else {
+ if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
+ if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
+ "btree node overwritten by prev node%s", buf.buf))
+ ret = DROP_THIS_NODE;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ }
}
-out:
+err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
- struct btree *child)
+ struct btree *child, struct bpos *pulled_from_scan)
{
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
- bch2_bpos_to_text(&buf2, b->key.k.p);
+ if (bpos_eq(child->key.k.p, b->key.k.p))
+ return 0;
- if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = set_node_max(c, child, b->key.k.p);
- if (ret)
- goto err;
+ prt_printf(&buf, "at btree %s level %u:\n parent: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ prt_str(&buf, "\n child: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
+
+ if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf)) {
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, b->key.k.p)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ bpos_successor(child->key.k.p), b->key.k.p);
+ if (ret)
+ goto err;
+
+ *pulled_from_scan = b->key.k.p;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ ret = set_node_max(c, child, b->key.k.p);
+ }
}
err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
+ struct bpos *pulled_from_scan)
{
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
- bool have_child, dropped_children = false;
+ bool have_child, new_pass = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (!b->c.level)
return 0;
-again:
- prev = NULL;
- have_child = dropped_children = false;
+
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+again:
+ cur = prev = NULL;
+ have_child = new_pass = false;
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -406,7 +359,7 @@ again:
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
- if (mustfix_fsck_err_on(ret == -EIO, c,
+ if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
@@ -414,11 +367,17 @@ again:
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
+ cur = NULL;
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- cur = NULL;
if (ret)
break;
+
+ if (!btree_id_is_alloc(b->c.btree_id)) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+ }
continue;
}
@@ -426,7 +385,23 @@ again:
if (ret)
break;
- ret = btree_repair_node_boundaries(c, b, prev, cur);
+ if (bch2_btree_node_is_stale(c, cur)) {
+ bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
+ six_unlock_read(&cur->c.lock);
+ bch2_btree_node_evict(trans, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ cur = NULL;
+ if (ret)
+ break;
+ continue;
+ }
+
+ ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
@@ -444,6 +419,7 @@ again:
prev = NULL;
if (ret == DROP_PREV_NODE) {
+ bch_info(c, "dropped prev node");
bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
@@ -451,8 +427,6 @@ again:
break;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
goto again;
} else if (ret)
break;
@@ -464,7 +438,11 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(c, b, prev);
+ ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
}
if (!IS_ERR_OR_NULL(prev))
@@ -478,7 +456,12 @@ again:
goto err;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ if (new_pass)
+ goto again;
+
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -493,7 +476,7 @@ again:
if (ret)
goto err;
- ret = bch2_btree_repair_topology_recurse(trans, cur);
+ ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
six_unlock_read(&cur->c.lock);
cur = NULL;
@@ -501,7 +484,7 @@ again:
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- dropped_children = true;
+ new_pass = true;
}
if (ret)
@@ -528,12 +511,14 @@ fsck_err:
six_unlock_read(&cur->c.lock);
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
- if (!ret && dropped_children)
+ if (!ret && new_pass)
goto again;
+ BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
+
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf);
return ret;
}
@@ -541,32 +526,63 @@ fsck_err:
int bch2_check_topology(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree *b;
- unsigned i;
+ struct bpos pulled_from_scan = POS_MIN;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
+ bool reconstructed_root = false;
- if (!r->alive)
- continue;
+ if (r->error) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+reconstruct_root:
+ bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
- b = r->b;
- if (btree_node_fake(b))
- continue;
+ r->alive = false;
+ r->error = 0;
+
+ if (!bch2_btree_has_scanned_nodes(c, i)) {
+ mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
+ "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
+ bch2_btree_root_alloc_fake(c, i, 0);
+ } else {
+ bch2_btree_root_alloc_fake(c, i, 1);
+ bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
+ if (ret)
+ break;
+ }
+
+ reconstructed_root = true;
+ }
+
+ struct btree *b = r->b;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- ret = bch2_btree_repair_topology_recurse(trans, b);
+ ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
- bch_err(c, "empty btree root - repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ r->b = NULL;
+
+ if (!reconstructed_root)
+ goto reconstruct_root;
+
+ bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
+ bch2_btree_root_alloc_fake(c, i, 0);
+ r->alive = false;
+ ret = 0;
}
}
-
+fsck_err:
bch2_trans_put(trans);
-
return ret;
}
@@ -589,18 +605,17 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
-
- if (!g->gen_valid &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c);
+
+ if (fsck_err_on(!g->gen_valid,
+ c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@@ -609,16 +624,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
- if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@@ -631,35 +645,34 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
- if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
- if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
continue;
if (fsck_err_on(bucket_data_type(g->data_type) &&
- bucket_data_type(g->data_type) != data_type, c,
+ bucket_data_type(g->data_type) !=
+ bucket_data_type(data_type), c,
ptr_bucket_data_type_mismatch,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
@@ -700,18 +713,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
if (do_update) {
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct bkey_i *new;
-
if (is_root) {
bch_err(c, "cannot update btree roots yet");
ret = -EINVAL;
goto err;
}
- new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+ struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
ret = -BCH_ERR_ENOMEM_gc_repair_key;
bch_err_msg(c, ret, "allocating new key");
@@ -726,7 +734,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
* btree node isn't there anymore, the read path will
* sort it out:
*/
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_GC_BUCKET(ca, ptr);
@@ -734,19 +742,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
ptr->gen = g->gen;
}
} else {
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
-
- (ptr->cached &&
- (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
- (!ptr->cached &&
- gen_cmp(ptr->gen, g->gen) < 0) ||
- gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
- (g->data_type &&
- g->data_type != data_type);
- }));
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
+
+ if ((p.ptr.cached &&
+ (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
+ (!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->gen) < 0) ||
+ gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type)) {
+ bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
+ goto restart_drop_ptrs;
+ }
+ }
again:
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_extent_entry_for_each(ptrs, entry) {
@@ -776,12 +791,6 @@ found:
}
}
- ret = bch2_journal_key_insert_take(c, btree_id, level, new);
- if (ret) {
- kfree(new);
- goto err;
- }
-
if (level)
bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
@@ -795,6 +804,12 @@ found:
bch_info(c, "new key %s", buf.buf);
}
+ ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+ if (ret) {
+ kfree(new);
+ goto err;
+ }
+
*k = bkey_i_to_s_c(new);
}
err:
@@ -813,6 +828,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
struct bch_fs *c = trans->c;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+ struct printbuf buf = PRINTBUF;
int ret = 0;
deleted.p = k->k->p;
@@ -821,10 +837,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
BUG_ON(bch2_journal_seq_verify &&
k->k->version.lo > atomic64_read(&c->journal.seq));
- ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
- if (ret)
- goto err;
-
if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
bkey_version_in_future,
"key version number higher than recorded: %llu > %llu",
@@ -833,52 +845,57 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
atomic64_set(&c->key_version, k->k->version.lo);
}
+ ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
+ if (ret)
+ goto err;
+
+ if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k),
+ c, btree_bitmap_not_marked,
+ "btree ptr not marked in member info btree allocated bitmap\n %s",
+ (bch2_bkey_val_to_text(&buf, c, *k),
+ buf.buf))) {
+ mutex_lock(&c->sb_lock);
+ bch2_dev_btree_bitmap_mark(c, *k);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
ret = commit_do(trans, NULL, NULL, 0,
- bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
+ bch2_key_trigger(trans, btree_id, level, old,
+ unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
fsck_err:
err:
+ printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
{
- struct bch_fs *c = trans->c;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
- struct bkey_buf prev, cur;
int ret = 0;
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
if (!btree_node_type_needs_gc(btree_node_type(b)))
return 0;
bch2_btree_node_iter_init_from_start(&iter, b);
- bch2_bkey_buf_init(&prev);
- bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
&k, initial);
if (ret)
- break;
+ return ret;
bch2_btree_node_iter_advance(&iter, b);
-
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
-
- ret = bch2_gc_check_topology(c, b, &prev, cur,
- bch2_btree_node_iter_end(&iter));
- if (ret)
- break;
- }
}
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- return ret;
+ return 0;
}
static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
@@ -927,14 +944,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_buf cur, prev;
+ struct bkey_buf cur;
struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
- bch2_bkey_buf_init(&prev);
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -945,25 +964,13 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
if (ret)
goto fsck_err;
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
- k = bkey_i_to_s_c(cur.k);
-
- bch2_btree_and_journal_iter_advance(&iter);
-
- ret = bch2_gc_check_topology(c, b,
- &prev, cur,
- !bch2_btree_and_journal_iter_peek(&iter).k);
- if (ret)
- goto fsck_err;
- } else {
- bch2_btree_and_journal_iter_advance(&iter);
- }
+ bch2_btree_and_journal_iter_advance(&iter);
}
if (b->c.level > target_depth) {
bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
struct btree *child;
@@ -976,7 +983,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
false);
ret = PTR_ERR_OR_ZERO(child);
- if (ret == -EIO) {
+ if (bch2_err_matches(ret, EIO)) {
bch2_topology_error(c);
if (__fsck_err(c,
@@ -1016,7 +1023,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
}
fsck_err:
bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
printbuf_exit(&buf);
return ret;
@@ -1034,9 +1040,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
b = bch2_btree_id_root(c, btree_id)->b;
- if (btree_node_fake(b))
- return 0;
-
six_lock_read(&b->c.lock, NULL, NULL);
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key);
@@ -1190,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
- kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
- sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket));
+ kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
free_percpu(ca->usage_gc);
@@ -1365,11 +1366,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
- struct bucket gc, *b;
+ struct bucket old_gc, gc, *b;
struct bkey_i_alloc_v4 *a;
struct bch_alloc_v4 old_convert, new;
const struct bch_alloc_v4 *old;
- enum bch_data_type type;
int ret;
old = bch2_alloc_to_v4(k, &old_convert);
@@ -1377,28 +1377,29 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
b = gc_bucket(ca, iter->pos.offset);
+ old_gc = *b;
+
+ if ((old->data_type == BCH_DATA_sb ||
+ old->data_type == BCH_DATA_journal) &&
+ !bch2_dev_is_online(ca)) {
+ b->data_type = old->data_type;
+ b->dirty_sectors = old->dirty_sectors;
+ }
/*
* b->data_type doesn't yet include need_discard & need_gc_gen states -
* fix that here:
*/
- type = __alloc_data_type(b->dirty_sectors,
- b->cached_sectors,
- b->stripe,
- *old,
- b->data_type);
- if (b->data_type != type) {
- struct bch_dev_usage *u;
-
- preempt_disable();
- u = this_cpu_ptr(ca->usage_gc);
- u->d[b->data_type].buckets--;
- b->data_type = type;
- u->d[b->data_type].buckets++;
- preempt_enable();
- }
-
+ b->data_type = __alloc_data_type(b->dirty_sectors,
+ b->cached_sectors,
+ b->stripe,
+ *old,
+ b->data_type);
gc = *b;
+
+ if (gc.data_type != old_gc.data_type ||
+ gc.dirty_sectors != old_gc.dirty_sectors)
+ bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
percpu_up_read(&c->mark_lock);
if (metadata_only &&
@@ -1410,8 +1411,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (gen_after(old->gen, gc.gen))
return 0;
- if (c->opts.reconstruct_alloc ||
- fsck_err_on(new.data_type != gc.data_type, c,
+ if (fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
@@ -1422,8 +1422,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
- if (c->opts.reconstruct_alloc || \
- fsck_err_on(new._f != gc._f, c, _errtype, \
+ if (fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
@@ -1491,7 +1490,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
for_each_member_device(c, ca) {
- struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
@@ -1585,8 +1584,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
" should be %u",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
r->refcount)) {
- struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
-
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
@@ -1595,6 +1593,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
new->k.type = KEY_TYPE_deleted;
else
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+ ret = bch2_trans_update(trans, iter, new, 0);
}
fsck_err:
printbuf_exit(&buf);
@@ -1817,10 +1816,10 @@ out:
if (!ret) {
bch2_journal_block(&c->journal);
- ret = bch2_gc_stripes_done(c, metadata_only) ?:
- bch2_gc_reflink_done(c, metadata_only) ?:
- bch2_gc_alloc_done(c, metadata_only) ?:
- bch2_gc_done(c, initial, metadata_only);
+ ret = bch2_gc_alloc_done(c, metadata_only) ?:
+ bch2_gc_done(c, initial, metadata_only) ?:
+ bch2_gc_stripes_done(c, metadata_only) ?:
+ bch2_gc_reflink_done(c, metadata_only);
bch2_journal_unblock(&c->journal);
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index aa9b6cbe3226..9678b2375bed 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
if (used_mempool)
mempool_free(p, &c->btree_bounce_pool);
else
- vpfree(p, size);
+ kvfree(p);
}
static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
- p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+ p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -581,8 +581,7 @@ static int __btree_err(int ret,
break;
case -BCH_ERR_btree_node_read_err_bad_node:
bch2_print_string_as_lines(KERN_ERR, out.buf);
- bch2_topology_error(c);
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+ ret = bch2_topology_error(c);
break;
case -BCH_ERR_btree_node_read_err_incompatible:
bch2_print_string_as_lines(KERN_ERR, out.buf);
@@ -655,6 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
*/
bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
+ b->nr = bch2_btree_node_count_keys(b);
struct bkey_s_c k;
struct bkey unpacked;
@@ -831,7 +831,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
-static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
struct bset *i, struct bkey_packed *k)
{
if (bkey_p_next(k) > vstruct_last(i))
@@ -840,6 +840,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
if (k->format > KEY_FORMAT_CURRENT)
return false;
+ if (!bkeyp_u64s_valid(&b->format, k))
+ return false;
+
struct printbuf buf = PRINTBUF;
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
@@ -881,7 +884,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
"invalid bkey format %u", k->format))
goto drop_this_key;
- /* XXX: validate k->u64s */
+ if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_bad_u64s,
+ "bad k->u64s %u (min %u max %lu)", k->u64s,
+ bkeyp_key_u64s(&b->format, k),
+ U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
+ goto drop_this_key;
+
if (!write)
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
@@ -938,13 +949,12 @@ drop_this_key:
* do
*/
- if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+ if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
for (next_good_key = 1;
next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
next_good_key++)
- if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+ if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
goto got_good_key;
-
}
/*
@@ -1058,7 +1068,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting btree node: %i", ret))
+ "decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
@@ -1099,7 +1109,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting btree node: %i\n", ret))
+ "decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
sectors = vstruct_sectors(bne, c->block_bits);
@@ -1255,10 +1265,12 @@ out:
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret == -BCH_ERR_btree_node_read_err_must_retry) {
retry_read = 1;
- else
+ } else {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
+ }
goto out;
}
@@ -1319,6 +1331,7 @@ start:
if (!can_retry) {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
break;
}
}
@@ -1327,10 +1340,12 @@ start:
rb->start_time);
bio_put(&rb->bio);
- if (saw_error && !btree_node_read_error(b)) {
+ if (saw_error &&
+ !btree_node_read_error(b) &&
+ c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->key.k.p);
- bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+ bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
__func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
bch2_btree_node_rewrite_async(c, b);
@@ -1518,9 +1533,10 @@ fsck_err:
ret = -1;
}
- if (ret)
+ if (ret) {
set_btree_node_read_error(b);
- else if (*saw_error)
+ bch2_btree_lost_data(c, b->c.btree_id);
+ } else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
for (i = 0; i < ra->nr; i++) {
@@ -1649,13 +1665,14 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
- bch_err(c, "%s", buf.buf);
+ bch_err_ratelimited(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
printbuf_exit(&buf);
@@ -1737,7 +1754,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
- ret = -EIO;
+ ret = -BCH_ERR_btree_node_read_error;
goto err;
}
@@ -1841,7 +1858,7 @@ static void btree_node_write_work(struct work_struct *work)
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_write_all_failed;
+ ret = -BCH_ERR_btree_node_write_all_failed;
goto err;
}
@@ -1852,7 +1869,7 @@ static void btree_node_write_work(struct work_struct *work)
} else {
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
@@ -1866,8 +1883,8 @@ out:
return;
err:
set_btree_node_noevict(b);
- if (!bch2_err_matches(ret, EROFS))
- bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+ bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+ "writing btree node: %s", bch2_err_str(ret));
goto out;
}
@@ -2123,7 +2140,7 @@ do_write:
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error encrypting btree node: %i\n", ret))
+ "encrypting btree node: %s", bch2_err_str(ret)))
goto err;
nonce = btree_nonce(i, b->written << 9);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3ef338df82f5..2a211a4bebd1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct bkey_s_c k;
int ret = 0;
- __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+ __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
@@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (ret)
goto err;
} else {
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
+ struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
+ if (!k) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ goto err;
+ }
+
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
if ((flags & BTREE_ITER_PREFETCH) &&
c->opts.btree_node_prefetch) {
@@ -962,7 +976,6 @@ err:
return ret;
}
-
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -1146,7 +1159,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
path = &trans->paths[path_idx];
if (unlikely(path->level >= BTREE_MAX_DEPTH))
- goto out;
+ goto out_uptodate;
path->level = btree_path_up_until_good_node(trans, path, 0);
@@ -1179,7 +1192,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
}
-
+out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
@@ -1520,7 +1533,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans)
{
unsigned nr = trans->nr_paths * 2;
- void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+ void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
sizeof(struct btree_trans_paths) +
nr * sizeof(struct btree_path) +
nr * sizeof(btree_path_idx_t) + 8 +
@@ -1729,7 +1742,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
- btree_path_set_should_be_locked(trans->paths + iter->path);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ if (btree_path_node(path, path->level))
+ btree_path_set_should_be_locked(path);
return 0;
}
@@ -2305,7 +2320,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_WITH_JOURNAL)
- return bkey_s_c_err(-EIO);
+ return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
@@ -2503,6 +2518,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = bch2_btree_iter_peek_upto(&iter2, end);
if (k.k && !bkey_err(k)) {
+ swap(iter->key_cache_path, iter2.key_cache_path);
iter->k = iter2.k;
k.k = &iter->k;
}
@@ -2762,6 +2778,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
struct btree_trans *trans = src->trans;
*dst = *src;
+#ifdef TRACK_PATH_ALLOCATED
+ dst->ip_allocated = _RET_IP_;
+#endif
if (src->path)
__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
@@ -2784,6 +2803,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
struct btree_transaction_stats *s = btree_trans_stats(trans);
s->max_mem = max(s->max_mem, new_bytes);
+ if (trans->used_mempool) {
+ if (trans->mem_bytes >= new_bytes)
+ goto out_change_top;
+
+ /* No more space from mempool item, need malloc new one */
+ new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+
+ new_mem = kmalloc(new_bytes, GFP_KERNEL);
+ if (!new_mem)
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ kfree(new_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = false;
+ mempool_free(trans->mem, &c->btree_trans_mem_pool);
+ goto out_new_mem;
+ }
+
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
@@ -2792,6 +2836,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = true;
kfree(trans->mem);
}
@@ -2805,7 +2851,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (ret)
return ERR_PTR(ret);
}
-
+out_new_mem:
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
@@ -2813,7 +2859,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
-
+out_change_top:
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -3085,9 +3131,9 @@ void bch2_trans_put(struct btree_trans *trans)
trans->paths = NULL;
if (paths_allocated != trans->_paths_allocated)
- kfree_rcu_mightsleep(paths_allocated);
+ kvfree_rcu_mightsleep(paths_allocated);
- if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+ if (trans->used_mempool)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
kfree(trans->mem);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 24772538e4cc..1c70836dd7cc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -498,8 +498,13 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- if (!trans->restarted)
- btree_iter_path(trans, iter)->preserve = false;
+ if (!iter->path || trans->restarted)
+ return;
+
+ struct btree_path *path = btree_iter_path(trans, iter);
+ path->preserve = false;
+ if (path->ref == 1)
+ path->should_be_locked = false;
}
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -642,7 +647,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 719a94a84950..1e8cf49a6935 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "bset.h"
+#include "btree_cache.h"
#include "btree_journal_iter.h"
#include "journal_io.h"
@@ -40,7 +42,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
{
- return keys->d + idx_to_pos(keys, idx);
+ return keys->data + idx_to_pos(keys, idx);
}
static size_t __bch2_journal_key_search(struct journal_keys *keys,
@@ -128,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
}
+static void journal_iter_verify(struct journal_iter *iter)
+{
+ struct journal_keys *keys = iter->keys;
+ size_t gap_size = keys->size - keys->nr;
+
+ BUG_ON(iter->idx >= keys->gap &&
+ iter->idx < keys->gap + gap_size);
+
+ if (iter->idx < keys->size) {
+ struct journal_key *k = keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ BUG_ON(cmp < 0);
+ }
+}
+
static void journal_iters_fix(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct btree_and_journal_iter *iter;
+ struct journal_key *new_key = &keys->data[keys->gap - 1];
+ struct journal_iter *iter;
/*
* If an iterator points one after the key we just inserted, decrement
@@ -141,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c)
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
- list_for_each_entry(iter, &c->journal_iters, journal.list)
- if (iter->journal.idx == gap_end)
- iter->journal.idx = keys->gap - 1;
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ journal_iter_verify(iter);
+ if (iter->idx == gap_end &&
+ new_key->btree_id == iter->btree_id &&
+ new_key->level == iter->level)
+ iter->idx = keys->gap - 1;
+ journal_iter_verify(iter);
+ }
}
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
@@ -180,33 +205,38 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
- journal_key_cmp(&n, &keys->d[idx]) == 0) {
- if (keys->d[idx].allocated)
- kfree(keys->d[idx].k);
- keys->d[idx] = n;
+ journal_key_cmp(&n, &keys->data[idx]) == 0) {
+ if (keys->data[idx].allocated)
+ kfree(keys->data[idx].k);
+ keys->data[idx] = n;
return 0;
}
if (idx > keys->gap)
idx -= keys->size - keys->nr;
+ size_t old_gap = keys->gap;
+
if (keys->nr == keys->size) {
+ journal_iters_move_gap(c, old_gap, keys->size);
+ old_gap = keys->size;
+
struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
};
- new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
- if (!new_keys.d) {
+ new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
+ if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
return -BCH_ERR_ENOMEM_journal_key_insert;
}
/* Since @keys was full, there was no gap: */
- memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
- kvfree(keys->d);
- keys->d = new_keys.d;
+ memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
+ kvfree(keys->data);
+ keys->data = new_keys.data;
keys->nr = new_keys.nr;
keys->size = new_keys.size;
@@ -214,13 +244,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
keys->gap = keys->nr;
}
- journal_iters_move_gap(c, keys->gap, idx);
+ journal_iters_move_gap(c, old_gap, idx);
- move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
- keys->gap = idx;
+ move_gap(keys, idx);
keys->nr++;
- keys->d[keys->gap++] = n;
+ keys->data[keys->gap++] = n;
journal_iters_fix(c);
@@ -260,6 +289,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
return bch2_journal_key_insert(c, id, level, &whiteout);
}
+bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &trans->c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+ if (!trans->journal_replay_not_finished)
+ return false;
+
+ return (idx < keys->size &&
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos) &&
+ bkey_deleted(&keys->data[idx].k->k));
+}
+
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
@@ -267,10 +312,10 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->size &&
- keys->d[idx].btree_id == btree &&
- keys->d[idx].level == level &&
- bpos_eq(keys->d[idx].k->k.p, pos))
- keys->d[idx].overwritten = true;
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos))
+ keys->data[idx].overwritten = true;
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
@@ -284,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->keys->d + iter->idx;
+ journal_iter_verify(iter);
+
+ while (iter->idx < iter->keys->size) {
+ struct journal_key *k = iter->keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ if (cmp > 0)
+ break;
+ BUG_ON(cmp);
- while (k < iter->keys->d + iter->keys->size &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
- k = iter->keys->d + iter->idx;
}
return bkey_s_c_null;
@@ -313,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+
+ journal_iter_verify(iter);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -334,9 +386,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
iter->pos = bpos_successor(iter->pos);
}
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+ struct btree_and_journal_iter iter = *_iter;
+ struct bch_fs *c = iter.trans->c;
+ unsigned level = iter.journal.level;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
+ ? (level > 1 ? 0 : 2)
+ : (level > 1 ? 1 : 16);
+
+ iter.prefetch = false;
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr--) {
+ bch2_btree_and_journal_iter_advance(&iter);
+ struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
- struct bkey_s_c btree_k, journal_k, ret;
+ struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
+
+ if (iter->prefetch && iter->journal.level)
+ btree_and_journal_iter_prefetch(iter);
again:
if (iter->at_end)
return bkey_s_c_null;
@@ -345,9 +426,10 @@ again:
bpos_lt(btree_k.k->p, iter->pos))
bch2_journal_iter_advance_btree(iter);
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
- bpos_lt(journal_k.k->p, iter->pos))
- bch2_journal_iter_advance(&iter->journal);
+ if (iter->trans->journal_replay_not_finished)
+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ bpos_lt(journal_k.k->p, iter->pos))
+ bch2_journal_iter_advance(&iter->journal);
ret = journal_k.k &&
(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
@@ -376,35 +458,40 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+ struct btree_and_journal_iter *iter,
struct btree *b,
struct btree_node_iter node_iter,
struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
+ iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
- bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
- INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
+ INIT_LIST_HEAD(&iter->journal.list);
+
+ if (trans->journal_replay_not_finished) {
+ bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+ list_add(&iter->journal.list, &trans->c->journal_iters);
+ }
}
/*
* this version is used by btree_gc before filesystem has gone RW and
* multithreaded, so uses the journal_iters list:
*/
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+ struct btree_and_journal_iter *iter,
struct btree *b)
{
struct btree_node_iter node_iter;
bch2_btree_node_iter_init_from_start(&node_iter, b);
- __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
- list_add(&iter->journal.list, &c->journal_iters);
+ __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
}
/* sort and dedup all keys in the journal: */
@@ -415,9 +502,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
struct genradix_iter iter;
genradix_for_each(&c->journal_entries, iter, i)
- if (*i)
- kvpfree(*i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&(*i)->j));
+ kvfree(*i);
genradix_free(&c->journal_entries);
}
@@ -437,22 +522,20 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
void bch2_journal_keys_put(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key *i;
BUG_ON(atomic_read(&keys->ref) <= 0);
if (!atomic_dec_and_test(&keys->ref))
return;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
+ move_gap(keys, keys->nr);
- for (i = keys->d; i < keys->d + keys->nr; i++)
+ darray_for_each(*keys, i)
if (i->allocated)
kfree(i->k);
- kvfree(keys->d);
- keys->d = NULL;
+ kvfree(keys->data);
+ keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
bch2_journal_entries_free(c);
@@ -460,83 +543,38 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- struct journal_key *src, *dst;
+ sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
- sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+ struct journal_key *dst = keys->data;
- src = dst = keys->d;
- while (src < keys->d + keys->nr) {
- while (src + 1 < keys->d + keys->nr &&
- !journal_key_cmp(src, src + 1))
- src++;
+ darray_for_each(*keys, src) {
+ if (src + 1 < &darray_top(*keys) &&
+ !journal_key_cmp(src, src + 1))
+ continue;
- *dst++ = *src++;
+ *dst++ = *src;
}
- keys->nr = dst - keys->d;
+ keys->nr = dst - keys->data;
}
int bch2_journal_keys_sort(struct bch_fs *c)
{
struct genradix_iter iter;
struct journal_replay *i, **_i;
- struct jset_entry *entry;
- struct bkey_i *k;
struct journal_keys *keys = &c->journal_keys;
- size_t nr_keys = 0, nr_read = 0;
+ size_t nr_read = 0;
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
- continue;
-
- for_each_jset_key(k, entry, &i->j)
- nr_keys++;
- }
-
- if (!nr_keys)
- return 0;
-
- keys->size = roundup_pow_of_two(nr_keys);
-
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- if (!keys->d) {
- bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
- nr_keys);
-
- do {
- keys->size >>= 1;
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- } while (!keys->d && keys->size > nr_keys / 8);
-
- if (!keys->d) {
- bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
- keys->size);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
cond_resched();
for_each_jset_key(k, entry, &i->j) {
- if (keys->nr == keys->size) {
- __journal_keys_sort(keys);
-
- if (keys->nr > keys->size * 7 / 8) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
- keys->nr, keys->size, nr_read, nr_keys);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
-
- keys->d[keys->nr++] = (struct journal_key) {
+ struct journal_key n = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
@@ -544,6 +582,18 @@ int bch2_journal_keys_sort(struct bch_fs *c)
.journal_offset = k->_data - i->j._data,
};
+ if (darray_push(keys, n)) {
+ __journal_keys_sort(keys);
+
+ if (keys->nr * 8 > keys->size * 7) {
+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
+ keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
+ return -BCH_ERR_ENOMEM_journal_keys_sort;
+ }
+
+ BUG_ON(darray_push(keys, n));
+ }
+
nr_read++;
}
}
@@ -551,6 +601,25 @@ int bch2_journal_keys_sort(struct bch_fs *c)
__journal_keys_sort(keys);
keys->gap = keys->nr;
- bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0;
}
+
+void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
+ unsigned level_min, unsigned level_max,
+ struct bpos start, struct bpos end)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ size_t dst = 0;
+
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i)
+ if (!(i->btree_id == btree &&
+ i->level >= level_min &&
+ i->level <= level_max &&
+ bpos_ge(i->k->k.p, start) &&
+ bpos_le(i->k->k.p, end)))
+ keys->data[dst++] = *i;
+ keys->nr = keys->gap = dst;
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 8ca4c100b2e3..af25046ebcaa 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -15,6 +15,7 @@ struct journal_iter {
*/
struct btree_and_journal_iter {
+ struct btree_trans *trans;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
@@ -22,6 +23,7 @@ struct btree_and_journal_iter {
struct journal_iter journal;
struct bpos pos;
bool at_end;
+ bool prefetch;
};
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@@ -29,25 +31,27 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+ struct btree_and_journal_iter *);
+
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
+bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *, struct btree *,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+ struct btree_and_journal_iter *, struct btree *,
struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *,
- struct btree *);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+ struct btree_and_journal_iter *, struct btree *);
void bch2_journal_keys_put(struct bch_fs *);
@@ -62,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *);
int bch2_journal_keys_sort(struct bch_fs *);
+void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
+ unsigned, unsigned,
+ struct bpos, struct bpos);
+
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 74e52fd28abe..88a3582a3275 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
+ bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
@@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
@@ -380,9 +382,11 @@ static int btree_key_cache_fill(struct btree_trans *trans,
struct bkey_i *new_k = NULL;
int ret;
- k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
- BTREE_ITER_KEY_CACHE_FILL|
- BTREE_ITER_CACHED_NOFILL);
+ bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+ BTREE_ITER_KEY_CACHE_FILL|
+ BTREE_ITER_CACHED_NOFILL);
+ iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -657,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;
if (ck->journal.seq != journal_last_seq(j) ||
- j->watermark == BCH_WATERMARK_stripe)
+ !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
ret = bch2_btree_iter_traverse(&b_iter) ?:
@@ -674,7 +678,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
!bch2_journal_error(j), c,
- "error flushing key cache: %s", bch2_err_str(ret));
+ "flushing key cache: %s", bch2_err_str(ret));
if (ret)
goto out;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 684397442338..f2caf491957e 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- struct btree_path *linked;
- unsigned i, iter;
- int ret;
-
- /*
- * XXX BIG FAT NOTICE
- *
- * Drop all read locks before taking a write lock:
- *
- * This is a hack, because bch2_btree_node_lock_write_nofail() is a
- * hack - but by dropping read locks first, this should never fail, and
- * we only use this in code paths where whatever read locks we've
- * already taken are no longer needed:
- */
-
- trans_for_each_path(trans, linked, iter) {
- if (!linked->nodes_locked)
- continue;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_read_locked(linked, i)) {
- btree_node_unlock(trans, linked, i);
- btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
- }
- }
-
- ret = __btree_node_lock_write(trans, path, b, true);
+ int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}
@@ -747,7 +721,8 @@ void bch2_trans_downgrade(struct btree_trans *trans)
return;
trans_for_each_path(trans, path, i)
- bch2_btree_path_downgrade(trans, path);
+ if (path->ref)
+ bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
new file mode 100644
index 000000000000..866bd278439f
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.c
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "recovery_passes.h"
+
+#include <linux/kthread.h>
+#include <linux/sort.h>
+
+struct find_btree_nodes_worker {
+ struct closure *cl;
+ struct find_btree_nodes *f;
+ struct bch_dev *ca;
+};
+
+static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
+{
+ prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+ bch2_bpos_to_text(out, n->min_key);
+ prt_str(out, "-");
+ bch2_bpos_to_text(out, n->max_key);
+
+ if (n->range_updated)
+ prt_str(out, " range updated");
+ if (n->overwritten)
+ prt_str(out, " overwritten");
+
+ for (unsigned i = 0; i < n->nr_ptrs; i++) {
+ prt_char(out, ' ');
+ bch2_extent_ptr_to_text(out, c, n->ptrs + i);
+ }
+}
+
+static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
+{
+ printbuf_indent_add(out, 2);
+ darray_for_each(nodes, i) {
+ found_btree_node_to_text(out, c, i);
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+}
+
+static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
+{
+ struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
+
+ set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
+ bp->k.p = f->max_key;
+ bp->v.seq = cpu_to_le64(f->cookie);
+ bp->v.sectors_written = 0;
+ bp->v.flags = 0;
+ bp->v.min_key = f->min_key;
+ SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
+ memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
+}
+
+static bool found_btree_node_is_readable(struct btree_trans *trans,
+ const struct found_btree_node *f)
+{
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+
+ found_btree_node_to_key(&k.k, f);
+
+ struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+ bool ret = !IS_ERR_OR_NULL(b);
+ if (ret)
+ six_unlock_read(&b->c.lock);
+
+ /*
+ * We might update this node's range; if that happens, we need the node
+ * to be re-read so the read path can trim keys that are no longer in
+ * this node
+ */
+ if (b != btree_node_root(trans->c, b))
+ bch2_btree_node_evict(trans, &k.k);
+ return ret;
+}
+
+static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
+ cmp_int(l->cookie, r->cookie);
+}
+
+/*
+ * Given two found btree nodes, if their sequence numbers are equal, take the
+ * one that's readable:
+ */
+static int found_btree_node_cmp_time(const struct found_btree_node *l,
+ const struct found_btree_node *r)
+{
+ return cmp_int(l->seq, r->seq);
+}
+
+static int found_btree_node_cmp_pos(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->min_key, r->min_key) ?:
+ -found_btree_node_cmp_time(l, r);
+}
+
+static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
+ struct bio *bio, struct btree_node *bn, u64 offset)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, bn, PAGE_SIZE);
+
+ submit_bio_wait(bio);
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status)))
+ return;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ return;
+
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+ struct nonce nonce = btree_nonce(&bn->keys, 0);
+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+ bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
+ }
+
+ if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+ return;
+
+ if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
+ return;
+
+ rcu_read_lock();
+ struct found_btree_node n = {
+ .btree_id = BTREE_NODE_ID(bn),
+ .level = BTREE_NODE_LEVEL(bn),
+ .seq = BTREE_NODE_SEQ(bn),
+ .cookie = le64_to_cpu(bn->keys.seq),
+ .min_key = bn->min_key,
+ .max_key = bn->max_key,
+ .nr_ptrs = 1,
+ .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .ptrs[0].offset = offset,
+ .ptrs[0].dev = ca->dev_idx,
+ .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
+ };
+ rcu_read_unlock();
+
+ if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+ mutex_lock(&f->lock);
+ if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
+ bch_err(c, "try_read_btree_node() can't handle endian conversion");
+ f->ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (darray_push(&f->nodes, n))
+ f->ret = -ENOMEM;
+unlock:
+ mutex_unlock(&f->lock);
+ }
+}
+
+static int read_btree_nodes_worker(void *p)
+{
+ struct find_btree_nodes_worker *w = p;
+ struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
+ struct bch_dev *ca = w->ca;
+ void *buf = (void *) __get_free_page(GFP_KERNEL);
+ struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
+ unsigned long last_print = jiffies;
+
+ if (!buf || !bio) {
+ bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+ w->f->ret = -ENOMEM;
+ goto err;
+ }
+
+ for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
+ for (unsigned bucket_offset = 0;
+ bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
+ bucket_offset += btree_sectors(c)) {
+ if (time_after(jiffies, last_print + HZ * 30)) {
+ u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
+ u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
+
+ bch_info(ca, "%s: %2u%% done", __func__,
+ (unsigned) div64_u64(cur_sector * 100, end_sector));
+ last_print = jiffies;
+ }
+
+ u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
+ continue;
+
+ try_read_btree_node(w->f, ca, bio, buf, sector);
+ }
+err:
+ bio_put(bio);
+ free_page((unsigned long) buf);
+ percpu_ref_get(&ca->io_ref);
+ closure_put(w->cl);
+ kfree(w);
+ return 0;
+}
+
+static int read_btree_nodes(struct find_btree_nodes *f)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+ struct closure cl;
+ int ret = 0;
+
+ closure_init_stack(&cl);
+
+ for_each_online_member(c, ca) {
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
+ continue;
+
+ struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
+ struct task_struct *t;
+
+ if (!w) {
+ percpu_ref_put(&ca->io_ref);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ percpu_ref_get(&ca->io_ref);
+ closure_get(&cl);
+ w->cl = &cl;
+ w->f = f;
+ w->ca = ca;
+
+ t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ ret = IS_ERR_OR_NULL(t);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ closure_put(&cl);
+ f->ret = ret;
+ bch_err(c, "error starting kthread: %i", ret);
+ break;
+ }
+ }
+err:
+ closure_sync(&cl);
+ return f->ret ?: ret;
+}
+
+static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+{
+ while (n + 1 < end &&
+ found_btree_node_cmp_pos(n, n + 1) > 0) {
+ swap(n[0], n[1]);
+ n++;
+ }
+}
+
+static int handle_overwrites(struct bch_fs *c,
+ struct found_btree_node *start,
+ struct found_btree_node *end)
+{
+ struct found_btree_node *n;
+again:
+ for (n = start + 1;
+ n < end &&
+ n->btree_id == start->btree_id &&
+ n->level == start->level &&
+ bpos_lt(n->min_key, start->max_key);
+ n++) {
+ int cmp = found_btree_node_cmp_time(start, n);
+
+ if (cmp > 0) {
+ if (bpos_cmp(start->max_key, n->max_key) >= 0)
+ n->overwritten = true;
+ else {
+ n->range_updated = true;
+ n->min_key = bpos_successor(start->max_key);
+ n->range_updated = true;
+ bubble_up(n, end);
+ goto again;
+ }
+ } else if (cmp < 0) {
+ BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+
+ start->max_key = bpos_predecessor(n->min_key);
+ start->range_updated = true;
+ } else {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "overlapping btree nodes with same seq! halting\n ");
+ found_btree_node_to_text(&buf, c, start);
+ prt_str(&buf, "\n ");
+ found_btree_node_to_text(&buf, c, n);
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_scan_for_btree_nodes(struct bch_fs *c)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+ struct printbuf buf = PRINTBUF;
+ size_t dst;
+ int ret = 0;
+
+ if (f->nodes.nr)
+ return 0;
+
+ mutex_init(&f->lock);
+
+ ret = read_btree_nodes(f);
+ if (ret)
+ return ret;
+
+ if (!f->nodes.nr) {
+ bch_err(c, "%s: no btree nodes found", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
+
+ if (prev &&
+ prev->cookie == i->cookie) {
+ if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
+ bch_err(c, "%s: found too many replicas for btree node", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+ prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
+ } else {
+ f->nodes.data[dst++] = *i;
+ }
+ }
+ f->nodes.nr = dst;
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ if (i->overwritten)
+ continue;
+
+ ret = handle_overwrites(c, i, &darray_top(f->nodes));
+ if (ret)
+ goto err;
+
+ BUG_ON(i->overwritten);
+ f->nodes.data[dst++] = *i;
+ }
+ f->nodes.nr = dst;
+
+ if (c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->max_key, r->min_key);
+}
+
+#define for_each_found_btree_node_in_range(_f, _search, _idx) \
+ for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
+ sizeof((_f)->nodes.data[0]), \
+ found_btree_node_range_start_cmp, &search); \
+ _idx < (_f)->nodes.nr && \
+ (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
+ (_f)->nodes.data[_idx].level == _search.level && \
+ bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
+ _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
+
+bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ struct found_btree_node search = {
+ .btree_id = b->c.btree_id,
+ .level = b->c.level,
+ .min_key = b->data->min_key,
+ .max_key = b->key.k.p,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx)
+ if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
+ return true;
+ return false;
+}
+
+bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+{
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = 0,
+ .min_key = POS_MIN,
+ .max_key = SPOS_MAX,
+ };
+
+ for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
+ return true;
+ return false;
+}
+
+int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
+ unsigned level, struct bpos node_min, struct bpos node_max)
+{
+ if (btree_id_is_alloc(btree))
+ return 0;
+
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ return ret;
+
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+ bch2_bpos_to_text(&buf, node_min);
+ prt_str(&buf, " - ");
+ bch2_bpos_to_text(&buf, node_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = level,
+ .min_key = node_min,
+ .max_key = node_max,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx) {
+ struct found_btree_node n = f->nodes.data[idx];
+
+ n.range_updated |= bpos_lt(n.min_key, node_min);
+ n.min_key = bpos_max(n.min_key, node_min);
+
+ n.range_updated |= bpos_gt(n.max_key, node_max);
+ n.max_key = bpos_min(n.max_key, node_max);
+
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+ found_btree_node_to_key(&tmp.k, &n);
+
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+
+ BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+
+ ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
+{
+ darray_exit(&f->nodes);
+}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
new file mode 100644
index 000000000000..08687b209787
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
+#define _BCACHEFS_BTREE_NODE_SCAN_H
+
+int bch2_scan_for_btree_nodes(struct bch_fs *);
+bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
+bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
new file mode 100644
index 000000000000..abb7b27d556a
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+
+#include "darray.h"
+
+struct found_btree_node {
+ bool range_updated:1;
+ bool overwritten:1;
+ u8 btree_id;
+ u8 level;
+ u32 seq;
+ u64 cookie;
+
+ struct bpos min_key;
+ struct bpos max_key;
+
+ unsigned nr_ptrs;
+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+};
+
+typedef DARRAY(struct found_btree_node) found_btree_nodes;
+
+struct find_btree_nodes {
+ int ret;
+ struct mutex lock;
+ found_btree_nodes nodes;
+};
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 30d69a6d133e..bbec91e8e650 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -318,7 +318,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
- bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
}
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -397,12 +397,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
struct bkey_cached *ck = (void *) path->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
+ unsigned watermark = flags & BCH_WATERMARK_MASK;
EBUG_ON(path->level);
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(c) &&
- !(flags & BCH_TRANS_COMMIT_journal_reclaim))
+ if (watermark < BCH_WATERMARK_reclaim &&
+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(c))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -499,9 +500,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- struct btree_insert_entry *btree_id_start)
+ unsigned btree_id_start)
{
- struct btree_insert_entry *i;
bool trans_trigger_run;
int ret, overwrite;
@@ -514,13 +514,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
do {
trans_trigger_run = false;
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ for (unsigned i = btree_id_start;
+ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
i++) {
- if (i->btree_id != btree_id)
+ if (trans->updates[i].btree_id != btree_id)
continue;
- ret = run_one_trans_trigger(trans, i, overwrite);
+ ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
if (ret < 0)
return ret;
if (ret)
@@ -534,8 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_insert_entry *btree_id_start = trans->updates;
- unsigned btree_id = 0;
+ unsigned btree_id = 0, btree_id_start = 0;
int ret = 0;
/*
@@ -549,8 +548,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
if (btree_id == BTREE_ID_alloc)
continue;
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
+ while (btree_id_start < trans->nr_updates &&
+ trans->updates[btree_id_start].btree_id < btree_id)
btree_id_start++;
ret = run_btree_triggers(trans, btree_id, btree_id_start);
@@ -558,11 +557,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
return ret;
}
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
+
if (i->btree_id > BTREE_ID_alloc)
break;
if (i->btree_id == BTREE_ID_alloc) {
- ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
if (ret)
return ret;
break;
@@ -826,7 +827,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
struct bch_fs *c = trans->c;
int ret = 0, u64s_delta = 0;
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
if (i->cached)
continue;
@@ -887,6 +889,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
int ret, unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
switch (ret) {
case -BCH_ERR_btree_insert_btree_node_full:
@@ -905,7 +908,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* flag
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4a5a64499eb7..e0c982a4195c 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,6 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
+#include "bbpos_types.h"
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
@@ -173,6 +174,11 @@ struct btree_cache {
*/
struct task_struct *alloc_lock;
struct closure_waitlist alloc_wait;
+
+ struct bbpos pinned_nodes_start;
+ struct bbpos pinned_nodes_end;
+ u64 pinned_nodes_leaf_mask;
+ u64 pinned_nodes_interior_mask;
};
struct btree_node_iter {
@@ -358,7 +364,21 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
+/* Number of btree paths we preallocate, usually enough */
#define BTREE_ITER_INITIAL 64
+/*
+ * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
+ * paths should run inside this limit, and if they don't it usually indicates a
+ * bug (leaking/duplicated btree paths).
+ *
+ * exception: some fsck paths
+ *
+ * bugs with excessive path usage seem to have possibly been eliminated now, so
+ * we might consider eliminating this (and btree_trans_too_many_iter()) at some
+ * point.
+ */
+#define BTREE_ITER_NORMAL_LIMIT 256
+/* never exceed limit */
#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
@@ -654,6 +674,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
BIT_ULL(BKEY_TYPE_reflink)| \
+ BIT_ULL(BKEY_TYPE_subvolumes)| \
BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
@@ -727,7 +748,7 @@ struct btree_root {
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
u8 alive;
- s8 error;
+ s16 error;
};
enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index c3ff365acce9..8e47e260eba5 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -38,6 +38,9 @@ static noinline int extent_front_merge(struct btree_trans *trans,
struct bkey_i *update;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
update = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(update);
if (ret)
@@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans,
struct bch_fs *c = trans->c;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
if (ret < 0)
@@ -452,7 +458,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
* the key cache - but the key has to exist in the btree for that to
* work:
*/
- if (path->cached && bkey_deleted(&i->old_k))
+ if (path->cached && !i->old_btree_u64s)
return flush_new_cached_update(trans, i, flags, ip);
return 0;
@@ -788,6 +794,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ int ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ return ret;
+
+ bkey_init(&k->k);
+ k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ k->k.p = pos;
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
+ struct bpos pos, bool set)
+{
struct bkey_i k;
bkey_init(&k.k);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b9382b7b288b..cc7c53e83f89 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -63,11 +63,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
enum btree_id btree, struct bpos pos)
{
- return bch2_btree_bit_mod(trans, btree, pos, false);
+ return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4530b14ff2c3..6030c396754f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_buf.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
@@ -18,15 +19,23 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-members.h"
#include "super-io.h"
#include "trace.h"
#include <linux/random.h>
+static const char * const bch2_btree_update_modes[] = {
+#define x(t) #t,
+ BTREE_UPDATE_MODES()
+#undef x
+ NULL
+};
+
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- btree_path_idx_t, struct btree *,
- struct keylist *, unsigned);
+ btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
@@ -45,56 +54,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
return path_idx;
}
-/* Debug code: */
-
/*
* Verify that child nodes correctly span parent node's range:
*/
-static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bpos next_node = b->data->min_key;
- struct btree_node_iter iter;
+ struct bch_fs *c = trans->c;
+ struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+ : b->data->min_key;
+ struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_btree_ptr_v2 bp;
- struct bkey unpacked;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
+ struct bkey_buf prev;
+ int ret = 0;
- BUG_ON(!b->c.level);
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- return;
+ if (!b->c.level)
+ return 0;
- bch2_btree_node_iter_init_from_start(&iter, b);
+ bch2_bkey_buf_init(&prev);
+ bkey_init(&prev.k->k);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- while (1) {
- k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
- break;
- bp = bkey_s_c_to_btree_ptr_v2(k);
+ goto out;
- if (!bpos_eq(next_node, bp.v->min_key)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, next_node);
- bch2_bpos_to_text(&buf2, bp.v->min_key);
- panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
- }
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- bch2_btree_node_iter_advance(&iter, b);
+ struct bpos expected_min = bkey_deleted(&prev.k->k)
+ ? node_min
+ : bpos_successor(prev.k->k.p);
- if (bch2_btree_node_iter_end(&iter)) {
- if (!bpos_eq(k.k->p, b->key.k.p)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, b->key.k.p);
- bch2_bpos_to_text(&buf2, k.k->p);
- panic("expected end %s got %s\n", buf1.buf, buf2.buf);
- }
- break;
+ if (!bpos_eq(expected_min, bp.v->min_key)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "end of prev node doesn't match start of next node\n"),
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n prev ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+ prt_str(&buf, "\n next ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
+ goto topology_repair;
}
- next_node = bpos_successor(k.k->p);
+ bch2_bkey_buf_reassemble(&prev, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+
+ if (bkey_deleted(&prev.k->k)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "empty interior node\n");
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
+ goto topology_repair;
+ } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "last child node doesn't end at end of parent node\n");
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n last key ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+
+ need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
+ goto topology_repair;
}
-#endif
+out:
+fsck_err:
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev, c);
+ printbuf_exit(&buf);
+ return ret;
+topology_repair:
+ if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
+ bch2_inconsistent_error(c);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ } else {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+ }
+ goto out;
}
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -255,7 +311,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct open_buckets obs = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+ unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
? BTREE_NODE_RESERVE
: 0;
int ret;
@@ -550,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as,
bch2_keylist_push(keys);
}
+static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
+{
+ for_each_keylist_key(&as->new_keys, k)
+ if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
+ return false;
+ return true;
+}
+
+static void btree_update_new_nodes_mark_sb(struct btree_update *as)
+{
+ struct bch_fs *c = as->c;
+
+ mutex_lock(&c->sb_lock);
+ for_each_keylist_key(&as->new_keys, k)
+ bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
+
/*
* The transactional part of an interior btree node update, where we journal the
* update we did to the interior node and update alloc info:
@@ -607,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as)
if (ret)
goto err;
+ if (!btree_update_new_nodes_marked_sb(as))
+ btree_update_new_nodes_mark_sb(as);
+
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk:
@@ -639,7 +718,7 @@ static void btree_update_nodes_written(struct btree_update *as)
* which may require allocations as well.
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim,
@@ -647,11 +726,15 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_trans_unlock(trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
- "%s(): error %s", __func__, bch2_err_str(ret));
+ "%s", bch2_err_str(ret));
err:
- if (as->b) {
-
- b = as->b;
+ /*
+ * We have to be careful because another thread might be getting ready
+ * to free as->b and calling btree_update_reparent() on us - we'll
+ * recheck under btree_update_lock below:
+ */
+ b = READ_ONCE(as->b);
+ if (b) {
btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@@ -795,15 +878,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
+ BUG_ON(as->update_level_end < b->c.level);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+ as->mode = BTREE_UPDATE_node;
as->b = b;
+ as->update_level_end = b->c.level;
set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
@@ -825,7 +910,7 @@ static void btree_update_reparent(struct btree_update *as,
lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL;
- child->mode = BTREE_INTERIOR_UPDATING_AS;
+ child->mode = BTREE_UPDATE_update;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush);
@@ -836,7 +921,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
struct bkey_i *insert = &b->key;
struct bch_fs *c = as->c;
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
@@ -850,7 +935,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ as->mode = BTREE_UPDATE_root;
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -1028,7 +1113,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
- BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode == BTREE_UPDATE_none);
if (as->took_gc_lock)
up_read(&as->c->gc_lock);
@@ -1045,7 +1130,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, bool split, unsigned flags)
+ unsigned level_start, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
@@ -1053,7 +1138,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
- unsigned update_level = level;
+ unsigned level_end = level_start;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret = 0;
u32 restart_count = trans->restart_count;
@@ -1068,29 +1153,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < c->journal.watermark) {
- struct journal_res res = { 0 };
+ if (watermark < BCH_WATERMARK_reclaim &&
+ test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) {
+ if (flags & BCH_TRANS_COMMIT_journal_reclaim)
+ return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
- ret = drop_locks_do(trans,
- bch2_journal_res_get(&c->journal, &res, 1,
- watermark|JOURNAL_RES_GET_CHECK));
+ bch2_trans_unlock(trans);
+ wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags));
+ ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
}
while (1) {
- nr_nodes[!!update_level] += 1 + split;
- update_level++;
+ nr_nodes[!!level_end] += 1 + split;
+ level_end++;
- ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
if (ret)
return ERR_PTR(ret);
- if (!btree_path_node(path, update_level)) {
+ if (!btree_path_node(path, level_end)) {
/* Allocating new root? */
nr_nodes[1] += split;
- update_level = BTREE_MAX_DEPTH;
+ level_end = BTREE_MAX_DEPTH;
break;
}
@@ -1098,11 +1184,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[level_end].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
- split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+ split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
if (!down_read_trylock(&c->gc_lock)) {
@@ -1116,12 +1202,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
- as->c = c;
- as->start_time = start_time;
- as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = true;
- as->btree_id = path->btree_id;
- as->update_level = update_level;
+ as->c = c;
+ as->start_time = start_time;
+ as->ip_started = _RET_IP_;
+ as->mode = BTREE_UPDATE_none;
+ as->watermark = watermark;
+ as->took_gc_lock = true;
+ as->btree_id = path->btree_id;
+ as->update_level_start = level_start;
+ as->update_level_end = level_end;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1163,7 +1252,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
*/
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
@@ -1193,7 +1282,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
err:
bch2_btree_update_free(as, trans);
if (!bch2_err_matches(ret, ENOSPC) &&
- !bch2_err_matches(ret, EROFS))
+ !bch2_err_matches(ret, EROFS) &&
+ ret != -BCH_ERR_journal_reclaim_would_deadlock)
bch_err_fn_ratelimited(c, ret);
return ERR_PTR(ret);
}
@@ -1208,33 +1298,35 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
- BUG_ON(btree_node_root(c, b) &&
- (b->c.level < btree_node_root(c, b)->c.level ||
- !btree_node_dying(btree_node_root(c, b))));
-
bch2_btree_id_root(c, b->c.btree_id)->b = b;
mutex_unlock(&c->btree_root_lock);
bch2_recalc_btree_reserve(c);
}
-static void bch2_btree_set_root(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+static int bch2_btree_set_root(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ bool nofail)
{
struct bch_fs *c = as->c;
- struct btree *old;
trace_and_count(c, btree_node_set_root, trans, b);
- old = btree_node_root(c, b);
+ struct btree *old = btree_node_root(c, b);
/*
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ if (nofail) {
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ } else {
+ int ret = bch2_btree_node_lock_write(trans, path, &old->c);
+ if (ret)
+ return ret;
+ }
bch2_btree_set_root_inmem(c, b);
@@ -1248,6 +1340,7 @@ static void bch2_btree_set_root(struct btree_update *as,
* depend on the new root would have to update the new root.
*/
bch2_btree_node_unlock_write(trans, path, old);
+ return 0;
}
/* Interior node updates: */
@@ -1314,12 +1407,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
}
static void
-__bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
@@ -1378,9 +1471,16 @@ static void __btree_split_node(struct btree_update *as,
if (bkey_deleted(k))
continue;
+ uk = bkey_unpack_key(b, k);
+
+ if (b->c.level &&
+ u64s < n1_u64s &&
+ u64s + k->u64s >= n1_u64s &&
+ bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p))
+ n1_u64s += k->u64s;
+
i = u64s >= n1_u64s;
u64s += k->u64s;
- uk = bkey_unpack_key(b, k);
if (!i)
n1_pos = uk.p;
bch2_bkey_format_add_key(&format[i], &uk);
@@ -1439,8 +1539,7 @@ static void __btree_split_node(struct btree_update *as,
bch2_verify_btree_nr_keys(n[i]);
- if (b->c.level)
- btree_node_interior_verify(as->c, n[i]);
+ BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
}
}
@@ -1469,15 +1568,15 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
- __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
- btree_node_interior_verify(as->c, b);
+ BUG_ON(bch2_btree_node_check_topology(trans, b));
}
}
static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path, struct btree *b,
- struct keylist *keys, unsigned flags)
+ struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(trans->paths + path, b);
@@ -1486,9 +1585,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
+ bch2_verify_btree_nr_keys(b);
BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
bch2_btree_interior_update_will_free_node(as, b);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
@@ -1578,16 +1682,17 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
- if (ret)
- goto err;
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
} else if (n3) {
- bch2_btree_set_root(as, trans, trans->paths + path, n3);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, trans, trans->paths + path, n1);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
}
+ if (ret)
+ goto err;
+
if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
@@ -1644,27 +1749,6 @@ err:
goto out;
}
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct keylist *keys)
-{
- struct btree_path *linked;
- unsigned i;
-
- __bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
-
- btree_update_updated_node(as, b);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
- bch2_trans_verify_paths(trans);
-}
-
/**
* bch2_btree_insert_node - insert bkeys into a given btree node
*
@@ -1673,7 +1757,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
- * @flags: transaction commit flags
*
* Returns: 0 on success, typically transaction restart error on failure
*
@@ -1683,10 +1766,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx, struct btree *b,
- struct keylist *keys, unsigned flags)
+ struct keylist *keys)
{
struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx;
+ struct btree_path *path = trans->paths + path_idx, *linked;
+ unsigned i;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1709,9 +1793,19 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
goto split;
}
- btree_node_interior_verify(c, b);
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret) {
+ bch2_btree_node_unlock_write(trans, path, b);
+ return ret;
+ }
+
+ bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
+
+ trans_for_each_path_with_node(trans, b, linked, i)
+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
- bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+ bch2_trans_verify_paths(trans);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1725,21 +1819,22 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
+ btree_update_updated_node(as, b);
bch2_btree_node_unlock_write(trans, path, b);
- btree_node_interior_verify(c, b);
+ BUG_ON(bch2_btree_node_check_topology(trans, b));
return 0;
split:
/*
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
- if (b->c.level >= as->update_level) {
+ if (b->c.level >= as->update_level_end) {
trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
- return btree_split(as, trans, path_idx, b, keys, flags);
+ return btree_split(as, trans, path_idx, b, keys);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
@@ -1747,7 +1842,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
unsigned flags)
{
/* btree_split & merge may both cause paths array to be reallocated */
-
struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
@@ -1759,7 +1853,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
if (IS_ERR(as))
return PTR_ERR(as);
- ret = btree_split(as, trans, path, b, NULL, flags);
+ ret = btree_split(as, trans, path, b, NULL);
if (ret) {
bch2_btree_update_free(as, trans);
return ret;
@@ -1775,6 +1869,65 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
return ret;
}
+static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
+ btree_path_idx_t path_idx)
+{
+ struct bch_fs *c = as->c;
+ struct btree_path *path = trans->paths + path_idx;
+ struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
+
+ BUG_ON(!btree_node_locked(path, b->c.level));
+
+ n = __btree_root_alloc(as, trans, b->c.level + 1);
+
+ bch2_btree_update_add_new_node(as, n);
+ six_unlock_write(&n->c.lock);
+
+ path->locks_want++;
+ BUG_ON(btree_node_locked(path, n->c.level));
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, path, n);
+
+ n->sib_u64s[0] = U16_MAX;
+ n->sib_u64s[1] = U16_MAX;
+
+ bch2_keylist_add(&as->parent_keys, &b->key);
+ btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
+
+ int ret = bch2_btree_set_root(as, trans, path, n, true);
+ BUG_ON(ret);
+
+ bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_trans_node_add(trans, path, n);
+ six_unlock_intent(&n->c.lock);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_add_tail(&b->list, &c->btree_cache.live);
+ mutex_unlock(&c->btree_cache.lock);
+
+ bch2_trans_verify_locks(trans);
+}
+
+int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+
+ if (btree_node_fake(b))
+ return bch2_btree_split_leaf(trans, path, flags);
+
+ struct btree_update *as =
+ bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
+ if (IS_ERR(as))
+ return PTR_ERR(as);
+
+ __btree_increase_depth(as, trans, path);
+ bch2_btree_update_done(as, trans);
+ return 0;
+}
+
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
btree_path_idx_t path,
unsigned level,
@@ -1797,6 +1950,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
BUG_ON(!trans->paths[path].should_be_locked);
BUG_ON(!btree_node_locked(&trans->paths[path], level));
+ /*
+ * Work around a deadlock caused by the btree write buffer not doing
+ * merges and leaving tons of merges for us to do - we really don't need
+ * to be doing merges at all from the interior update path, and if the
+ * interior update path is generating too many new interior updates we
+ * deadlock:
+ */
+ if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
+ return 0;
+
+ flags &= ~BCH_WATERMARK_MASK;
+
b = trans->paths[path].l[level].b;
if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
@@ -1845,8 +2010,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
__func__, buf1.buf, buf2.buf);
printbuf_exit(&buf1);
printbuf_exit(&buf2);
- bch2_topology_error(c);
- ret = -EIO;
+ ret = bch2_topology_error(c);
goto err;
}
@@ -1916,7 +2080,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err_free_update;
@@ -1943,6 +2107,10 @@ err:
bch2_path_put(trans, new_path, true);
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans);
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+ ret = 0;
+ if (!ret)
+ ret = bch2_trans_relock(trans);
return ret;
err_free_update:
bch2_btree_node_free_never_used(as, trans, n);
@@ -1987,14 +2155,14 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path,
- parent, &as->parent_keys, flags);
- if (ret)
- goto err;
+ ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
} else {
- bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
+ ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
}
+ if (ret)
+ goto err;
+
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
@@ -2069,7 +2237,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
async_btree_node_rewrite_trans(trans, a));
- bch_err_fn(c, ret);
+ bch_err_fn_ratelimited(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@@ -2116,7 +2284,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
}
- queue_work(c->btree_interior_update_worker, &a->work);
+ queue_work(c->btree_node_rewrite_worker, &a->work);
}
void bch2_do_pending_node_rewrites(struct bch_fs *c)
@@ -2128,7 +2296,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c)
list_del(&a->list);
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
- queue_work(c->btree_interior_update_worker, &a->work);
+ queue_work(c->btree_node_rewrite_worker, &a->work);
}
mutex_unlock(&c->pending_node_rewrites_lock);
}
@@ -2339,7 +2507,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_inmem(c, b);
}
-static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level)
{
struct bch_fs *c = trans->c;
struct closure cl;
@@ -2358,7 +2526,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
- b->c.level = 0;
+ b->c.level = level;
b->c.btree_id = id;
bkey_btree_ptr_init(&b->key);
@@ -2385,9 +2553,23 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
return 0;
}
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{
- bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+ bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level));
+}
+
+static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
+{
+ prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ (void *) as->ip_started,
+ bch2_btree_id_str(as->btree_id),
+ as->update_level_start,
+ as->update_level_end,
+ bch2_watermarks[as->watermark],
+ bch2_btree_update_modes[as->mode],
+ as->nodes_written,
+ closure_nr_remaining(&as->cl),
+ as->journal.seq);
}
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2396,12 +2578,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- prt_printf(out, "%p m %u w %u r %u j %llu\n",
- as,
- as->mode,
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
+ bch2_btree_update_to_text(out, as);
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -2465,6 +2642,8 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
+ if (c->btree_node_rewrite_worker)
+ destroy_workqueue(c->btree_node_rewrite_worker);
if (c->btree_interior_update_worker)
destroy_workqueue(c->btree_interior_update_worker);
mempool_exit(&c->btree_interior_update_pool);
@@ -2485,10 +2664,15 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
{
c->btree_interior_update_worker =
- alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ c->btree_node_rewrite_worker =
+ alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
+ if (!c->btree_node_rewrite_worker)
+ return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_update)))
return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index c593c925d1e3..c1a479ebaad1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -10,6 +10,20 @@
#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
+int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
+
+#define BTREE_UPDATE_MODES() \
+ x(none) \
+ x(node) \
+ x(root) \
+ x(update)
+
+enum btree_update_mode {
+#define x(n) BTREE_UPDATE_##n,
+ BTREE_UPDATE_MODES()
+#undef x
+};
+
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
@@ -32,28 +46,24 @@ struct btree_update {
struct closure cl;
struct bch_fs *c;
u64 start_time;
+ unsigned long ip_started;
struct list_head list;
struct list_head unwritten_list;
- /* What kind of update are we doing? */
- enum {
- BTREE_INTERIOR_NO_UPDATE,
- BTREE_INTERIOR_UPDATING_NODE,
- BTREE_INTERIOR_UPDATING_ROOT,
- BTREE_INTERIOR_UPDATING_AS,
- } mode;
-
+ enum btree_update_mode mode;
+ enum bch_watermark watermark;
unsigned nodes_written:1;
unsigned took_gc_lock:1;
enum btree_id btree_id;
- unsigned update_level;
+ unsigned update_level_start;
+ unsigned update_level_end;
struct disk_reservation disk_res;
/*
- * BTREE_INTERIOR_UPDATING_NODE:
+ * BTREE_UPDATE_node:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
@@ -119,6 +129,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
+int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
+
int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);
@@ -160,7 +172,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
struct bkey_i *, unsigned, bool);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index ac7844861966..36a6f42aba5e 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -11,6 +11,7 @@
#include "journal_reclaim.h"
#include <linux/prefetch.h>
+#include <linux/sort.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
@@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke
#endif
}
+static int wb_key_seq_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
/* Compare excluding idx, the low 24 bits: */
static inline bool wb_key_eq(const void *_l, const void *_r)
{
@@ -307,6 +316,16 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
write_locked = false;
+
+ ret = lockrestart_do(trans,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_foreground_maybe_merge(trans, iter.path, 0,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc));
+ if (ret)
+ goto err;
}
}
@@ -357,6 +376,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+ sort(wb->flushing.keys.data,
+ wb->flushing.keys.nr,
+ sizeof(wb->flushing.keys.data[0]),
+ wb_key_seq_cmp, NULL);
+
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
continue;
@@ -368,17 +392,17 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
ret = commit_do(trans, NULL, NULL,
BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim,
+ BCH_TRANS_COMMIT_no_journal_res ,
btree_write_buffered_insert(trans, i));
if (ret)
goto err;
}
}
err:
- bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+ bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
bch2_journal_pin_drop(j, &wb->flushing.pin);
wb->flushing.keys.nr = 0;
@@ -574,8 +598,6 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
{
struct journal_keys_to_wb dst;
- struct jset_entry *entry;
- struct bkey_i *k;
int ret = 0;
bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
@@ -590,7 +612,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
entry->type = BCH_JSET_ENTRY_btree_keys;
}
+ spin_lock(&c->journal.lock);
buf->need_flush_to_write_buffer = false;
+ spin_unlock(&c->journal.lock);
out:
bch2_journal_keys_to_write_buffer_end(c, &dst);
return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 54f7826ac498..941401a210f5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -525,6 +525,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
"different types of data in same bucket: %s, %s",
bch2_data_type_str(g->data_type),
bch2_data_type_str(data_type))) {
+ BUG();
ret = -EIO;
goto err;
}
@@ -628,6 +629,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ BUG();
ret = -EIO;
goto err;
}
@@ -815,14 +817,14 @@ static int __mark_pointer(struct btree_trans *trans,
static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
- s64 *sectors,
- unsigned flags)
+ const union bch_extent_entry *entry,
+ s64 *sectors, unsigned flags)
{
bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
struct bpos bucket;
struct bch_backpointer bp;
- bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp);
*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
@@ -851,7 +853,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_GC) {
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
percpu_down_read(&c->mark_lock);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
@@ -979,7 +981,7 @@ static int __trigger_extent(struct btree_trans *trans,
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors;
- ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
+ ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
if (ret < 0)
return ret;
@@ -990,8 +992,8 @@ static int __trigger_extent(struct btree_trans *trans,
ret = !gc
? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
: update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
- bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
- __func__);
+ bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
+ bch2_err_str(ret));
if (ret)
return ret;
}
@@ -1020,7 +1022,7 @@ static int __trigger_extent(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+ bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
printbuf_exit(&buf);
}
if (ret)
@@ -1053,7 +1055,8 @@ int bch2_trigger_extent(struct btree_trans *trans,
(int) bch2_bkey_needs_rebalance(c, old);
if (mod) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ new.k->p, mod > 0);
if (ret)
return ret;
}
@@ -1335,7 +1338,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
- kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+ kvfree(buckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -1345,16 +1348,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bool resize = ca->bucket_gens != NULL;
int ret;
- if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
- GFP_KERNEL|__GFP_ZERO))) {
+ if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
+ GFP_KERNEL|__GFP_ZERO))) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
if ((c->opts.buckets_nouse &&
- !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)))) {
+ !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) *
+ sizeof(unsigned long),
+ GFP_KERNEL|__GFP_ZERO)))) {
ret = -BCH_ERR_ENOMEM_buckets_nouse;
goto err;
}
@@ -1397,8 +1400,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
- kvpfree(buckets_nouse,
- BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+ kvfree(buckets_nouse);
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
@@ -1407,27 +1409,21 @@ err:
void bch2_dev_buckets_free(struct bch_dev *ca)
{
- unsigned i;
-
- kvpfree(ca->buckets_nouse,
- BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
- kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
- sizeof(struct bucket_gens) + ca->mi.nbuckets);
+ kvfree(ca->buckets_nouse);
+ kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
-
ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
if (!ca->usage_base)
return -BCH_ERR_ENOMEM_usage_init;
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
ca->usage[i] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[i])
return -BCH_ERR_ENOMEM_usage_init;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6387e039f789..f9af5adabe83 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
fallthrough;
case BCH_WATERMARK_btree_copygc:
case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
break;
}
@@ -394,14 +395,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type)
: "(invalid data type)";
}
-static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
-{
- if (type < BCH_DATA_NR)
- prt_str(out, __bch2_data_types[type]);
- else
- prt_printf(out, "(invalid data type %u)", type);
-}
-
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 226b39c17667..72781aad6ba7 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -7,7 +7,7 @@
#include "chardev.h"
#include "journal.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
@@ -22,12 +22,6 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
-__must_check
-static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
- return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@@ -140,39 +134,51 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
struct fsck_thread {
struct thread_with_stdio thr;
struct bch_fs *c;
- char **devs;
- size_t nr_devs;
struct bch_opts opts;
};
static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
{
struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- if (thr->devs)
- for (size_t i = 0; i < thr->nr_devs; i++)
- kfree(thr->devs[i]);
- kfree(thr->devs);
kfree(thr);
}
-static int bch2_fsck_offline_thread_fn(void *arg)
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
- struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
- struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
- thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
- if (!thr->thr.thr.ret)
- bch2_fs_stop(c);
+ int ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ return ret;
- thread_with_stdio_done(&thr->thr);
- return 0;
+ ret = bch2_fs_start(thr->c);
+ if (ret)
+ goto err;
+
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+ ret |= 1;
+ }
+ if (test_bit(BCH_FS_error, &c->flags)) {
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+ ret |= 4;
+ }
+err:
+ bch2_fs_stop(c);
+ return ret;
}
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_offline_thread_fn,
+};
+
static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
{
struct bch_ioctl_fsck_offline arg;
struct fsck_thread *thr = NULL;
- u64 *devs = NULL;
+ darray_str(devs) = {};
long ret = 0;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -184,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
- !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
- !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
- ret = -ENOMEM;
- goto err;
- }
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ u64 dev_u64;
+ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+ if (ret)
+ goto err;
- thr->opts = bch2_opts_empty();
- thr->nr_devs = arg.nr_devs;
+ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(dev_str);
+ if (ret)
+ goto err;
- if (copy_from_user(devs, &user_arg->devs[0],
- array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
- ret = -EINVAL;
- goto err;
+ ret = darray_push(&devs, dev_str);
+ if (ret) {
+ kfree(dev_str);
+ goto err;
+ }
}
- for (size_t i = 0; i < arg.nr_devs; i++) {
- thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
- ret = PTR_ERR_OR_ZERO(thr->devs[i]);
- if (ret)
- goto err;
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
}
+ thr->opts = bch2_opts_empty();
+
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
@@ -220,17 +229,26 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
- ret = bch2_run_thread_with_stdio(&thr->thr,
- bch2_fsck_thread_exit,
- bch2_fsck_offline_thread_fn);
-err:
- if (ret < 0) {
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- }
- kfree(devs);
+ /* We need request_key() to be called before we punt to kthread: */
+ opt_set(thr->opts, nostart, true);
+
+ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+ if (!IS_ERR(thr->c) &&
+ thr->c->opts.errors == BCH_ON_ERROR_panic)
+ thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
+out:
+ darray_for_each(devs, i)
+ kfree(*i);
+ darray_exit(&devs);
return ret;
+err:
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ goto out;
}
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
@@ -763,9 +781,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
-static int bch2_fsck_online_thread_fn(void *arg)
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
{
- struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = thr->c;
c->stdio_filter = current;
@@ -793,13 +811,16 @@ static int bch2_fsck_online_thread_fn(void *arg)
c->stdio_filter = NULL;
c->opts.fix_errors = old_fix_errors;
- thread_with_stdio_done(&thr->thr);
-
up(&c->online_fsck_mutex);
bch2_ro_ref_put(c);
- return 0;
+ return ret;
}
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_online_thread_fn,
+};
+
static long bch2_ioctl_fsck_online(struct bch_fs *c,
struct bch_ioctl_fsck_online arg)
{
@@ -840,9 +861,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
goto err;
}
- ret = bch2_run_thread_with_stdio(&thr->thr,
- bch2_fsck_thread_exit,
- bch2_fsck_online_thread_fn);
+ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
err:
if (ret < 0) {
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3c761ad6b1c8..7ed779b411f6 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -429,15 +429,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
extent_nonce(version, crc_old), bio);
if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
- bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
- "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
- __func__,
- crc_old.csum.hi,
- crc_old.csum.lo,
- merged.hi,
- merged.lo,
- bch2_csum_types[crc_old.csum_type],
- bch2_csum_types[new_csum_type]);
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
+ "expected %0llx:%0llx got %0llx:%0llx (old type ",
+ __func__,
+ crc_old.csum.hi,
+ crc_old.csum.lo,
+ merged.hi,
+ merged.lo);
+ bch2_prt_csum_type(&buf, crc_old.csum_type);
+ prt_str(&buf, " new type ");
+ bch2_prt_csum_type(&buf, new_csum_type);
+ prt_str(&buf, ")");
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
return -EIO;
}
@@ -558,7 +563,7 @@ got_key:
return 0;
}
-#include "../crypto.h"
+#include "crypto.h"
#endif
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1b8c2c1016dc..e40499fde9a4 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
struct bch_csum expected,
struct bch_csum got)
{
- prt_printf(out, "checksum error: got ");
+ prt_str(out, "checksum error, type ");
+ bch2_prt_csum_type(out, type);
+ prt_str(out, ": got ");
bch2_csum_to_text(out, type, got);
prt_str(out, " should be ");
bch2_csum_to_text(out, type, expected);
- prt_printf(out, " type %s", bch2_csum_types[type]);
}
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 33df8cf86bd8..1410365a8891 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
return 0;
if (!mempool_initialized(&c->compression_bounce[READ]) &&
- mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
- 1, c->opts.encoded_extent_max))
+ mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+ 1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_read_init;
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
- mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
- 1, c->opts.encoded_extent_max))
+ mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+ 1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_write_init;
for (i = compression_types;
@@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
- if (mempool_init_kvpmalloc_pool(
+ if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
return -BCH_ERR_ENOMEM_compression_workspace_init;
}
if (!mempool_initialized(&c->decompress_workspace) &&
- mempool_init_kvpmalloc_pool(&c->decompress_workspace,
- 1, decompress_workspace_size))
+ mempool_init_kvmalloc_pool(&c->decompress_workspace,
+ 1, decompress_workspace_size))
return -BCH_ERR_ENOMEM_decompression_workspace_init;
return 0;
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 58c2eb45570f..607fd5e232c9 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
-static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
-{
- if (type < BCH_COMPRESSION_TYPE_NR)
- prt_str(out, __bch2_compression_types[type]);
- else
- prt_printf(out, "(invalid compression type %u)", type);
-}
-
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 4150feca42a2..0022b51ce3c0 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -14,6 +14,7 @@
#include "move.h"
#include "nocow_locking.h"
#include "rebalance.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "trace.h"
@@ -509,6 +510,14 @@ int bch2_data_update_init(struct btree_trans *trans,
unsigned ptrs_locked = 0;
int ret = 0;
+ /*
+ * fs is corrupt we have a key for a snapshot node that doesn't exist,
+ * and we have to check for this because we go rw before repairing the
+ * snapshots table - just skip it, we can move it later.
+ */
+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+ return -BCH_ERR_data_update_done;
+
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
@@ -571,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans,
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
- (!atomic_read(&ctxt->read_sectors) &&
- !atomic_read(&ctxt->write_sectors)));
+ list_empty(&ctxt->ios));
if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks,
@@ -590,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans,
i++;
}
+ unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
@@ -599,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
- durability_have >= io_opts.data_replicas) {
+ !durability_required) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
@@ -608,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans,
goto done;
}
- m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+ m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas;
- m->op.nr_replicas_required = m->op.nr_replicas;
- BUG_ON(!m->op.nr_replicas);
+ /*
+ * If device(s) were set to durability=0 after data was written to them
+ * we can end up with a duribilty=0 extent, and the normal algorithm
+ * that tries not to increase durability doesn't work:
+ */
+ if (!(durability_have + durability_removing))
+ m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+ m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7bdba8507fc9..cd99b7399414 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
@@ -137,7 +138,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+ c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -170,7 +171,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+ bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
printbuf_exit(&buf);
}
out:
@@ -199,7 +200,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+ n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
@@ -293,7 +294,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_buf_bytes(b));
+ kvfree(n_ondisk);
percpu_ref_put(&ca->io_ref);
}
@@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- do {
+ while (1) {
err = flush_buf(i);
if (err)
return err;
@@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
if (!i->size)
break;
+ if (done)
+ break;
+
done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
i->iter++;
- } while (!done);
+ }
if (i->buf.allocation_failure)
return -ENOMEM;
@@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
+static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ bch2_btree_updates_to_text(&i->buf, c);
+ i->iter++;
+ }
+
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations btree_updates_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_updates_read,
+};
+
static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-
if (!i)
return -ENOMEM;
@@ -866,6 +902,20 @@ void bch2_fs_debug_exit(struct bch_fs *c)
debugfs_remove_recursive(c->fs_debug_dir);
}
+static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
+{
+ struct dentry *d;
+
+ d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
+
+ debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
+
+ debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
+
+ debugfs_create_file("bfloat-failed", 0400, d, bd,
+ &bfloat_failed_debug_ops);
+}
+
void bch2_fs_debug_init(struct bch_fs *c)
{
struct btree_debug *bd;
@@ -888,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
+ debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_updates_ops);
+
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
c, &btree_transaction_stats_op);
@@ -902,21 +955,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- debugfs_create_file(bch2_btree_id_str(bd->id),
- 0400, c->btree_debug_dir, bd,
- &btree_debug_ops);
-
- snprintf(name, sizeof(name), "%s-formats",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &btree_format_debug_ops);
-
- snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &bfloat_failed_debug_ops);
+ bch2_fs_debug_btree_init(c, bd);
}
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4ae1e9f002a0..d37bd07afbfe 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -144,19 +144,21 @@ fsck_err:
return ret;
}
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
- prt_printf(out, "%.*s -> %llu type %s",
- d_name.len,
- d_name.name,
- d.v->d_type != DT_SUBVOL
- ? le64_to_cpu(d.v->d_inum)
- : le32_to_cpu(d.v->d_child_subvol),
- bch2_d_type_str(d.v->d_type));
+ prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+
+ if (d.v->d_type != DT_SUBVOL)
+ prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+ else
+ prt_printf(out, "%u -> %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ le32_to_cpu(d.v->d_child_subvol));
+
+ prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -199,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
}
int bch2_dirent_create_snapshot(struct btree_trans *trans,
- u64 dir, u32 snapshot,
+ u32 dir_subvol, u64 dir, u32 snapshot,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
bch_str_hash_flags_t str_hash_flags)
{
- subvol_inum zero_inum = { 0 };
+ subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+ dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -217,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.inode = dir;
dirent->k.p.snapshot = snapshot;
- ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- zero_inum, snapshot,
- &dirent->k_i, str_hash_flags,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, snapshot,
+ &dirent->k_i, str_hash_flags,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -291,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
- unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+ unsigned src_update_flags = 0;
+ bool delete_src, delete_dst;
int ret = 0;
- if (src_dir.subvol != dst_dir.subvol)
- return -EXDEV;
-
memset(src_inum, 0, sizeof(*src_inum));
memset(dst_inum, 0, sizeof(*dst_inum));
@@ -317,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
- src_type = bkey_s_c_to_dirent(old_src).v->d_type;
-
- if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
- return -EOPNOTSUPP;
-
-
/* Lookup dst: */
if (mode == BCH_RENAME) {
/*
@@ -350,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
-
- dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
-
- if (dst_type == DT_SUBVOL)
- return -EOPNOTSUPP;
}
if (mode != BCH_RENAME_EXCHANGE)
@@ -424,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
+ if (new_dst->v.d_type == DT_SUBVOL)
+ new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
+
+ if ((mode == BCH_RENAME_EXCHANGE) &&
+ new_src->v.d_type == DT_SUBVOL)
+ new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
out_set_src:
-
/*
- * If we're deleting a subvolume, we need to really delete the dirent,
- * not just emit a whiteout in the current snapshot:
+ * If we're deleting a subvolume we need to really delete the dirent,
+ * not just emit a whiteout in the current snapshot - there can only be
+ * single dirent that points to a given subvolume.
+ *
+ * IOW, we don't maintain multiple versions in different snapshots of
+ * dirents that point to subvolumes - dirents that point to subvolumes
+ * are only visible in one particular subvolume so it's not necessary,
+ * and it would be particularly confusing for fsck to have to deal with.
*/
- if (src_type == DT_SUBVOL) {
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&src_iter);
+ delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+ new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+ delete_dst = old_dst.k &&
+ bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+ new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+ if (!delete_src || !bkey_deleted(&new_src->k)) {
+ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (ret)
goto out;
+ }
- new_src->k.p = src_iter.pos;
- src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+ if (delete_src) {
+ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&src_iter) ?:
+ bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ if (ret)
+ goto out;
}
- ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
- if (ret)
- goto out;
+ if (delete_dst) {
+ bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&dst_iter) ?:
+ bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ if (ret)
+ goto out;
+ }
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
@@ -456,41 +472,29 @@ out:
return ret;
}
-int __bch2_dirent_lookup_trans(struct btree_trans *trans,
- struct btree_iter *iter,
- subvol_inum dir,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, subvol_inum *inum,
- unsigned flags)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum,
+ unsigned flags)
{
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
- if (ret)
- return ret;
-
- ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
+ int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
if (ret)
return ret;
- k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
- d = bkey_s_c_to_dirent(k);
-
- ret = bch2_dirent_read_target(trans, dir, d, inum);
+ ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
if (ret > 0)
ret = -ENOENT;
err:
if (ret)
bch2_trans_iter_exit(trans, iter);
-
return ret;
}
@@ -502,13 +506,13 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
struct btree_iter iter = { NULL };
int ret = lockrestart_do(trans,
- __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+ bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;
}
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -518,7 +522,10 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
SPOS(dir, 0, snapshot),
POS(dir, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
- ret = -ENOTEMPTY;
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
+ continue;
+ ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -531,7 +538,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
- bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+ bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
}
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 21ffeb78f02e..bee55cca2aa0 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
-int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
bch_str_hash_flags_t);
@@ -62,14 +62,14 @@ int bch2_dirent_rename(struct btree_trans *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
subvol_inum, const struct bch_hash_info *,
const struct qstr *, subvol_inum *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
const struct bch_hash_info *,
const struct qstr *, subvol_inum *);
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d503af270024..556a217108d3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -131,29 +131,33 @@ fsck_err:
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+ const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
+ struct bch_stripe s = {};
+
+ memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
+
+ unsigned nr_data = s.nr_blocks - s.nr_redundant;
+
+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
+ s.algorithm,
+ le16_to_cpu(s.sectors),
+ nr_data,
+ s.nr_redundant);
+ bch2_prt_csum_type(out, s.csum_type);
+ prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+
+ for (unsigned i = 0; i < s.nr_blocks; i++) {
+ const struct bch_extent_ptr *ptr = sp->ptrs + i;
+
+ if ((void *) ptr >= bkey_val_end(k))
+ break;
+
+ bch2_extent_ptr_to_text(out, c, ptr);
- prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
- s->algorithm,
- le16_to_cpu(s->sectors),
- nr_data,
- s->nr_redundant,
- s->csum_type,
- 1U << s->csum_granularity_bits);
-
- for (i = 0; i < s->nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = s->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
- if (i < nr_data)
- prt_printf(out, "#%u", stripe_blockcount_get(s, i));
- prt_printf(out, " gen %u", ptr->gen);
- if (ptr_stale(ca, ptr))
- prt_printf(out, " stale");
+ if (s.csum_type < BCH_CSUM_NR &&
+ i < nr_data &&
+ stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
+ prt_printf(out, "#%u", stripe_blockcount_get(sp, i));
}
}
@@ -448,7 +452,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
printbuf_exit(&buf);
return ret;
}
@@ -504,7 +508,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
unsigned i;
for (i = 0; i < s->v.nr_blocks; i++) {
- kvpfree(buf->data[i], buf->size << 9);
+ kvfree(buf->data[i]);
buf->data[i] = NULL;
}
}
@@ -531,7 +535,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
memset(buf->valid, 0xFF, sizeof(buf->valid));
for (i = 0; i < v->nr_blocks; i++) {
- buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+ buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
if (!buf->data[i])
goto err;
}
@@ -607,10 +611,8 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct printbuf err = PRINTBUF;
struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
- prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
- want.hi, want.lo,
- got.hi, got.lo,
- bch2_csum_types[v->csum_type]);
+ prt_str(&err, "stripe ");
+ bch2_csum_err_msg(&err, v->csum_type, want, got);
prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
bch_err_ratelimited(ca, "%s", err.buf);
@@ -1868,10 +1870,10 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
return -BCH_ERR_stripe_alloc_blocked;
ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+ bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+ "reading stripe key: %s", bch2_err_str(ret));
if (ret) {
bch2_stripe_close(c, h->s);
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
return ret;
}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f4369b02e805..f042616888b0 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -32,6 +32,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
+ EBUG_ON(s->csum_type >= BCH_CSUM_NR);
+
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index d260ff9bbfeb..43557bebd0f8 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "errcode.h"
+#include "trace.h"
#include <linux/errname.h>
@@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class)
return err == class;
}
-int __bch2_err_class(int err)
+int __bch2_err_class(int bch_err)
{
- err = -err;
- BUG_ON((unsigned) err >= BCH_ERR_MAX);
+ int std_err = -bch_err;
+ BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
- while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
- err = bch2_errcode_parents[err - BCH_ERR_START];
+ while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
+ std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
+
+ trace_error_downcast(bch_err, std_err, _RET_IP_);
- return -err;
+ return -std_err;
}
const char *bch2_blk_status_to_str(blk_status_t status)
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 8c40c2067a04..01a79fa3eacb 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -5,6 +5,10 @@
#define BCH_ERRCODES() \
x(ERANGE, ERANGE_option_too_small) \
x(ERANGE, ERANGE_option_too_big) \
+ x(EINVAL, mount_option) \
+ x(BCH_ERR_mount_option, option_name) \
+ x(BCH_ERR_mount_option, option_value) \
+ x(BCH_ERR_mount_option, option_not_bool) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
@@ -78,6 +82,7 @@
x(ENOMEM, ENOMEM_fs_name_alloc) \
x(ENOMEM, ENOMEM_fs_other_alloc) \
x(ENOMEM, ENOMEM_dev_alloc) \
+ x(ENOMEM, ENOMEM_disk_accounting) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@@ -109,6 +114,8 @@
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
+ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
+ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(0, open_buckets_empty) \
x(0, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
@@ -176,6 +183,9 @@
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
x(EINVAL, opt_parse_error) \
+ x(EINVAL, remove_with_metadata_missing_unimplemented)\
+ x(EINVAL, remove_would_lose_data) \
+ x(EINVAL, btree_iter_with_journal_not_supported) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -225,7 +235,10 @@
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
x(EIO, sb_not_downgraded) \
- x(EIO, btree_write_all_failed) \
+ x(EIO, btree_node_write_all_failed) \
+ x(EIO, btree_node_read_error) \
+ x(EIO, btree_node_read_validate_error) \
+ x(EIO, btree_need_topology_repair) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -238,7 +251,9 @@
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
- x(BCH_ERR_nopromote, nopromote_enomem)
+ x(BCH_ERR_nopromote, nopromote_enomem) \
+ x(0, need_inode_lock) \
+ x(0, invalid_snapshot_node)
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d32c8bebe46c..82a6656c941c 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
+#include "journal.h"
+#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
@@ -15,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "inconsistency detected - emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
+ journal_cur_seq(&c->journal));
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
@@ -25,11 +28,16 @@ bool bch2_inconsistent_error(struct bch_fs *c)
}
}
-void bch2_topology_error(struct bch_fs *c)
+int bch2_topology_error(struct bch_fs *c)
{
set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_fsck_running, &c->flags))
+ if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
bch2_inconsistent_error(c);
+ return -BCH_ERR_btree_need_topology_repair;
+ } else {
+ return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
+ -BCH_ERR_btree_node_read_validate_error;
+ }
}
void bch2_fatal_error(struct bch_fs *c)
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index fec17d1353d1..36caedf72d89 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -30,7 +30,13 @@ struct work_struct;
bool bch2_inconsistent_error(struct bch_fs *);
-void bch2_topology_error(struct bch_fs *);
+int bch2_topology_error(struct bch_fs *);
+
+#define bch2_fs_topology_error(c, ...) \
+({ \
+ bch_err(c, "btree topology error: " __VA_ARGS__); \
+ bch2_topology_error(c); \
+})
#define bch2_fs_inconsistent(c, ...) \
({ \
@@ -191,9 +197,9 @@ do { \
void bch2_fatal_error(struct bch_fs *);
-#define bch2_fs_fatal_error(c, ...) \
+#define bch2_fs_fatal_error(c, _msg, ...) \
do { \
- bch_err(c, __VA_ARGS__); \
+ bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \
bch2_fatal_error(c); \
} while (0)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 61395b113df9..1a331e539204 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -189,13 +189,18 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
- btree_ptr_v2_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
+ c, err, btree_ptr_v2_val_too_big,
"value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+ bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
+ c, err, btree_ptr_v2_min_key_bad,
+ "min_key > key");
+
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
@@ -973,6 +978,33 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
return bkey_deleted(k.k);
}
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
+{
+ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ? bch_dev_bkey_exists(c, ptr->dev)
+ : NULL;
+
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u",
+ ptr->dev, b, offset, ptr->gen);
+ if (ptr->cached)
+ prt_str(out, " cached");
+ if (ptr->unwritten)
+ prt_str(out, " unwritten");
+ if (b >= ca->mi.first_bucket &&
+ b < ca->mi.nbuckets &&
+ ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
+}
+
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -988,42 +1020,22 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr: {
- const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
- struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
- ? bch_dev_bkey_exists(c, ptr->dev)
- : NULL;
-
- if (!ca) {
- prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "");
- } else {
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, "ptr: %u:%llu:%u gen %u",
- ptr->dev, b, offset, ptr->gen);
- if (ptr->cached)
- prt_str(out, " cached");
- if (ptr->unwritten)
- prt_str(out, " unwritten");
- if (ca && ptr_stale(ca, ptr))
- prt_printf(out, " stale");
- }
+ case BCH_EXTENT_ENTRY_ptr:
+ bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
break;
- }
+
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128: {
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
crc.compressed_size,
crc.uncompressed_size,
- crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type]);
+ crc.offset, crc.nonce);
+ bch2_prt_csum_type(out, crc.csum_type);
+ prt_str(out, " compress ");
bch2_prt_compression_type(out, crc.compression_type);
break;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6bf839d69e84..528e817eacbd 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -43,6 +43,11 @@ enum bkey_invalid_flags;
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+#define extent_entry_next_safe(_entry, _end) \
+ (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \
+ ? extent_entry_next(_entry) \
+ : _end)
+
static inline unsigned
__extent_entry_type(const union bch_extent_entry *e)
{
@@ -103,17 +108,17 @@ static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *en
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+ return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
}
static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+ return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
}
static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
{
- switch (extent_entry_type(e)) {
+ switch (__extent_entry_type(e)) {
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
@@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
for ((_entry) = (_start); \
(_entry) < (_end); \
- (_entry) = extent_entry_next(_entry))
+ (_entry) = extent_entry_next_safe(_entry, _end))
#define __bkey_ptr_next(_ptr, _end) \
({ \
@@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
(_ptr).has_ec = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
- switch (extent_entry_type(_entry)) { \
+ switch (__extent_entry_type(_entry)) { \
case BCH_EXTENT_ENTRY_ptr: \
(_ptr).ptr = _entry->ptr; \
goto out; \
@@ -344,7 +349,7 @@ out: \
for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
(_entry) = _start; \
__bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
- (_entry) = extent_entry_next(_entry))
+ (_entry) = extent_entry_next_safe(_entry, _end))
#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
@@ -591,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
return ret;
}
-static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return BCH_DATA_btree;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return BCH_DATA_user;
- case KEY_TYPE_stripe: {
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- BUG_ON(ptr < s.v->ptrs ||
- ptr >= s.v->ptrs + s.v->nr_blocks);
-
- return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity
- : BCH_DATA_user;
- }
- default:
- BUG();
- }
-}
-
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
@@ -695,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
new file mode 100644
index 000000000000..0f955c3c76a7
--- /dev/null
+++ b/fs/bcachefs/eytzinger.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "eytzinger.h"
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+ unsigned char lsbits = (unsigned char)size;
+
+ (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+ return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+ do {
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+ do {
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES (swap_r_func_t)2
+#define SWAP_WRAPPER (swap_r_func_t)3
+
+struct wrapper {
+ cmp_func_t cmp;
+ swap_func_t swap_func;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+ if (swap_func == SWAP_WRAPPER) {
+ ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
+ return;
+ }
+
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64(a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32(a, b, size);
+ else if (swap_func == SWAP_BYTES)
+ swap_bytes(a, b, size);
+ else
+ swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+ if (cmp == _CMP_WRAPPER)
+ return ((const struct wrapper *)priv)->cmp(a, b);
+ return cmp(a, b, priv);
+}
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func, const void *priv,
+ size_t l, size_t r)
+{
+ return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+ swap_r_func_t swap_func, const void *priv,
+ size_t l, size_t r)
+{
+ do_swap(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ int i, c, r;
+
+ /* called from 'sort' without swap function, let's pick the default */
+ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
+ swap_func = NULL;
+
+ if (!swap_func) {
+ if (is_aligned(base, size, 8))
+ swap_func = SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ swap_func = SWAP_WORDS_32;
+ else
+ swap_func = SWAP_BYTES;
+ }
+
+ /* heapify */
+ for (i = n / 2 - 1; i >= 0; --i) {
+ for (r = i; r * 2 + 1 < n; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < n &&
+ eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+ c++;
+
+ if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+ break;
+
+ eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; --i) {
+ eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+ for (r = 0; r * 2 + 1 < i; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < i &&
+ eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+ c++;
+
+ if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+ break;
+
+ eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ }
+ }
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+ cmp_func_t cmp_func,
+ swap_func_t swap_func)
+{
+ struct wrapper w = {
+ .cmp = cmp_func,
+ .swap_func = swap_func,
+ };
+
+ return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index b04750dbf870..24840aee335c 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -5,23 +5,33 @@
#include <linux/bitops.h>
#include <linux/log2.h>
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
+ *
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
*
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + child;
}
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
/*
* sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + 1 + child;
}
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
/* return greatest node <= @search, or -1 if not found */
-static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
- eytzinger_cmp_fn cmp, const void *search)
+static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
@@ -244,21 +252,38 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
do {
i = n;
- n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+ n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
} while (n < nr);
if (n & 1) {
- /* @i was greater than @search, return previous node: */
-
- if (i == eytzinger0_first(nr))
- return -1;
-
+ /*
+ * @i was greater than @search, return previous node:
+ *
+ * if @i was leftmost/smallest element,
+ * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
+ */
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
+static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
+{
+ ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+ /*
+ * if eytitzinger0_find_le() returned -1 - no element was <= search - we
+ * want to return the first element; next/prev identities mean this work
+ * as expected
+ *
+ * similarly if find_le() returns last element, we should return -1;
+ * identities mean this all works out:
+ */
+ return eytzinger0_next(idx, nr);
+}
+
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
void *_base = (base); \
@@ -269,13 +294,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
int _res; \
\
while (_i < _nr && \
- (_res = _cmp(_search, _base + _i * _size, _size))) \
+ (_res = _cmp(_search, _base + _i * _size))) \
_i = eytzinger0_child(_i, _res > 0); \
_i; \
})
-void eytzinger0_sort(void *, size_t, size_t,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+ cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index 66b945be10c2..d8153fe27037 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -24,12 +24,12 @@ struct { \
(fifo)->mask = (fifo)->size \
? roundup_pow_of_two((fifo)->size) - 1 \
: 0; \
- (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
+ (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \
})
#define free_fifo(fifo) \
do { \
- kvpfree((fifo)->data, fifo_buf_size(fifo)); \
+ kvfree((fifo)->data); \
(fifo)->data = NULL; \
} while (0)
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 1c1ea0f0c692..624e6f963240 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans,
u32 new_subvol, dir_snapshot;
ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+ dir.subvol,
snapshot_src.subvol,
&new_subvol, &snapshot,
(flags & BCH_CREATE_SNAPSHOT_RO) != 0);
@@ -242,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
const struct qstr *name,
- bool deleting_snapshot)
+ bool deleting_subvol)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
@@ -260,8 +261,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_hash = bch2_hash_info_init(c, dir_u);
- ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
- name, &inum, BTREE_ITER_INTENT);
+ ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
@@ -270,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+ if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
ret = bch2_empty_dir_trans(trans, inum);
if (ret)
goto err;
}
- if (deleting_snapshot && !inode_u->bi_subvol) {
+ if (deleting_subvol && !inode_u->bi_subvol) {
ret = -BCH_ERR_ENOENT_not_subvol;
goto err;
}
- if (deleting_snapshot || inode_u->bi_subvol) {
+ if (inode_u->bi_subvol) {
+ /* Recursive subvolume destroy not allowed (yet?) */
+ ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
+ if (ret)
+ goto err;
+ }
+
+ if (deleting_subvol || inode_u->bi_subvol) {
ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
if (ret)
goto err;
@@ -349,6 +357,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
return ret;
}
+static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
+{
+ struct btree_iter iter;
+ struct bkey_i_subvolume *s =
+ bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvol),
+ BTREE_ITER_CACHED, subvolume);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ s->v.fs_path_parent = cpu_to_le32(new_parent);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
int bch2_rename_trans(struct btree_trans *trans,
subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
@@ -410,6 +434,36 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
+ if (src_inode_u->bi_subvol &&
+ dst_dir.subvol != src_inode_u->bi_parent_subvol) {
+ ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
+ if (ret)
+ goto err;
+ }
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ dst_inode_u->bi_subvol &&
+ src_dir.subvol != dst_inode_u->bi_parent_subvol) {
+ ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
+ if (ret)
+ goto err;
+ }
+
+ /* Can't move across subvolumes, unless it's a subvolume root: */
+ if (src_dir.subvol != dst_dir.subvol &&
+ (!src_inode_u->bi_subvol ||
+ (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+ ret = -EXDEV;
+ goto err;
+ }
+
+ if (src_inode_u->bi_parent_subvol)
+ src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+ if ((mode == BCH_RENAME_EXCHANGE) &&
+ dst_inode_u->bi_parent_subvol)
+ dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
src_inode_u->bi_dir = dst_dir_u->bi_inum;
src_inode_u->bi_dir_offset = dst_offset;
@@ -432,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inum)) {
- ret = -ENOTEMPTY;
- goto err;
+ if (S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = bch2_empty_dir_trans(trans, dst_inum);
+ if (ret)
+ goto err;
}
}
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 27710cdd5710..39292e7ef342 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
- loff_t pos, unsigned len)
+ loff_t pos, unsigned len,
+ bool inode_locked)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr);
+ /*
+ * If we're not using the inode lock, we need to lock all the folios for
+ * atomiticity of writes vs. other writes:
+ */
+ if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
+ ret = -BCH_ERR_need_inode_lock;
+ goto out;
+ }
+
f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping);
@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied;
spin_lock(&inode->v.i_lock);
- if (end > inode->v.i_size)
+ if (end > inode->v.i_size) {
+ BUG_ON(!inode_locked);
i_size_write(&inode->v, end);
+ }
spin_unlock(&inode->v.i_lock);
f_pos = pos;
@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
- loff_t pos = iocb->ki_pos;
- ssize_t written = 0;
- int ret = 0;
+ loff_t pos;
+ bool inode_locked = false;
+ ssize_t written = 0, written2 = 0, ret = 0;
+
+ /*
+ * We don't take the inode lock unless i_size will be changing. Folio
+ * locks provide exclusion with other writes, and the pagecache add lock
+ * provides exclusion with truncate and hole punching.
+ *
+ * There is one nasty corner case where atomicity would be broken
+ * without great care: when copying data from userspace to the page
+ * cache, we do that with faults disable - a page fault would recurse
+ * back into the filesystem, taking filesystem locks again, and
+ * deadlock; so it's done with faults disabled, and we fault in the user
+ * buffer when we aren't holding locks.
+ *
+ * If we do part of the write, but we then race and in the userspace
+ * buffer have been evicted and are no longer resident, then we have to
+ * drop our folio locks to re-fault them in, breaking write atomicity.
+ *
+ * To fix this, we restart the write from the start, if we weren't
+ * holding the inode lock.
+ *
+ * There is another wrinkle after that; if we restart the write from the
+ * start, and then get an unrecoverable error, we _cannot_ claim to
+ * userspace that we did not write data we actually did - so we must
+ * track (written2) the most we ever wrote.
+ */
+
+ if ((iocb->ki_flags & IOCB_APPEND) ||
+ (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
+ inode_lock(&inode->v);
+ inode_locked = true;
+ }
+
+ ret = generic_write_checks(iocb, iter);
+ if (ret <= 0)
+ goto unlock;
+
+ ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
+ if (ret) {
+ if (!inode_locked) {
+ inode_lock(&inode->v);
+ inode_locked = true;
+ ret = file_remove_privs_flags(file, 0);
+ }
+ if (ret)
+ goto unlock;
+ }
+
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+
+ pos = iocb->ki_pos;
bch2_pagecache_add_get(inode);
+ if (!inode_locked &&
+ (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
+ goto get_inode_lock;
+
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter);
@@ -1004,12 +1072,17 @@ again:
}
}
+ if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
+ goto get_inode_lock;
+
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+ if (ret == -BCH_ERR_need_inode_lock)
+ goto get_inode_lock;
if (unlikely(ret < 0))
break;
@@ -1030,50 +1103,46 @@ again:
}
pos += ret;
written += ret;
+ written2 = max(written, written2);
+
+ if (ret != bytes && !inode_locked)
+ goto get_inode_lock;
ret = 0;
balance_dirty_pages_ratelimited(mapping);
- } while (iov_iter_count(iter));
+ if (0) {
+get_inode_lock:
+ bch2_pagecache_add_put(inode);
+ inode_lock(&inode->v);
+ inode_locked = true;
+ bch2_pagecache_add_get(inode);
+
+ iov_iter_revert(iter, written);
+ pos -= written;
+ written = 0;
+ ret = 0;
+ }
+ } while (iov_iter_count(iter));
bch2_pagecache_add_put(inode);
+unlock:
+ if (inode_locked)
+ inode_unlock(&inode->v);
+
+ iocb->ki_pos += written;
- return written ? written : ret;
+ ret = max(written, written2) ?: ret;
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
}
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- ssize_t ret;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = bch2_direct_write(iocb, from);
- goto out;
- }
-
- inode_lock(&inode->v);
-
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto unlock;
-
- ret = file_remove_privs(file);
- if (ret)
- goto unlock;
-
- ret = file_update_time(file);
- if (ret)
- goto unlock;
-
- ret = bch2_buffered_write(iocb, from);
- if (likely(ret > 0))
- iocb->ki_pos += ret;
-unlock:
- inode_unlock(&inode->v);
+ ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+ ? bch2_direct_write(iocb, iter)
+ : bch2_buffered_write(iocb, iter);
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-out:
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 33cb6da3a5ad..b889370a5088 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
+ bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
@@ -536,7 +538,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
if (likely(!dio->iter.count) || dio->op.error)
break;
- bio_reset(bio, NULL, REQ_OP_WRITE);
+ bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
}
out:
return bch2_dio_write_done(dio);
@@ -590,22 +592,25 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
prefetch(&inode->ei_inode);
prefetch((void *) &inode->ei_inode + 64);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+ return -EROFS;
+
inode_lock(&inode->v);
ret = generic_write_checks(req, iter);
if (unlikely(ret <= 0))
- goto err;
+ goto err_put_write_ref;
ret = file_remove_privs(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
ret = file_update_time(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
- goto err;
+ goto err_put_write_ref;
inode_dio_begin(&inode->v);
bch2_pagecache_block_get(inode);
@@ -618,7 +623,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
bio = bio_alloc_bioset(NULL,
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_WRITE,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
@@ -645,7 +650,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
}
ret = bch2_dio_write_loop(dio);
-err:
+out:
if (locked)
inode_unlock(&inode->v);
return ret;
@@ -653,7 +658,9 @@ err_put_bio:
bch2_pagecache_block_put(inode);
bio_put(bio);
inode_dio_end(&inode->v);
- goto err;
+err_put_write_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ goto out;
}
void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 8cbaba6565b4..828c3d7c8f19 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -51,13 +51,10 @@ enum bch_folio_sector_state {
struct bch_folio_sector {
/* Uncompressed, fully allocated replicas (or on disk reservation): */
- unsigned nr_replicas:4;
-
+ u8 nr_replicas:4,
/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
- unsigned replicas_reserved:4;
-
- /* i_sectors: */
- enum bch_folio_sector_state state:8;
+ replicas_reserved:4;
+ u8 state;
};
struct bch_folio {
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8c70123b6a0c..20b40477425f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
static int bch2_flush_inode(struct bch_fs *c,
struct bch_inode_info *inode)
{
- struct bch_inode_unpacked u;
- int ret;
-
if (c->opts.journal_flush_disabled)
return 0;
- ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
- if (ret)
- return ret;
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+ return -EROFS;
- return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
- bch2_inode_flush_nocow_writes(c, inode);
+ struct bch_inode_unpacked u;
+ int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
+ bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_inode_flush_nocow_writes(c, inode);
+ bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+ return ret;
}
int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 77ae65542db9..b5ea9fa1259d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,8 @@ retry:
goto retry;
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
- "inode %u:%llu not found when updating",
+ "%s: inode %u:%llu not found when updating",
+ bch2_err_str(ret),
inode_inum(inode).subvol,
inode_inum(inode).inum);
@@ -176,45 +177,88 @@ static unsigned bch2_inode_hash(subvol_inum inum)
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
{
- struct bch_inode_unpacked inode_u;
- struct bch_inode_info *inode;
- struct btree_trans *trans;
- struct bch_subvolume subvol;
- int ret;
+ subvol_inum inum = inode_inum(inode);
+ struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
+ BUG_ON(!old);
- inode = to_bch_ei(iget5_locked(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->v.i_state & I_NEW))
- return &inode->v;
+ if (unlikely(old != inode)) {
+ discard_new_inode(&inode->v);
+ inode = old;
+ } else {
+ mutex_lock(&c->vfs_inodes_lock);
+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ mutex_unlock(&c->vfs_inodes_lock);
+ /*
+ * we really don't want insert_inode_locked2() to be setting
+ * I_NEW...
+ */
+ unlock_new_inode(&inode->v);
+ }
- trans = bch2_trans_get(c);
- ret = lockrestart_do(trans,
- bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
- bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+ return inode;
+}
- if (!ret)
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- bch2_trans_put(trans);
+#define memalloc_flags_do(_flags, _do) \
+({ \
+ unsigned _saved_flags = memalloc_flags_save(_flags); \
+ typeof(_do) _ret = _do; \
+ memalloc_noreclaim_restore(_saved_flags); \
+ _ret; \
+})
- if (ret) {
- iget_failed(&inode->v);
- return ERR_PTR(bch2_err_class(ret));
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+
+ struct bch_inode_info *inode =
+ memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
+ to_bch_ei(new_inode(c->vfs_sb)));
+
+ if (unlikely(!inode)) {
+ int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
+ if (ret && inode)
+ discard_new_inode(&inode->v);
+ if (ret)
+ return ERR_PTR(ret);
}
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
+ return inode;
+}
- unlock_new_inode(&inode->v);
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+ struct bch_inode_info *inode =
+ to_bch_ei(ilookup5_nowait(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ &inum));
+ if (inode)
+ return &inode->v;
- return &inode->v;
+ struct btree_trans *trans = bch2_trans_get(c);
+
+ struct bch_inode_unpacked inode_u;
+ struct bch_subvolume subvol;
+ int ret = lockrestart_do(trans,
+ bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+ PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+ if (!ret) {
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ inode = bch2_inode_insert(c, inode);
+ }
+ bch2_trans_put(trans);
+
+ return ret ? ERR_PTR(ret) : &inode->v;
}
struct bch_inode_info *
@@ -226,7 +270,7 @@ __bch2_create(struct mnt_idmap *idmap,
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans *trans;
struct bch_inode_unpacked dir_u;
- struct bch_inode_info *inode, *old;
+ struct bch_inode_info *inode;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
subvol_inum inum;
@@ -293,7 +337,6 @@ err_before_quota:
mutex_unlock(&dir->ei_update_lock);
}
- bch2_iget5_set(&inode->v, &inum);
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -304,36 +347,7 @@ err_before_quota:
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
*/
-
- inode->v.i_state |= I_CREATING;
-
- old = to_bch_ei(inode_insert5(&inode->v,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- BUG_ON(!old);
-
- if (unlikely(old != inode)) {
- /*
- * We raced, another process pulled the new inode into cache
- * before us:
- */
- make_bad_inode(&inode->v);
- iput(&inode->v);
-
- inode = old;
- } else {
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
- /*
- * we really don't want insert_inode_locked2() to be setting
- * I_NEW...
- */
- unlock_new_inode(&inode->v);
- }
-
+ inode = bch2_inode_insert(c, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@@ -352,23 +366,78 @@ err_trans:
/* methods */
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+ subvol_inum dir, struct bch_hash_info *dir_hash_info,
+ const struct qstr *name)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dirent_iter = {};
+ subvol_inum inum = {};
+
+ int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+ dir_hash_info, dir, name, 0);
+ if (ret)
+ return ERR_PTR(ret);
+
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret)
+ goto err;
+
+ struct bch_inode_info *inode =
+ to_bch_ei(ilookup5_nowait(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ &inum));
+ if (inode)
+ goto out;
+
+ struct bch_subvolume subvol;
+ struct bch_inode_unpacked inode_u;
+ ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+ PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+ if (bch2_err_matches(ret, ENOENT)) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "%s points to missing inode", buf.buf);
+ printbuf_exit(&buf);
+ }
+ if (ret)
+ goto err;
+
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ inode = bch2_inode_insert(c, inode);
+out:
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ return inode;
+err:
+ inode = ERR_PTR(ret);
+ goto out;
+}
+
static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
unsigned int flags)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
- struct inode *vinode = NULL;
- subvol_inum inum = { .subvol = 1 };
- int ret;
-
- ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
- &dentry->d_name, &inum);
- if (!ret)
- vinode = bch2_vfs_inode_get(c, inum);
+ struct bch_inode_info *inode;
+ bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+ &hash, &dentry->d_name)));
+ if (IS_ERR(inode))
+ inode = NULL;
- return d_splice_alias(vinode, dentry);
+ return d_splice_alias(&inode->v, dentry);
}
static int bch2_mknod(struct mnt_idmap *idmap,
@@ -1372,6 +1441,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
+ bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0);
if (BCH_SUBVOLUME_SNAP(subvol))
@@ -1572,7 +1642,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
* number:
*/
u64 avail_inodes = ((usage.capacity - usage.used) << 3);
- u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_bsize = sb->s_blocksize;
@@ -1583,10 +1652,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = usage.nr_inodes + avail_inodes;
buf->f_ffree = avail_inodes;
- fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
- le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b);
buf->f_namelen = BCH_NAME_MAX;
return 0;
@@ -1805,8 +1871,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
ret = bch2_parse_mount_opts(NULL, &opts, data);
- if (ret)
+ if (ret) {
+ ret = bch2_err_class(ret);
return ERR_PTR(ret);
+ }
if (!dev_name || strlen(dev_name) == 0)
return ERR_PTR(-EINVAL);
@@ -1882,6 +1950,7 @@ got_sb:
sb->s_time_gran = c->sb.nsec_per_time_unit;
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
+ sb->s_uuid = c->sb.user_uuid;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
@@ -1928,6 +1997,7 @@ out:
return dget(sb->s_root);
err_put_super:
+ __bch2_fs_stop(c);
deactivate_locked_super(sb);
return ERR_PTR(bch2_err_class(ret));
}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6a760777bafb..8e2010212cc3 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,7 +12,7 @@
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
#include "xattr.h"
@@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
- int ret;
-
- ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+ int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
*snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode);
@@ -100,8 +98,8 @@ err:
}
static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -142,34 +140,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
return 0;
}
-static int __write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- struct bkey_inode_buf *inode_p =
- bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack(inode_p, inode);
- inode_p->inode.k.p.snapshot = snapshot;
-
- return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
- &inode_p->inode.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static int fsck_write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __write_inode(trans, inode, snapshot));
- bch_err_fn(trans->c, ret);
- return ret;
-}
-
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
@@ -186,9 +156,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
- ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
err:
bch_err_fn(c, ret);
@@ -197,7 +168,8 @@ err:
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
- struct bch_inode_unpacked *lostfound)
+ struct bch_inode_unpacked *lostfound,
+ u64 reattaching_inum)
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
@@ -212,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
return ret;
subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
- u32 subvol_snapshot;
- ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
- &subvol_snapshot, &root_inum.inum);
- bch_err_msg(c, ret, "looking up root subvol");
+ struct bch_subvolume subvol;
+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
+ false, 0, &subvol);
+ bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
+ le32_to_cpu(st.master_subvol), snapshot);
if (ret)
return ret;
+ if (!subvol.inode) {
+ struct btree_iter iter;
+ struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(subvol);
+ if (ret)
+ return ret;
+
+ subvol->v.inode = cpu_to_le64(reattaching_inum);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ root_inum.inum = le64_to_cpu(subvol.inode);
+
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
u32 root_inode_snapshot = snapshot;
ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
- bch_err_msg(c, ret, "looking up root inode");
+ bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
+ root_inum.inum, le32_to_cpu(st.master_subvol));
if (ret)
return ret;
@@ -280,7 +269,7 @@ create_lostfound:
goto err;
ret = bch2_dirent_create_snapshot(trans,
- root_inode.bi_inum, snapshot, &root_hash_info,
+ 0, root_inode.bi_inum, snapshot, &root_hash_info,
mode_to_type(lostfound->bi_mode),
&lostfound_str,
lostfound->bi_inum,
@@ -303,30 +292,47 @@ static int reattach_inode(struct btree_trans *trans,
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
+ u32 dirent_snapshot = inode_snapshot;
int ret;
- ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
+ if (inode->bi_subvol) {
+ inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ u64 root_inum;
+ ret = subvol_lookup(trans, inode->bi_parent_subvol,
+ &dirent_snapshot, &root_inum);
+ if (ret)
+ return ret;
+
+ snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
+ } else {
+ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+ }
+
+ ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
if (ret)
return ret;
if (S_ISDIR(inode->bi_mode)) {
lostfound.bi_nlink++;
- ret = __write_inode(trans, &lostfound, U32_MAX);
+ ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
if (ret)
return ret;
}
dir_hash = bch2_hash_info_init(trans->c, &lostfound);
- snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
name = (struct qstr) QSTR(name_buf);
ret = bch2_dirent_create_snapshot(trans,
- lostfound.bi_inum, inode_snapshot,
+ inode->bi_parent_subvol, lostfound.bi_inum,
+ dirent_snapshot,
&dir_hash,
inode_d_type(inode),
- &name, inode->bi_inum, &dir_offset,
+ &name,
+ inode->bi_subvol ?: inode->bi_inum,
+ &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
return ret;
@@ -334,7 +340,7 @@ static int reattach_inode(struct btree_trans *trans,
inode->bi_dir = lostfound.bi_inum;
inode->bi_dir_offset = dir_offset;
- return __write_inode(trans, inode, inode_snapshot);
+ return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
}
static int remove_backpointer(struct btree_trans *trans,
@@ -353,6 +359,133 @@ static int remove_backpointer(struct btree_trans *trans,
return ret;
}
+static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+ struct bch_fs *c = trans->c;
+
+ struct bch_inode_unpacked inode;
+ int ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &inode);
+ if (ret)
+ return ret;
+
+ ret = remove_backpointer(trans, &inode);
+ bch_err_msg(c, ret, "removing dirent");
+ if (ret)
+ return ret;
+
+ ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
+ bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+ return ret;
+}
+
+static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
+{
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_snapshot_is_leaf(c, snapshotid)) {
+ bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ /*
+ * If inum isn't set, that means we're being called from check_dirents,
+ * not check_inodes - the root of this subvolume doesn't exist or we
+ * would have found it there:
+ */
+ if (!inum) {
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked new_inode;
+ u64 cpu = raw_smp_processor_id();
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+
+ new_inode.bi_subvol = subvolid;
+
+ int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+ bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, &new_inode);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ if (ret)
+ return ret;
+
+ inum = new_inode.bi_inum;
+ }
+
+ bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
+
+ struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ int ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ return ret;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->k.p.offset = subvolid;
+ new_subvol->v.snapshot = cpu_to_le32(snapshotid);
+ new_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
+ if (ret)
+ return ret;
+
+ struct btree_iter iter;
+ struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshots, POS(0, snapshotid),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
+ if (ret)
+ return ret;
+
+ u32 snapshot_tree = le32_to_cpu(s->v.tree);
+
+ s->v.subvol = cpu_to_le32(subvolid);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
+ bch2_trans_iter_exit(trans, &iter);
+
+ struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(st);
+ bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
+ if (ret)
+ return ret;
+
+ if (!st->v.master_subvol)
+ st->v.master_subvol = cpu_to_le32(subvolid);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked new_inode;
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
+ new_inode.bi_size = size;
+ new_inode.bi_inum = inum;
+
+ return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
+}
+
+static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
+{
+ struct btree_iter iter = {};
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
+}
+
struct snapshots_seen_entry {
u32 id;
u32 equiv;
@@ -592,13 +725,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
}
static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
- u32 snapshot, bool is_whiteout)
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
{
- struct inode_walker_entry *i;
-
- snapshot = bch2_snapshot_equiv(c, snapshot);
+ bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
+ u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ struct inode_walker_entry *i;
__darray_for_each(w->inodes, i)
if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
goto found;
@@ -609,20 +741,24 @@ found:
if (snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- size_t pos;
- int ret;
new.snapshot = snapshot;
new.count = 0;
- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
- w->last_pos.inode, snapshot, i->snapshot);
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+ "unexpected because we should always update the inode when we update a key in that inode\n"
+ "%s",
+ w->last_pos.inode, snapshot, i->snapshot, buf.buf);
+ printbuf_exit(&buf);
while (i > w->inodes.data && i[-1].snapshot > snapshot)
--i;
- pos = i - w->inodes.data;
- ret = darray_insert_item(&w->inodes, pos, new);
+ size_t pos = i - w->inodes.data;
+ int ret = darray_insert_item(&w->inodes, pos, new);
if (ret)
return ERR_PTR(ret);
@@ -633,21 +769,21 @@ found:
}
static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
- struct inode_walker *w, struct bpos pos,
- bool is_whiteout)
+ struct inode_walker *w,
+ struct bkey_s_c k)
{
- if (w->last_pos.inode != pos.inode) {
- int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+ if (w->last_pos.inode != k.k->p.inode) {
+ int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
- } else if (bkey_cmp(w->last_pos, pos)) {
+ } else if (bkey_cmp(w->last_pos, k.k->p)) {
darray_for_each(w->inodes, i)
i->seen_this_pos = false;
}
- w->last_pos = pos;
+ w->last_pos = k.k->p;
- return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+ return lookup_inode_for_snapshot(trans->c, w, k);
}
static int __get_visible_inodes(struct btree_trans *trans,
@@ -722,7 +858,7 @@ static int hash_redo_key(struct btree_trans *trans,
delete->k.p = k_iter->pos;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set_snapshot(trans, desc, hash_info,
+ bch2_hash_set_in_snapshot(trans, desc, hash_info,
(subvol_inum) { 0, k.k->p.inode },
k.k->p.snapshot, tmp,
BCH_HASH_SET_MUST_CREATE,
@@ -795,16 +931,93 @@ fsck_err:
goto out;
}
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
+{
+ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
+static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
+{
+ if (inode->bi_subvol) {
+ u64 inum;
+ int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
+ if (ret)
+ return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
+ }
+
+ return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ return d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
+}
+
static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
- int ret = bkey_err(k);
- if (ret)
+ int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot, bool *write_inode)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ struct btree_iter dirent_iter = {};
+ struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
+ int ret = bkey_err(d);
+ if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- bch2_trans_iter_exit(trans, &iter);
- return k.k->type == KEY_TYPE_set;
+ if (fsck_err_on(ret,
+ c, inode_points_to_missing_dirent,
+ "inode points to missing dirent\n%s",
+ (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
+ fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
+ c, inode_points_to_wrong_dirent,
+ "inode points to dirent that does not point back:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, inode_k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ /*
+ * We just clear the backpointer fields for now. If we find a
+ * dirent that points to this inode in check_dirents(), we'll
+ * update it then; then when we get to check_path() if the
+ * backpointer is still 0 we'll reattach it.
+ */
+ inode->bi_dir = 0;
+ inode->bi_dir_offset = 0;
+ inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
+ *write_inode = true;
+ }
+
+ ret = 0;
+fsck_err:
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
}
static int check_inode(struct btree_trans *trans,
@@ -861,7 +1074,8 @@ static int check_inode(struct btree_trans *trans,
u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
- ret = __write_inode(trans, &u, iter->pos.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
return ret;
@@ -876,7 +1090,7 @@ static int check_inode(struct btree_trans *trans,
if (ret < 0)
return ret;
- fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+ fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
"inode %llu:%u unlinked, but not on deleted list",
u.bi_inum, k.k->p.snapshot);
ret = 0;
@@ -950,8 +1164,54 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
+ if (u.bi_dir || u.bi_dir_offset) {
+ ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
+ if (ret)
+ goto err;
+ }
+
+ if (fsck_err_on(u.bi_parent_subvol &&
+ (u.bi_subvol == 0 ||
+ u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
+ c, inode_bi_parent_nonzero,
+ "inode %llu:%u has subvol %u but nonzero parent subvol %u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
+ u.bi_parent_subvol = 0;
+ do_update = true;
+ }
+
+ if (u.bi_subvol) {
+ struct bch_subvolume s;
+
+ ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
+ goto do_update;
+ }
+
+ if (fsck_err_on(ret,
+ c, inode_bi_subvol_missing,
+ "inode %llu:%u bi_subvol points to missing subvolume %u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+ fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+ !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+ k.k->p.snapshot),
+ c, inode_bi_subvol_wrong,
+ "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+ le64_to_cpu(s.inode),
+ le32_to_cpu(s.snapshot))) {
+ u.bi_subvol = 0;
+ u.bi_parent_subvol = 0;
+ do_update = true;
+ }
+ }
+do_update:
if (do_update) {
- ret = __write_inode(trans, &u, iter->pos.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
return ret;
@@ -982,32 +1242,9 @@ int bch2_check_inodes(struct bch_fs *c)
return ret;
}
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos)
-{
- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
-{
- return d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1021,8 +1258,8 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
i->count = count2;
if (i->count != count2) {
- bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
return -BCH_ERR_internal_fsck_err;
}
@@ -1032,14 +1269,21 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
w->last_pos.inode, i->snapshot,
i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
- ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
}
fsck_err:
bch_err_fn(c, ret);
- return ret ?: trans_was_restarted(trans, restart_count);
+ return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+ u32 restart_count = trans->restart_count;
+ return check_i_sectors_notnested(trans, w) ?:
+ trans_was_restarted(trans, restart_count);
}
struct extent_end {
@@ -1255,10 +1499,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
goto err;
}
- ret = extent_ends_at(c, extent_ends, seen, k);
- if (ret)
- goto err;
-
extent_ends->last_pos = k.k->p;
err:
return ret;
@@ -1312,7 +1552,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
goto err;
@@ -1322,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s",
(printbuf_reset(&buf),
@@ -1388,6 +1639,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
i->seen_this_pos = true;
}
+
+ if (k.k->type != KEY_TYPE_whiteout) {
+ ret = extent_ends_at(c, extent_ends, s, k);
+ if (ret)
+ goto err;
+ }
out:
err:
fsck_err:
@@ -1423,7 +1680,7 @@ int bch2_check_extents(struct bch_fs *c)
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
- check_i_sectors(trans, &w));
+ check_i_sectors_notnested(trans, &w));
bch2_disk_reservation_put(c, &res);
extent_ends_exit(&extent_ends);
@@ -1453,10 +1710,9 @@ int bch2_check_indirect_extents(struct bch_fs *c)
return ret;
}
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1469,8 +1725,8 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
return count2;
if (i->count != count2) {
- bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
- i->count, count2);
+ bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
@@ -1481,96 +1737,123 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
- ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
}
fsck_err:
bch_err_fn(c, ret);
- return ret ?: trans_was_restarted(trans, restart_count);
+ return ret;
}
-static int check_dirent_target(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- u32 target_snapshot)
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+ u32 restart_count = trans->restart_count;
+ return check_subdir_count_notnested(trans, w) ?:
+ trans_was_restarted(trans, restart_count);
+}
+
+static int check_dirent_inode_dirent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
{
struct bch_fs *c = trans->c;
- struct bkey_i_dirent *n;
struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
int ret = 0;
+ if (inode_points_to_dirent(target, d))
+ return 0;
+
if (!target->bi_dir &&
!target->bi_dir_offset) {
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
+ return __bch2_fsck_write_inode(trans, target, target_snapshot);
}
- if (!inode_points_to_dirent(target, d)) {
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
- SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
+ struct btree_iter bp_iter = { NULL };
+ struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+ SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
- bool backpointer_exists = !ret;
- ret = 0;
+ bool backpointer_exists = !ret;
+ ret = 0;
+
+ if (fsck_err_on(!backpointer_exists,
+ c, inode_wrong_backpointer,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+ goto out;
+ }
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- if (backpointer_exists)
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ if (backpointer_exists)
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+ if (fsck_err_on(backpointer_exists &&
+ (S_ISDIR(target->bi_mode) ||
+ target->bi_subvol),
+ c, inode_dir_multiple_links,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target_snapshot, buf.buf)) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto out;
+ }
- if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
- c, inode_dir_multiple_links,
- "directory %llu:%u with multiple links\n%s",
- target->bi_inum, target_snapshot, buf.buf)) {
- ret = __remove_dirent(trans, d.k->p);
- goto out;
- }
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
+ if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+ c, inode_multiple_links_but_nlink_0,
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
- c, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
- }
+static int check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_dirent *n;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- if (fsck_err_on(!backpointer_exists,
- c, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
- }
- }
+ ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
+ if (ret)
+ goto err;
if (fsck_err_on(d.v->d_type != inode_d_type(target),
c, dirent_d_type_wrong,
@@ -1586,6 +1869,12 @@ static int check_dirent_target(struct btree_trans *trans,
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = inode_d_type(target);
+ if (n->v.d_type == DT_SUBVOL) {
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+ } else {
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
+ }
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
@@ -1593,33 +1882,163 @@ static int check_dirent_target(struct btree_trans *trans,
d = dirent_i_to_s_c(n);
}
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
- if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
- target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
- c, dirent_d_parent_subvol_wrong,
- "dirent has wrong d_parent_subvol field: got %u, should be %u",
- le32_to_cpu(d.v->d_parent_subvol),
- target->bi_parent_subvol)) {
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
+/* find a subvolume that's a descendent of @snapshot: */
+static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_subvolume)
+ continue;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+ if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
+ bch2_trans_iter_exit(trans, &iter);
+ *subvolid = k.k->p.offset;
+ goto found;
+ }
+ }
+ if (!ret)
+ ret = -ENOENT;
+found:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c_dirent d)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter subvol_iter = {};
+ struct bch_inode_unpacked subvol_root;
+ u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
+ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+ u32 parent_snapshot;
+ u32 new_parent_subvol = 0;
+ u64 parent_inum;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (ret ||
+ (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
+ int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+ if (ret2 && !bch2_err_matches(ret, ENOENT))
+ return ret2;
+ }
+
+ if (ret &&
+ !new_parent_subvol &&
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ /*
+ * Couldn't find a subvol for dirent's snapshot - but we lost
+ * subvols, so we need to reconstruct:
+ */
+ ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
+ if (ret)
+ return ret;
+
+ parent_snapshot = d.k->p.snapshot;
+ }
+
+ if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
+ "dirent parent_subvol points to missing subvolume\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
+ fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
+ c, dirent_not_visible_in_parent_subvol,
+ "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
+ parent_snapshot,
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ if (!new_parent_subvol) {
+ bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
+ ret = PTR_ERR_OR_ZERO(new_dirent);
if (ret)
goto err;
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
+ }
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ struct bkey_s_c_subvolume s =
+ bch2_bkey_get_iter_typed(trans, &subvol_iter,
+ BTREE_ID_subvolumes, POS(0, target_subvol),
+ 0, subvolume);
+ ret = bkey_err(s.s_c);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (ret) {
+ if (fsck_err(c, dirent_to_missing_subvol,
+ "dirent points to missing subvolume\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
+ return __remove_dirent(trans, d.k->p);
+ ret = 0;
+ goto out;
+ }
+
+ if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
+ c, subvol_fs_path_parent_wrong,
+ "subvol with wrong fs_path_parent, should be be %u\n%s",
+ parent_subvol,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto err;
- d = dirent_i_to_s_c(n);
+ n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+ }
+
+ u64 target_inum = le64_to_cpu(s.v->inode);
+ u32 target_snapshot = le32_to_cpu(s.v->snapshot);
+
+ ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (ret) {
+ bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = 0;
+ goto err;
+ }
+
+ if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
+ c, inode_bi_parent_wrong,
+ "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
+ target_inum,
+ subvol_root.bi_parent_subvol, parent_subvol)) {
+ subvol_root.bi_parent_subvol = parent_subvol;
+ ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+ if (ret)
+ goto err;
}
+
+ ret = check_dirent_target(trans, iter, d, &subvol_root,
+ target_snapshot);
+ if (ret)
+ goto err;
out:
err:
fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &subvol_iter);
printbuf_exit(&buf);
- bch_err_fn(c, ret);
return ret;
}
@@ -1631,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
struct bpos equiv;
@@ -1661,7 +2079,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
- i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, dir, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret < 0)
goto err;
@@ -1670,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false;
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ dir->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf),
@@ -1704,53 +2133,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_dirent)
goto out;
- d = bkey_s_c_to_dirent(k);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) {
- struct bch_inode_unpacked subvol_root;
- u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
- u32 target_snapshot;
- u64 target_inum;
-
- ret = subvol_lookup(trans, target_subvol,
- &target_snapshot, &target_inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c, dirent_to_missing_subvol,
- "dirent points to missing subvolume %u",
- le32_to_cpu(d.v->d_child_subvol))) {
- ret = __remove_dirent(trans, d.k->p);
- goto err;
- }
-
- ret = lookup_inode(trans, target_inum,
- &subvol_root, &target_snapshot);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c, subvol_to_missing_root,
- "subvolume %u points to missing subvolume root %llu",
- target_subvol,
- target_inum)) {
- bch_err(c, "repair not implemented yet");
- ret = -EINVAL;
- goto err;
- }
-
- if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
- c, subvol_root_wrong_bi_subvol,
- "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
- target_inum,
- subvol_root.bi_subvol, target_subvol)) {
- subvol_root.bi_subvol = target_subvol;
- ret = __write_inode(trans, &subvol_root, target_snapshot);
- if (ret)
- goto err;
- }
-
- ret = check_dirent_target(trans, iter, d, &subvol_root,
- target_snapshot);
+ ret = check_dirent_to_subvol(trans, iter, d);
if (ret)
goto err;
} else {
@@ -1776,12 +2162,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
}
- }
-
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, equiv.snapshot, i)
- i->count++;
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+ i->count++;
+ }
out:
err:
fsck_err:
@@ -1810,7 +2195,8 @@ int bch2_check_dirents(struct bch_fs *c)
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+ check_subdir_count_notnested(trans, &dir));
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
@@ -1829,10 +2215,12 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
int ret;
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
+ if (ret < 0)
return ret;
+ if (ret)
+ return 0;
- i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
@@ -1890,17 +2278,21 @@ static int check_root_trans(struct btree_trans *trans)
if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
"root subvol missing")) {
- struct bkey_i_subvolume root_subvol;
+ struct bkey_i_subvolume *root_subvol =
+ bch2_trans_kmalloc(trans, sizeof(*root_subvol));
+ ret = PTR_ERR_OR_ZERO(root_subvol);
+ if (ret)
+ goto err;
snapshot = U32_MAX;
inum = BCACHEFS_ROOT_INO;
- bkey_subvolume_init(&root_subvol.k_i);
- root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_subvol.v.flags = 0;
- root_subvol.v.snapshot = cpu_to_le32(snapshot);
- root_subvol.v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
+ bkey_subvolume_init(&root_subvol->k_i);
+ root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol->v.flags = 0;
+ root_subvol->v.snapshot = cpu_to_le32(snapshot);
+ root_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
bch_err_msg(c, ret, "writing root subvol");
if (ret)
goto err;
@@ -1919,7 +2311,7 @@ static int check_root_trans(struct btree_trans *trans)
0, NULL);
root_inode.bi_inum = inum;
- ret = __write_inode(trans, &root_inode, snapshot);
+ ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
bch_err_msg(c, ret, "writing root inode");
}
err:
@@ -1936,6 +2328,107 @@ int bch2_check_root(struct bch_fs *c)
return ret;
}
+typedef DARRAY(u32) darray_u32;
+
+static bool darray_u32_has(darray_u32 *d, u32 v)
+{
+ darray_for_each(*d, i)
+ if (*i == v)
+ return true;
+ return false;
+}
+
+/*
+ * We've checked that inode backpointers point to valid dirents; here, it's
+ * sufficient to check that the subvolume root has a dirent:
+ */
+static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+ struct bch_inode_unpacked inode;
+ int ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &inode);
+ if (ret)
+ return ret;
+
+ return inode.bi_dir != 0;
+}
+
+static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter parent_iter = {};
+ darray_u32 subvol_path = {};
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
+ ret = darray_push(&subvol_path, k.k->p.offset);
+ if (ret)
+ goto err;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+ ret = subvol_has_dirent(trans, s);
+ if (ret < 0)
+ break;
+
+ if (fsck_err_on(!ret,
+ c, subvol_unreachable,
+ "unreachable subvolume %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ buf.buf))) {
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+
+ u32 parent = le32_to_cpu(s.v->fs_path_parent);
+
+ if (darray_u32_has(&subvol_path, parent)) {
+ if (fsck_err(c, subvol_loop, "subvolume loop"))
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &parent_iter);
+ bch2_trans_iter_init(trans, &parent_iter,
+ BTREE_ID_subvolumes, POS(0, parent), 0);
+ k = bch2_btree_iter_peek_slot(&parent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
+ c, subvol_unreachable,
+ "unreachable subvolume %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ buf.buf))) {
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+ }
+fsck_err:
+err:
+ printbuf_exit(&buf);
+ darray_exit(&subvol_path);
+ bch2_trans_iter_exit(trans, &parent_iter);
+ return ret;
+}
+
+int bch2_check_subvolume_structure(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol_path(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
struct pathbuf_entry {
u64 inum;
u32 snapshot;
@@ -1952,89 +2445,71 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-static int path_down(struct bch_fs *c, pathbuf *p,
- u64 inum, u32 snapshot)
-{
- int ret = darray_push(p, ((struct pathbuf_entry) {
- .inum = inum,
- .snapshot = snapshot,
- }));
-
- if (ret)
- bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
- p->size);
- return ret;
-}
-
/*
- * Check that a given inode is reachable from the root:
+ * Check that a given inode is reachable from its subvolume root - we already
+ * verified subvolume connectivity:
*
* XXX: we should also be verifying that inodes are in the right subvolumes
*/
-static int check_path(struct btree_trans *trans,
- pathbuf *p,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
+static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked inode;
+ struct printbuf buf = PRINTBUF;
+ u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
int ret = 0;
- snapshot = bch2_snapshot_equiv(c, snapshot);
p->nr = 0;
- while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
- inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ BUG_ON(bch2_inode_unpack(inode_k, &inode));
+
+ while (!inode.bi_subvol) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
u32 parent_snapshot = snapshot;
- if (inode->bi_subvol) {
- u64 inum;
-
- ret = subvol_lookup(trans, inode->bi_parent_subvol,
- &parent_snapshot, &inum);
- if (ret)
- break;
- }
-
- d = dirent_get_by_pos(trans, &dirent_iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset,
- parent_snapshot));
+ d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
break;
- if (!ret && !dirent_points_to_inode(d, inode)) {
+ if (!ret && !dirent_points_to_inode(d, &inode)) {
bch2_trans_iter_exit(trans, &dirent_iter);
ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
}
if (bch2_err_matches(ret, ENOENT)) {
- if (fsck_err(c, inode_unreachable,
- "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
- inode->bi_inum, snapshot,
- bch2_d_type_str(inode_d_type(inode)),
- inode->bi_nlink,
- inode->bi_dir,
- inode->bi_dir_offset))
- ret = reattach_inode(trans, inode, snapshot);
- break;
+ ret = 0;
+ if (fsck_err(c, inode_unreachable,
+ "unreachable inode\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, inode_k),
+ buf.buf)))
+ ret = reattach_inode(trans, &inode, snapshot);
+ goto out;
}
bch2_trans_iter_exit(trans, &dirent_iter);
- if (!S_ISDIR(inode->bi_mode))
+ if (!S_ISDIR(inode.bi_mode))
break;
- ret = path_down(c, p, inode->bi_inum, snapshot);
- if (ret) {
- bch_err(c, "memory allocation failure");
+ ret = darray_push(p, ((struct pathbuf_entry) {
+ .inum = inode.bi_inum,
+ .snapshot = snapshot,
+ }));
+ if (ret)
return ret;
- }
snapshot = parent_snapshot;
- ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
+ SPOS(0, inode.bi_dir, snapshot), 0);
+ ret = bkey_err(inode_k) ?:
+ !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
+ : bch2_inode_unpack(inode_k, &inode);
if (ret) {
/* Should have been caught in dirents pass */
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2042,30 +2517,32 @@ static int check_path(struct btree_trans *trans,
break;
}
- if (path_is_dup(p, inode->bi_inum, snapshot)) {
+ snapshot = inode_k.k->p.snapshot;
+
+ if (path_is_dup(p, inode.bi_inum, snapshot)) {
/* XXX print path */
bch_err(c, "directory structure loop");
darray_for_each(*p, i)
pr_err("%llu:%u", i->inum, i->snapshot);
- pr_err("%llu:%u", inode->bi_inum, snapshot);
-
- if (!fsck_err(c, dir_loop, "directory structure loop"))
- return 0;
+ pr_err("%llu:%u", inode.bi_inum, snapshot);
- ret = remove_backpointer(trans, inode);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ if (fsck_err(c, dir_loop, "directory structure loop")) {
+ ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent");
- if (ret)
- break;
+ if (ret)
+ break;
- ret = reattach_inode(trans, inode, snapshot);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
+ ret = reattach_inode(trans, &inode, snapshot);
+ bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+ }
break;
}
}
+out:
fsck_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+ printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
@@ -2077,7 +2554,6 @@ fsck_err:
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- struct bch_inode_unpacked u;
pathbuf path = { 0, };
int ret;
@@ -2090,12 +2566,10 @@ int bch2_check_directory_structure(struct bch_fs *c)
if (!bkey_is_inode(k.k))
continue;
- BUG_ON(bch2_inode_unpack(k, &u));
-
- if (u.bi_flags & BCH_INODE_unlinked)
+ if (bch2_inode_flags(k) & BCH_INODE_unlinked)
continue;
- check_path(trans, &path, &u, iter.pos.snapshot);
+ check_path(trans, &path, k);
})));
darray_exit(&path);
@@ -2291,7 +2765,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
bch2_inode_nlink_set(&u, link->count);
- ret = __write_inode(trans, &u, k.k->p.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
}
fsck_err:
return ret;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index da991e8cf27e..a4ef94271784 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
+int bch2_check_subvolume_structure(struct bch_fs *);
int bch2_check_directory_structure(struct bch_fs *);
int bch2_check_nlinks(struct bch_fs *);
int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 086f0090b03a..ca4a066e9a54 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
return bch2_inode_unpack_slowpath(k, unpacked);
}
-static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+int bch2_inode_peek_nowarn(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
subvol_inum inum, unsigned flags)
@@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
+int __bch2_fsck_write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ struct bkey_inode_buf *inode_p =
+ bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
+
+ bch2_inode_pack(inode_p, inode);
+ inode_p->inode.k.p.snapshot = snapshot;
+
+ return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+ &inode_p->inode.k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_fsck_write_inode(trans, inode, snapshot));
+ bch_err_fn(trans->c, ret);
+ return ret;
+}
+
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
{
struct bch_inode_unpacked u;
@@ -524,8 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
prt_newline(out);
- prt_newline(out);
prt_printf(out, "bi_version=%llu", inode->bi_version);
+ prt_newline(out);
#define x(_name, _bits) \
prt_printf(out, #_name "=%llu", (u64) inode->_name); \
@@ -592,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
bool old_deleted = bkey_is_deleted_inode(old);
bool new_deleted = bkey_is_deleted_inode(new.s_c);
if (old_deleted != new_deleted) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+ new.k->p, new_deleted);
if (ret)
return ret;
}
@@ -1088,8 +1117,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
if (S_ISDIR(inode.bi_mode)) {
- ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
- if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+ ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
+ if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+ c, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
@@ -1141,7 +1171,7 @@ fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
- ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
goto out;
}
@@ -1151,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
bool need_another_pass;
int ret;
again:
+ /*
+ * if we ran check_inodes() unlinked inodes will have already been
+ * cleaned up but the write buffer will be out of sync; therefore we
+ * alway need a write buffer flush
+ */
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
need_another_pass = false;
/*
@@ -1183,12 +1222,8 @@ again:
ret;
}));
- if (!ret && need_another_pass) {
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
+ if (!ret && need_another_pass)
goto again;
- }
err:
bch2_trans_put(trans);
return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b63f312581cf..056298050550 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
+int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
@@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
return bch2_inode_write_flags(trans, iter, inode, 0);
}
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
@@ -172,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
}
+static inline u32 bch2_inode_flags(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+ case KEY_TYPE_inode_v2:
+ return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+ case KEY_TYPE_inode_v3:
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+ default:
+ return 0;
+ }
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 1baf78594cca..82f9170dab3f 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
ret = 0;
err:
bch2_logged_op_finish(trans, op_k);
+ bch_err_fn(c, ret);
return ret;
}
@@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish:
break;
}
err:
+ bch_err_fn(c, ret);
bch2_logged_op_finish(trans, op_k);
bch2_trans_iter_exit(trans, &iter);
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c574d8873a1..8a556e6d1ab6 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
if (!op) {
ret = -BCH_ERR_nopromote_enomem;
goto err;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 2c098ac017b3..f137252bccc5 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
bch2_congested_acct(ca, io_latency, now, rw);
- __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+ __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}
#endif
@@ -530,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op)
bch_err_inum_offset_ratelimited(c,
insert->k.p.inode, insert->k.p.offset << 9,
- "write error while doing btree update: %s",
+ "%s write error while doing btree update: %s",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
}
@@ -1067,7 +1068,8 @@ do_write:
*_dst = dst;
return more;
csum_err:
- bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+ bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user");
ret = -EIO;
err:
if (to_wbio(dst)->bounce)
@@ -1169,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
bch_err_inum_offset_ratelimited(c,
insert->k.p.inode, insert->k.p.offset << 9,
- "write error while doing btree update: %s",
+ "%s write error while doing btree update: %s",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
}
@@ -1449,7 +1452,9 @@ err:
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
- "%s(): error: %s", __func__, bch2_err_str(ret));
+ "%s(): %s error: %s", __func__,
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
+ bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1573,7 +1578,8 @@ CLOSURE_CALLBACK(bch2_write)
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
- "misaligned write");
+ "%s write error: misaligned write",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user");
op->error = -EIO;
goto err;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bc890776eb57..9c9a25dbd613 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -27,33 +27,71 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+ return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+ return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+ return __journal_entry_is_open(j->reservations);
+}
+
static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
{
union journal_res_state s = READ_ONCE(j->reservations);
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
- prt_printf(out, "seq:");
+ prt_str(out, "seq:");
prt_tab(out);
prt_printf(out, "%llu", seq);
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "refcount:");
+ prt_str(out, "refcount:");
prt_tab(out);
prt_printf(out, "%u", journal_state_count(s, i));
prt_newline(out);
- prt_printf(out, "size:");
+ prt_str(out, "size:");
prt_tab(out);
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
- prt_printf(out, "expires");
+ prt_str(out, "expires:");
prt_tab(out);
prt_printf(out, "%li jiffies", buf->expires - jiffies);
prt_newline(out);
+ prt_str(out, "flags:");
+ prt_tab(out);
+ if (buf->noflush)
+ prt_str(out, "noflush ");
+ if (buf->must_flush)
+ prt_str(out, "must_flush ");
+ if (buf->separate_flush)
+ prt_str(out, "separate_flush ");
+ if (buf->need_flush_to_write_buffer)
+ prt_str(out, "need_flush_to_write_buffer ");
+ if (buf->write_started)
+ prt_str(out, "write_started ");
+ if (buf->write_allocated)
+ prt_str(out, "write allocated ");
+ if (buf->write_done)
+ prt_str(out, "write done");
+ prt_newline(out);
+
printbuf_indent_sub(out, 2);
}
@@ -66,26 +104,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
seq <= journal_cur_seq(j);
seq++)
bch2_journal_buf_to_text(out, j, seq);
-}
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
- return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
- return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
- return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return __journal_entry_is_open(j->reservations);
+ prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
}
static inline struct journal_buf *
@@ -174,21 +193,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
return stuck;
}
+void bch2_journal_do_writes(struct journal *j)
+{
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ unsigned idx = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *w = j->buf + idx;
+
+ if (w->write_started && !w->write_allocated)
+ break;
+ if (w->write_started)
+ continue;
+
+ if (!journal_state_count(j->reservations, idx)) {
+ w->write_started = true;
+ closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+ }
+
+ break;
+ }
+}
+
/*
* Final processing when the last reference of a journal buffer has been
* dropped. Drop the pin list reference acquired at journal entry open and write
* the buffer, if requested.
*/
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
lockdep_assert_held(&j->lock);
if (__bch2_journal_pin_put(j, seq))
bch2_journal_reclaim_fast(j);
- if (write)
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ bch2_journal_do_writes(j);
}
/*
@@ -380,11 +418,14 @@ static int journal_entry_open(struct journal *j)
BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
- buf->flush_time = 0;
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+ buf->flush_time = 0;
buf->need_flush_to_write_buffer = true;
+ buf->write_started = false;
+ buf->write_allocated = false;
+ buf->write_done = false;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@@ -418,9 +459,10 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- mod_delayed_work(c->io_complete_wq,
- &j->write_work,
- msecs_to_jiffies(c->opts.journal_flush_delay));
+ if (nr_unwritten_journal_entries(j) == 1)
+ mod_delayed_work(j->wq,
+ &j->write_work,
+ msecs_to_jiffies(c->opts.journal_flush_delay));
journal_wake(j);
if (j->early_journal_entries.nr)
@@ -445,20 +487,16 @@ static void journal_quiesce(struct journal *j)
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- long delta;
spin_lock(&j->lock);
- if (!__journal_entry_is_open(j->reservations))
- goto unlock;
+ if (__journal_entry_is_open(j->reservations)) {
+ long delta = journal_cur_buf(j)->expires - jiffies;
- delta = journal_cur_buf(j)->expires - jiffies;
-
- if (delta > 0)
- mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
- else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-unlock:
+ if (delta > 0)
+ mod_delayed_work(j->wq, &j->write_work, delta);
+ else
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+ }
spin_unlock(&j->lock);
}
@@ -476,30 +514,29 @@ retry:
if (bch2_journal_error(j))
return -BCH_ERR_erofs_journal_err;
- spin_lock(&j->lock);
+ if (j->blocked)
+ return -BCH_ERR_journal_res_get_blocked;
- /* check once more in case somebody else shut things down... */
- if (bch2_journal_error(j)) {
- spin_unlock(&j->lock);
- return -BCH_ERR_erofs_journal_err;
+ if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+ ret = JOURNAL_ERR_journal_full;
+ can_discard = j->can_discard;
+ goto out;
+ }
+
+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+ ret = JOURNAL_ERR_max_in_flight;
+ goto out;
}
+ spin_lock(&j->lock);
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
* unnecessarily
*/
if (journal_res_get_fast(j, res, flags)) {
- spin_unlock(&j->lock);
- return 0;
- }
-
- if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- /*
- * Don't want to close current journal entry, just need to
- * invoke reclaim:
- */
- ret = JOURNAL_ERR_journal_full;
+ ret = 0;
goto unlock;
}
@@ -515,30 +552,30 @@ retry:
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
- ret = journal_entry_open(j);
-
- if (ret == JOURNAL_ERR_max_in_flight) {
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, true);
- if (trace_journal_entry_full_enabled()) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- bch2_journal_bufs_to_text(&buf, j);
- trace_journal_entry_full(c, buf.buf);
- printbuf_exit(&buf);
- }
- count_event(c, journal_entry_full);
- }
+ ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
-
- if (!ret)
+out:
+ if (ret == JOURNAL_ERR_retry)
goto retry;
+ if (!ret)
+ return 0;
+
if (journal_error_check_stuck(j, ret, flags))
ret = -BCH_ERR_journal_res_get_blocked;
+ if (ret == JOURNAL_ERR_max_in_flight &&
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ count_event(c, journal_entry_full);
+ }
+
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
@@ -674,7 +711,7 @@ recheck_need_open:
return ret;
seq = res.seq;
- buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ buf = journal_seq_to_buf(j, seq);
buf->must_flush = true;
if (!buf->flush_time) {
@@ -692,8 +729,8 @@ recheck_need_open:
}
/*
- * if write was kicked off without a flush, flush the next sequence
- * number instead
+ * if write was kicked off without a flush, or if we promised it
+ * wouldn't be a flush, flush the next sequence number instead
*/
buf = journal_seq_to_buf(j, seq);
if (buf->noflush) {
@@ -771,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
- /* journal write is already in flight, and was a flush write: */
- if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+ /* journal flush already in flight, or flush requseted */
+ if (buf->must_flush)
goto out;
buf->noflush = true;
@@ -1157,13 +1194,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
struct journal_replay *i, **_i;
struct genradix_iter iter;
bool had_entries = false;
- unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
genradix_for_each_reverse(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
last_seq = le64_to_cpu(i->j.last_seq);
@@ -1196,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
@@ -1211,8 +1247,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
p = journal_seq_pin(j, seq);
p->devs.nr = 0;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++)
- bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+ darray_for_each(i->ptrs, ptr)
+ bch2_dev_list_add_dev(&p->devs, ptr->dev);
had_entries = true;
}
@@ -1240,13 +1276,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
void bch2_dev_journal_exit(struct bch_dev *ca)
{
- kfree(ca->journal.bio);
- kfree(ca->journal.buckets);
- kfree(ca->journal.bucket_seq);
+ struct journal_device *ja = &ca->journal;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+ kfree(ja->bio[i]);
+ ja->bio[i] = NULL;
+ }
- ca->journal.bio = NULL;
- ca->journal.buckets = NULL;
- ca->journal.bucket_seq = NULL;
+ kfree(ja->buckets);
+ kfree(ja->bucket_seq);
+ ja->buckets = NULL;
+ ja->bucket_seq = NULL;
}
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1256,14 +1296,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
bch2_sb_field_get(sb, journal);
struct bch_sb_field_journal_v2 *journal_buckets_v2 =
bch2_sb_field_get(sb, journal_v2);
- unsigned i, nr_bvecs;
ja->nr = 0;
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- for (i = 0; i < nr; i++)
+ for (unsigned i = 0; i < nr; i++)
ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
} else if (journal_buckets) {
ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1273,13 +1312,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (!ja->bucket_seq)
return -BCH_ERR_ENOMEM_dev_journal_init;
- nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+ unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
- ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
- if (!ca->journal.bio)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+ ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+ nr_bvecs), GFP_KERNEL);
+ if (!ja->bio[i])
+ return -BCH_ERR_ENOMEM_dev_journal_init;
- bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+ ja->bio[i]->ca = ca;
+ ja->bio[i]->buf_idx = i;
+ bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+ }
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
@@ -1287,14 +1331,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- unsigned j, dst = 0;
+ unsigned dst = 0;
- for (i = 0; i < nr; i++)
- for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+ for (unsigned i = 0; i < nr; i++)
+ for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
ja->buckets[dst++] =
le64_to_cpu(journal_buckets_v2->d[i].start) + j;
} else if (journal_buckets) {
- for (i = 0; i < ja->nr; i++)
+ for (unsigned i = 0; i < ja->nr; i++)
ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
}
@@ -1303,19 +1347,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
- unsigned i;
+ if (j->wq)
+ destroy_workqueue(j->wq);
darray_exit(&j->early_journal_entries);
- for (i = 0; i < ARRAY_SIZE(j->buf); i++)
- kvpfree(j->buf[i].data, j->buf[i].buf_size);
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+ kvfree(j->buf[i].data);
free_fifo(&j->pin);
}
int bch2_fs_journal_init(struct journal *j)
{
static struct lock_class_key res_key;
- unsigned i;
mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
@@ -1336,14 +1380,20 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
return -BCH_ERR_ENOMEM_journal_pin_fifo;
- for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+ j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data)
return -BCH_ERR_ENOMEM_journal_buf;
+ j->buf[i].idx = i;
}
j->pin.front = j->pin.back = 1;
+
+ j->wq = alloc_workqueue("bcachefs_journal",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+ if (!j->wq)
+ return -BCH_ERR_ENOMEM_fs_other_alloc;
return 0;
}
@@ -1381,6 +1431,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+ prt_printf(out, "blocked:\t\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
@@ -1455,7 +1506,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
- unsigned i;
spin_lock(&j->lock);
*seq = max(*seq, j->pin.front);
@@ -1473,7 +1523,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
prt_newline(out);
printbuf_indent_add(out, 2);
- for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
list_for_each_entry(pin, &pin_list->list[i], list) {
prt_printf(out, "\t%px %ps", pin, pin->flush);
prt_newline(out);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4544ce24bb8a..7c7528f839c5 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
}
bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
{
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx))
- bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ bch2_journal_buf_put_final(j, seq);
}
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx)) {
spin_lock(&j->lock);
- bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ bch2_journal_buf_put_final(j, seq);
spin_unlock(&j->lock);
}
}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 47805193f18c..9aa28b52ab92 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,37 @@
#include "sb-clean.h"
#include "trace.h"
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ darray_for_each(j->ptrs, i) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+ u64 offset;
+
+ div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
+
+ if (i != j->ptrs.data)
+ prt_printf(out, " ");
+ prt_printf(out, "%u:%u:%u (sector %llu)",
+ i->dev, i->bucket, i->bucket_offset, i->sector);
+ }
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+ bch2_journal_ptrs_to_text(out, c, j);
+
+ for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+ struct jset_entry_datetime *datetime =
+ container_of(entry, struct jset_entry_datetime, entry);
+ bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+ break;
+ }
+}
+
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@@ -52,13 +83,15 @@ static void __journal_replay_free(struct bch_fs *c,
BUG_ON(*p != i);
*p = NULL;
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ kvfree(i);
}
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
{
- i->ignore = true;
+ if (blacklisted)
+ i->ignore_blacklisted = true;
+ else
+ i->ignore_not_dirty = true;
if (!c->opts.read_entire_journal)
__journal_replay_free(c, i);
@@ -84,9 +117,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
{
struct genradix_iter iter;
struct journal_replay **_i, *i, *dup;
- struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+ struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
/* Is this entry older than the range we need? */
@@ -108,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, jlist->last_seq)) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
if (le64_to_cpu(i->j.seq) >= last_seq)
break;
- journal_replay_free(c, i);
+
+ journal_replay_free(c, i, false);
}
}
@@ -131,72 +165,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
*/
dup = *_i;
if (dup) {
- if (bytes == vstruct_bytes(&dup->j) &&
- !memcmp(j, &dup->j, bytes)) {
- i = dup;
- goto found;
- }
+ bool identical = bytes == vstruct_bytes(&dup->j) &&
+ !memcmp(j, &dup->j, bytes);
+ bool not_identical = !identical &&
+ entry_ptr.csum_good &&
+ dup->csum_good;
+
+ bool same_device = false;
+ darray_for_each(dup->ptrs, ptr)
+ if (ptr->dev == ca->dev_idx)
+ same_device = true;
+
+ ret = darray_push(&dup->ptrs, entry_ptr);
+ if (ret)
+ goto out;
- if (!entry_ptr.csum_good) {
- i = dup;
- goto found;
- }
+ bch2_journal_replay_to_text(&buf, c, dup);
- if (!dup->csum_good)
+ fsck_err_on(same_device,
+ c, journal_entry_dup_same_device,
+ "duplicate journal entry on same device\n %s",
+ buf.buf);
+
+ fsck_err_on(not_identical,
+ c, journal_entry_replicas_data_mismatch,
+ "found duplicate but non identical journal entries\n %s",
+ buf.buf);
+
+ if (entry_ptr.csum_good && !identical)
goto replace;
- fsck_err(c, journal_entry_replicas_data_mismatch,
- "found duplicate but non identical journal entries (seq %llu)",
- le64_to_cpu(j->seq));
- i = dup;
- goto found;
+ goto out;
}
replace:
- i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
return -BCH_ERR_ENOMEM_journal_entry_add;
- i->nr_ptrs = 0;
- i->csum_good = entry_ptr.csum_good;
- i->ignore = false;
+ darray_init(&i->ptrs);
+ i->csum_good = entry_ptr.csum_good;
+ i->ignore_blacklisted = false;
+ i->ignore_not_dirty = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
- i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
- if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
- bch_err(c, "found too many copies of journal entry %llu",
- le64_to_cpu(i->j.seq));
- dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
- }
-
/* The first ptr should represent the jset we kept: */
- memcpy(i->ptrs + i->nr_ptrs,
- dup->ptrs,
- sizeof(dup->ptrs[0]) * dup->nr_ptrs);
- i->nr_ptrs += dup->nr_ptrs;
+ darray_for_each(dup->ptrs, ptr)
+ darray_push(&i->ptrs, *ptr);
__journal_replay_free(c, dup);
+ } else {
+ darray_push(&i->ptrs, entry_ptr);
}
*_i = i;
- return 0;
-found:
- for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
- if (ptr->dev == ca->dev_idx) {
- bch_err(c, "duplicate journal entry %llu on same device",
- le64_to_cpu(i->j.seq));
- goto out;
- }
- }
-
- if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
- bch_err(c, "found too many copies of journal entry %llu",
- le64_to_cpu(i->j.seq));
- goto out;
- }
-
- i->ptrs[i->nr_ptrs++] = entry_ptr;
out:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -223,7 +247,7 @@ static void journal_entry_err_msg(struct printbuf *out,
if (entry) {
prt_str(out, " type=");
- prt_str(out, bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
}
if (!jset) {
@@ -374,13 +398,13 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
- struct bkey_i *k;
bool first = true;
jset_entry_for_each_key(entry, k) {
if (!first) {
prt_newline(out);
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
+ prt_str(out, ": ");
}
prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
@@ -540,9 +564,9 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- prt_printf(out, "type=%s v=%llu",
- bch2_fs_usage_types[u->entry.btree_id],
- le64_to_cpu(u->v));
+ prt_str(out, "type=");
+ bch2_prt_fs_usage_type(out, u->entry.btree_id);
+ prt_printf(out, " v=%llu", le64_to_cpu(u->v));
}
static int journal_entry_data_usage_validate(struct bch_fs *c,
@@ -741,6 +765,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_datetime_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ unsigned bytes = vstruct_bytes(entry);
+ unsigned expected = 16;
+ int ret = 0;
+
+ if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_size,
+ "bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_datetime *datetime =
+ container_of(entry, struct jset_entry_datetime, entry);
+
+ bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@@ -773,11 +828,11 @@ int bch2_journal_entry_validate(struct bch_fs *c,
void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
+ bch2_prt_jset_entry_type(out, entry->type);
+
if (entry->type < BCH_JSET_ENTRY_NR) {
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_str(out, ": ");
bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
- } else {
- prt_printf(out, "(unknown type %u)", entry->type);
}
}
@@ -913,11 +968,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
new_size = roundup_pow_of_two(new_size);
- n = kvpmalloc(new_size, GFP_KERNEL);
+ n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
- kvpfree(b->data, b->size);
+ kvfree(b->data);
b->data = n;
b->size = new_size;
return 0;
@@ -1028,9 +1083,7 @@ reread:
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
- bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %s",
- bch2_err_str(ret));
+ bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1102,16 +1155,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
if (!r)
continue;
- for (i = 0; i < r->nr_ptrs; i++) {
- if (r->ptrs[i].dev == ca->dev_idx) {
- unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+ darray_for_each(r->ptrs, i)
+ if (i->dev == ca->dev_idx) {
+ unsigned wrote = bucket_remainder(ca, i->sector) +
vstruct_sectors(&r->j, c->block_bits);
- ja->cur_idx = r->ptrs[i].bucket;
+ ja->cur_idx = i->bucket;
ja->sectors_free = ca->mi.bucket_size - wrote;
goto found;
}
- }
}
found:
mutex_unlock(&jlist->lock);
@@ -1144,7 +1196,7 @@ found:
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
- kvpfree(buf.data, buf.size);
+ kvfree(buf.data);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
return;
@@ -1155,27 +1207,6 @@ err:
goto out;
}
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
-{
- unsigned i;
-
- for (i = 0; i < j->nr_ptrs; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
- u64 offset;
-
- div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
-
- if (i)
- prt_printf(out, " ");
- prt_printf(out, "%u:%u:%u (sector %llu)",
- j->ptrs[i].dev,
- j->ptrs[i].bucket,
- j->ptrs[i].bucket_offset,
- j->ptrs[i].sector);
- }
-}
-
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
@@ -1228,20 +1259,20 @@ int bch2_journal_read(struct bch_fs *c,
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
if (!*start_seq)
*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
if (JSET_NO_FLUSH(&i->j)) {
- i->ignore = true;
+ i->ignore_blacklisted = true;
continue;
}
if (!last_write_torn && !i->csum_good) {
last_write_torn = true;
- i->ignore = true;
+ i->ignore_blacklisted = true;
continue;
}
@@ -1280,12 +1311,12 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
if (seq < *last_seq) {
- journal_replay_free(c, i);
+ journal_replay_free(c, i, false);
continue;
}
@@ -1293,7 +1324,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
- i->ignore = true;
+ i->ignore_blacklisted = true;
}
}
@@ -1302,7 +1333,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1353,32 +1384,31 @@ int bch2_journal_read(struct bch_fs *c,
.e.data_type = BCH_DATA_journal,
.e.nr_required = 1,
};
- unsigned ptr;
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+ darray_for_each(i->ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!i->ptrs[ptr].csum_good)
- bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+ if (!ptr->csum_good)
+ bch_err_dev_offset(ca, ptr->sector,
"invalid journal checksum, seq %llu%s",
le64_to_cpu(i->j.seq),
i->csum_good ? " (had good copy on another device)" : "");
}
ret = jset_validate(c,
- bch_dev_bkey_exists(c, i->ptrs[0].dev),
+ bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
&i->j,
- i->ptrs[0].sector,
+ i->ptrs.data[0].sector,
READ);
if (ret)
goto err;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++)
- replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+ darray_for_each(i->ptrs, ptr)
+ replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
bch2_replicas_entry_sort(&replicas.e);
@@ -1547,7 +1577,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
return;
- new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+ new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@@ -1558,7 +1588,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
swap(buf->buf_size, new_size);
spin_unlock(&j->lock);
- kvpfree(new_buf, new_size);
+ kvfree(new_buf);
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@@ -1568,12 +1598,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
static CLOSURE_CALLBACK(journal_write_done)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
union journal_res_state old, new;
- u64 v, seq;
+ u64 v, seq = le64_to_cpu(w->data->seq);
int err = 0;
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@@ -1593,63 +1623,68 @@ static CLOSURE_CALLBACK(journal_write_done)
if (err)
bch2_fatal_error(c);
- spin_lock(&j->lock);
- seq = le64_to_cpu(w->data->seq);
+ closure_debug_destroy(cl);
+ spin_lock(&j->lock);
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = w->devs_written;
+ if (err && (!j->err_seq || seq < j->err_seq))
+ j->err_seq = seq;
+ w->write_done = true;
- if (!err) {
- if (!JSET_NO_FLUSH(w->data)) {
+ bool completed = false;
+
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ w = j->buf + (seq & JOURNAL_BUF_MASK);
+ if (!w->write_done)
+ break;
+
+ if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
-
bch2_reset_alloc_cursors(c);
}
- } else if (!j->err_seq || seq < j->err_seq)
- j->err_seq = seq;
- j->seq_ondisk = seq;
+ j->seq_ondisk = seq;
- /*
- * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
- * more buckets:
- *
- * Must come before signaling write completion, for
- * bch2_fs_journal_stop():
- */
- if (j->watermark != BCH_WATERMARK_stripe)
- journal_reclaim_kick(&c->journal);
+ /*
+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+ * more buckets:
+ *
+ * Must come before signaling write completion, for
+ * bch2_fs_journal_stop():
+ */
+ if (j->watermark != BCH_WATERMARK_stripe)
+ journal_reclaim_kick(&c->journal);
- /* also must come before signalling write completion: */
- closure_debug_destroy(cl);
+ v = atomic64_read(&j->reservations.counter);
+ do {
+ old.v = new.v = v;
+ BUG_ON(journal_state_count(new, new.unwritten_idx));
+ BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
- v = atomic64_read(&j->reservations.counter);
- do {
- old.v = new.v = v;
- BUG_ON(journal_state_count(new, new.unwritten_idx));
+ new.unwritten_idx++;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
- new.unwritten_idx++;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ closure_wake_up(&w->wait);
+ completed = true;
+ }
- bch2_journal_reclaim_fast(j);
- bch2_journal_space_available(j);
+ if (completed) {
+ bch2_journal_reclaim_fast(j);
+ bch2_journal_space_available(j);
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, false);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
- closure_wake_up(&w->wait);
- journal_wake(j);
+ journal_wake(j);
+ }
- if (!journal_state_count(new, new.unwritten_idx) &&
- journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
- spin_unlock(&j->lock);
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
- } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+ if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
struct journal_buf *buf = journal_cur_buf(j);
long delta = buf->expires - jiffies;
@@ -1659,46 +1694,46 @@ static CLOSURE_CALLBACK(journal_write_done)
* previous entries still in flight - the current journal entry
* might want to be written now:
*/
-
- spin_unlock(&j->lock);
- mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
- } else {
- spin_unlock(&j->lock);
+ mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
+
+ spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
{
- struct bch_dev *ca = bio->bi_private;
+ struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+ struct bch_dev *ca = jbio->ca;
struct journal *j = &ca->fs->journal;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- unsigned long flags;
+ struct journal_buf *w = j->buf + jbio->buf_idx;
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
+ unsigned long flags;
+
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
}
- closure_put(&j->io);
+ closure_put(&w->io);
percpu_ref_put(&ca->io_ref);
}
static CLOSURE_CALLBACK(do_journal_write)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct journal_device *ja = &ca->journal;
+
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
@@ -1708,7 +1743,7 @@ static CLOSURE_CALLBACK(do_journal_write)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
- bio = ca->journal.bio;
+ struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@@ -1727,11 +1762,10 @@ static CLOSURE_CALLBACK(do_journal_write)
trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
- ca->journal.bucket_seq[ca->journal.cur_idx] =
- le64_to_cpu(w->data->seq);
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
}
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
}
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@@ -1782,11 +1816,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (!wb.wb)
bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
- struct bkey_i *k;
jset_entry_for_each_key(i, k) {
ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
if (ret) {
- bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
+ bch2_err_str(ret));
bch2_journal_keys_to_write_buffer_end(c, &wb);
return ret;
}
@@ -1798,15 +1832,24 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (wb.wb)
bch2_journal_keys_to_write_buffer_end(c, &wb);
+
+ spin_lock(&c->journal.lock);
w->need_flush_to_write_buffer = false;
+ spin_unlock(&c->journal.lock);
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
+ struct jset_entry_datetime *d =
+ container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+ d->entry.type = BCH_JSET_ENTRY_datetime;
+ d->seconds = cpu_to_le64(ktime_get_real_seconds());
+
bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
- BUG_ON(u64s > j->entry_u64s_reserved);
+
+ WARN_ON(u64s > j->entry_u64s_reserved);
le32_add_cpu(&jset->u64s, u64s);
@@ -1814,7 +1857,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
bytes = vstruct_bytes(jset);
if (sectors > w->sectors) {
- bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+ bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
return -EINVAL;
@@ -1842,8 +1885,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
- if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %i", ret))
+ if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
@@ -1893,6 +1935,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
j->nr_noflush_writes++;
} else {
+ w->must_flush = true;
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
@@ -1903,20 +1946,28 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
CLOSURE_CALLBACK(bch2_journal_write)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
- struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
+ for_each_rw_member(c, ca)
+ nr_rw_members++;
+
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+ BUG_ON(!w->write_started);
+ BUG_ON(w->write_allocated);
+ BUG_ON(w->write_done);
j->write_start_time = local_clock();
spin_lock(&j->lock);
+ if (nr_rw_members > 1)
+ w->separate_flush = true;
+
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret)
@@ -1956,12 +2007,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
* bch2_journal_space_available():
*/
w->sectors = 0;
+ w->write_allocated = true;
/*
* journal entry has been compacted and allocated, recalculate space
* available:
*/
bch2_journal_space_available(j);
+ bch2_journal_do_writes(j);
spin_unlock(&j->lock);
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@@ -1969,12 +2022,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (c->opts.nochanges)
goto no_io;
- for_each_rw_member(c, ca)
- nr_rw_members++;
-
- if (nr_rw_members > 1)
- w->separate_flush = true;
-
/*
* Mark journal replicas before we submit the write to guarantee
* recovery will find the journal entries after a crash.
@@ -1985,25 +2032,29 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
+ if (!JSET_NO_FLUSH(w->data))
+ closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
+
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
- bio = ca->journal.bio;
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_PREFLUSH);
+ REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
}
- continue_at(cl, do_journal_write, c->io_complete_wq);
+ continue_at(cl, do_journal_write, j->wq);
return;
no_io:
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
return;
err:
bch2_fatal_error(c);
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index c035e7c108e1..4f1e763ab506 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -2,26 +2,35 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
+#include "darray.h"
+
+struct journal_ptr {
+ bool csum_good;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+};
+
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
- struct journal_ptr {
- bool csum_good;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
- } ptrs[BCH_REPLICAS_MAX];
- unsigned nr_ptrs;
+ DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
bool csum_good;
- bool ignore;
+ bool ignore_blacklisted;
+ bool ignore_not_dirty;
/* must be last: */
struct jset j;
};
+static inline bool journal_replay_ignore(struct journal_replay *i)
+{
+ return !i || i->ignore_blacklisted || i->ignore_not_dirty;
+}
+
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
@@ -36,12 +45,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
}
#define for_each_jset_entry_type(entry, jset, type) \
- for (entry = (jset)->start; \
+ for (struct jset_entry *entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define jset_entry_for_each_key(_e, _k) \
- for (_k = (_e)->start; \
+ for (struct bkey_i *_k = (_e)->start; \
_k < vstruct_last(_e); \
_k = bkey_next(_k))
@@ -62,4 +71,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
CLOSURE_CALLBACK(bch2_journal_write);
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+ memset(entry, 0, u64s * sizeof(u64));
+ /*
+ * The u64s field counts from the start of data, ignoring the shared
+ * fields.
+ */
+ entry->u64s = cpu_to_le16(u64s - 1);
+
+ *end = vstruct_next(*end);
+ return entry;
+}
+
#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c33dca641575..04a577848b01 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -62,14 +62,13 @@ void bch2_journal_set_watermark(struct journal *j)
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
- if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
- &j->low_on_space_start, low_on_space) ||
- track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
- &j->low_on_pin_start, low_on_pin) ||
- track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
- &j->write_buffer_full_start, low_on_wb))
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
+ mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin);
+
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
@@ -394,8 +393,6 @@ void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
- bool reclaim;
-
spin_lock(&j->lock);
u64 seq = READ_ONCE(src->seq);
@@ -411,44 +408,44 @@ void bch2_journal_pin_copy(struct journal *j,
return;
}
- reclaim = __journal_pin_drop(j, dst);
+ bool reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
- journal_wake(j);
+ if (seq == journal_last_seq(j))
+ journal_wake(j);
+ spin_unlock(&j->lock);
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
- bool reclaim;
-
spin_lock(&j->lock);
BUG_ON(seq < journal_last_seq(j));
- reclaim = __journal_pin_drop(j, pin);
+ bool reclaim = __journal_pin_drop(j, pin);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
-
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
- journal_wake(j);
+ if (seq == journal_last_seq(j))
+ journal_wake(j);
+
+ spin_unlock(&j->lock);
}
/**
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 0200e299cfbb..37a024e034d4 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr)
return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
}
-static struct bch_sb_field_journal_seq_blacklist *
-blacklist_entry_try_merge(struct bch_fs *c,
- struct bch_sb_field_journal_seq_blacklist *bl,
- unsigned i)
-{
- unsigned nr = blacklist_nr_entries(bl);
-
- if (le64_to_cpu(bl->start[i].end) >=
- le64_to_cpu(bl->start[i + 1].start)) {
- bl->start[i].end = bl->start[i + 1].end;
- --nr;
- memmove(&bl->start[i],
- &bl->start[i + 1],
- sizeof(bl->start[0]) * (nr - i));
-
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- sb_blacklist_u64s(nr));
- BUG_ON(!bl);
- }
-
- return bl;
-}
-
-static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
- u64 start, u64 end)
-{
- return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
-}
-
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
- unsigned i, nr;
+ unsigned i = 0, nr;
int ret = 0;
mutex_lock(&c->sb_lock);
bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
nr = blacklist_nr_entries(bl);
- for (i = 0; i < nr; i++) {
+ while (i < nr) {
struct journal_seq_blacklist_entry *e =
bl->start + i;
- if (bl_entry_contig_or_overlaps(e, start, end)) {
- e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
- e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
-
- if (i + 1 < nr)
- bl = blacklist_entry_try_merge(c,
- bl, i);
- if (i)
- bl = blacklist_entry_try_merge(c,
- bl, i - 1);
- goto out_write_sb;
+ if (end < le64_to_cpu(e->start))
+ break;
+
+ if (start > le64_to_cpu(e->end)) {
+ i++;
+ continue;
}
+
+ /*
+ * Entry is contiguous or overlapping with new entry: merge it
+ * with new entry, and delete:
+ */
+
+ start = min(start, le64_to_cpu(e->start));
+ end = max(end, le64_to_cpu(e->end));
+ array_remove_item(bl->start, nr, i);
}
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
@@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
goto out;
}
- bl->start[nr].start = cpu_to_le64(start);
- bl->start[nr].end = cpu_to_le64(end);
-out_write_sb:
+ array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
+ .start = cpu_to_le64(start),
+ .end = cpu_to_le64(end),
+ }));
c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
ret = bch2_write_super(c);
@@ -119,8 +95,7 @@ out:
return ret ?: bch2_blacklist_table_initialize(c);
}
-static int journal_seq_blacklist_table_cmp(const void *_l,
- const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
@@ -165,8 +140,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
if (!bl)
return 0;
- t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
- GFP_KERNEL);
+ t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
return -BCH_ERR_ENOMEM_blacklist_table_init;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 38817c7a0851..b5161b5d76a0 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -18,6 +18,7 @@
* the journal that are being staged or in flight.
*/
struct journal_buf {
+ struct closure io;
struct jset *data;
__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -33,10 +34,14 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
- bool noflush; /* write has already been kicked off, and was noflush */
- bool must_flush; /* something wants a flush */
- bool separate_flush;
- bool need_flush_to_write_buffer;
+ bool noflush:1; /* write has already been kicked off, and was noflush */
+ bool must_flush:1; /* something wants a flush */
+ bool separate_flush:1;
+ bool need_flush_to_write_buffer:1;
+ bool write_started:1;
+ bool write_allocated:1;
+ bool write_done:1;
+ u8 idx;
};
/*
@@ -129,11 +134,13 @@ enum journal_flags {
JOURNAL_STARTED,
JOURNAL_MAY_SKIP_FLUSH,
JOURNAL_NEED_FLUSH_WRITE,
+ JOURNAL_SPACE_LOW,
};
/* Reasons we may fail to get a journal reservation: */
#define JOURNAL_ERRORS() \
x(ok) \
+ x(retry) \
x(blocked) \
x(max_in_flight) \
x(journal_full) \
@@ -149,6 +156,13 @@ enum journal_errors {
typedef DARRAY(u64) darray_u64;
+struct journal_bio {
+ struct bch_dev *ca;
+ unsigned buf_idx;
+
+ struct bio bio;
+};
+
/* Embedded in struct bch_fs */
struct journal {
/* Fastpath stuff up front: */
@@ -203,8 +217,8 @@ struct journal {
wait_queue_head_t wait;
struct closure_waitlist async_wait;
- struct closure io;
struct delayed_work write_work;
+ struct workqueue_struct *wq;
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
@@ -274,11 +288,6 @@ struct journal {
u64 nr_noflush_writes;
u64 entry_bytes_written;
- u64 low_on_space_start;
- u64 low_on_pin_start;
- u64 max_in_flight_start;
- u64 write_buffer_full_start;
-
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *flush_seq_time;
@@ -313,7 +322,7 @@ struct journal_device {
u64 *buckets;
/* Bio for journal reads/writes to this device */
- struct bio *bio;
+ struct journal_bio *bio[JOURNAL_BUF_NR];
/* for bch_journal_read_device */
struct closure read;
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index ad598105c587..b82f8209041f 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk;
u32 restart_count = trans->restart_count;
- int ret;
if (!fn)
return 0;
@@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, c, k);
- ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
- fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+ fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+
+ return trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
@@ -101,8 +100,8 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
- __func__, buf.buf, bch2_err_str(ret));
+ bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
+ buf.buf, bch2_err_str(ret));
printbuf_exit(&buf);
}
}
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 7a4ca5a28b3e..26569043e368 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
u64 dev_bucket, u64 time, bool set)
{
return time
- ? bch2_btree_bit_mod(trans, BTREE_ID_lru,
- lru_pos(lru_id, dev_bucket, time), set)
+ ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
+ lru_pos(lru_id, dev_bucket, time), set)
: 0;
}
@@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
goto out;
}
- if (c->opts.reconstruct_alloc ||
- fsck_err(c, lru_entry_bad,
+ if (fsck_err(c, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
index bf0ef668fd38..0ea9f30803a2 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/fs/bcachefs/mean_and_variance.c
@@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
* mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
* @s: mean and variance number of samples and their sums
* @x: new value to include in the &mean_and_variance_weighted
+ * @initted: caller must track whether this is the first use or not
+ * @weight: ewma weight
*
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
*/
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+ s64 x, bool initted, u8 weight)
{
// previous weighted variance.
- u8 w = s->weight;
+ u8 w = weight;
u64 var_w0 = s->variance;
// new value weighted.
s64 x_w = x << w;
@@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
// new mean weighted.
s64 u_w1 = s->mean + diff;
- if (!s->init) {
+ if (!initted) {
s->mean = x_w;
s->variance = 0;
} else {
s->mean = u_w1;
s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
}
- s->init = true;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
/**
* mean_and_variance_weighted_get_mean() - get mean from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+ u8 weight)
{
- return fast_divpow2(s.mean, s.weight);
+ return fast_divpow2(s.mean, weight);
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
/**
* mean_and_variance_weighted_get_variance() -- get variance from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+ u8 weight)
{
// always positive don't need fast divpow2
- return s.variance >> s.weight;
+ return s.variance >> weight;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
/**
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+ u8 weight)
{
- return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+ return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 64df11ab422b..4fcf062dd22c 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -154,8 +154,6 @@ struct mean_and_variance {
/* expontentially weighted variant */
struct mean_and_variance_weighted {
- bool init;
- u8 weight; /* base 2 logarithim */
s64 mean;
u64 variance;
};
@@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
u64 mean_and_variance_get_variance(struct mean_and_variance s1);
u32 mean_and_variance_get_stddev(struct mean_and_variance s);
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+ s64 v, bool initted, u8 weight);
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+ u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+ u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+ u8 weight);
#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index 019583c3ca0e..4c298e74723d 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test)
static void mean_and_variance_weighted_test(struct kunit *test)
{
- struct mean_and_variance_weighted s = { .weight = 2 };
+ struct mean_and_variance_weighted s = { };
- mean_and_variance_weighted_update(&s, 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+ mean_and_variance_weighted_update(&s, 10, false, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
- mean_and_variance_weighted_update(&s, 20);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+ mean_and_variance_weighted_update(&s, 20, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
- mean_and_variance_weighted_update(&s, 30);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+ mean_and_variance_weighted_update(&s, 30, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
- s = (struct mean_and_variance_weighted) { .weight = 2 };
+ s = (struct mean_and_variance_weighted) { };
- mean_and_variance_weighted_update(&s, -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+ mean_and_variance_weighted_update(&s, -10, false, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
- mean_and_variance_weighted_update(&s, -20);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+ mean_and_variance_weighted_update(&s, -20, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
- mean_and_variance_weighted_update(&s, -30);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+ mean_and_variance_weighted_update(&s, -30, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
}
static void mean_and_variance_weighted_advanced_test(struct kunit *test)
{
- struct mean_and_variance_weighted s = { .weight = 8 };
+ struct mean_and_variance_weighted s = { };
+ bool initted = false;
s64 i;
- for (i = 10; i <= 100; i += 10)
- mean_and_variance_weighted_update(&s, i);
+ for (i = 10; i <= 100; i += 10) {
+ mean_and_variance_weighted_update(&s, i, initted, 8);
+ initted = true;
+ }
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
- s = (struct mean_and_variance_weighted) { .weight = 8 };
+ s = (struct mean_and_variance_weighted) { };
+ initted = false;
- for (i = -10; i >= -100; i -= 10)
- mean_and_variance_weighted_update(&s, i);
+ for (i = -10; i >= -100; i -= 10) {
+ mean_and_variance_weighted_update(&s, i, initted, 8);
+ initted = true;
+ }
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
}
static void do_mean_and_variance_test(struct kunit *test,
@@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test,
s64 *weighted_stddev)
{
struct mean_and_variance mv = {};
- struct mean_and_variance_weighted vw = { .weight = weight };
+ struct mean_and_variance_weighted vw = { };
for (unsigned i = 0; i < initial_n; i++) {
mean_and_variance_update(&mv, initial_value);
- mean_and_variance_weighted_update(&vw, initial_value);
+ mean_and_variance_weighted_update(&vw, initial_value, false, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
}
for (unsigned i = 0; i < n; i++) {
mean_and_variance_update(&mv, data[i]);
- mean_and_variance_weighted_update(&vw, data[i]);
+ mean_and_variance_weighted_update(&vw, data[i], true, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
}
KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
@@ -130,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_2(struct kunit *test)
-{
- s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
- s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 };
- s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 };
- s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
- s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_3(struct kunit *test)
+static void mean_and_variance_test_2(struct kunit *test)
{
s64 d[] = { 100, 100, 100, 100, 100 };
s64 mean[] = { 22, 32, 40, 46, 50 };
@@ -155,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_4(struct kunit *test)
-{
- s64 d[] = { 100, 100, 100, 100, 100 };
- s64 mean[] = { 10, 11, 12, 13, 14 };
- s64 stddev[] = { 9, 13, 15, 17, 19 };
- s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
- s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
static void mean_and_variance_fast_divpow2(struct kunit *test)
{
s64 i;
@@ -224,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = {
KUNIT_CASE(mean_and_variance_weighted_advanced_test),
KUNIT_CASE(mean_and_variance_test_1),
KUNIT_CASE(mean_and_variance_test_2),
- KUNIT_CASE(mean_and_variance_test_3),
- KUNIT_CASE(mean_and_variance_test_4),
{}
};
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 5623cee3ef86..69098eeb5d48 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
- return -EINVAL;
+ return -BCH_ERR_remove_would_lose_data;
return 0;
}
@@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
- return -EINVAL;
+ return -BCH_ERR_remove_with_metadata_missing_unimplemented;
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
@@ -132,10 +132,8 @@ retry:
ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
- if (ret) {
- bch_err(c, "Cannot drop device without losing data");
+ if (ret)
break;
- }
ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 69e06a84dad4..0d2b82d8d11f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -155,8 +155,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (bch2_err_matches(ret, EROFS))
return ret;
- if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
- __func__, bch2_err_str(ret)))
+ if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
return ret;
ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index b1ed0b9a20d3..bb068fd72465 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -7,6 +7,7 @@
#include "disk_groups.h"
#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "super-io.h"
#include "util.h"
@@ -42,7 +43,7 @@ const char * const __bch2_btree_ids[] = {
NULL
};
-const char * const bch2_csum_types[] = {
+static const char * const __bch2_csum_types[] = {
BCH_CSUM_TYPES()
NULL
};
@@ -52,7 +53,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const __bch2_compression_types[] = {
+static const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -82,18 +83,39 @@ const char * const bch2_member_states[] = {
NULL
};
-const char * const bch2_jset_entry_types[] = {
+static const char * const __bch2_jset_entry_types[] = {
BCH_JSET_ENTRY_TYPES()
NULL
};
-const char * const bch2_fs_usage_types[] = {
+static const char * const __bch2_fs_usage_types[] = {
BCH_FS_USAGE_TYPES()
NULL
};
#undef x
+static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
+ unsigned nr, const char *type, unsigned idx)
+{
+ if (idx < nr)
+ prt_str(out, opts[idx]);
+ else
+ prt_printf(out, "(unknown %s %u)", type, idx);
+}
+
+#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \
+void bch2_prt_##name(struct printbuf *out, type t) \
+{ \
+ prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
+}
+
+PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type);
+PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
+PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
+PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
+
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
{
@@ -205,6 +227,9 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_STR(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = ARRAY_SIZE(_choices), \
.choices = _choices
+#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
+ .min = 0, .max = U64_MAX, \
+ .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
@@ -314,7 +339,7 @@ int bch2_opt_parse(struct bch_fs *c,
if (ret < 0 || (*res != 0 && *res != 1)) {
if (err)
prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret;
+ return ret < 0 ? ret : -BCH_ERR_option_not_bool;
}
break;
case BCH_OPT_UINT:
@@ -456,7 +481,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts = kstrdup(options, GFP_KERNEL);
if (!copied_opts)
- return -1;
+ return -ENOMEM;
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
@@ -501,11 +526,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
bad_opt:
pr_err("Bad mount option %s", name);
- ret = -1;
+ ret = -BCH_ERR_option_name;
goto out;
bad_val:
pr_err("Invalid mount option %s", err.buf);
- ret = -1;
+ ret = -BCH_ERR_option_value;
goto out;
out:
kfree(copied_opts_start);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 9a4b7faa3765..84e452835a17 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
-extern const char * const bch2_jset_entry_types[];
-extern const char * const bch2_fs_usage_types[];
extern const char * const bch2_d_types[];
+void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
+void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
+void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
+void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
+void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
+
static inline const char *bch2_d_type_str(unsigned d_type)
{
return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
@@ -290,6 +292,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
+ x(no_splitbrain_check, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't kick drives out when splitbrain detected")\
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
@@ -332,6 +339,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
+ x(fsck_memory_usage_percent, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(20, 70), \
+ BCH2_NO_SB_OPT, 50, \
+ NULL, "Maximum percentage of system ram fsck is allowed to pin")\
x(fix_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_FN(bch2_opt_fix_errors), \
@@ -352,12 +364,17 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't replay the journal") \
- x(keep_journal, u8, \
+ NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_pass_last, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_STR_NOLIMIT(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Exit recovery after specified pass") \
+ x(retain_recovery_info, u8, \
0, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't free journal entries/keys after startup")\
+ NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 22d1017aa49b..56336f3dd1d0 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: ");
- bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
prt_str(out, "io wait remaining: ");
- bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
prt_str(out, "duration waited: ");
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 21e13bb4335b..0f328aba9760 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,35 +1,31 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
#include "alloc_background.h"
-#include "btree_gc.h"
+#include "bkey_buf.h"
#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "buckets.h"
#include "dirent.h"
-#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "fs-common.h"
-#include "fsck.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
-#include "lru.h"
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-downgrade.h"
#include "snapshot.h"
-#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
@@ -37,30 +33,67 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static bool btree_id_is_alloc(enum btree_id id)
+void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
{
- switch (id) {
- case BTREE_ID_alloc:
- case BTREE_ID_backpointers:
- case BTREE_ID_need_discard:
- case BTREE_ID_freespace:
- case BTREE_ID_bucket_gens:
- return true;
- default:
- return false;
+ u64 b = BIT_ULL(btree);
+
+ if (!(c->sb.btrees_lost_data & b)) {
+ bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+
+ mutex_lock(&c->sb_lock);
+ bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
}
}
/* for -o reconstruct_alloc: */
-static void drop_alloc_keys(struct journal_keys *keys)
+static void bch2_reconstruct_alloc(struct bch_fs *c)
{
- size_t src, dst;
+ bch2_journal_log_msg(c, "dropping alloc info");
+ bch_info(c, "dropping and reconstructing all alloc info");
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
+
+ __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- for (src = 0, dst = 0; src < keys->nr; src++)
- if (!btree_id_is_alloc(keys->d[src].btree_id))
- keys->d[dst++] = keys->d[src];
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- keys->nr = dst;
+
+ bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
}
/*
@@ -70,9 +103,7 @@ static void drop_alloc_keys(struct journal_keys *keys)
*/
static void zero_out_btree_mem_ptr(struct journal_keys *keys)
{
- struct journal_key *i;
-
- for (i = keys->d; i < keys->d + keys->nr; i++)
+ darray_for_each(*keys, i)
if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
}
@@ -124,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (ret)
goto out;
+ struct btree_path *path = btree_iter_path(trans, &iter);
+ if (unlikely(!btree_path_node(path, k->level))) {
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, 0, iter_flags);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_increase_depth(trans, iter.path, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+
/* Must be checked with btree locked: */
if (k->overwritten)
goto out;
@@ -142,7 +184,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq);
}
-static int bch2_journal_replay(struct bch_fs *c)
+int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
DARRAY(struct journal_key *) keys_sorted = { 0 };
@@ -150,6 +192,7 @@ static int bch2_journal_replay(struct bch_fs *c)
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
struct btree_trans *trans = bch2_trans_get(c);
+ bool immediate_flush = false;
int ret = 0;
if (keys->nr) {
@@ -161,15 +204,22 @@ static int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
+ move_gap(keys, keys->nr);
+
/*
* First, attempt to replay keys in sorted order. This is more
* efficient - better locality of btree access - but some might fail if
* that would cause a journal deadlock.
*/
- for (size_t i = 0; i < keys->nr; i++) {
+ darray_for_each(*keys, k) {
cond_resched();
- struct journal_key *k = keys->d + i;
+ /*
+ * k->allocated means the key wasn't read in from the journal,
+ * rather it was from early repair code
+ */
+ if (k->allocated)
+ immediate_flush = true;
/* Skip fastpath if we're low on space in the journal */
ret = c->journal.watermark ? -1 :
@@ -222,7 +272,8 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_trans_put(trans);
trans = NULL;
- if (!c->opts.keep_journal)
+ if (!c->opts.retain_recovery_info &&
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
@@ -230,6 +281,12 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j);
+ /* if we did any repair, flush it immediately */
+ if (immediate_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ ret = bch2_journal_meta(&c->journal);
+ }
+
if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
@@ -264,7 +321,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
bkey_copy(&r->key, (struct bkey_i *) entry->start);
r->error = 0;
} else {
- r->error = -EIO;
+ r->error = -BCH_ERR_btree_node_read_error;
}
r->alive = true;
break;
@@ -359,7 +416,7 @@ static int journal_replay_early(struct bch_fs *c,
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
vstruct_for_each(&i->j, entry) {
@@ -379,202 +436,57 @@ static int journal_replay_early(struct bch_fs *c,
static int read_btree_roots(struct bch_fs *c)
{
- unsigned i;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (!r->alive)
continue;
- if (btree_id_is_alloc(i) &&
- c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
continue;
- }
- if (r->error) {
- __fsck_err(c,
- btree_id_is_alloc(i)
- ? FSCK_CAN_IGNORE : 0,
- btree_root_bkey_invalid,
- "invalid btree root %s",
- bch2_btree_id_str(i));
- if (i == BTREE_ID_alloc)
+ if (mustfix_fsck_err_on((ret = r->error),
+ c, btree_root_bkey_invalid,
+ "invalid btree root %s",
+ bch2_btree_id_str(i)) ||
+ mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
+ c, btree_root_read_error,
+ "error reading btree root %s l=%u: %s",
+ bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
+ if (btree_id_is_alloc(i)) {
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- }
+ r->error = 0;
+ } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+ bch_info(c, "will run btree node scan");
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ }
- ret = bch2_btree_root_read(c, i, &r->key, r->level);
- if (ret) {
- fsck_err(c,
- btree_root_read_error,
- "error reading btree root %s",
- bch2_btree_id_str(i));
- if (btree_id_is_alloc(i))
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
ret = 0;
+ bch2_btree_lost_data(c, i);
}
}
- for (i = 0; i < BTREE_ID_NR; i++) {
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
- if (!r->b) {
+ if (!r->b && !r->error) {
r->alive = false;
r->level = 0;
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
}
}
fsck_err:
return ret;
}
-static int bch2_initialize_subvolumes(struct bch_fs *c)
-{
- struct bkey_i_snapshot_tree root_tree;
- struct bkey_i_snapshot root_snapshot;
- struct bkey_i_subvolume root_volume;
- int ret;
-
- bkey_snapshot_tree_init(&root_tree.k_i);
- root_tree.k.p.offset = 1;
- root_tree.v.master_subvol = cpu_to_le32(1);
- root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
-
- bkey_snapshot_init(&root_snapshot.k_i);
- root_snapshot.k.p.offset = U32_MAX;
- root_snapshot.v.flags = 0;
- root_snapshot.v.parent = 0;
- root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
- root_snapshot.v.tree = cpu_to_le32(1);
- SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
- bkey_subvolume_init(&root_volume.k_i);
- root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_volume.v.flags = 0;
- root_volume.v.snapshot = cpu_to_le32(U32_MAX);
- root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
-
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
- }
-
- ret = bch2_inode_unpack(k, &inode);
- BUG_ON(ret);
-
- inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
- ret = bch2_inode_write(trans, &iter, &inode);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* set bi_subvol on root inode */
-noinline_for_stack
-static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- __bch2_fs_upgrade_for_subvolumes(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...) #_fn,
- BCH_RECOVERY_PASSES()
-#undef x
- NULL
-};
-
-static int bch2_check_allocations(struct bch_fs *c)
-{
- return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- /*
- * After we go RW, the journal keys buffer can't be modified (except for
- * setting journal_key->overwritten: it will be accessed by multiple
- * threads
- */
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- if (keys->nr || c->opts.fsck || !c->sb.clean)
- return bch2_fs_read_write_early(c);
- return 0;
-}
-
-struct recovery_pass_fn {
- int (*fn)(struct bch_fs *);
- unsigned when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
static bool check_version_upgrade(struct bch_fs *c)
{
unsigned latest_version = bcachefs_metadata_version_current;
@@ -647,96 +559,6 @@ static bool check_version_upgrade(struct bch_fs *c)
return false;
}
-u64 bch2_fsck_recovery_passes(void)
-{
- u64 ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & PASS_FSCK)
- ret |= BIT_ULL(i);
- return ret;
-}
-
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
- return false;
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return true;
- if ((p->when & PASS_FSCK) && c->opts.fsck)
- return true;
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
- return true;
- if (p->when & PASS_ALWAYS)
- return true;
- return false;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
- int ret;
-
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_CONT " done\n");
-
- return 0;
-}
-
-static int bch2_run_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
- unsigned pass = c->curr_recovery_pass;
-
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
- (ret && c->curr_recovery_pass < pass))
- continue;
- if (ret)
- break;
-
- c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
- }
- c->curr_recovery_pass++;
- c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
- }
-
- return ret;
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
- struct recovery_pass_fn *p = recovery_pass_fns + i;
-
- if (!(p->when & PASS_ONLINE))
- continue;
-
- ret = bch2_run_recovery_pass(c, i);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
- i = c->curr_recovery_pass;
- continue;
- }
- if (ret)
- break;
- }
-
- return ret;
-}
-
int bch2_fs_recovery(struct bch_fs *c)
{
struct bch_sb_field_clean *clean = NULL;
@@ -769,24 +591,14 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (c->opts.fsck && c->opts.norecovery) {
- bch_err(c, "cannot select both norecovery and fsck");
- ret = -EINVAL;
- goto err;
- }
+ if (c->opts.norecovery)
+ c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
if (!c->opts.nochanges) {
mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
- struct bch_sb_field_ext *ext =
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
- if (!ext) {
- ret = -BCH_ERR_ENOSPC_sb;
- mutex_unlock(&c->sb_lock);
- goto err;
- }
-
if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
ext->recovery_passes_required[0] |=
cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
@@ -845,7 +657,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+ if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) {
struct genradix_iter iter;
struct journal_replay **i;
@@ -862,7 +674,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto out;
genradix_for_each_reverse(&c->journal_entries, iter, i)
- if (*i && !(*i)->ignore) {
+ if (!journal_replay_ignore(*i)) {
last_journal_entry = &(*i)->j;
break;
}
@@ -887,7 +699,8 @@ int bch2_fs_recovery(struct bch_fs *c)
genradix_for_each_reverse(&c->journal_entries, iter, i)
if (*i) {
last_journal_entry = &(*i)->j;
- (*i)->ignore = false;
+ (*i)->ignore_blacklisted = false;
+ (*i)->ignore_not_dirty= false;
/*
* This was probably a NO_FLUSH entry,
* so last_seq was garbage - but we know
@@ -923,10 +736,8 @@ use_clean:
c->journal_replay_seq_start = last_seq;
c->journal_replay_seq_end = blacklist_seq - 1;
- if (c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- drop_alloc_keys(&c->journal_keys);
- }
+ if (c->opts.reconstruct_alloc)
+ bch2_reconstruct_alloc(c);
zero_out_btree_mem_ptr(&c->journal_keys);
@@ -950,7 +761,7 @@ use_clean:
bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
if (ret) {
- bch_err(c, "error creating new journal seq blacklist entry");
+ bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
goto err;
}
}
@@ -961,9 +772,6 @@ use_clean:
if (ret)
goto err;
- if (c->opts.reconstruct_alloc)
- bch2_journal_log_msg(c, "dropping alloc info");
-
/*
* Skip past versions that might have possibly been used (as nonces),
* but hadn't had their pointers written:
@@ -981,6 +789,12 @@ use_clean:
clear_bit(BCH_FS_fsck_running, &c->flags);
+ /* fsync if we fixed errors */
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
+
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
test_bit(BCH_FS_errors_fixed, &c->flags) &&
@@ -1015,6 +829,7 @@ use_clean:
}
mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
@@ -1028,15 +843,18 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_error, &c->flags)) {
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- if (ext &&
- (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
- !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
- memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
- memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
- write_sb = true;
- }
+ if (!test_bit(BCH_FS_error, &c->flags) &&
+ !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
+ memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+ write_sb = true;
+ }
+
+ if (c->opts.fsck &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+ ext->btrees_lost_data) {
+ ext->btrees_lost_data = 0;
+ write_sb = true;
}
if (c->opts.fsck &&
@@ -1077,9 +895,10 @@ use_clean:
out:
bch2_flush_fsck_errs(c);
- if (!c->opts.keep_journal &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ if (!c->opts.retain_recovery_info) {
bch2_journal_keys_put_initial(c);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+ }
kfree(clean);
if (!ret &&
@@ -1105,6 +924,7 @@ int bch2_fs_initialize(struct bch_fs *c)
int ret;
bch_notice(c, "initializing new filesystem");
+ set_bit(BCH_FS_new_fs, &c->flags);
mutex_lock(&c->sb_lock);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
@@ -1119,11 +939,11 @@ int bch2_fs_initialize(struct bch_fs *c)
}
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
set_bit(BCH_FS_may_go_rw, &c->flags);
for (unsigned i = 0; i < BTREE_ID_NR; i++)
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
for_each_member_device(c, ca)
bch2_dev_usage_init(ca);
@@ -1194,7 +1014,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
+ c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 4e9d24719b2e..4bf818de1f2f 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,37 +2,9 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-extern const char * const bch2_recovery_passes[];
+void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return 0;
-
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
-
- c->recovery_passes_explicit |= BIT_ULL(pass);
-
- if (c->curr_recovery_pass >= pass) {
- c->curr_recovery_pass = pass;
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
- return -BCH_ERR_restart_recovery;
- } else {
- return 0;
- }
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *);
-u64 bch2_fsck_recovery_passes(void);
+int bch2_journal_replay(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
new file mode 100644
index 000000000000..0cec0f7d9703
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_gc.h"
+#include "btree_node_scan.h"
+#include "ec.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...) #_fn,
+ BCH_RECOVERY_PASSES()
+#undef x
+ NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+ return bch2_gc(c, true, false);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys, keys->nr);
+
+ set_bit(BCH_FS_may_go_rw, &c->flags);
+
+ if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
+ return bch2_fs_read_write_early(c);
+ return 0;
+}
+
+struct recovery_pass_fn {
+ int (*fn)(struct bch_fs *);
+ unsigned when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+ return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_to_stable_map[i]);
+ return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+ static const u8 map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ };
+
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(map[i]);
+ return ret;
+}
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return 0;
+
+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+ c->recovery_passes_explicit |= BIT_ULL(pass);
+
+ if (c->curr_recovery_pass >= pass) {
+ c->curr_recovery_pass = pass;
+ c->recovery_passes_complete &= (1ULL << pass) >> 1;
+ return -BCH_ERR_restart_recovery;
+ } else {
+ return 0;
+ }
+}
+
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (!test_bit_le64(s, ext->recovery_passes_required)) {
+ __set_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+
+ return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+static void bch2_clear_recovery_pass_required(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (test_bit_le64(s, ext->recovery_passes_required)) {
+ __clear_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+ u64 ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & PASS_FSCK)
+ ret |= BIT_ULL(i);
+ return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return true;
+ if ((p->when & PASS_FSCK) && c->opts.fsck)
+ return true;
+ if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+ return true;
+ if (p->when & PASS_ALWAYS)
+ return true;
+ return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ int ret;
+
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_CONT " done\n");
+
+ return 0;
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+ struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+ if (!(p->when & PASS_ONLINE))
+ continue;
+
+ ret = bch2_run_recovery_pass(c, i);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+ i = c->curr_recovery_pass;
+ continue;
+ }
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ if (c->opts.recovery_pass_last &&
+ c->curr_recovery_pass > c->opts.recovery_pass_last)
+ break;
+
+ if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
+ unsigned pass = c->curr_recovery_pass;
+
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
+ (ret && c->curr_recovery_pass < pass))
+ continue;
+ if (ret)
+ break;
+
+ c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+ }
+
+ c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
+
+ if (!test_bit(BCH_FS_error, &c->flags))
+ bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
+
+ c->curr_recovery_pass++;
+ }
+
+ return ret;
+}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
new file mode 100644
index 000000000000..99b464e127b8
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.h
@@ -0,0 +1,17 @@
+#ifndef _BCACHEFS_RECOVERY_PASSES_H
+#define _BCACHEFS_RECOVERY_PASSES_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *);
+int bch2_run_recovery_passes(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_passes_types.h
index fa0c8efd2a1b..773aea9a0080 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_TYPES_H
-#define _BCACHEFS_RECOVERY_TYPES_H
+#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
+#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
#define PASS_SILENT BIT(0)
#define PASS_FSCK BIT(1)
@@ -13,11 +13,12 @@
* must never change:
*/
#define BCH_RECOVERY_PASSES() \
+ x(scan_for_btree_nodes, 37, 0) \
+ x(check_topology, 4, 0) \
x(alloc_read, 0, PASS_ALWAYS) \
x(stripes_read, 1, PASS_ALWAYS) \
x(initialize_subvolumes, 2, 0) \
x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_topology, 4, 0) \
x(check_allocations, 5, PASS_FSCK) \
x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
@@ -31,20 +32,23 @@
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
x(check_inodes, 24, PASS_FSCK) \
x(check_extents, 25, PASS_FSCK) \
x(check_indirect_extents, 26, PASS_FSCK) \
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 33, 0) \
x(set_fs_needs_rebalance, 34, 0) \
@@ -54,6 +58,7 @@ enum bch_recovery_pass {
#define x(n, id, when) BCH_RECOVERY_PASS_##n,
BCH_RECOVERY_PASSES()
#undef x
+ BCH_RECOVERY_PASS_NR
};
/* But we also need stable identifiers that can be used in the superblock */
@@ -63,4 +68,4 @@ enum bch_recovery_pass_stable {
#undef x
};
-#endif /* _BCACHEFS_RECOVERY_TYPES_H */
+#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c47c66c2b394..ff7864731a07 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -185,8 +185,7 @@ not_found:
} else {
bkey_error_init(update);
update->k.p = p.k->p;
- update->k.p.offset = next_idx;
- update->k.size = next_idx - *idx;
+ update->k.size = p.k->size;
set_bkey_val_u64s(&update->k, 0);
}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cc2672c12031..678b9c20e251 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -6,12 +6,15 @@
#include "replicas.h"
#include "super-io.h"
+#include <linux/sort.h>
+
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r, const void *priv)
{
+ size_t size = (size_t) priv;
return memcmp(l, r, size);
}
@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+ eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+ bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
verify_replicas_entry(search);
-#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
+#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
entry_cmp, search);
#undef entry_cmp
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
unsigned i;
- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- bch2_memcmp, NULL);
+ sort_r(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ bch2_memcmp, NULL,
+ (void *)(size_t)cpu_r->entry_size);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index b6bf0ebe7e84..5980ba2563fe 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -171,22 +171,6 @@ fsck_err:
return ERR_PTR(ret);
}
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
- struct jset_entry *entry = *end;
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
- memset(entry, 0, u64s * sizeof(u64));
- /*
- * The u64s field counts from the start of data, ignoring the shared
- * fields.
- */
- entry->u64s = cpu_to_le16(u64s - 1);
-
- *end = vstruct_next(*end);
- return entry;
-}
-
void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 441dcb1bf160..a98ef940b7a3 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -7,7 +7,7 @@
#include "bcachefs.h"
#include "darray.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "super-io.h"
@@ -45,7 +45,16 @@
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
x(rebalance_work, \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
+ x(subvolume_fs_parent, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
+ BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
+ x(btree_subvolume_children, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
+ BCH_FSCK_ERR_subvol_children_not_set) \
+ x(mi_btree_bitmap, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_btree_bitmap_not_marked)
#define DOWNGRADE_TABLE()
@@ -253,7 +262,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
if (e < BCH_SB_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
- ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
+ __set_bit_le64(e, ext->errors_silent);
}
}
}
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index c08aacdfd073..4ca6e7b0d8aa 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -130,7 +130,7 @@
x(bucket_gens_nonzero_for_invalid_buckets, 122) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
x(need_discard_freespace_key_bad, 124) \
- x(backpointer_pos_wrong, 125) \
+ x(backpointer_bucket_offset_wrong, 125) \
x(backpointer_to_missing_device, 126) \
x(backpointer_to_missing_alloc, 127) \
x(backpointer_to_missing_ptr, 128) \
@@ -231,7 +231,7 @@
x(dirent_name_dot_or_dotdot, 223) \
x(dirent_name_has_slash, 224) \
x(dirent_d_type_wrong, 225) \
- x(dirent_d_parent_subvol_wrong, 226) \
+ x(inode_bi_parent_wrong, 226) \
x(dirent_in_missing_dir_inode, 227) \
x(dirent_in_non_dir_inode, 228) \
x(dirent_to_missing_inode, 229) \
@@ -250,7 +250,28 @@
x(hash_table_key_duplicate, 242) \
x(hash_table_key_wrong_offset, 243) \
x(unlinked_inode_not_on_deleted_list, 244) \
- x(reflink_p_front_pad_bad, 245)
+ x(reflink_p_front_pad_bad, 245) \
+ x(journal_entry_dup_same_device, 246) \
+ x(inode_bi_subvol_missing, 247) \
+ x(inode_bi_subvol_wrong, 248) \
+ x(inode_points_to_missing_dirent, 249) \
+ x(inode_points_to_wrong_dirent, 250) \
+ x(inode_bi_parent_nonzero, 251) \
+ x(dirent_to_missing_parent_subvol, 252) \
+ x(dirent_not_visible_in_parent_subvol, 253) \
+ x(subvol_fs_path_parent_wrong, 254) \
+ x(subvol_root_fs_path_parent_nonzero, 255) \
+ x(subvol_children_not_set, 256) \
+ x(subvol_children_bad, 257) \
+ x(subvol_loop, 258) \
+ x(subvol_unreachable, 259) \
+ x(btree_node_bkey_bad_u64s, 260) \
+ x(btree_node_topology_empty_interior_node, 261) \
+ x(btree_ptr_v2_min_key_bad, 262) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263) \
+ x(snapshot_node_missing, 264) \
+ x(dup_backpointer_to_bad_csum_extent, 265) \
+ x(btree_bitmap_not_marked, 266)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index eff5ce18c69c..522a969345e5 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "disk_groups.h"
#include "opts.h"
#include "replicas.h"
@@ -426,3 +427,55 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+ if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev),
+ ptr->offset, btree_sectors(c)))
+ return false;
+ return true;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+ u64 start, unsigned sectors)
+{
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+ u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+ u64 end = start + sectors;
+
+ int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+ if (resize > 0) {
+ u64 new_bitmap = 0;
+
+ for (unsigned i = 0; i < 64; i++)
+ if (bitmap & BIT_ULL(i))
+ new_bitmap |= BIT_ULL(i >> resize);
+ bitmap = new_bitmap;
+ m->btree_bitmap_shift += resize;
+ }
+
+ for (unsigned bit = sectors >> m->btree_bitmap_shift;
+ bit << m->btree_bitmap_shift < end;
+ bit++)
+ bitmap |= BIT_ULL(bit);
+
+ m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+ __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index be0a94183271..b27c3e4467cf 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_SB_MEMBERS_H
#include "darray.h"
+#include "bkey_types.h"
extern char * const bch2_member_error_strs[];
@@ -220,6 +221,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = bch2_member_exists(mi),
+ .btree_bitmap_shift = mi->btree_bitmap_shift,
+ .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
};
}
@@ -228,4 +231,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *);
void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
void bch2_dev_errors_reset(struct bch_dev *);
+static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
+{
+ u64 end = start + sectors;
+
+ if (end > 64 << ca->mi.btree_bitmap_shift)
+ return false;
+
+ for (unsigned bit = sectors >> ca->mi.btree_bitmap_shift;
+ bit << ca->mi.btree_bitmap_shift < end;
+ bit++)
+ if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
+ return false;
+ return true;
+}
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
+void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ac6ba04d5521..544322d5c251 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -8,6 +8,7 @@
#include "errcode.h"
#include "error.h"
#include "fs.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include <linux/random.h>
@@ -91,23 +92,29 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,
/* Snapshot nodes: */
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
{
- struct snapshot_table *t;
+ while (id && id < ancestor) {
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ id = s ? s->parent : 0;
+ }
+ return id == ancestor;
+}
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
rcu_read_lock();
- t = rcu_dereference(c->snapshots);
-
- while (id && id < ancestor)
- id = __snapshot_t(t, id)->parent;
+ bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
rcu_read_unlock();
- return id == ancestor;
+ return ret;
}
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
{
const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return 0;
if (s->skip[2] <= ancestor)
return s->skip[2];
@@ -118,27 +125,36 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
return s->parent;
}
+static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return false;
+
+ return test_bit(ancestor - id - 1, s->is_ancestor);
+}
+
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
- struct snapshot_table *t;
bool ret;
- EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
-
rcu_read_lock();
- t = rcu_dereference(c->snapshots);
+ struct snapshot_table *t = rcu_dereference(c->snapshots);
+
+ if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
+ ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
+ goto out;
+ }
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
- if (id && id < ancestor) {
- ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
-
- EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
- } else {
- ret = id == ancestor;
- }
+ ret = id && id < ancestor
+ ? test_ancestor_bitmap(t, id, ancestor)
+ : id == ancestor;
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
+out:
rcu_read_unlock();
return ret;
@@ -147,36 +163,39 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
- size_t new_size;
struct snapshot_table *new, *old;
- new_size = max(16UL, roundup_pow_of_two(idx + 1));
+ size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
+ size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
- new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+ new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
+ new->nr = new_size;
+
old = rcu_dereference_protected(c->snapshots, true);
if (old)
- memcpy(new->s,
- rcu_dereference_protected(c->snapshots, true)->s,
- sizeof(new->s[0]) * c->snapshot_table_size);
+ memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
rcu_assign_pointer(c->snapshots, new);
- c->snapshot_table_size = new_size;
- kvfree_rcu_mightsleep(old);
+ kvfree_rcu(old, rcu);
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ return &rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock))->s[idx];
}
static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
+ struct snapshot_table *table =
+ rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock));
lockdep_assert_held(&c->snapshot_table_lock);
- if (likely(idx < c->snapshot_table_size))
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ if (likely(table && idx < table->nr))
+ return &table->s[idx];
return __snapshot_t_mut(c, id);
}
@@ -547,7 +566,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
"snapshot tree points to missing subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+ fsck_err_on(!bch2_snapshot_is_ancestor(c,
le32_to_cpu(subvol.snapshot),
root_id),
c, snapshot_tree_to_wrong_subvol,
@@ -563,6 +582,13 @@ static int check_snapshot_tree(struct btree_trans *trans,
u32 subvol_id;
ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+ bch_err_fn(c, ret);
+
+ if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
+ ret = 0;
+ goto err;
+ }
+
if (ret)
goto err;
@@ -720,7 +746,6 @@ static int check_snapshot(struct btree_trans *trans,
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
u32 real_depth;
struct printbuf buf = PRINTBUF;
- bool should_have_subvol;
u32 i, id;
int ret = 0;
@@ -766,7 +791,7 @@ static int check_snapshot(struct btree_trans *trans,
}
}
- should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+ bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
!BCH_SNAPSHOT_DELETED(&s);
if (should_have_subvol) {
@@ -868,6 +893,154 @@ int bch2_check_snapshots(struct bch_fs *c)
return ret;
}
+static int check_snapshot_exists(struct btree_trans *trans, u32 id)
+{
+ struct bch_fs *c = trans->c;
+
+ if (bch2_snapshot_equiv(c, id))
+ return 0;
+
+ u32 tree_id;
+ int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+ if (ret)
+ return ret;
+
+ struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
+ ret = PTR_ERR_OR_ZERO(snapshot);
+ if (ret)
+ return ret;
+
+ bkey_snapshot_init(&snapshot->k_i);
+ snapshot->k.p = POS(0, id);
+ snapshot->v.tree = cpu_to_le32(tree_id);
+ snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+
+ return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
+ bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+}
+
+/* Figure out which snapshot nodes belong in the same tree: */
+struct snapshot_tree_reconstruct {
+ enum btree_id btree;
+ struct bpos cur_pos;
+ snapshot_id_list cur_ids;
+ DARRAY(snapshot_id_list) trees;
+};
+
+static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
+{
+ darray_for_each(r->trees, i)
+ darray_exit(i);
+ darray_exit(&r->trees);
+ darray_exit(&r->cur_ids);
+}
+
+static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ return r->btree == BTREE_ID_inodes
+ ? r->cur_pos.offset == pos.offset
+ : r->cur_pos.inode == pos.inode;
+}
+
+static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
+{
+ darray_for_each(*l, i)
+ if (snapshot_list_has_id(r, *i))
+ return true;
+ return false;
+}
+
+static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
+{
+ bool first = true;
+ darray_for_each(*s, i) {
+ if (!first)
+ prt_char(out, ' ');
+ first = false;
+ prt_printf(out, "%u", *i);
+ }
+}
+
+static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
+{
+ if (r->cur_ids.nr) {
+ darray_for_each(r->trees, i)
+ if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
+ int ret = snapshot_list_merge(c, i, &r->cur_ids);
+ if (ret)
+ return ret;
+ goto out;
+ }
+ darray_push(&r->trees, r->cur_ids);
+ darray_init(&r->cur_ids);
+ }
+out:
+ r->cur_ids.nr = 0;
+ return 0;
+}
+
+static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ if (!same_snapshot(r, pos))
+ snapshot_tree_reconstruct_next(c, r);
+ r->cur_pos = pos;
+ return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
+}
+
+int bch2_reconstruct_snapshots(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+ struct snapshot_tree_reconstruct r = {};
+ int ret = 0;
+
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+ if (btree_type_has_snapshots(btree)) {
+ r.btree = btree;
+
+ ret = for_each_btree_key(trans, iter, btree, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
+ get_snapshot_trees(c, &r, k.k->p);
+ }));
+ if (ret)
+ goto err;
+
+ snapshot_tree_reconstruct_next(c, &r);
+ }
+ }
+
+ darray_for_each(r.trees, t) {
+ printbuf_reset(&buf);
+ snapshot_id_list_to_text(&buf, t);
+
+ darray_for_each(*t, id) {
+ if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+ c, snapshot_node_missing,
+ "snapshot node %u from tree %s missing", *id, buf.buf)) {
+ if (t->nr > 1) {
+ bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+ }
+
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot_exists(trans, *id));
+ if (ret)
+ goto err;
+ }
+ }
+ }
+fsck_err:
+err:
+ bch2_trans_put(trans);
+ snapshot_tree_reconstruct_exit(&r);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -1678,6 +1851,20 @@ int bch2_snapshots_read(struct bch_fs *c)
POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
bch_err_fn(c, ret);
+
+ /*
+ * It's important that we check if we need to reconstruct snapshots
+ * before going RW, so we mark that pass as required in the superblock -
+ * otherwise, we could end up deleting keys with missing snapshot nodes
+ * instead
+ */
+ BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
+ test_bit(BCH_FS_may_go_rw, &c->flags));
+
+ if (bch2_err_matches(ret, EIO) ||
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
+ ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
+
return ret;
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 7c66ffc06385..b7d2fed37c4f 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -33,7 +33,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
{
- return &t->s[U32_MAX - id];
+ u32 idx = U32_MAX - id;
+
+ return likely(t && idx < t->nr)
+ ? &t->s[idx]
+ : NULL;
}
static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
@@ -44,7 +48,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
rcu_read_lock();
- id = snapshot_t(c, id)->tree;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ id = s ? s->tree : 0;
rcu_read_unlock();
return id;
@@ -52,7 +57,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->parent : 0;
}
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -66,19 +72,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- u32 parent = snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ if (!s)
+ return 0;
- if (parent &&
- snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+ u32 parent = s->parent;
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) &&
+ parent &&
+ s->depth != snapshot_t(c, parent)->depth + 1)
panic("id %u depth=%u parent %u depth=%u\n",
id, snapshot_t(c, id)->depth,
parent, snapshot_t(c, parent)->depth);
return parent;
-#else
- return snapshot_t(c, id)->parent;
-#endif
}
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -116,7 +122,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->equiv;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->equiv : 0;
}
static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
@@ -133,38 +140,22 @@ static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
return id == bch2_snapshot_equiv(c, id);
}
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
- bool ret;
-
rcu_read_lock();
- s = snapshot_t(c, id);
- ret = s->children[0];
+ const struct snapshot_t *s = snapshot_t(c, id);
+ int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
rcu_read_unlock();
return ret;
}
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
- return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
- u32 parent = __bch2_snapshot_parent(c, id);
-
- if (!parent)
- return 0;
-
- s = snapshot_t(c, __bch2_snapshot_parent(c, id));
- if (id == s->children[0])
- return s->children[1];
- if (id == s->children[1])
- return s->children[0];
- return 0;
+ int ret = bch2_snapshot_is_internal_node(c, id);
+ if (ret < 0)
+ return ret;
+ return !ret;
}
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
@@ -218,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list
static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- int ret;
-
BUG_ON(snapshot_list_has_id(s, id));
- ret = darray_push(s, id);
+ int ret = darray_push(s, id);
if (ret)
bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
return ret;
}
+static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ int ret = snapshot_list_has_id(s, id)
+ ? 0
+ : darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
+}
+
+static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
+{
+ darray_for_each(*src, i) {
+ int ret = snapshot_list_add_nodup(c, dst, *i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s);
int bch2_snapshot_get_subvol(struct btree_trans *, u32,
@@ -238,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
+int bch2_reconstruct_snapshots(struct bch_fs *);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *);
@@ -249,7 +260,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
struct bpos pos)
{
if (!btree_type_has_snapshots(id) ||
- bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+ bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
return 0;
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index fcaa5a888744..3976f80721bf 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set_snapshot(struct btree_trans *trans,
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
@@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
struct bkey_i *insert,
bch_str_hash_flags_t str_hash_flags)
{
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
insert->k.p.inode = inum.inum;
- return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, str_hash_flags, 0);
+ u32 snapshot;
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ bch2_hash_set_in_snapshot(trans, desc, info, inum,
+ snapshot, insert, str_hash_flags, 0);
}
static __always_inline
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 7c67c28d3ef8..88a79c823276 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -13,13 +13,26 @@
static int bch2_subvolume_delete(struct btree_trans *, u32);
+static struct bpos subvolume_children_pos(struct bkey_s_c k)
+{
+ if (k.k->type != KEY_TYPE_subvolume)
+ return POS_MIN;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+ if (!s.v->fs_path_parent)
+ return POS_MIN;
+ return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
+}
+
static int check_subvol(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_subvolume subvol;
+ struct btree_iter subvol_children_iter = {};
struct bch_snapshot snapshot;
+ struct printbuf buf = PRINTBUF;
unsigned snapid;
int ret = 0;
@@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans,
return ret ?: -BCH_ERR_transaction_restart_nested;
}
+ if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
+ subvol.v->fs_path_parent,
+ c, subvol_root_fs_path_parent_nonzero,
+ "root subvolume has nonzero fs_path_parent\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ n->v.fs_path_parent = 0;
+ }
+
+ if (subvol.v->fs_path_parent) {
+ struct bpos pos = subvolume_children_pos(k);
+
+ struct bkey_s_c subvol_children_k =
+ bch2_bkey_get_iter(trans, &subvol_children_iter,
+ BTREE_ID_subvolume_children, pos, 0);
+ ret = bkey_err(subvol_children_k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
+ c, subvol_children_not_set,
+ "subvolume not set in subvolume_children btree at %llu:%llu\n%s",
+ pos.inode, pos.offset,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
+ if (ret)
+ goto err;
+ }
+ }
+
+ struct bch_inode_unpacked inode;
+ struct btree_iter inode_iter = {};
+ ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+ (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+ 0);
+ bch2_trans_iter_exit(trans, &inode_iter);
+
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(ret, c, subvol_to_missing_root,
+ "subvolume %llu points to missing subvolume root %llu:%u",
+ k.k->p.offset, le64_to_cpu(subvol.v->inode),
+ le32_to_cpu(subvol.v->snapshot))) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ return ret ?: -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+ c, subvol_root_wrong_bi_subvol,
+ "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+ inode.bi_inum, inode_iter.k.p.snapshot,
+ inode.bi_subvol, subvol.k->p.offset)) {
+ inode.bi_subvol = subvol.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+ if (ret)
+ goto err;
+ }
+
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
u32 snapshot_tree;
@@ -72,8 +151,10 @@ static int check_subvol(struct btree_trans *trans,
SET_BCH_SUBVOLUME_SNAP(&s->v, true);
}
}
-
+err:
fsck_err:
+ bch2_trans_iter_exit(trans, &subvol_children_iter);
+ printbuf_exit(&buf);
return ret;
}
@@ -88,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c)
return ret;
}
+static int check_subvol_child(struct btree_trans *trans,
+ struct btree_iter *child_iter,
+ struct bkey_s_c child_k)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_subvolume s;
+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
+ 0, subvolume, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(ret ||
+ le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
+ c, subvol_children_bad,
+ "incorrect entry in subvolume_children btree %llu:%llu",
+ child_k.k->p.inode, child_k.k->p.offset)) {
+ ret = bch2_btree_delete_at(trans, child_iter, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ return ret;
+}
+
+int bch2_check_subvol_children(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol_child(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return 0;
+}
+
/* Subvolumes: */
int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -112,8 +229,50 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
le64_to_cpu(s.v->inode),
le32_to_cpu(s.v->snapshot));
- if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
- prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+ if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
+ prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
+ prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
+ }
+}
+
+static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
+{
+ return !bpos_eq(pos, POS_MIN)
+ ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
+ : 0;
+}
+
+int bch2_subvolume_trigger(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
+{
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bpos children_pos_old = subvolume_children_pos(old);
+ struct bpos children_pos_new = subvolume_children_pos(new.s_c);
+
+ if (!bpos_eq(children_pos_old, children_pos_new)) {
+ int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
+ subvolume_children_mod(trans, children_pos_new, true);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
+{
+ struct btree_iter iter;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+
+ return bkey_err(k) ?: k.k && k.k->p.inode == subvol
+ ? -BCH_ERR_ENOTEMPTY_subvol_not_empty
+ : 0;
}
static __always_inline int
@@ -197,8 +356,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_subvolume)
return 0;
- if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
- le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
+ le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
return 0;
s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
@@ -206,7 +365,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (ret)
return ret;
- s->v.parent = cpu_to_le32(new_parent);
+ s->v.creation_parent = cpu_to_le32(new_parent);
return 0;
}
@@ -229,7 +388,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
- subvolid_to_delete, le32_to_cpu(s.parent)));
+ subvolid_to_delete, le32_to_cpu(s.creation_parent)));
}
/*
@@ -360,6 +519,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
}
int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+ u32 parent_subvolid,
u32 src_subvolid,
u32 *new_subvolid,
u32 *new_snapshotid,
@@ -416,12 +576,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
if (ret)
goto err;
- new_subvol->v.flags = 0;
- new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
- new_subvol->v.inode = cpu_to_le64(inode);
- new_subvol->v.parent = cpu_to_le32(src_subvolid);
- new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
- new_subvol->v.otime.hi = 0;
+ new_subvol->v.flags = 0;
+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
+ new_subvol->v.inode = cpu_to_le64(inode);
+ new_subvol->v.creation_parent = cpu_to_le32(src_subvolid);
+ new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid);
+ new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
+ new_subvol->v.otime.hi = 0;
SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
@@ -434,6 +595,78 @@ err:
return ret;
}
+int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot_tree root_tree;
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_tree_init(&root_tree.k_i);
+ root_tree.k.p.offset = 1;
+ root_tree.v.master_subvol = cpu_to_le32(1);
+ root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+ root_snapshot.v.tree = cpu_to_le32(1);
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_is_inode(k.k)) {
+ bch_err(trans->c, "root inode not found");
+ ret = -BCH_ERR_ENOENT_inode;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ ret = bch2_inode_write(trans, &iter, &inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* set bi_subvol on root inode */
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_fs_upgrade_for_subvolumes(trans));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
int bch2_fs_subvolumes_init(struct bch_fs *c)
{
INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a6f56f66e27c..d2015d549bd2 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -8,17 +8,22 @@
enum bkey_invalid_flags;
int bch2_check_subvols(struct bch_fs *);
+int bch2_check_subvol_children(struct bch_fs *);
int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
.key_invalid = bch2_subvolume_invalid, \
.val_to_text = bch2_subvolume_to_text, \
+ .trigger = bch2_subvolume_trigger, \
.min_val_size = 16, \
})
+int bch2_subvol_has_children(struct btree_trans *, u32);
int bch2_subvolume_get(struct btree_trans *, unsigned,
bool, int, struct bch_subvolume *);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
@@ -30,8 +35,10 @@ int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32,
- u32 *, u32 *, bool);
+int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
+
+int bch2_initialize_subvolumes(struct bch_fs *);
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
int bch2_fs_subvolumes_init(struct bch_fs *);
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
index af79134b07d6..e029df7ba89f 100644
--- a/fs/bcachefs/subvolume_format.h
+++ b/fs/bcachefs/subvolume_format.h
@@ -19,8 +19,8 @@ struct bch_subvolume {
* This is _not_ necessarily the subvolume of the directory containing
* this subvolume:
*/
- __le32 parent;
- __le32 pad;
+ __le32 creation_parent;
+ __le32 fs_path_parent;
bch_le128 otime;
};
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index ae644adfc391..9b10c8947828 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,6 +20,8 @@ struct snapshot_t {
};
struct snapshot_table {
+ struct rcu_head rcu;
+ size_t nr;
#ifndef RUST_BINDGEN
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
#else
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 36988add581f..08ea3dbbbe97 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -8,7 +8,7 @@
#include "journal.h"
#include "journal_sb.h"
#include "journal_seq_blacklist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
@@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev_handle))
- bdev_release(sb->bdev_handle);
+ if (!IS_ERR_OR_NULL(sb->s_bdev_file))
+ bdev_fput(sb->s_bdev_file);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return ret;
}
+ if (rw == WRITE &&
+ bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
+ prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
+ le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
+ le64_to_cpu(sb->seq));
+ return -BCH_ERR_invalid_sb_members_missing;
+ }
+
return 0;
}
@@ -519,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c)
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
- if (ext)
+ if (ext) {
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
+ c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ }
for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
@@ -690,8 +700,11 @@ retry:
return -ENOMEM;
sb->sb_name = kstrdup(path, GFP_KERNEL);
- if (!sb->sb_name)
- return -ENOMEM;
+ if (!sb->sb_name) {
+ ret = -ENOMEM;
+ prt_printf(&err, "error allocating memory for sb_name");
+ goto err;
+ }
#ifndef __KERNEL__
if (opt_get(*opts, direct_io) == false)
@@ -704,22 +717,23 @@ retry:
if (!opt_get(*opts, nochanges))
sb->mode |= BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->bdev_handle) &&
- PTR_ERR(sb->bdev_handle) == -EACCES &&
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->s_bdev_file) &&
+ PTR_ERR(sb->s_bdev_file) == -EACCES &&
opt_get(*opts, read_only)) {
sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->bdev_handle))
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->s_bdev_file))
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev_handle)) {
- ret = PTR_ERR(sb->bdev_handle);
+ if (IS_ERR(sb->s_bdev_file)) {
+ ret = PTR_ERR(sb->s_bdev_file);
+ prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
goto err;
}
- sb->bdev = sb->bdev_handle->bdev;
+ sb->bdev = file_bdev(sb->s_bdev_file);
ret = bch2_sb_realloc(sb, 0);
if (ret) {
@@ -743,9 +757,9 @@ retry:
prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
- printk(KERN_INFO "%s", err2.buf);
+ bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
else
- printk(KERN_ERR "%s", err2.buf);
+ bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
printbuf_exit(&err2);
printbuf_reset(&err);
@@ -803,21 +817,20 @@ got_super:
goto err;
}
- ret = 0;
sb->have_layout = true;
ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
- printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
- path, err.buf);
+ bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+ path, err.buf);
goto err_no_print;
}
out:
printbuf_exit(&err);
return ret;
err:
- printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
- path, err.buf);
+ bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+ path, err.buf);
err_no_print:
bch2_free_super(sb);
goto out;
@@ -977,7 +990,7 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, " > ");
bch2_version_to_text(&buf, bcachefs_metadata_version_current);
prt_str(&buf, ")");
- bch2_fs_fatal_error(c, "%s", buf.buf);
+ bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_sb_not_downgraded;
}
@@ -997,7 +1010,7 @@ int bch2_write_super(struct bch_fs *c)
if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock write was silently dropped! (seq %llu expected %llu)",
+ ": Superblock write was silently dropped! (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
@@ -1007,7 +1020,7 @@ int bch2_write_super(struct bch_fs *c)
if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock modified by another process (seq %llu expected %llu)",
+ ": Superblock modified by another process (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
@@ -1058,7 +1071,7 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written ||
(can_mount_without_written &&
!can_mount_with_written), c,
- "Unable to write superblock to sufficient devices (from %ps)",
+ ": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
ret = -1;
out:
@@ -1154,6 +1167,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
kfree(errors_silent);
}
+
+ prt_printf(out, "Btrees with missing data:");
+ prt_tab(out);
+ prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
+ prt_newline(out);
}
static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6b23e11825e6..8daf80a38d60 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -15,6 +15,7 @@
#include "btree_gc.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_write_buffer.h"
@@ -56,6 +57,7 @@
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
+#include "thread_with_file.h"
#include "trace.h"
#include <linux/backing-dev.h>
@@ -86,26 +88,38 @@ const char * const bch2_fs_flag_strs[] = {
NULL
};
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+__printf(2, 0)
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
{
- struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+#ifdef __KERNEL__
+ if (unlikely(stdio)) {
+ if (fmt[0] == KERN_SOH[0])
+ fmt += 2;
+
+ bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+ return;
+ }
+#endif
+ vprintk(fmt, args);
+}
+
+void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
va_list args;
va_start(args, fmt);
- if (likely(!stdio)) {
- vprintk(fmt, args);
- } else {
- unsigned long flags;
-
- if (fmt[0] == KERN_SOH[0])
- fmt += 2;
+ bch2_print_maybe_redirect(stdio, fmt, args);
+ va_end(args);
+}
- spin_lock_irqsave(&stdio->output_lock, flags);
- prt_vprintf(&stdio->output_buf, fmt, args);
- spin_unlock_irqrestore(&stdio->output_lock, flags);
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
- wake_up(&stdio->output_wait);
- }
+ va_list args;
+ va_start(args, fmt);
+ bch2_print_maybe_redirect(stdio, fmt, args);
va_end(args);
}
@@ -274,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c)
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
!test_bit(BCH_FS_emergency_ro, &c->flags))
set_bit(BCH_FS_clean_shutdown, &c->flags);
+
bch2_fs_journal_stop(&c->journal);
+ bch_info(c, "%sshutdown complete, journal seq %llu",
+ test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
+ c->journal.seq_ondisk);
+
/*
* After stopping journal:
*/
@@ -352,7 +371,7 @@ void bch2_fs_read_only(struct bch_fs *c)
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
test_bit(BCH_FS_started, &c->flags) &&
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- !c->opts.norecovery) {
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
@@ -497,7 +516,8 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
- if (c->opts.norecovery)
+ if (c->opts.recovery_pass_last &&
+ c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
return -BCH_ERR_erofs_norecovery;
if (c->opts.nochanges)
@@ -522,6 +542,7 @@ static void __bch2_fs_free(struct bch_fs *c)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
@@ -546,6 +567,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_put_initial(c);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
@@ -576,7 +598,7 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->btree_update_wq);
bch2_free_super(&c->disk_sb);
- kvpfree(c, sizeof(*c));
+ kvfree(c);
module_put(THIS_MODULE);
}
@@ -715,7 +737,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
unsigned i, iter_size;
int ret = 0;
- c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+ c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c) {
c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
goto out;
@@ -818,13 +840,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
pr_uuid(&name, c->sb.user_uuid.b);
- strscpy(c->name, name.buf, sizeof(c->name));
- printbuf_exit(&name);
-
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
if (ret)
goto err;
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -862,13 +884,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
- WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -882,8 +904,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
- mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- c->opts.btree_node_size) ||
+ mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+ c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@@ -1002,8 +1024,16 @@ int bch2_fs_start(struct bch_fs *c)
for_each_online_member(c, ca)
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
+ struct bch_sb_field_ext *ext =
+ bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
mutex_unlock(&c->sb_lock);
+ if (!ext) {
+ bch_err(c, "insufficient space in superblock for sb_field_ext");
+ ret = -BCH_ERR_ENOSPC_sb;
+ goto err;
+ }
+
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
@@ -1061,7 +1091,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
}
static int bch2_dev_in_fs(struct bch_sb_handle *fs,
- struct bch_sb_handle *sb)
+ struct bch_sb_handle *sb,
+ struct bch_opts *opts)
{
if (fs == sb)
return 0;
@@ -1102,11 +1133,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
prt_newline(&buf);
- prt_printf(&buf, "Not using older sb");
+ if (!opts->no_splitbrain_check)
+ prt_printf(&buf, "Not using older sb");
pr_err("%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_device_splitbrain;
+
+ if (!opts->no_splitbrain_check)
+ return -BCH_ERR_device_splitbrain;
}
struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
@@ -1124,17 +1158,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
- prt_str(&buf, "believes seq of ");
+ prt_str(&buf, " believes seq of ");
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " has %llu\n", seq_from_member);
- prt_str(&buf, "Not using ");
- prt_bdevname(&buf, sb->bdev);
+
+ if (!opts->no_splitbrain_check) {
+ prt_str(&buf, "Not using ");
+ prt_bdevname(&buf, sb->bdev);
+ }
pr_err("%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_device_splitbrain;
+
+ if (!opts->no_splitbrain_check)
+ return -BCH_ERR_device_splitbrain;
}
return 0;
@@ -1168,8 +1207,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);
- bch2_time_stats_exit(&ca->io_latency[WRITE]);
- bch2_time_stats_exit(&ca->io_latency[READ]);
+ bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+ bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
@@ -1260,8 +1299,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
- bch2_time_stats_init(&ca->io_latency[READ]);
- bch2_time_stats_init(&ca->io_latency[WRITE]);
+ bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
+ bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
@@ -1597,27 +1636,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- bch_err_msg(ca, ret, "dropping data");
+ bch_err_msg(ca, ret, "bch2_dev_data_drop()");
if (ret)
goto err;
ret = bch2_dev_remove_alloc(c, ca);
- bch_err_msg(ca, ret, "deleting alloc info");
+ bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
if (ret)
goto err;
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- bch_err_msg(ca, ret, "flushing journal");
+ bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
if (ret)
goto err;
ret = bch2_journal_flush(&c->journal);
- bch_err(ca, "journal error");
+ bch_err_msg(ca, ret, "bch2_journal_flush()");
if (ret)
goto err;
ret = bch2_replicas_gc2(c);
- bch_err_msg(ca, ret, "in replicas_gc2()");
+ bch_err_msg(ca, ret, "bch2_replicas_gc2()");
if (ret)
goto err;
@@ -1835,7 +1874,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+ ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
bch_err_msg(c, ret, "bringing %s online", path);
if (ret)
goto err;
@@ -2023,7 +2062,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
best = sb;
darray_for_each_reverse(sbs, sb) {
- ret = bch2_dev_in_fs(best, sb);
+ ret = bch2_dev_in_fs(best, sb, &opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 0e5a14fc8e7f..11bcef170c2c 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,7 +4,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
- struct bdev_handle *bdev_handle;
+ struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
@@ -37,6 +37,8 @@ struct bch_member_cpu {
u8 durability;
u8 freespace_initialized;
u8 valid;
+ u8 btree_bitmap_shift;
+ u64 btree_allocated_bitmap;
};
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index cee80c47feea..5be92fe3f4ea 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -17,7 +17,6 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_update.h"
-#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
@@ -26,6 +25,7 @@
#include "ec.h"
#include "inode.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "movinggc.h"
@@ -139,6 +139,7 @@ do { \
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_flush);
write_attribute(prune_cache);
write_attribute(btree_wakeup);
rw_attribute(btree_gc_periodic);
@@ -166,7 +167,6 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(btree_updates);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(stripes_heap);
@@ -415,9 +415,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_journal_debug)
bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_btree_updates)
- bch2_btree_updates_to_text(out, c);
-
if (attr == &sysfs_btree_cache)
bch2_btree_cache_to_text(out, c);
@@ -505,7 +502,7 @@ STORE(bch2_fs)
/* Debugging: */
- if (!test_bit(BCH_FS_rw, &c->flags))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
return -EROFS;
if (attr == &sysfs_prune_cache) {
@@ -538,6 +535,11 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_invalidates)
bch2_do_invalidates(c);
+ if (attr == &sysfs_trigger_journal_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -558,6 +560,7 @@ STORE(bch2_fs)
size = ret;
}
#endif
+ bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
return size;
}
SYSFS_OPS(bch2_fs);
@@ -639,7 +642,6 @@ SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_flags,
&sysfs_journal_debug,
- &sysfs_btree_updates,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_new_stripes,
@@ -657,6 +659,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_gc,
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
+ &sysfs_trigger_journal_flush,
&sysfs_prune_cache,
&sysfs_btree_wakeup,
@@ -930,10 +933,10 @@ SHOW(bch2_dev)
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
if (attr == &sysfs_io_latency_stats_read)
- bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+ bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
if (attr == &sysfs_io_latency_stats_write)
- bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b3fe9fc57747..bfec656f94c0 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 9220d7de10db..940db15d6a93 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -2,7 +2,6 @@
#ifndef NO_BCACHEFS_FS
#include "bcachefs.h"
-#include "printbuf.h"
#include "thread_with_file.h"
#include <linux/anon_inodes.h>
@@ -10,6 +9,7 @@
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
+#include <linux/sched/sysctl.h>
void bch2_thread_with_file_exit(struct thread_with_file *thr)
{
@@ -65,68 +65,82 @@ err:
return ret;
}
-static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+/* stdio_redirect */
+
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
{
- return thr->stdio.output_buf.pos ||
- thr->output2.nr ||
- thr->thr.done;
+ return stdio->input.buf.nr || stdio->done;
}
-static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
- size_t copied = 0, b;
- int ret = 0;
+ return stdio->output.buf.nr || stdio->done;
+}
- if ((file->f_flags & O_NONBLOCK) &&
- !thread_with_stdio_has_output(thr))
- return -EAGAIN;
+#define STDIO_REDIRECT_BUFSIZE 4096
- ret = wait_event_interruptible(thr->stdio.output_wait,
- thread_with_stdio_has_output(thr));
- if (ret)
- return ret;
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+ return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
- if (thr->thr.done)
- return 0;
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+ return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
- while (len) {
- ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
- if (ret)
- break;
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+ spin_lock_init(&buf->lock);
+ init_waitqueue_head(&buf->wait);
+ darray_init(&buf->buf);
+}
- spin_lock_irq(&thr->stdio.output_lock);
- b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
+/* thread_with_stdio */
- memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
- memmove(thr->stdio.output_buf.buf,
- thr->stdio.output_buf.buf + b,
- thr->stdio.output_buf.pos - b);
+static void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+ thr->thr.done = true;
+ thr->stdio.done = true;
+ wake_up(&thr->stdio.input.wait);
+ wake_up(&thr->stdio.output.wait);
+}
- thr->output2.nr += b;
- thr->stdio.output_buf.pos -= b;
- spin_unlock_irq(&thr->stdio.output_lock);
+static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ struct stdio_buf *buf = &thr->stdio.output;
+ size_t copied = 0, b;
+ int ret = 0;
- b = min(len, thr->output2.nr);
- if (!b)
- break;
+ if (!(file->f_flags & O_NONBLOCK)) {
+ ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
+ if (ret)
+ return ret;
+ } else if (!stdio_redirect_has_output(&thr->stdio))
+ return -EAGAIN;
- b -= copy_to_user(buf, thr->output2.data, b);
- if (!b) {
+ while (len && buf->buf.nr) {
+ if (fault_in_writeable(ubuf, len) == len) {
ret = -EFAULT;
break;
}
- copied += b;
- buf += b;
- len -= b;
-
- memmove(thr->output2.data,
- thr->output2.data + b,
- thr->output2.nr - b);
- thr->output2.nr -= b;
+ spin_lock_irq(&buf->lock);
+ b = min_t(size_t, len, buf->buf.nr);
+
+ if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
+ ubuf += b;
+ len -= b;
+ copied += b;
+ buf->buf.nr -= b;
+ memmove(buf->buf.data,
+ buf->buf.data + b,
+ buf->buf.nr);
+ }
+ spin_unlock_irq(&buf->lock);
}
return copied ?: ret;
@@ -137,27 +151,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
+ thread_with_stdio_done(thr);
bch2_thread_with_file_exit(&thr->thr);
- printbuf_exit(&thr->stdio.input_buf);
- printbuf_exit(&thr->stdio.output_buf);
- darray_exit(&thr->output2);
- thr->exit(thr);
+ darray_exit(&thr->stdio.input.buf);
+ darray_exit(&thr->stdio.output.buf);
+ thr->ops->exit(thr);
return 0;
}
-#define WRITE_BUFFER 4096
-
-static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
-{
- return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
-}
-
static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
- struct printbuf *buf = &thr->stdio.input_buf;
+ struct stdio_buf *buf = &thr->stdio.input;
size_t copied = 0;
ssize_t ret = 0;
@@ -173,29 +180,30 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
break;
}
- spin_lock(&thr->stdio.input_lock);
- if (buf->pos < WRITE_BUFFER)
- bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
- b = min(len, printbuf_remaining_size(buf));
-
- if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
- ubuf += b;
- len -= b;
- copied += b;
- buf->pos += b;
+ spin_lock(&buf->lock);
+ if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
+ darray_make_room_gfp(&buf->buf,
+ min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
+ b = min(len, darray_room(buf->buf));
+
+ if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
+ buf->buf.nr += b;
+ ubuf += b;
+ len -= b;
+ copied += b;
}
- spin_unlock(&thr->stdio.input_lock);
+ spin_unlock(&buf->lock);
if (b) {
- wake_up(&thr->stdio.input_wait);
+ wake_up(&buf->wait);
} else {
if ((file->f_flags & O_NONBLOCK)) {
ret = -EAGAIN;
break;
}
- ret = wait_event_interruptible(thr->stdio.input_wait,
- thread_with_stdio_has_input_space(thr));
+ ret = wait_event_interruptible(buf->wait,
+ stdio_redirect_has_input_space(&thr->stdio));
if (ret)
break;
}
@@ -209,90 +217,233 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
- poll_wait(file, &thr->stdio.output_wait, wait);
- poll_wait(file, &thr->stdio.input_wait, wait);
+ poll_wait(file, &thr->stdio.output.wait, wait);
+ poll_wait(file, &thr->stdio.input.wait, wait);
__poll_t mask = 0;
- if (thread_with_stdio_has_output(thr))
+ if (stdio_redirect_has_output(&thr->stdio))
mask |= EPOLLIN;
- if (thread_with_stdio_has_input_space(thr))
+ if (stdio_redirect_has_input_space(&thr->stdio))
mask |= EPOLLOUT;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
+static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ poll_wait(file, &thr->stdio.output.wait, wait);
+
+ __poll_t mask = 0;
+
+ if (stdio_redirect_has_output(&thr->stdio))
+ mask |= EPOLLIN;
+ if (thr->thr.done)
+ mask |= EPOLLHUP|EPOLLERR;
+ return mask;
+}
+
+static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ return thr->thr.ret;
+}
+
+static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ if (thr->ops->unlocked_ioctl)
+ return thr->ops->unlocked_ioctl(thr, cmd, p);
+ return -ENOTTY;
+}
+
static const struct file_operations thread_with_stdio_fops = {
- .release = thread_with_stdio_release,
+ .llseek = no_llseek,
.read = thread_with_stdio_read,
.write = thread_with_stdio_write,
.poll = thread_with_stdio_poll,
+ .flush = thread_with_stdio_flush,
+ .release = thread_with_stdio_release,
+ .unlocked_ioctl = thread_with_stdio_ioctl,
+};
+
+static const struct file_operations thread_with_stdout_fops = {
.llseek = no_llseek,
+ .read = thread_with_stdio_read,
+ .poll = thread_with_stdout_poll,
+ .flush = thread_with_stdio_flush,
+ .release = thread_with_stdio_release,
+ .unlocked_ioctl = thread_with_stdio_ioctl,
};
+static int thread_with_stdio_fn(void *arg)
+{
+ struct thread_with_stdio *thr = arg;
+
+ thr->thr.ret = thr->ops->fn(thr);
+
+ thread_with_stdio_done(thr);
+ return 0;
+}
+
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
- void (*exit)(struct thread_with_stdio *),
- int (*fn)(void *))
+ const struct thread_with_stdio_ops *ops)
{
- thr->stdio.input_buf = PRINTBUF;
- thr->stdio.input_buf.atomic++;
- spin_lock_init(&thr->stdio.input_lock);
- init_waitqueue_head(&thr->stdio.input_wait);
+ stdio_buf_init(&thr->stdio.input);
+ stdio_buf_init(&thr->stdio.output);
+ thr->ops = ops;
- thr->stdio.output_buf = PRINTBUF;
- thr->stdio.output_buf.atomic++;
- spin_lock_init(&thr->stdio.output_lock);
- init_waitqueue_head(&thr->stdio.output_wait);
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
+}
- darray_init(&thr->output2);
- thr->exit = exit;
+int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
+{
+ stdio_buf_init(&thr->stdio.input);
+ stdio_buf_init(&thr->stdio.output);
+ thr->ops = ops;
- return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
}
+EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
- wait_event(stdio->input_wait,
- stdio->input_buf.pos || stdio->done);
+ struct stdio_buf *buf = &stdio->input;
+
+ /*
+ * we're waiting on user input (or for the file descriptor to be
+ * closed), don't want a hung task warning:
+ */
+ do {
+ wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } while (!stdio_redirect_has_input(stdio));
if (stdio->done)
return -1;
- spin_lock(&stdio->input_lock);
- int ret = min(len, stdio->input_buf.pos);
- stdio->input_buf.pos -= ret;
- memcpy(buf, stdio->input_buf.buf, ret);
- memmove(stdio->input_buf.buf,
- stdio->input_buf.buf + ret,
- stdio->input_buf.pos);
- spin_unlock(&stdio->input_lock);
+ spin_lock(&buf->lock);
+ int ret = min(len, buf->buf.nr);
+ buf->buf.nr -= ret;
+ memcpy(ubuf, buf->buf.data, ret);
+ memmove(buf->buf.data,
+ buf->buf.data + ret,
+ buf->buf.nr);
+ spin_unlock(&buf->lock);
- wake_up(&stdio->input_wait);
+ wake_up(&buf->wait);
return ret;
}
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
- wait_event(stdio->input_wait,
- stdio->input_buf.pos || stdio->done);
-
- if (stdio->done)
- return -1;
+ struct stdio_buf *buf = &stdio->input;
+ size_t copied = 0;
+ ssize_t ret = 0;
+again:
+ do {
+ wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } while (!stdio_redirect_has_input(stdio));
+
+ if (stdio->done) {
+ ret = -1;
+ goto out;
+ }
- spin_lock(&stdio->input_lock);
- int ret = min(len, stdio->input_buf.pos);
- char *n = memchr(stdio->input_buf.buf, '\n', ret);
+ spin_lock(&buf->lock);
+ size_t b = min(len, buf->buf.nr);
+ char *n = memchr(buf->buf.data, '\n', b);
if (n)
- ret = min(ret, n + 1 - stdio->input_buf.buf);
- stdio->input_buf.pos -= ret;
- memcpy(buf, stdio->input_buf.buf, ret);
- memmove(stdio->input_buf.buf,
- stdio->input_buf.buf + ret,
- stdio->input_buf.pos);
- spin_unlock(&stdio->input_lock);
-
- wake_up(&stdio->input_wait);
+ b = min_t(size_t, b, n + 1 - buf->buf.data);
+ buf->buf.nr -= b;
+ memcpy(ubuf, buf->buf.data, b);
+ memmove(buf->buf.data,
+ buf->buf.data + b,
+ buf->buf.nr);
+ ubuf += b;
+ len -= b;
+ copied += b;
+ spin_unlock(&buf->lock);
+
+ wake_up(&buf->wait);
+
+ if (!n && len)
+ goto again;
+out:
+ return copied ?: ret;
+}
+
+__printf(3, 0)
+static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+ ssize_t ret;
+
+ do {
+ va_list args2;
+ size_t len;
+
+ va_copy(args2, args);
+ len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+ va_end(args2);
+
+ if (len + 1 <= darray_room(*out)) {
+ out->nr += len;
+ return len;
+ }
+
+ ret = darray_make_room_gfp(out, len + 1, gfp);
+ } while (ret == 0);
+
+ return ret;
+}
+
+ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+ const char *fmt, va_list args)
+{
+ struct stdio_buf *buf = &stdio->output;
+ unsigned long flags;
+ ssize_t ret;
+
+again:
+ spin_lock_irqsave(&buf->lock, flags);
+ ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
+ spin_unlock_irqrestore(&buf->lock, flags);
+
+ if (ret < 0) {
+ if (nonblocking)
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(buf->wait,
+ stdio_redirect_has_output_space(stdio));
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+ wake_up(&buf->wait);
+ return ret;
+}
+
+ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+ const char *fmt, ...)
+{
+ va_list args;
+ ssize_t ret;
+
+ va_start(args, fmt);
+ ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+ va_end(args);
+
return ret;
}
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 05879c5048c8..af54ea8f5b0f 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -4,6 +4,38 @@
#include "thread_with_file_types.h"
+/*
+ * Thread with file: Run a kthread and connect it to a file descriptor, so that
+ * it can be interacted with via fd read/write methods and closing the file
+ * descriptor stops the kthread.
+ *
+ * We have two different APIs:
+ *
+ * thread_with_file, the low level version.
+ * You get to define the full file_operations, including your release function,
+ * which means that you must call bch2_thread_with_file_exit() from your
+ * .release method
+ *
+ * thread_with_stdio, the higher level version
+ * This implements full piping of input and output, including .poll.
+ *
+ * Notes on behaviour:
+ * - kthread shutdown behaves like writing or reading from a pipe that has been
+ * closed
+ * - Input and output buffers are 4096 bytes, although buffers may in some
+ * situations slightly exceed that limit so as to avoid chopping off a
+ * message in the middle in nonblocking mode.
+ * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
+ * should be fine but might change in future revisions.
+ * - Output buffer may grow past 4096 bytes to deal with messages that are
+ * bigger than 4096 bytes
+ * - Writing may be done blocking or nonblocking; in nonblocking mode, we only
+ * drop entire messages.
+ *
+ * To write, use stdio_redirect_printf()
+ * To read, use stdio_redirect_read() or stdio_redirect_readline()
+ */
+
struct task_struct;
struct thread_with_file {
@@ -17,25 +49,28 @@ int bch2_run_thread_with_file(struct thread_with_file *,
const struct file_operations *,
int (*fn)(void *));
+struct thread_with_stdio;
+
+struct thread_with_stdio_ops {
+ void (*exit)(struct thread_with_stdio *);
+ int (*fn)(struct thread_with_stdio *);
+ long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
+};
+
struct thread_with_stdio {
struct thread_with_file thr;
struct stdio_redirect stdio;
- DARRAY(char) output2;
- void (*exit)(struct thread_with_stdio *);
+ const struct thread_with_stdio_ops *ops;
};
-static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
- thr->thr.done = true;
- thr->stdio.done = true;
- wake_up(&thr->stdio.input_wait);
- wake_up(&thr->stdio.output_wait);
-}
-
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
- void (*exit)(struct thread_with_stdio *),
- int (*fn)(void *));
+ const struct thread_with_stdio_ops *);
+int bch2_run_thread_with_stdout(struct thread_with_stdio *,
+ const struct thread_with_stdio_ops *);
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+
#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index 90b5e645e98c..e0daf4eec341 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -2,14 +2,21 @@
#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#include "darray.h"
+
+struct stdio_buf {
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ darray_char buf;
+};
+
struct stdio_redirect {
- spinlock_t output_lock;
- wait_queue_head_t output_wait;
- struct printbuf output_buf;
+ struct stdio_buf input;
+ struct stdio_buf output;
spinlock_t input_lock;
wait_queue_head_t input_wait;
- struct printbuf input_buf;
+ darray_char input_buf;
bool done;
};
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
new file mode 100644
index 000000000000..4508e9dcbee2
--- /dev/null
+++ b/fs/bcachefs/time_stats.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+
+#include "eytzinger.h"
+#include "time_stats.h"
+
+static const struct time_unit time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "d", (u64) NSEC_PER_SEC * 3600 * 24},
+ { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+ { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+ { "eon", U64_MAX },
+};
+
+const struct time_unit *bch2_pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+ unsigned i = 0;
+
+ while (i < ARRAY_SIZE(q->entries)) {
+ struct quantile_entry *e = q->entries + i;
+
+ if (unlikely(!e->step)) {
+ e->m = v;
+ e->step = max_t(unsigned, v / 2, 1024);
+ } else if (e->m > v) {
+ e->m = e->m >= e->step
+ ? e->m - e->step
+ : 0;
+ } else if (e->m < v) {
+ e->m = e->m + e->step > e->m
+ ? e->m + e->step
+ : U32_MAX;
+ }
+
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
+ e->step = max_t(unsigned, e->step / 2, 1);
+
+ if (v >= e->m)
+ break;
+
+ i = eytzinger0_child(i, v > e->m);
+ }
+}
+
+static inline void time_stats_update_one(struct bch2_time_stats *stats,
+ u64 start, u64 end)
+{
+ u64 duration, freq;
+ bool initted = stats->last_event != 0;
+
+ if (time_after64(end, start)) {
+ struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+ duration = end - start;
+ mean_and_variance_update(&stats->duration_stats, duration);
+ mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+ duration, initted, TIME_STATS_MV_WEIGHT);
+ stats->max_duration = max(stats->max_duration, duration);
+ stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
+
+ if (quantiles)
+ quantiles_update(quantiles, duration);
+ }
+
+ if (stats->last_event && time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ mean_and_variance_update(&stats->freq_stats, freq);
+ mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+ freq, initted, TIME_STATS_MV_WEIGHT);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+ }
+
+ stats->last_event = end;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ for (struct time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+
+static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&stats->lock, flags);
+ __bch2_time_stats_clear_buffer(stats, b);
+ spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+ unsigned long flags;
+
+ if (!stats->buffer) {
+ spin_lock_irqsave(&stats->lock, flags);
+ time_stats_update_one(stats, start, end);
+
+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+ stats->duration_stats.n > 1024)
+ stats->buffer =
+ alloc_percpu_gfp(struct time_stat_buffer,
+ GFP_ATOMIC);
+ spin_unlock_irqrestore(&stats->lock, flags);
+ } else {
+ struct time_stat_buffer *b;
+
+ preempt_disable();
+ b = this_cpu_ptr(stats->buffer);
+
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+ b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+ .start = start,
+ .end = end
+ };
+
+ if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+ time_stats_clear_buffer(stats, b);
+ preempt_enable();
+ }
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+ free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->min_duration = U64_MAX;
+ stats->min_freq = U64_MAX;
+ spin_lock_init(&stats->lock);
+}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
new file mode 100644
index 000000000000..5df61403744b
--- /dev/null
+++ b/fs/bcachefs/time_stats.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * bch2_time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ * everywhere without worrying about overhead
+ *
+ * tracks:
+ * - number of events
+ * - maximum event duration ever seen
+ * - sum of all event durations
+ * - average event duration, standard and weighted
+ * - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _BCACHEFS_TIME_STATS_H
+#define _BCACHEFS_TIME_STATS_H
+
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+#include <linux/string.h>
+
+#include "mean_and_variance.h"
+
+struct time_unit {
+ const char *name;
+ u64 nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *bch2_pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES 15
+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+ struct quantile_entry {
+ u64 m;
+ u64 step;
+ } entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+ unsigned nr;
+ struct time_stat_buffer_entry {
+ u64 start;
+ u64 end;
+ } entries[31];
+};
+
+struct bch2_time_stats {
+ spinlock_t lock;
+ bool have_quantiles;
+ /* all fields are in nanoseconds */
+ u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
+ u64 max_freq;
+ u64 min_freq;
+ u64 last_event;
+ u64 last_event_start;
+
+ struct mean_and_variance duration_stats;
+ struct mean_and_variance freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT 8
+
+ struct mean_and_variance_weighted duration_stats_weighted;
+ struct mean_and_variance_weighted freq_stats_weighted;
+ struct time_stat_buffer __percpu *buffer;
+};
+
+struct bch2_time_stats_quantiles {
+ struct bch2_time_stats stats;
+ struct quantiles quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
+{
+ return stats->have_quantiles
+ ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
+ : NULL;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats - bch2_time_stats to update
+ * @start - start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+ __bch2_time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats - bch2_time_stats to update
+ * @v - new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
+{
+ if (v != !!stats->last_event_start) {
+ if (!v) {
+ bch2_time_stats_update(stats, stats->last_event_start);
+ stats->last_event_start = 0;
+ } else {
+ stats->last_event_start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
+{
+ bch2_time_stats_exit(&statq->stats);
+}
+static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
+{
+ bch2_time_stats_init(&statq->stats);
+ statq->stats.have_quantiles = true;
+ memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
+#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 293b90d704fb..6aa81d1e6d36 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1431,6 +1431,25 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
+TRACE_EVENT(error_downcast,
+ TP_PROTO(int bch_err, int std_err, unsigned long ip),
+ TP_ARGS(bch_err, std_err, ip),
+
+ TP_STRUCT__entry(
+ __array(char, bch_err, 32 )
+ __array(char, std_err, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+ strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 3a32faa86b5c..92c6ad75e702 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -337,157 +337,16 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
}
#endif
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
- const struct time_unit *u = pick_time_units(ns);
+ const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
- unsigned i = 0;
-
- while (i < ARRAY_SIZE(q->entries)) {
- struct bch2_quantile_entry *e = q->entries + i;
-
- if (unlikely(!e->step)) {
- e->m = v;
- e->step = max_t(unsigned, v / 2, 1024);
- } else if (e->m > v) {
- e->m = e->m >= e->step
- ? e->m - e->step
- : 0;
- } else if (e->m < v) {
- e->m = e->m + e->step > e->m
- ? e->m + e->step
- : U32_MAX;
- }
-
- if ((e->m > v ? e->m - v : v - e->m) < e->step)
- e->step = max_t(unsigned, e->step / 2, 1);
-
- if (v >= e->m)
- break;
-
- i = eytzinger0_child(i, v > e->m);
- }
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
- u64 start, u64 end)
-{
- u64 duration, freq;
-
- if (time_after64(end, start)) {
- duration = end - start;
- mean_and_variance_update(&stats->duration_stats, duration);
- mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
- stats->max_duration = max(stats->max_duration, duration);
- stats->min_duration = min(stats->min_duration, duration);
- stats->total_duration += duration;
- bch2_quantiles_update(&stats->quantiles, duration);
- }
-
- if (stats->last_event && time_after64(end, stats->last_event)) {
- freq = end - stats->last_event;
- mean_and_variance_update(&stats->freq_stats, freq);
- mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
- stats->max_freq = max(stats->max_freq, freq);
- stats->min_freq = min(stats->min_freq, freq);
- }
-
- stats->last_event = end;
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- for (struct bch2_time_stat_buffer_entry *i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
- b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&stats->lock, flags);
- __bch2_time_stats_clear_buffer(stats, b);
- spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
- unsigned long flags;
-
- WARN_ONCE(!stats->duration_stats_weighted.weight ||
- !stats->freq_stats_weighted.weight,
- "uninitialized time_stats");
-
- if (!stats->buffer) {
- spin_lock_irqsave(&stats->lock, flags);
- bch2_time_stats_update_one(stats, start, end);
-
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
- stats->duration_stats.n > 1024)
- stats->buffer =
- alloc_percpu_gfp(struct bch2_time_stat_buffer,
- GFP_ATOMIC);
- spin_unlock_irqrestore(&stats->lock, flags);
- } else {
- struct bch2_time_stat_buffer *b;
-
- preempt_disable();
- b = this_cpu_ptr(stats->buffer);
-
- BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
- b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
- .start = start,
- .end = end
- };
-
- if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
- bch2_time_stats_clear_buffer(stats, b);
- preempt_enable();
- }
-}
-
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
- const struct time_unit *u = pick_time_units(ns);
+ const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
prt_tab_rjust(out);
@@ -506,10 +365,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
- const struct time_unit *u;
+ struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
- u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
- int i;
+ u64 f_stddev = 0, d_stddev = 0;
if (stats->buffer) {
int cpu;
@@ -571,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, d_mean);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, d_stddev);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
@@ -594,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, f_mean);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, f_stddev);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
printbuf_tabstops_reset(out);
- i = eytzinger0_first(NR_QUANTILES);
- u = pick_time_units(stats->quantiles.entries[i].m);
-
- prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(i, NR_QUANTILES) {
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
- q = max(stats->quantiles.entries[i].m, last_q);
- prt_printf(out, "%llu ",
- div_u64(q, u->nsecs));
- if (is_last)
- prt_newline(out);
- last_q = q;
+ if (quantiles) {
+ int i = eytzinger0_first(NR_QUANTILES);
+ const struct time_unit *u =
+ bch2_pick_time_units(quantiles->entries[i].m);
+ u64 last_q = 0;
+
+ prt_printf(out, "quantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ u64 q = max(quantiles->entries[i].m, last_q);
+ prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+ if (is_last)
+ prt_newline(out);
+ last_q = q;
+ }
}
}
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
- free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
- memset(stats, 0, sizeof(*stats));
- stats->duration_stats_weighted.weight = 8;
- stats->freq_stats_weighted.weight = 8;
- stats->min_duration = U64_MAX;
- stats->min_freq = U64_MAX;
- spin_lock_init(&stats->lock);
-}
/* ratelimit: */
@@ -864,171 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
-static int alignment_ok(const void *base, size_t align)
-{
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
- ((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
- u32 t = *(u32 *)a;
- *(u32 *)a = *(u32 *)b;
- *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
- u64 t = *(u64 *)a;
- *(u64 *)a = *(u64 *)b;
- *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
- char t;
-
- do {
- t = *(char *)a;
- *(char *)a++ = *(char *)b;
- *(char *)b++ = t;
- } while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- size_t l, size_t r)
-{
- return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
- void (*swap_func)(void *, void *, size_t),
- size_t l, size_t r)
-{
- swap_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t))
-{
- int i, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for (i = n / 2 - 1; i >= 0; --i) {
- for (r = i; r * 2 + 1 < n; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < n &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-
- /* sort */
- for (i = n - 1; i > 0; --i) {
- do_swap(base, n, size, swap_func, 0, i);
-
- for (r = 0; r * 2 + 1 < i; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < i &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t size))
-{
- /* pre-scale counters for performance */
- int i = (num/2 - 1) * size, n = num * size, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for ( ; i >= 0; i -= size) {
- for (r = i; r * 2 + size < n; r = c) {
- c = r * 2 + size;
- if (c < n - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-
- /* sort */
- for (i = n - size; i > 0; i -= size) {
- swap_func(base, base + i, size);
- for (r = 0; r * 2 + size < i; r = c) {
- c = r * 2 + size;
- if (c < i - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-}
-
-static void mempool_free_vp(void *element, void *pool_data)
-{
- size_t size = (size_t) pool_data;
-
- vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
- size_t size = (size_t) pool_data;
-
- return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
- return size < PAGE_SIZE
- ? mempool_init_kmalloc_pool(pool, min_nr, size)
- : mempool_init(pool, min_nr, mempool_alloc_vp,
- mempool_free_vp, (void *) size);
-}
-
#if 0
void eytzinger1_test(void)
{
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b414736d59a5..5cf885b09986 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -21,6 +21,7 @@
#include "mean_and_variance.h"
#include "darray.h"
+#include "time_stats.h"
struct closure;
@@ -53,38 +54,6 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
-static inline void vpfree(void *p, size_t size)
-{
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- free_pages((unsigned long) p, get_order(size));
-}
-
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
- return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
- get_order(size)) ?:
- __vmalloc(size, gfp_mask);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
- if (size < PAGE_SIZE)
- kfree(p);
- else
- vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
-{
- return size < PAGE_SIZE
- ? kmalloc(size, gfp_mask)
- : vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
#define HEAP(type) \
struct { \
size_t size, used; \
@@ -97,13 +66,13 @@ struct { \
({ \
(heap)->used = 0; \
(heap)->size = (_size); \
- (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+ (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
(gfp)); \
})
#define free_heap(heap) \
do { \
- kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
+ kvfree((heap)->data); \
(heap)->data = NULL; \
} while (0)
@@ -361,84 +330,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
#endif
}
-#define NR_QUANTILES 15
-#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
- struct bch2_quantile_entry {
- u64 m;
- u64 step;
- } entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
- unsigned nr;
- struct bch2_time_stat_buffer_entry {
- u64 start;
- u64 end;
- } entries[32];
-};
-
-struct bch2_time_stats {
- spinlock_t lock;
- /* all fields are in nanoseconds */
- u64 min_duration;
- u64 max_duration;
- u64 total_duration;
- u64 max_freq;
- u64 min_freq;
- u64 last_event;
- struct bch2_quantiles quantiles;
-
- struct mean_and_variance duration_stats;
- struct mean_and_variance_weighted duration_stats_weighted;
- struct mean_and_variance freq_stats;
- struct mean_and_variance_weighted freq_stats_weighted;
- struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
- __bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- if (v != !!*start) {
- if (!v) {
- bch2_time_stats_update(stats, *start);
- *start = 0;
- } else {
- *start = local_clock() ?: 1;
- return true;
- }
- }
-
- return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- bool ret = v && !*start;
- *start = v;
- return ret;
-}
-#endif
-
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-
#define ewma_add(ewma, val, weight) \
({ \
typeof(ewma) _ewma = (ewma); \
@@ -738,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
-
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
@@ -788,8 +677,15 @@ static inline void __move_gap(void *array, size_t element_size,
}
/* Move the gap in a gap buffer: */
-#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \
- __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+#define move_gap(_d, _new_gap) \
+do { \
+ BUG_ON(_new_gap > (_d)->nr); \
+ BUG_ON((_d)->gap > (_d)->nr); \
+ \
+ __move_gap((_d)->data, sizeof((_d)->data[0]), \
+ (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \
+ (_d)->gap = _new_gap; \
+} while (0)
#define bubble_sort(_base, _nr, _cmp) \
do { \
@@ -876,4 +772,43 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
void bch2_darray_str_exit(darray_str *);
int bch2_split_devs(const char *, darray_str *);
+#ifdef __KERNEL__
+
+__must_check
+static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
+__must_check
+static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
+{
+ return copy_from_user(to, from, n) ? -EFAULT : 0;
+}
+
+#endif
+
+static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
+{
+ if (v)
+ set_bit(nr, addr);
+ else
+ clear_bit(nr, addr);
+}
+
+static inline void __set_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline void __clear_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline bool test_bit_le64(size_t bit, __le64 *addr)
+{
+ return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 9c0d2316031b..754f17bba68e 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
kfree(buf);
if (ret < 0)
- return ret;
+ goto err_class_exit;
ret = bch2_opt_check_may_set(c, opt_id, v);
if (ret < 0)
- return ret;
+ goto err_class_exit;
s.v = v + 1;
s.defined = true;
@@ -595,6 +595,7 @@ err:
(opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
+err_class_exit:
return bch2_err_class(ret);
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 2b4dda047450..d76f406d3b2e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -435,8 +435,7 @@ befs_init_inodecache(void)
{
befs_inode_cachep = kmem_cache_create_usercopy("befs_inode_cache",
sizeof(struct befs_inode_info), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT),
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
offsetof(struct befs_inode_info,
i_data.symlink),
sizeof_field(struct befs_inode_info,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 355957dbce39..db81570c9637 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -259,7 +259,7 @@ static int __init init_inodecache(void)
bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
sizeof(struct bfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (bfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fefc642541cb..3314249e8674 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -320,7 +320,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
else
executable_stack = EXSTACK_DEFAULT;
- if (stack_size == 0) {
+ if (stack_size == 0 && interp_params.flags & ELF_FDPIC_FLAG_PRESENT) {
stack_size = interp_params.stack_size;
if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
executable_stack = EXSTACK_ENABLE_X;
@@ -1359,7 +1359,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
rcu_read_unlock();
- strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+ get_task_comm(psinfo->pr_fname, p);
return 0;
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e24d784898fc..7696beec4c21 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -244,7 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -255,13 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return -EINVAL;
}
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev_handle);
+ return PTR_ERR(bdev_file);
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
@@ -312,7 +312,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
- device->bdev_handle = bdev_handle;
+ device->bdev_file = bdev_file;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
@@ -333,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5a507237c4fa..55f3ba6a831c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2736,7 +2736,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_vol_args_v2 *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2776,7 +2776,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
/* Exclusive operation is now claimed */
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
btrfs_exclop_finish(fs_info);
@@ -2790,8 +2790,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
err_drop:
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2804,7 +2804,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_vol_args *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2834,15 +2834,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
out_free:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dedec3d9b111..f15591f3e54f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -466,39 +466,39 @@ static noinline struct btrfs_fs_devices *find_fsid(
static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
- int flush, struct bdev_handle **bdev_handle,
+ int flush, struct file **bdev_file,
struct btrfs_super_block **disk_super)
{
struct block_device *bdev;
int ret;
- *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev_handle)) {
- ret = PTR_ERR(*bdev_handle);
+ if (IS_ERR(*bdev_file)) {
+ ret = PTR_ERR(*bdev_file);
goto error;
}
- bdev = (*bdev_handle)->bdev;
+ bdev = file_bdev(*bdev_file);
if (flush)
sync_blockdev(bdev);
ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
invalidate_bdev(bdev);
*disk_super = btrfs_read_dev_super(bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
return 0;
error:
- *bdev_handle = NULL;
+ *bdev_file = NULL;
return ret;
}
@@ -641,7 +641,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -652,7 +652,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret)
return ret;
@@ -676,20 +676,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev_handle->bdev))
+ if (bdev_read_only(file_bdev(bdev_file)))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!bdev_nonrot(bdev_handle->bdev))
+ if (!bdev_nonrot(file_bdev(bdev_file)))
fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev_handle->bdev))
+ if (bdev_max_discard_sectors(file_bdev(bdev_file)))
fs_devices->discardable = true;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (device->devt != device->bdev->bd_dev) {
@@ -714,7 +714,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return -EINVAL;
}
@@ -1027,10 +1027,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev_handle) {
- bdev_release(device->bdev_handle);
+ if (device->bdev_file) {
+ fput(device->bdev_file);
device->bdev = NULL;
- device->bdev_handle = NULL;
+ device->bdev_file = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1075,7 +1075,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- bdev_release(device->bdev_handle);
+ fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1320,6 +1320,47 @@ int btrfs_forget_devices(dev_t devt)
return ret;
}
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
+ const char *path, dev_t devt,
+ bool mount_arg_dev)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Do not skip device registration for mounted devices with matching
+ * maj:min but different paths. Booting without initrd relies on
+ * /dev/root initially, later replaced with the actual root device.
+ * A successful scan ensures grub2-probe selects the correct device.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ if (!fs_devices->opened) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ continue;
+ }
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->bdev && (device->bdev->bd_dev == devt) &&
+ strcmp(device->name->str, path) != 0) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /* Do not skip registration. */
+ return false;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+ }
+
+ if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+ !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
+ return true;
+
+ return false;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -1335,8 +1376,9 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
u64 bytenr, bytenr_orig;
+ dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
@@ -1358,37 +1400,31 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
- if (IS_ERR(bdev_handle))
- return ERR_CAST(bdev_handle);
+ bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
+ ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
- disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
}
- if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
- !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
- dev_t devt;
+ devt = file_bdev(bdev_file)->bd_dev;
+ if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
+ pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+ path, MAJOR(devt), MINOR(devt));
- ret = lookup_bdev(path, &devt);
- if (ret)
- btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
- path, ret);
- else
- btrfs_free_stale_devices(devt, NULL);
+ btrfs_free_stale_devices(devt, NULL);
- pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
- path, MAJOR(devt), MINOR(devt));
device = NULL;
goto free_disk_super;
}
@@ -1401,7 +1437,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return device;
}
@@ -2076,7 +2112,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle)
+ struct file **bdev_file)
{
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
@@ -2185,7 +2221,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev_handle) {
+ if (device->bdev_file) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2201,9 +2237,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and bdev_release() will pull in the ->open_mutex on
- * the block device and it's dependencies. Instead just flush the
- * device and let the caller do the final bdev_release.
+ * write lock, and fput() on the block device will pull in the
+ * ->open_mutex on the block device and it's dependencies. Instead
+ * just flush the device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_scratch_superblocks(fs_info, device);
@@ -2213,7 +2249,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
}
- *bdev_handle = device->bdev_handle;
+ *bdev_file = device->bdev_file;
synchronize_rcu();
btrfs_free_device(device);
@@ -2349,7 +2385,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int ret;
if (!path || !path[0])
@@ -2367,7 +2403,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
}
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret) {
btrfs_put_dev_args_from_path(args);
return ret;
@@ -2380,7 +2416,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return 0;
}
@@ -2600,7 +2636,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices = NULL;
@@ -2613,12 +2649,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
- if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
+ if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
ret = -EINVAL;
goto error;
}
@@ -2630,11 +2666,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev_handle->bdev);
+ sync_blockdev(file_bdev(bdev_file));
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev_handle->bdev) {
+ if (device->bdev == file_bdev(bdev_file)) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2650,8 +2686,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
device->fs_info = fs_info;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error_free_device;
@@ -2834,7 +2870,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index feba8d53526c..93854609a4d5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -105,7 +105,7 @@ struct btrfs_device {
u64 generation;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
struct btrfs_zoned_device_info *zone_info;
@@ -698,7 +698,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle);
+ struct file **bdev_file);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 459d1af02c3c..4cba80b34387 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -822,11 +822,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ unsigned int nofs_flags;
+
ASSERT(sb_zone_is_full(reset));
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- reset->start, reset->len,
- GFP_NOFS);
+ reset->start, reset->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -972,11 +975,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
* explicit ZONE_FINISH is not necessary.
*/
if (zone->wp != zone->start + zone->capacity) {
+ unsigned int nofs_flags;
int ret;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_NOFS);
+ zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
@@ -994,11 +1000,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
+ unsigned int nofs_flags;
sector_t zone_sectors;
sector_t nr_sectors;
u8 zone_sectors_shift;
u32 sb_zone;
u32 nr_zones;
+ int ret;
zone_sectors = bdev_zone_sectors(bdev);
zone_sectors_shift = ilog2(zone_sectors);
@@ -1009,9 +1017,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
/*
@@ -1122,12 +1133,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
+ unsigned int nofs_flags;
int ret;
*bytes = 0;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
- physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_NOFS);
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -2239,14 +2252,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int nofs_flags;
if (zinfo->max_active_zones == 0)
continue;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret) {
up_read(&dev_replace->rwsem);
diff --git a/fs/buffer.c b/fs/buffer.c
index d3bcf601d3e5..4f73d23c2c46 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -55,7 +55,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
- struct writeback_control *wbc);
+ enum rw_hint hint, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
* a successful fsync(). For example, ext2 indirect blocks need to be
* written back and waited upon before fsync() returns.
*
- * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
* inode_has_buffers() and invalidate_inode_buffers() are provided for the
* management of a list of dependent buffers at ->i_mapping->i_private_list.
*
@@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -1944,7 +1945,8 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
}
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ enum rw_hint write_hint,
struct writeback_control *wbc)
{
const enum req_op op = opf & REQ_OP_MASK;
@@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_write_hint = write_hint;
__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
- submit_bh_wbc(opf, bh, NULL);
+ submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3121,12 +3125,8 @@ void __init buffer_init(void)
unsigned long nrpages;
int ret;
- bh_cachep = kmem_cache_create("buffer_head",
- sizeof(struct buffer_head), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD),
- NULL);
-
+ bh_cachep = KMEM_CACHE(buffer_head,
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
*/
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1340d77124ae..ee9caf7916fb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -795,8 +795,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
ihold(inode);
if (wbc->sync_mode == WB_SYNC_NONE &&
- ceph_inode_to_fs_client(inode)->write_congested)
+ ceph_inode_to_fs_client(inode)->write_congested) {
+ redirty_page_for_writepage(wbc, page);
return AOP_WRITEPAGE_ACTIVATE;
+ }
wait_on_page_fscache(page);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7fb4aae97412..c4941ba245ac 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4634,6 +4634,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
iput(inode);
spin_lock(&mdsc->cap_delay_lock);
}
+
+ /*
+ * Make sure too many dirty caps or general
+ * slowness doesn't block mdsc delayed work,
+ * preventing send_renew_caps() from running.
+ */
+ if (jiffies - loop_start >= 5 * HZ)
+ break;
}
spin_unlock(&mdsc->cap_delay_lock);
doutc(cl, "done\n");
@@ -4775,13 +4783,13 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
ceph_vinop(inode));
- spin_lock(&mdsc->cap_unlink_delay_lock);
+ spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
list_del_init(&ci->i_cap_delay_list);
list_add_tail(&ci->i_cap_delay_list,
&mdsc->cap_unlink_delay_list);
- spin_unlock(&mdsc->cap_unlink_delay_lock);
+ spin_unlock(&mdsc->cap_delay_lock);
/*
* Fire the work immediately, because the MDS maybe
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index abe8028d95bf..16873d07692f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1138,7 +1138,12 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
}
idx = 0;
- left = ret > 0 ? ret : 0;
+ if (ret <= 0)
+ left = 0;
+ else if (off + ret > i_size)
+ left = i_size - off;
+ else
+ left = ret;
while (left > 0) {
size_t plen, copied;
@@ -1167,15 +1172,13 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
}
if (ret > 0) {
- if (off > *ki_pos) {
- if (off >= i_size) {
- *retry_op = CHECK_EOF;
- ret = i_size - *ki_pos;
- *ki_pos = i_size;
- } else {
- ret = off - *ki_pos;
- *ki_pos = off;
- }
+ if (off >= i_size) {
+ *retry_op = CHECK_EOF;
+ ret = i_size - *ki_pos;
+ *ki_pos = i_size;
+ } else {
+ ret = off - *ki_pos;
+ *ki_pos = off;
}
if (last_objver)
@@ -2126,14 +2129,16 @@ again:
int statret;
struct page *page = NULL;
loff_t i_size;
+ int mask = CEPH_STAT_CAP_SIZE;
if (retry_op == READ_INLINE) {
page = __page_cache_alloc(GFP_KERNEL);
if (!page)
return -ENOMEM;
+
+ mask = CEPH_STAT_CAP_INLINE_DATA;
}
- statret = __ceph_do_getattr(inode, page,
- CEPH_STAT_CAP_INLINE_DATA, !!page);
+ statret = __ceph_do_getattr(inode, page, mask, !!page);
if (statret < 0) {
if (page)
__free_page(page);
@@ -2174,7 +2179,7 @@ again:
/* hit EOF or hole? */
if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
ret < len) {
- doutc(cl, "hit hole, ppos %lld < size %lld, reading more\n",
+ doutc(cl, "may hit hole, ppos %lld < size %lld, reading more\n",
iocb->ki_pos, i_size);
read += ret;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index e07ad29ff8b9..ebf4ac0055dd 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -33,7 +33,7 @@ void __init ceph_flock_init(void)
static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
{
- struct inode *inode = file_inode(dst->fl_file);
+ struct inode *inode = file_inode(dst->c.flc_file);
atomic_inc(&ceph_inode(inode)->i_filelock_ref);
dst->fl_u.ceph.inode = igrab(inode);
}
@@ -110,17 +110,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
else
length = fl->fl_end - fl->fl_start + 1;
- owner = secure_addr(fl->fl_owner);
+ owner = secure_addr(fl->c.flc_owner);
doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
"start: %llu, length: %llu, wait: %d, type: %d\n",
- (int)lock_type, (int)operation, owner, (u64)fl->fl_pid,
- fl->fl_start, length, wait, fl->fl_type);
+ (int)lock_type, (int)operation, owner,
+ (u64) fl->c.flc_pid,
+ fl->fl_start, length, wait, fl->c.flc_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
req->r_args.filelock_change.owner = cpu_to_le64(owner);
- req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
+ req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid);
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
@@ -130,13 +131,13 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
err = ceph_mdsc_wait_request(mdsc, req, wait ?
ceph_lock_wait_for_completion : NULL);
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
- fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+ fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
- fl->fl_type = F_RDLCK;
+ fl->c.flc_type = F_RDLCK;
else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
else
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
@@ -150,8 +151,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
ceph_mdsc_put_request(req);
doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
"length: %llu, wait: %d, type: %d, err code %d\n",
- (int)lock_type, (int)operation, (u64)fl->fl_pid,
- fl->fl_start, length, wait, fl->fl_type, err);
+ (int)lock_type, (int)operation, (u64) fl->c.flc_pid,
+ fl->fl_start, length, wait, fl->c.flc_type, err);
return err;
}
@@ -227,10 +228,10 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
static int try_unlock_file(struct file *file, struct file_lock *fl)
{
int err;
- unsigned int orig_flags = fl->fl_flags;
- fl->fl_flags |= FL_EXISTS;
+ unsigned int orig_flags = fl->c.flc_flags;
+ fl->c.flc_flags |= FL_EXISTS;
err = locks_lock_file_wait(file, fl);
- fl->fl_flags = orig_flags;
+ fl->c.flc_flags = orig_flags;
if (err == -ENOENT) {
if (!(orig_flags & FL_EXISTS))
err = 0;
@@ -253,13 +254,13 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
u8 wait = 0;
u8 lock_cmd;
- if (!(fl->fl_flags & FL_POSIX))
+ if (!(fl->c.flc_flags & FL_POSIX))
return -ENOLCK;
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- doutc(cl, "fl_owner: %p\n", fl->fl_owner);
+ doutc(cl, "fl_owner: %p\n", fl->c.flc_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
if (IS_GETLK(cmd))
@@ -273,19 +274,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
}
spin_unlock(&ci->i_ceph_lock);
if (err < 0) {
- if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
+ if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl))
posix_lock_file(file, fl, NULL);
return err;
}
- if (F_RDLCK == fl->fl_type)
+ if (lock_is_read(fl))
lock_cmd = CEPH_LOCK_SHARED;
- else if (F_WRLCK == fl->fl_type)
+ else if (lock_is_write(fl))
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
- if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) {
err = try_unlock_file(file, fl);
if (err <= 0)
return err;
@@ -293,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
- if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) {
doutc(cl, "locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
@@ -319,13 +320,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
u8 wait = 0;
u8 lock_cmd;
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
- doutc(cl, "fl_file: %p\n", fl->fl_file);
+ doutc(cl, "fl_file: %p\n", fl->c.flc_file);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
@@ -333,7 +334,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
}
spin_unlock(&ci->i_ceph_lock);
if (err < 0) {
- if (F_UNLCK == fl->fl_type)
+ if (lock_is_unlock(fl))
locks_lock_file_wait(file, fl);
return err;
}
@@ -341,14 +342,14 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (IS_SETLKW(cmd))
wait = 1;
- if (F_RDLCK == fl->fl_type)
+ if (lock_is_read(fl))
lock_cmd = CEPH_LOCK_SHARED;
- else if (F_WRLCK == fl->fl_type)
+ else if (lock_is_write(fl))
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
- if (F_UNLCK == fl->fl_type) {
+ if (lock_is_unlock(fl)) {
err = try_unlock_file(file, fl);
if (err <= 0)
return err;
@@ -356,7 +357,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
inode, lock_cmd, wait, fl);
- if (!err && F_UNLCK != fl->fl_type) {
+ if (!err && F_UNLCK != fl->c.flc_type) {
err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
@@ -385,9 +386,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
ctx = locks_inode_context(inode);
if (ctx) {
spin_lock(&ctx->flc_lock);
- list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+ for_each_file_lock(lock, &ctx->flc_posix)
++(*fcntl_count);
- list_for_each_entry(lock, &ctx->flc_flock, fl_list)
+ for_each_file_lock(lock, &ctx->flc_flock)
++(*flock_count);
spin_unlock(&ctx->flc_lock);
}
@@ -408,10 +409,10 @@ static int lock_to_ceph_filelock(struct inode *inode,
cephlock->start = cpu_to_le64(lock->fl_start);
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
- cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
- cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+ cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid);
+ cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner));
- switch (lock->fl_type) {
+ switch (lock->c.flc_type) {
case F_RDLCK:
cephlock->type = CEPH_LOCK_SHARED;
break;
@@ -422,7 +423,8 @@ static int lock_to_ceph_filelock(struct inode *inode,
cephlock->type = CEPH_LOCK_UNLOCK;
break;
default:
- doutc(cl, "Have unknown lock type %d\n", lock->fl_type);
+ doutc(cl, "Have unknown lock type %d\n",
+ lock->c.flc_type);
err = -EINVAL;
}
@@ -453,7 +455,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
return 0;
spin_lock(&ctx->flc_lock);
- list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
+ for_each_file_lock(lock, &ctx->flc_posix) {
++seen_fcntl;
if (seen_fcntl > num_fcntl_locks) {
err = -ENOSPC;
@@ -464,7 +466,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
goto fail;
++l;
}
- list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+ for_each_file_lock(lock, &ctx->flc_flock) {
++seen_flock;
if (seen_flock > num_flock_locks) {
err = -ENOSPC;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3ab9c268a8bb..360b686c3c67 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2504,7 +2504,7 @@ static void ceph_cap_unlink_work(struct work_struct *work)
struct ceph_client *cl = mdsc->fsc->client;
doutc(cl, "begin\n");
- spin_lock(&mdsc->cap_unlink_delay_lock);
+ spin_lock(&mdsc->cap_delay_lock);
while (!list_empty(&mdsc->cap_unlink_delay_list)) {
struct ceph_inode_info *ci;
struct inode *inode;
@@ -2516,15 +2516,15 @@ static void ceph_cap_unlink_work(struct work_struct *work)
inode = igrab(&ci->netfs.inode);
if (inode) {
- spin_unlock(&mdsc->cap_unlink_delay_lock);
+ spin_unlock(&mdsc->cap_delay_lock);
doutc(cl, "on %p %llx.%llx\n", inode,
ceph_vinop(inode));
ceph_check_caps(ci, CHECK_CAPS_FLUSH);
iput(inode);
- spin_lock(&mdsc->cap_unlink_delay_lock);
+ spin_lock(&mdsc->cap_delay_lock);
}
}
- spin_unlock(&mdsc->cap_unlink_delay_lock);
+ spin_unlock(&mdsc->cap_delay_lock);
doutc(cl, "done\n");
}
@@ -5404,7 +5404,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->cap_wait_list);
spin_lock_init(&mdsc->cap_delay_lock);
INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
- spin_lock_init(&mdsc->cap_unlink_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 03f8ff00874f..b88e80415224 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -461,9 +461,8 @@ struct ceph_mds_client {
struct delayed_work delayed_work; /* delayed work */
unsigned long last_renew_caps; /* last time we renewed our caps */
struct list_head cap_delay_list; /* caps with delayed release */
- spinlock_t cap_delay_lock; /* protects cap_delay_list */
struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */
- spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */
+ spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 5ec102f6b1ac..885cb5d4e771 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -928,36 +928,36 @@ static int __init init_caches(void)
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT, ceph_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+ ceph_inode_init_once);
if (!ceph_inode_cachep)
return -ENOMEM;
- ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
+ ceph_cap_cachep = KMEM_CACHE(ceph_cap, 0);
if (!ceph_cap_cachep)
goto bad_cap;
- ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD);
+ ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, 0);
if (!ceph_cap_snap_cachep)
goto bad_cap_snap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ SLAB_RECLAIM_ACCOUNT);
if (!ceph_cap_flush_cachep)
goto bad_cap_flush;
ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ SLAB_RECLAIM_ACCOUNT);
if (!ceph_dentry_cachep)
goto bad_dentry;
- ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info, 0);
if (!ceph_file_cachep)
goto bad_file;
- ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD);
+ ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, 0);
if (!ceph_dir_file_cachep)
goto bad_dir_file;
- ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD);
+ ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, 0);
if (!ceph_mds_request_cachep)
goto bad_mds_req;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 0c7c2528791e..6898dc621011 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -24,6 +24,8 @@
#include <linux/pid_namespace.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/vmalloc.h>
#include <linux/coda.h>
@@ -70,8 +72,8 @@ int __init coda_init_inodecache(void)
{
coda_inode_cachep = kmem_cache_create("coda_inode_cache",
sizeof(struct coda_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT, init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+ init_once);
if (coda_inode_cachep == NULL)
return -ENOMEM;
return 0;
@@ -87,10 +89,10 @@ void coda_destroy_inodecache(void)
kmem_cache_destroy(coda_inode_cachep);
}
-static int coda_remount(struct super_block *sb, int *flags, char *data)
+static int coda_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- *flags |= SB_NOATIME;
+ sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= SB_NOATIME;
return 0;
}
@@ -102,78 +104,102 @@ static const struct super_operations coda_super_operations =
.evict_inode = coda_evict_inode,
.put_super = coda_put_super,
.statfs = coda_statfs,
- .remount_fs = coda_remount,
};
-static int get_device_index(struct coda_mount_data *data)
+struct coda_fs_context {
+ int idx;
+};
+
+enum {
+ Opt_fd,
+};
+
+static const struct fs_parameter_spec coda_param_specs[] = {
+ fsparam_fd ("fd", Opt_fd),
+ {}
+};
+
+static int coda_parse_fd(struct fs_context *fc, int fd)
{
+ struct coda_fs_context *ctx = fc->fs_private;
struct fd f;
struct inode *inode;
int idx;
- if (data == NULL) {
- pr_warn("%s: Bad mount data\n", __func__);
- return -1;
- }
-
- if (data->version != CODA_MOUNT_VERSION) {
- pr_warn("%s: Bad mount version\n", __func__);
- return -1;
- }
-
- f = fdget(data->fd);
+ f = fdget(fd);
if (!f.file)
- goto Ebadf;
+ return -EBADF;
inode = file_inode(f.file);
if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
fdput(f);
- goto Ebadf;
+ return invalf(fc, "code: Not coda psdev");
}
idx = iminor(inode);
fdput(f);
- if (idx < 0 || idx >= MAX_CODADEVS) {
- pr_warn("%s: Bad minor number\n", __func__);
- return -1;
+ if (idx < 0 || idx >= MAX_CODADEVS)
+ return invalf(fc, "coda: Bad minor number");
+ ctx->idx = idx;
+ return 0;
+}
+
+static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, coda_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_fd:
+ return coda_parse_fd(fc, result.uint_32);
}
- return idx;
-Ebadf:
- pr_warn("%s: Bad file\n", __func__);
- return -1;
+ return 0;
+}
+
+/*
+ * Parse coda's binary mount data form. We ignore any errors and go with index
+ * 0 if we get one for backward compatibility.
+ */
+static int coda_parse_monolithic(struct fs_context *fc, void *_data)
+{
+ struct coda_mount_data *data = _data;
+
+ if (!data)
+ return invalf(fc, "coda: Bad mount data");
+
+ if (data->version != CODA_MOUNT_VERSION)
+ return invalf(fc, "coda: Bad mount version");
+
+ coda_parse_fd(fc, data->fd);
+ return 0;
}
-static int coda_fill_super(struct super_block *sb, void *data, int silent)
+static int coda_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct coda_fs_context *ctx = fc->fs_private;
struct inode *root = NULL;
struct venus_comm *vc;
struct CodaFid fid;
int error;
- int idx;
-
- if (task_active_pid_ns(current) != &init_pid_ns)
- return -EINVAL;
-
- idx = get_device_index((struct coda_mount_data *) data);
- /* Ignore errors in data, for backward compatibility */
- if(idx == -1)
- idx = 0;
-
- pr_info("%s: device index: %i\n", __func__, idx);
+ infof(fc, "coda: device index: %i\n", ctx->idx);
- vc = &coda_comms[idx];
+ vc = &coda_comms[ctx->idx];
mutex_lock(&vc->vc_mutex);
if (!vc->vc_inuse) {
- pr_warn("%s: No pseudo device\n", __func__);
+ errorf(fc, "coda: No pseudo device");
error = -EINVAL;
goto unlock_out;
}
if (vc->vc_sb) {
- pr_warn("%s: Device already mounted\n", __func__);
+ errorf(fc, "coda: Device already mounted");
error = -EBUSY;
goto unlock_out;
}
@@ -313,18 +339,45 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-/* init_coda: used by filesystems.c to register coda */
+static int coda_get_tree(struct fs_context *fc)
+{
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
-static struct dentry *coda_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+ return get_tree_nodev(fc, coda_fill_super);
+}
+
+static void coda_free_fc(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, coda_fill_super);
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations coda_context_ops = {
+ .free = coda_free_fc,
+ .parse_param = coda_parse_param,
+ .parse_monolithic = coda_parse_monolithic,
+ .get_tree = coda_get_tree,
+ .reconfigure = coda_reconfigure,
+};
+
+static int coda_init_fs_context(struct fs_context *fc)
+{
+ struct coda_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct coda_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->fs_private = ctx;
+ fc->ops = &coda_context_ops;
+ return 0;
}
struct file_system_type coda_fs_type = {
.owner = THIS_MODULE,
.name = "coda",
- .mount = coda_mount,
+ .init_fs_context = coda_init_fs_context,
+ .parameters = coda_param_specs,
.kill_sb = kill_anon_super,
.fs_flags = FS_BINARY_MOUNTDATA,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index f258c17c1841..be6403b4b14b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -872,6 +872,9 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
loff_t pos;
ssize_t n;
+ if (!page)
+ return 0;
+
if (cprm->to_skip) {
if (!__dump_skip(cprm, cprm->to_skip))
return 0;
@@ -884,7 +887,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
pos = file->f_pos;
bvec_set_page(&bvec, page, PAGE_SIZE, 0);
iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
- iov_iter_set_copy_mc(&iter);
n = __kernel_write_iter(cprm->file, &iter, &pos);
if (n != PAGE_SIZE)
return 0;
@@ -895,10 +897,44 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
return 1;
}
+/*
+ * If we might get machine checks from kernel accesses during the
+ * core dump, let's get those errors early rather than during the
+ * IO. This is not performance-critical enough to warrant having
+ * all the machine check logic in the iovec paths.
+ */
+#ifdef copy_mc_to_kernel
+
+#define dump_page_alloc() alloc_page(GFP_KERNEL)
+#define dump_page_free(x) __free_page(x)
+static struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+ void *buf = kmap_local_page(src);
+ size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE);
+ kunmap_local(buf);
+ return left ? NULL : dst;
+}
+
+#else
+
+/* We just want to return non-NULL; it's never used. */
+#define dump_page_alloc() ERR_PTR(-EINVAL)
+#define dump_page_free(x) ((void)(x))
+static inline struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+ return src;
+}
+#endif
+
int dump_user_range(struct coredump_params *cprm, unsigned long start,
unsigned long len)
{
unsigned long addr;
+ struct page *dump_page;
+
+ dump_page = dump_page_alloc();
+ if (!dump_page)
+ return 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
@@ -912,14 +948,17 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
*/
page = get_dump_page(addr);
if (page) {
- int stop = !dump_emit_page(cprm, page);
+ int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
put_page(page);
- if (stop)
+ if (stop) {
+ dump_page_free(dump_page);
return 0;
+ }
} else {
dump_skip(cprm, PAGE_SIZE);
}
}
+ dump_page_free(dump_page);
return 1;
}
#endif
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 60dbfa0f8805..9901057a15ba 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
sb->s_mtd = NULL;
} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
sync_blockdev(sb->s_bdev);
- bdev_release(sb->s_bdev_handle);
+ bdev_fput(sb->s_bdev_file);
}
kfree(sbi);
}
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 7b3fc189593a..0ad52fbe51c9 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -74,13 +74,7 @@ struct fscrypt_nokey_name {
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
- if (str->len == 1 && str->name[0] == '.')
- return true;
-
- if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
- return true;
-
- return false;
+ return is_dot_dotdot(str->name, str->len);
}
/**
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 1892356cf924..8371e4e1f596 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -222,16 +222,19 @@ struct fscrypt_inode_info {
struct fscrypt_prepared_key ci_enc_key;
/* True if ci_enc_key should be freed when this struct is freed */
- bool ci_owns_key;
+ u8 ci_owns_key : 1;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
/*
* True if this inode will use inline encryption (blk-crypto) instead of
* the traditional filesystem-layer encryption.
*/
- bool ci_inlinecrypt;
+ u8 ci_inlinecrypt : 1;
#endif
+ /* True if ci_dirhash_key is initialized */
+ u8 ci_dirhash_key_initialized : 1;
+
/*
* log2 of the data unit size (granularity of contents encryption) of
* this file. This is computable from ci_policy and ci_inode but is
@@ -242,6 +245,9 @@ struct fscrypt_inode_info {
/* Cached value: log2 of number of data units per FS block */
u8 ci_data_units_per_block_bits;
+ /* Hashed inode number. Only set for IV_INO_LBLK_32 */
+ u32 ci_hashed_ino;
+
/*
* Encryption mode used for this inode. It corresponds to either the
* contents or filenames encryption mode, depending on the inode type.
@@ -276,16 +282,12 @@ struct fscrypt_inode_info {
* the plaintext filenames -- currently just casefolded directories.
*/
siphash_key_t ci_dirhash_key;
- bool ci_dirhash_key_initialized;
/* The encryption policy used by this inode */
union fscrypt_policy ci_policy;
/* This inode's nonce, copied from the fscrypt_context */
u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE];
-
- /* Hashed inode number. Only set for IV_INO_LBLK_32 */
- u32 ci_hashed_ino;
};
typedef enum {
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 52504dd478d3..104771c3d3f6 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -102,11 +102,8 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
if (err && err != -ENOENT)
return err;
- if (fname->is_nokey_name) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_NOKEY_NAME;
- spin_unlock(&dentry->d_lock);
- }
+ fscrypt_prepare_dentry(dentry, fname->is_nokey_name);
+
return err;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
@@ -131,12 +128,10 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry)
{
int err = fscrypt_get_encryption_info(dir, true);
+ bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir));
+
+ fscrypt_prepare_dentry(dentry, is_nokey_name);
- if (!err && !fscrypt_has_encryption_key(dir)) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_NOKEY_NAME;
- spin_unlock(&dentry->d_lock);
- }
return err;
}
EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 0edf0b58daa7..6681a71625f0 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -74,8 +74,12 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk)
* that concurrent keyring lookups can no longer find it.
*/
WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 0);
- key_put(mk->mk_users);
- mk->mk_users = NULL;
+ if (mk->mk_users) {
+ /* Clear the keyring so the quota gets released right away. */
+ keyring_clear(mk->mk_users);
+ key_put(mk->mk_users);
+ mk->mk_users = NULL;
+ }
call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index d71f7c799e79..b4fe01ea4bd4 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -23,7 +23,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
},
[FSCRYPT_MODE_AES_256_CTS] = {
- .friendly_name = "AES-256-CTS-CBC",
+ .friendly_name = "AES-256-CBC-CTS",
.cipher_str = "cts(cbc(aes))",
.keysize = 32,
.security_strength = 32,
@@ -38,7 +38,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
},
[FSCRYPT_MODE_AES_128_CTS] = {
- .friendly_name = "AES-128-CTS-CBC",
+ .friendly_name = "AES-128-CBC-CTS",
.cipher_str = "cts(cbc(aes))",
.keysize = 16,
.security_strength = 16,
@@ -53,7 +53,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
},
[FSCRYPT_MODE_SM4_CTS] = {
- .friendly_name = "SM4-CTS-CBC",
+ .friendly_name = "SM4-CBC-CTS",
.cipher_str = "cts(cbc(sm4))",
.keysize = 16,
.security_strength = 16,
@@ -687,7 +687,7 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
/**
* fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
* @dir: a possibly-encrypted directory
- * @inode: the new inode. ->i_mode must be set already.
+ * @inode: the new inode. ->i_mode and ->i_blkbits must be set already.
* ->i_ino doesn't need to be set yet.
* @encrypt_ret: (output) set to %true if the new inode will be encrypted
*
@@ -717,6 +717,9 @@ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
if (IS_ERR(policy))
return PTR_ERR(policy);
+ if (WARN_ON_ONCE(inode->i_blkbits == 0))
+ return -EINVAL;
+
if (WARN_ON_ONCE(inode->i_mode == 0))
return -EINVAL;
diff --git a/fs/dcache.c b/fs/dcache.c
index 6ebccba33336..71a8e943a0fa 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3139,7 +3139,7 @@ static void __init dcache_init(void)
* of the dcache.
*/
dentry_cache = KMEM_CACHE_USERCOPY(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
d_iname);
/* Hash may have been set up in dcache_init_early */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 034a617cb1a5..a40da0065433 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -751,13 +751,28 @@ static void __debugfs_file_removed(struct dentry *dentry)
if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
return;
- /* if we hit zero, just wait for all to finish */
- if (!refcount_dec_and_test(&fsd->active_users)) {
- wait_for_completion(&fsd->active_users_drained);
+ /* if this was the last reference, we're done */
+ if (refcount_dec_and_test(&fsd->active_users))
return;
- }
- /* if we didn't hit zero, try to cancel any we can */
+ /*
+ * If there's still a reference, the code that obtained it can
+ * be in different states:
+ * - The common case of not using cancellations, or already
+ * after debugfs_leave_cancellation(), where we just need
+ * to wait for debugfs_file_put() which signals the completion;
+ * - inside a cancellation section, i.e. between
+ * debugfs_enter_cancellation() and debugfs_leave_cancellation(),
+ * in which case we need to trigger the ->cancel() function,
+ * and then wait for debugfs_file_put() just like in the
+ * previous case;
+ * - before debugfs_enter_cancellation() (but obviously after
+ * debugfs_file_get()), in which case we may not see the
+ * cancellation in the list on the first round of the loop,
+ * but debugfs_enter_cancellation() signals the completion
+ * after adding it, so this code gets woken up to call the
+ * ->cancel() function.
+ */
while (refcount_read(&fsd->active_users)) {
struct debugfs_cancellation *c;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 60456263a338..62c97ff9e852 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_end_io = dio_bio_end_io;
if (dio->is_pinned)
bio_set_flag(bio, BIO_PAGE_PINNED);
+ bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint;
+
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index dfc444dad329..3b4dbce849f0 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -246,7 +246,7 @@ struct dlm_lkb {
int8_t lkb_highbast; /* highest mode bast sent for */
int8_t lkb_wait_type; /* type of reply waiting for */
- atomic_t lkb_wait_count;
+ int8_t lkb_wait_count;
int lkb_wait_nodeid; /* for debugging */
struct list_head lkb_statequeue; /* rsb g/c/w list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 652c51fbbf76..fd752dd03896 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1407,7 +1407,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error = 0;
- int wc;
mutex_lock(&ls->ls_waiters_mutex);
@@ -1429,17 +1428,20 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
error = -EBUSY;
goto out;
}
- wc = atomic_inc_return(&lkb->lkb_wait_count);
+ lkb->lkb_wait_count++;
hold_lkb(lkb);
log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
- lkb->lkb_id, lkb->lkb_wait_type, mstype, wc,
- dlm_iflags_val(lkb));
+ lkb->lkb_id, lkb->lkb_wait_type, mstype,
+ lkb->lkb_wait_count, dlm_iflags_val(lkb));
goto out;
}
- wc = atomic_fetch_inc(&lkb->lkb_wait_count);
- DLM_ASSERT(!wc, dlm_print_lkb(lkb); printk("wait_count %d\n", wc););
+ DLM_ASSERT(!lkb->lkb_wait_count,
+ dlm_print_lkb(lkb);
+ printk("wait_count %d\n", lkb->lkb_wait_count););
+
+ lkb->lkb_wait_count++;
lkb->lkb_wait_type = mstype;
lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
hold_lkb(lkb);
@@ -1502,7 +1504,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
lkb->lkb_id);
lkb->lkb_wait_type = 0;
- atomic_dec(&lkb->lkb_wait_count);
+ lkb->lkb_wait_count--;
unhold_lkb(lkb);
goto out_del;
}
@@ -1529,15 +1531,16 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
if (overlap_done && lkb->lkb_wait_type) {
log_error(ls, "remwait error %x reply %d wait_type %d overlap",
lkb->lkb_id, mstype, lkb->lkb_wait_type);
- atomic_dec(&lkb->lkb_wait_count);
+ lkb->lkb_wait_count--;
unhold_lkb(lkb);
lkb->lkb_wait_type = 0;
}
- DLM_ASSERT(atomic_read(&lkb->lkb_wait_count), dlm_print_lkb(lkb););
+ DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
- if (atomic_dec_and_test(&lkb->lkb_wait_count))
+ lkb->lkb_wait_count--;
+ if (!lkb->lkb_wait_count)
list_del_init(&lkb->lkb_wait_reply);
unhold_lkb(lkb);
return 0;
@@ -2666,7 +2669,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
goto out;
/* lock not allowed if there's any op in progress */
- if (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count))
+ if (lkb->lkb_wait_type || lkb->lkb_wait_count)
goto out;
if (is_overlap(lkb))
@@ -2728,7 +2731,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
/* normal unlock not allowed if there's any op in progress */
if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) &&
- (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count)))
+ (lkb->lkb_wait_type || lkb->lkb_wait_count))
goto out;
/* an lkb may be waiting for an rsb lookup to complete where the
@@ -5011,21 +5014,32 @@ static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
return lkb;
}
-/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
- master or dir-node for r. Processing the lkb may result in it being placed
- back on waiters. */
-
-/* We do this after normal locking has been enabled and any saved messages
- (in requestqueue) have been processed. We should be confident that at
- this point we won't get or process a reply to any of these waiting
- operations. But, new ops may be coming in on the rsbs/locks here from
- userspace or remotely. */
-
-/* there may have been an overlap unlock/cancel prior to recovery or after
- recovery. if before, the lkb may still have a pos wait_count; if after, the
- overlap flag would just have been set and nothing new sent. we can be
- confident here than any replies to either the initial op or overlap ops
- prior to recovery have been received. */
+/*
+ * Forced state reset for locks that were in the middle of remote operations
+ * when recovery happened (i.e. lkbs that were on the waiters list, waiting
+ * for a reply from a remote operation.) The lkbs remaining on the waiters
+ * list need to be reevaluated; some may need resending to a different node
+ * than previously, and some may now need local handling rather than remote.
+ *
+ * First, the lkb state for the voided remote operation is forcibly reset,
+ * equivalent to what remove_from_waiters() would normally do:
+ * . lkb removed from ls_waiters list
+ * . lkb wait_type cleared
+ * . lkb waiters_count cleared
+ * . lkb ref count decremented for each waiters_count (almost always 1,
+ * but possibly 2 in case of cancel/unlock overlapping, which means
+ * two remote replies were being expected for the lkb.)
+ *
+ * Second, the lkb is reprocessed like an original operation would be,
+ * by passing it to _request_lock or _convert_lock, which will either
+ * process the lkb operation locally, or send it to a remote node again
+ * and put the lkb back onto the waiters list.
+ *
+ * When reprocessing the lkb, we may find that it's flagged for an overlapping
+ * force-unlock or cancel, either from before recovery began, or after recovery
+ * finished. If this is the case, the unlock/cancel is done directly, and the
+ * original operation is not initiated again (no _request_lock/_convert_lock.)
+ */
int dlm_recover_waiters_post(struct dlm_ls *ls)
{
@@ -5040,6 +5054,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
break;
}
+ /*
+ * Find an lkb from the waiters list that's been affected by
+ * recovery node changes, and needs to be reprocessed. Does
+ * hold_lkb(), adding a refcount.
+ */
lkb = find_resend_waiter(ls);
if (!lkb)
break;
@@ -5048,6 +5067,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
hold_rsb(r);
lock_rsb(r);
+ /*
+ * If the lkb has been flagged for a force unlock or cancel,
+ * then the reprocessing below will be replaced by just doing
+ * the unlock/cancel directly.
+ */
mstype = lkb->lkb_wait_type;
oc = test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT,
&lkb->lkb_iflags);
@@ -5061,22 +5085,40 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
dlm_dir_nodeid(r), oc, ou);
- /* At this point we assume that we won't get a reply to any
- previous op or overlap op on this lock. First, do a big
- remove_from_waiters() for all previous ops. */
+ /*
+ * No reply to the pre-recovery operation will now be received,
+ * so a forced equivalent of remove_from_waiters() is needed to
+ * reset the waiters state that was in place before recovery.
+ */
clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
+
+ /* Forcibly clear wait_type */
lkb->lkb_wait_type = 0;
- /* drop all wait_count references we still
- * hold a reference for this iteration.
+
+ /*
+ * Forcibly reset wait_count and associated refcount. The
+ * wait_count will almost always be 1, but in case of an
+ * overlapping unlock/cancel it could be 2: see where
+ * add_to_waiters() finds the lkb is already on the waiters
+ * list and does lkb_wait_count++; hold_lkb().
*/
- while (!atomic_dec_and_test(&lkb->lkb_wait_count))
+ while (lkb->lkb_wait_count) {
+ lkb->lkb_wait_count--;
unhold_lkb(lkb);
+ }
+ /* Forcibly remove from waiters list */
mutex_lock(&ls->ls_waiters_mutex);
list_del_init(&lkb->lkb_wait_reply);
mutex_unlock(&ls->ls_waiters_mutex);
+ /*
+ * The lkb is now clear of all prior waiters state and can be
+ * processed locally, or sent to remote node again, or directly
+ * cancelled/unlocked.
+ */
+
if (oc || ou) {
/* do an unlock or cancel instead of resending */
switch (mstype) {
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d814c5121367..9ca83ef70ed1 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -138,14 +138,14 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
}
op->info.optype = DLM_PLOCK_OP_LOCK;
- op->info.pid = fl->fl_pid;
- op->info.ex = (fl->fl_type == F_WRLCK);
- op->info.wait = !!(fl->fl_flags & FL_SLEEP);
+ op->info.pid = fl->c.flc_pid;
+ op->info.ex = lock_is_write(fl);
+ op->info.wait = !!(fl->c.flc_flags & FL_SLEEP);
op->info.fsid = ls->ls_global_id;
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long)fl->fl_owner;
+ op->info.owner = (__u64)(long) fl->c.flc_owner;
/* async handling */
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
@@ -258,7 +258,7 @@ static int dlm_plock_callback(struct plock_op *op)
}
/* got fs lock; bookkeep locally as well: */
- flc->fl_flags &= ~FL_SLEEP;
+ flc->c.flc_flags &= ~FL_SLEEP;
if (posix_lock_file(file, flc, NULL)) {
/*
* This can only happen in the case of kmalloc() failure.
@@ -291,7 +291,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
struct dlm_ls *ls;
struct plock_op *op;
int rv;
- unsigned char fl_flags = fl->fl_flags;
+ unsigned char saved_flags = fl->c.flc_flags;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
@@ -304,7 +304,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
}
/* cause the vfs unlock to return ENOENT if lock is not found */
- fl->fl_flags |= FL_EXISTS;
+ fl->c.flc_flags |= FL_EXISTS;
rv = locks_lock_file_wait(file, fl);
if (rv == -ENOENT) {
@@ -317,14 +317,14 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
}
op->info.optype = DLM_PLOCK_OP_UNLOCK;
- op->info.pid = fl->fl_pid;
+ op->info.pid = fl->c.flc_pid;
op->info.fsid = ls->ls_global_id;
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long)fl->fl_owner;
+ op->info.owner = (__u64)(long) fl->c.flc_owner;
- if (fl->fl_flags & FL_CLOSE) {
+ if (fl->c.flc_flags & FL_CLOSE) {
op->info.flags |= DLM_PLOCK_FL_CLOSE;
send_op(op);
rv = 0;
@@ -345,7 +345,7 @@ out_free:
dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
- fl->fl_flags = fl_flags;
+ fl->c.flc_flags = saved_flags;
return rv;
}
EXPORT_SYMBOL_GPL(dlm_posix_unlock);
@@ -375,14 +375,14 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file,
return -EINVAL;
memset(&info, 0, sizeof(info));
- info.pid = fl->fl_pid;
- info.ex = (fl->fl_type == F_WRLCK);
+ info.pid = fl->c.flc_pid;
+ info.ex = lock_is_write(fl);
info.fsid = ls->ls_global_id;
dlm_put_lockspace(ls);
info.number = number;
info.start = fl->fl_start;
info.end = fl->fl_end;
- info.owner = (__u64)(long)fl->fl_owner;
+ info.owner = (__u64)(long) fl->c.flc_owner;
rv = do_lock_cancel(&info);
switch (rv) {
@@ -437,13 +437,13 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
}
op->info.optype = DLM_PLOCK_OP_GET;
- op->info.pid = fl->fl_pid;
- op->info.ex = (fl->fl_type == F_WRLCK);
+ op->info.pid = fl->c.flc_pid;
+ op->info.ex = lock_is_write(fl);
op->info.fsid = ls->ls_global_id;
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long)fl->fl_owner;
+ op->info.owner = (__u64)(long) fl->c.flc_owner;
send_op(op);
wait_event(recv_wq, (op->done != 0));
@@ -455,16 +455,16 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = op->info.rv;
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
if (rv == -ENOENT)
rv = 0;
else if (rv > 0) {
locks_init_lock(fl);
- fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
- fl->fl_flags = FL_POSIX;
- fl->fl_pid = op->info.pid;
+ fl->c.flc_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_pid = op->info.pid;
if (op->info.nodeid != dlm_our_nodeid())
- fl->fl_pid = -fl->fl_pid;
+ fl->c.flc_pid = -fl->c.flc_pid;
fl->fl_start = op->info.start;
fl->fl_end = op->info.end;
rv = 0;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 695e691b38b3..9f9b68448830 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -806,7 +806,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
struct dlm_lkb *lkb;
DECLARE_WAITQUEUE(wait, current);
struct dlm_callback *cb;
- int rv, copy_lvb = 0;
+ int rv, ret, copy_lvb = 0;
int old_mode, new_mode;
if (count == sizeof(struct dlm_device_version)) {
@@ -906,9 +906,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
}
- rv = copy_result_to_user(lkb->lkb_ua,
- test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
- cb->flags, cb->mode, copy_lvb, buf, count);
+ ret = copy_result_to_user(lkb->lkb_ua,
+ test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+ cb->flags, cb->mode, copy_lvb, buf, count);
kref_put(&cb->ref, dlm_release_callback);
@@ -916,7 +916,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
if (rv == DLM_DEQUEUE_CALLBACK_LAST)
dlm_put_lkb(lkb);
- return rv;
+ return ret;
}
static __poll_t device_poll(struct file *file, poll_table *wait)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 03bd55069d86..2fe0f3af1a08 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1949,16 +1949,6 @@ out:
return rc;
}
-static bool is_dot_dotdot(const char *name, size_t name_size)
-{
- if (name_size == 1 && name[0] == '.')
- return true;
- else if (name_size == 2 && name[0] == '.' && name[1] == '.')
- return true;
-
- return false;
-}
-
/**
* ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
* @plaintext_name: The plaintext name
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f17fdac76b2e..e4421c10caeb 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -14,19 +14,14 @@
#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/blkdev.h>
-
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "efs.h"
#include <linux/efs_vh.h>
#include <linux/efs_fs_sb.h>
static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
-static int efs_fill_super(struct super_block *s, void *d, int silent);
-
-static struct dentry *efs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
-}
+static int efs_init_fs_context(struct fs_context *fc);
static void efs_kill_sb(struct super_block *s)
{
@@ -35,15 +30,6 @@ static void efs_kill_sb(struct super_block *s)
kfree(sbi);
}
-static struct file_system_type efs_fs_type = {
- .owner = THIS_MODULE,
- .name = "efs",
- .mount = efs_mount,
- .kill_sb = efs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("efs");
-
static struct pt_types sgi_pt_types[] = {
{0x00, "SGI vh"},
{0x01, "SGI trkrepl"},
@@ -63,6 +49,27 @@ static struct pt_types sgi_pt_types[] = {
{0, NULL}
};
+enum {
+ Opt_explicit_open,
+};
+
+static const struct fs_parameter_spec efs_param_spec[] = {
+ fsparam_flag ("explicit-open", Opt_explicit_open),
+ {}
+};
+
+/*
+ * File system definition and registration.
+ */
+static struct file_system_type efs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "efs",
+ .kill_sb = efs_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = efs_init_fs_context,
+ .parameters = efs_param_spec,
+};
+MODULE_ALIAS_FS("efs");
static struct kmem_cache * efs_inode_cachep;
@@ -91,8 +98,8 @@ static int __init init_inodecache(void)
{
efs_inode_cachep = kmem_cache_create("efs_inode_cache",
sizeof(struct efs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT, init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ init_once);
if (efs_inode_cachep == NULL)
return -ENOMEM;
return 0;
@@ -108,18 +115,10 @@ static void destroy_inodecache(void)
kmem_cache_destroy(efs_inode_cachep);
}
-static int efs_remount(struct super_block *sb, int *flags, char *data)
-{
- sync_filesystem(sb);
- *flags |= SB_RDONLY;
- return 0;
-}
-
static const struct super_operations efs_superblock_operations = {
.alloc_inode = efs_alloc_inode,
.free_inode = efs_free_inode,
.statfs = efs_statfs,
- .remount_fs = efs_remount,
};
static const struct export_operations efs_export_ops = {
@@ -249,26 +248,26 @@ static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) {
return 0;
}
-static int efs_fill_super(struct super_block *s, void *d, int silent)
+static int efs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct efs_sb_info *sb;
struct buffer_head *bh;
struct inode *root;
- sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+ sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
if (!sb)
return -ENOMEM;
s->s_fs_info = sb;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
-
+
s->s_magic = EFS_SUPER_MAGIC;
if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
pr_err("device does not support %d byte blocks\n",
EFS_BLOCKSIZE);
return -EINVAL;
}
-
+
/* read the vh (volume header) block */
bh = sb_bread(s, 0);
@@ -294,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
pr_err("cannot read superblock\n");
return -EIO;
}
-
+
if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
#ifdef DEBUG
pr_warn("invalid superblock at block %u\n",
@@ -328,6 +327,61 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
return 0;
}
+static void efs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static int efs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, efs_fill_super);
+}
+
+static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ int token;
+ struct fs_parse_result result;
+
+ token = fs_parse(fc, efs_param_spec, param, &result);
+ if (token < 0)
+ return token;
+ return 0;
+}
+
+static int efs_reconfigure(struct fs_context *fc)
+{
+ sync_filesystem(fc->root->d_sb);
+
+ return 0;
+}
+
+struct efs_context {
+ unsigned long s_mount_opts;
+};
+
+static const struct fs_context_operations efs_context_opts = {
+ .parse_param = efs_parse_param,
+ .get_tree = efs_get_tree,
+ .reconfigure = efs_reconfigure,
+ .free = efs_free_fc,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int efs_init_fs_context(struct fs_context *fc)
+{
+ struct efs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fc->fs_private = ctx;
+ fc->ops = &efs_context_opts;
+
+ return 0;
+}
+
static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
struct super_block *sb = dentry->d_sb;
struct efs_sb_info *sbi = SUPER_INFO(sb);
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 7cc5841577b2..333587ba6183 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -81,13 +81,6 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
return true;
}
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
-{
- return page->mapping == MNGD_MAPPING(sbi);
-}
-
int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
unsigned int padbufsize);
extern const struct z_erofs_decompressor erofs_decompressors[];
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c98aeda8abb2..52524bd9698b 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -220,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return 0;
}
- map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
+ map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
@@ -238,8 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev_handle ?
- dif->bdev_handle->bdev : NULL;
+ map->m_bdev = dif->bdev_file ?
+ file_bdev(dif->bdev_file) : NULL;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
@@ -447,5 +447,6 @@ const struct file_operations erofs_file_fops = {
.llseek = generic_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
+ .get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index d4cee95af14c..2ec9b2bb628d 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -323,7 +323,8 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
u8 *kin;
- DBG_BUGON(rq->outputsize > rq->inputsize);
+ if (rq->outputsize > rq->inputsize)
+ return -EOPNOTSUPP;
if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
cur = bs - (rq->pageofs_out & (bs - 1));
pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index b98872058abe..81e65c453ef0 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -212,9 +212,6 @@ again:
if (rq->out[no] != rq->in[j])
continue;
-
- DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
- rq->in[j]));
tmppage = erofs_allocpage(pgpl, rq->gfp);
if (!tmppage) {
err = -ENOMEM;
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 6ca357d83cfa..4b28dc130c9f 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -258,9 +258,6 @@ again:
if (rq->out[no] != rq->in[j])
continue;
-
- DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
- rq->in[j]));
tmppage = erofs_allocpage(pgpl, rq->gfp);
if (!tmppage) {
err = -ENOMEM;
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 89a7c2453aae..8aff1a724805 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -3,6 +3,7 @@
* Copyright (C) 2022, Alibaba Cloud
* Copyright (C) 2022, Bytedance Inc. All rights reserved.
*/
+#include <linux/pseudo_fs.h>
#include <linux/fscache.h>
#include "internal.h"
@@ -12,9 +13,27 @@ static LIST_HEAD(erofs_domain_list);
static LIST_HEAD(erofs_domain_cookies_list);
static struct vfsmount *erofs_pseudo_mnt;
-struct erofs_fscache_request {
- struct erofs_fscache_request *primary;
- struct netfs_cache_resources cache_resources;
+static int erofs_anon_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type erofs_anon_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "pseudo_erofs",
+ .init_fs_context = erofs_anon_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct erofs_fscache_io {
+ struct netfs_cache_resources cres;
+ struct iov_iter iter;
+ netfs_io_terminated_t end_io;
+ void *private;
+ refcount_t ref;
+};
+
+struct erofs_fscache_rq {
struct address_space *mapping; /* The mapping being accessed */
loff_t start; /* Start position */
size_t len; /* Length of the request */
@@ -23,44 +42,17 @@ struct erofs_fscache_request {
refcount_t ref;
};
-static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
- loff_t start, size_t len)
+static bool erofs_fscache_io_put(struct erofs_fscache_io *io)
{
- struct erofs_fscache_request *req;
-
- req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- req->mapping = mapping;
- req->start = start;
- req->len = len;
- refcount_set(&req->ref, 1);
-
- return req;
+ if (!refcount_dec_and_test(&io->ref))
+ return false;
+ if (io->cres.ops)
+ io->cres.ops->end_operation(&io->cres);
+ kfree(io);
+ return true;
}
-static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
- size_t len)
-{
- struct erofs_fscache_request *req;
-
- /* use primary request for the first submission */
- if (!primary->submitted) {
- refcount_inc(&primary->ref);
- return primary;
- }
-
- req = erofs_fscache_req_alloc(primary->mapping,
- primary->start + primary->submitted, len);
- if (!IS_ERR(req)) {
- req->primary = primary;
- refcount_inc(&primary->ref);
- }
- return req;
-}
-
-static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
+static void erofs_fscache_req_complete(struct erofs_fscache_rq *req)
{
struct folio *folio;
bool failed = req->error;
@@ -80,120 +72,196 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
rcu_read_unlock();
}
-static void erofs_fscache_req_put(struct erofs_fscache_request *req)
+static void erofs_fscache_req_put(struct erofs_fscache_rq *req)
{
- if (refcount_dec_and_test(&req->ref)) {
- if (req->cache_resources.ops)
- req->cache_resources.ops->end_operation(&req->cache_resources);
- if (!req->primary)
- erofs_fscache_req_complete(req);
- else
- erofs_fscache_req_put(req->primary);
- kfree(req);
- }
+ if (!refcount_dec_and_test(&req->ref))
+ return;
+ erofs_fscache_req_complete(req);
+ kfree(req);
}
-static void erofs_fscache_subreq_complete(void *priv,
+static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ struct erofs_fscache_rq *req = kzalloc(sizeof(*req), GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+ req->mapping = mapping;
+ req->start = start;
+ req->len = len;
+ refcount_set(&req->ref, 1);
+ return req;
+}
+
+static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
+{
+ struct erofs_fscache_rq *req = io->private;
+
+ if (erofs_fscache_io_put(io))
+ erofs_fscache_req_put(req);
+}
+
+static void erofs_fscache_req_end_io(void *priv,
ssize_t transferred_or_error, bool was_async)
{
- struct erofs_fscache_request *req = priv;
+ struct erofs_fscache_io *io = priv;
+ struct erofs_fscache_rq *req = io->private;
- if (IS_ERR_VALUE(transferred_or_error)) {
- if (req->primary)
- req->primary->error = transferred_or_error;
- else
- req->error = transferred_or_error;
- }
- erofs_fscache_req_put(req);
+ if (IS_ERR_VALUE(transferred_or_error))
+ req->error = transferred_or_error;
+ erofs_fscache_req_io_put(io);
+}
+
+static struct erofs_fscache_io *erofs_fscache_req_io_alloc(struct erofs_fscache_rq *req)
+{
+ struct erofs_fscache_io *io = kzalloc(sizeof(*io), GFP_KERNEL);
+
+ if (!io)
+ return NULL;
+ io->end_io = erofs_fscache_req_end_io;
+ io->private = req;
+ refcount_inc(&req->ref);
+ refcount_set(&io->ref, 1);
+ return io;
}
/*
- * Read data from fscache (cookie, pstart, len), and fill the read data into
- * page cache described by (req->mapping, lstart, len). @pstart describeis the
- * start physical address in the cache file.
+ * Read data from fscache described by cookie at pstart physical address
+ * offset, and fill the read data into buffer described by io->iter.
*/
-static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
- struct erofs_fscache_request *req, loff_t pstart, size_t len)
+static int erofs_fscache_read_io_async(struct fscache_cookie *cookie,
+ loff_t pstart, struct erofs_fscache_io *io)
{
enum netfs_io_source source;
- struct super_block *sb = req->mapping->host->i_sb;
- struct netfs_cache_resources *cres = &req->cache_resources;
- struct iov_iter iter;
- loff_t lstart = req->start + req->submitted;
- size_t done = 0;
+ struct netfs_cache_resources *cres = &io->cres;
+ struct iov_iter *iter = &io->iter;
int ret;
- DBG_BUGON(len > req->len - req->submitted);
-
ret = fscache_begin_read_operation(cres, cookie);
if (ret)
return ret;
- while (done < len) {
- loff_t sstart = pstart + done;
- size_t slen = len - done;
+ while (iov_iter_count(iter)) {
+ size_t orig_count = iov_iter_count(iter), len = orig_count;
unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
source = cres->ops->prepare_ondemand_read(cres,
- sstart, &slen, LLONG_MAX, &flags, 0);
- if (WARN_ON(slen == 0))
+ pstart, &len, LLONG_MAX, &flags, 0);
+ if (WARN_ON(len == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
- erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+ erofs_err(NULL, "prepare_read failed (source %d)", source);
return -EIO;
}
- refcount_inc(&req->ref);
- iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
- lstart + done, slen);
-
- ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
- erofs_fscache_subreq_complete, req);
+ iov_iter_truncate(iter, len);
+ refcount_inc(&io->ref);
+ ret = fscache_read(cres, pstart, iter, NETFS_READ_HOLE_FAIL,
+ io->end_io, io);
if (ret == -EIOCBQUEUED)
ret = 0;
if (ret) {
- erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ erofs_err(NULL, "fscache_read failed (ret %d)", ret);
return ret;
}
+ if (WARN_ON(iov_iter_count(iter)))
+ return -EIO;
- done += slen;
+ iov_iter_reexpand(iter, orig_count - len);
+ pstart += len;
}
- DBG_BUGON(done != len);
return 0;
}
-static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+struct erofs_fscache_bio {
+ struct erofs_fscache_io io;
+ struct bio bio; /* w/o bdev to share bio_add_page/endio() */
+ struct bio_vec bvecs[BIO_MAX_VECS];
+};
+
+static void erofs_fscache_bio_endio(void *priv,
+ ssize_t transferred_or_error, bool was_async)
+{
+ struct erofs_fscache_bio *io = priv;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ io->bio.bi_status = errno_to_blk_status(transferred_or_error);
+ io->bio.bi_end_io(&io->bio);
+ BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0);
+ erofs_fscache_io_put(&io->io);
+}
+
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev)
{
+ struct erofs_fscache_bio *io;
+
+ io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL);
+ bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ);
+ io->io.private = mdev->m_fscache->cookie;
+ io->io.end_io = erofs_fscache_bio_endio;
+ refcount_set(&io->io.ref, 1);
+ return &io->bio;
+}
+
+void erofs_fscache_submit_bio(struct bio *bio)
+{
+ struct erofs_fscache_bio *io = container_of(bio,
+ struct erofs_fscache_bio, bio);
int ret;
+
+ iov_iter_bvec(&io->io.iter, ITER_DEST, io->bvecs, bio->bi_vcnt,
+ bio->bi_iter.bi_size);
+ ret = erofs_fscache_read_io_async(io->io.private,
+ bio->bi_iter.bi_sector << 9, &io->io);
+ erofs_fscache_io_put(&io->io);
+ if (!ret)
+ return;
+ bio->bi_status = errno_to_blk_status(ret);
+ bio->bi_end_io(bio);
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
struct erofs_fscache *ctx = folio->mapping->host->i_private;
- struct erofs_fscache_request *req;
+ int ret = -ENOMEM;
+ struct erofs_fscache_rq *req;
+ struct erofs_fscache_io *io;
req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
- if (IS_ERR(req)) {
+ if (!req) {
folio_unlock(folio);
- return PTR_ERR(req);
+ return ret;
}
- ret = erofs_fscache_read_folios_async(ctx->cookie, req,
- folio_pos(folio), folio_size(folio));
+ io = erofs_fscache_req_io_alloc(req);
+ if (!io) {
+ req->error = ret;
+ goto out;
+ }
+ iov_iter_xarray(&io->iter, ITER_DEST, &folio->mapping->i_pages,
+ folio_pos(folio), folio_size(folio));
+
+ ret = erofs_fscache_read_io_async(ctx->cookie, folio_pos(folio), io);
if (ret)
req->error = ret;
+ erofs_fscache_req_io_put(io);
+out:
erofs_fscache_req_put(req);
return ret;
}
-static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
{
- struct address_space *mapping = primary->mapping;
+ struct address_space *mapping = req->mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
- struct erofs_fscache_request *req;
+ struct erofs_fscache_io *io;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
- struct iov_iter iter;
- loff_t pos = primary->start + primary->submitted;
+ loff_t pos = req->start + req->submitted;
size_t count;
int ret;
@@ -204,6 +272,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
if (map.m_flags & EROFS_MAP_META) {
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct iov_iter iter;
erofs_blk_t blknr;
size_t offset, size;
void *src;
@@ -224,15 +293,17 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
}
iov_iter_zero(PAGE_SIZE - size, &iter);
erofs_put_metabuf(&buf);
- primary->submitted += PAGE_SIZE;
+ req->submitted += PAGE_SIZE;
return 0;
}
- count = primary->len - primary->submitted;
+ count = req->len - req->submitted;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ struct iov_iter iter;
+
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
iov_iter_zero(count, &iter);
- primary->submitted += count;
+ req->submitted += count;
return 0;
}
@@ -247,18 +318,19 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
if (ret)
return ret;
- req = erofs_fscache_req_chain(primary, count);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ io = erofs_fscache_req_io_alloc(req);
+ if (!io)
+ return -ENOMEM;
+ iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
+ ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie,
+ mdev.m_pa + (pos - map.m_la), io);
+ erofs_fscache_req_io_put(io);
- ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- req, mdev.m_pa + (pos - map.m_la), count);
- erofs_fscache_req_put(req);
- primary->submitted += count;
+ req->submitted += count;
return ret;
}
-static int erofs_fscache_data_read(struct erofs_fscache_request *req)
+static int erofs_fscache_data_read(struct erofs_fscache_rq *req)
{
int ret;
@@ -267,20 +339,19 @@ static int erofs_fscache_data_read(struct erofs_fscache_request *req)
if (ret)
req->error = ret;
} while (!ret && req->submitted < req->len);
-
return ret;
}
static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
{
- struct erofs_fscache_request *req;
+ struct erofs_fscache_rq *req;
int ret;
req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
- if (IS_ERR(req)) {
+ if (!req) {
folio_unlock(folio);
- return PTR_ERR(req);
+ return -ENOMEM;
}
ret = erofs_fscache_data_read(req);
@@ -290,14 +361,14 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
static void erofs_fscache_readahead(struct readahead_control *rac)
{
- struct erofs_fscache_request *req;
+ struct erofs_fscache_rq *req;
if (!readahead_count(rac))
return;
req = erofs_fscache_req_alloc(rac->mapping,
readahead_pos(rac), readahead_length(rac));
- if (IS_ERR(req))
+ if (!req)
return;
/* The request completion will drop refs on the folios. */
@@ -381,7 +452,7 @@ static int erofs_fscache_init_domain(struct super_block *sb)
goto out;
if (!erofs_pseudo_mnt) {
- struct vfsmount *mnt = kern_mount(&erofs_fs_type);
+ struct vfsmount *mnt = kern_mount(&erofs_anon_fs_type);
if (IS_ERR(mnt)) {
err = PTR_ERR(mnt);
goto out;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 36e638e8b53a..0eb0e6f933c3 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -259,14 +259,12 @@ static int erofs_fill_inode(struct inode *inode)
if (erofs_inode_is_data_compressed(vi->datalayout)) {
#ifdef CONFIG_EROFS_FS_ZIP
- if (!erofs_is_fscache_mode(inode->i_sb)) {
- DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE,
- erofs_info, inode->i_sb,
- "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
- inode->i_mapping->a_ops = &z_erofs_aops;
- err = 0;
- goto out_unlock;
- }
+ DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
+ erofs_info, inode->i_sb,
+ "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
+ inode->i_mapping->a_ops = &z_erofs_aops;
+ err = 0;
+ goto out_unlock;
#endif
err = -EOPNOTSUPP;
goto out_unlock;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b0409badb017..39c67119f43b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,7 +49,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct dax_device *dax_dev;
u64 dax_part_off;
@@ -385,7 +385,6 @@ struct erofs_map_dev {
unsigned int m_deviceid;
};
-extern struct file_system_type erofs_fs_type;
extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
@@ -467,8 +466,8 @@ int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
int __init z_erofs_init_zip_subsystem(void);
void z_erofs_exit_zip_subsystem(void);
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
- struct erofs_workgroup *egrp);
+int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *egrp);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
void *erofs_get_pcpubuf(unsigned int requiredpages);
@@ -513,6 +512,8 @@ void erofs_fscache_unregister_fs(struct super_block *sb);
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
char *name, unsigned int flags);
void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fscache_submit_bio(struct bio *bio);
#else
static inline int erofs_fscache_register_fs(struct super_block *sb)
{
@@ -530,6 +531,8 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
{
}
+static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fscache_submit_bio(struct bio *bio) {}
#endif
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 5f60f163bd56..c0eb139adb07 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -177,7 +177,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
void *ptr;
ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
@@ -201,12 +201,12 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(fscache);
dif->fscache = fscache;
} else if (!sbi->devs->flatdev) {
- bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
+ bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ,
sb->s_type, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
- dif->bdev_handle = bdev_handle;
- dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
+ dif->bdev_file = bdev_file;
+ dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file),
&dif->dax_part_off, NULL, NULL);
}
@@ -430,7 +430,6 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
- warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
set_opt(&ctx->opt, DAX_ALWAYS);
clear_opt(&ctx->opt, DAX_NEVER);
return true;
@@ -579,13 +578,6 @@ static const struct export_operations erofs_export_ops = {
.get_parent = erofs_get_parent,
};
-static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
-{
- static const struct tree_descr empty_descr = {""};
-
- return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
-}
-
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
@@ -712,11 +704,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return 0;
}
-static int erofs_fc_anon_get_tree(struct fs_context *fc)
-{
- return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
-}
-
static int erofs_fc_get_tree(struct fs_context *fc)
{
struct erofs_fs_context *ctx = fc->fs_private;
@@ -754,8 +741,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->bdev_handle)
- bdev_release(dif->bdev_handle);
+ if (dif->bdev_file)
+ fput(dif->bdev_file);
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
@@ -789,20 +776,10 @@ static const struct fs_context_operations erofs_context_ops = {
.free = erofs_fc_free,
};
-static const struct fs_context_operations erofs_anon_context_ops = {
- .get_tree = erofs_fc_anon_get_tree,
-};
-
static int erofs_init_fs_context(struct fs_context *fc)
{
struct erofs_fs_context *ctx;
- /* pseudo mount for anon inodes */
- if (fc->sb_flags & SB_KERNMOUNT) {
- fc->ops = &erofs_anon_context_ops;
- return 0;
- }
-
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
@@ -824,12 +801,6 @@ static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi;
- /* pseudo mount for anon inodes */
- if (sb->s_flags & SB_KERNMOUNT) {
- kill_anon_super(sb);
- return;
- }
-
if (erofs_is_fscache_mode(sb))
kill_anon_super(sb);
else
@@ -868,7 +839,7 @@ static void erofs_put_super(struct super_block *sb)
erofs_fscache_unregister_fs(sb);
}
-struct file_system_type erofs_fs_type = {
+static struct file_system_type erofs_fs_type = {
.owner = THIS_MODULE,
.name = "erofs",
.init_fs_context = erofs_init_fs_context,
@@ -885,7 +856,7 @@ static int __init erofs_module_init(void)
erofs_inode_cachep = kmem_cache_create("erofs_inode",
sizeof(struct erofs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
erofs_inode_init_once);
if (!erofs_inode_cachep)
return -ENOMEM;
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index e146d09151af..518bdd69c823 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -129,7 +129,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
* the XArray. Otherwise some cached pages could be still attached to
* the orphan old workgroup when the new one is available in the tree.
*/
- if (erofs_try_to_free_all_cached_pages(sbi, grp))
+ if (erofs_try_to_free_all_cached_folios(sbi, grp))
goto out;
/*
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ff0aa72b0db3..3216b920d369 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -19,7 +19,10 @@
typedef void *z_erofs_next_pcluster_t;
struct z_erofs_bvec {
- struct page *page;
+ union {
+ struct page *page;
+ struct folio *folio;
+ };
int offset;
unsigned int end;
};
@@ -116,47 +119,46 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
}
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
+{
+ return fo->mapping == MNGD_MAPPING(sbi);
+}
+
/*
- * bit 30: I/O error occurred on this page
- * bit 0 - 29: remaining parts to complete this page
+ * bit 30: I/O error occurred on this folio
+ * bit 0 - 29: remaining parts to complete this folio
*/
-#define Z_EROFS_PAGE_EIO (1 << 30)
+#define Z_EROFS_FOLIO_EIO (1 << 30)
-static inline void z_erofs_onlinepage_init(struct page *page)
+static void z_erofs_onlinefolio_init(struct folio *folio)
{
union {
atomic_t o;
- unsigned long v;
+ void *v;
} u = { .o = ATOMIC_INIT(1) };
- set_page_private(page, u.v);
- smp_wmb();
- SetPagePrivate(page);
+ folio->private = u.v; /* valid only if file-backed folio is locked */
}
-static inline void z_erofs_onlinepage_split(struct page *page)
+static void z_erofs_onlinefolio_split(struct folio *folio)
{
- atomic_inc((atomic_t *)&page->private);
+ atomic_inc((atomic_t *)&folio->private);
}
-static void z_erofs_onlinepage_endio(struct page *page, int err)
+static void z_erofs_onlinefolio_end(struct folio *folio, int err)
{
int orig, v;
- DBG_BUGON(!PagePrivate(page));
-
do {
- orig = atomic_read((atomic_t *)&page->private);
- v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
- } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
-
- if (!(v & ~Z_EROFS_PAGE_EIO)) {
- set_page_private(page, 0);
- ClearPagePrivate(page);
- if (!(v & Z_EROFS_PAGE_EIO))
- SetPageUptodate(page);
- unlock_page(page);
- }
+ orig = atomic_read((atomic_t *)&folio->private);
+ v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0);
+ } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
+
+ if (v & ~Z_EROFS_FOLIO_EIO)
+ return;
+ folio->private = 0;
+ folio_end_read(folio, !(v & Z_EROFS_FOLIO_EIO));
}
#define Z_EROFS_ONSTACK_PAGES 32
@@ -572,17 +574,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
for (i = 0; i < pclusterpages; ++i) {
struct page *page, *newpage;
- void *t; /* mark pages just found for debugging */
/* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
- if (page) {
- t = (void *)((unsigned long)page | 1);
- newpage = NULL;
- } else {
+ if (!page) {
/* I/O is needed, no possible to decompress directly */
standalone = false;
if (!shouldalloc)
@@ -596,11 +594,10 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
if (!newpage)
continue;
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
- t = (void *)((unsigned long)newpage | 1);
}
spin_lock(&pcl->obj.lockref.lock);
if (!pcl->compressed_bvecs[i].page) {
- pcl->compressed_bvecs[i].page = t;
+ pcl->compressed_bvecs[i].page = page ? page : newpage;
spin_unlock(&pcl->obj.lockref.lock);
continue;
}
@@ -620,9 +617,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
-/* called by erofs_shrinker to get rid of all compressed_pages */
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
+/* called by erofs_shrinker to get rid of all cached compressed bvecs */
+int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *grp)
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
@@ -630,27 +627,22 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
int i;
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- /*
- * refcount of workgroup is now freezed as 0,
- * therefore no need to worry about available decompression users.
- */
+ /* There is no actice user since the pcluster is now freezed */
for (i = 0; i < pclusterpages; ++i) {
- struct page *page = pcl->compressed_bvecs[i].page;
+ struct folio *folio = pcl->compressed_bvecs[i].folio;
- if (!page)
+ if (!folio)
continue;
- /* block other users from reclaiming or migrating the page */
- if (!trylock_page(page))
+ /* Avoid reclaiming or migrating this folio */
+ if (!folio_trylock(folio))
return -EBUSY;
- if (!erofs_page_is_managed(sbi, page))
+ if (!erofs_folio_is_managed(sbi, folio))
continue;
-
- /* barrier is implied in the following 'unlock_page' */
- WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- detach_page_private(page);
- unlock_page(page);
+ pcl->compressed_bvecs[i].folio = NULL;
+ folio_detach_private(folio);
+ folio_unlock(folio);
}
return 0;
}
@@ -667,20 +659,17 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
ret = false;
spin_lock(&pcl->obj.lockref.lock);
- if (pcl->obj.lockref.count > 0)
- goto out;
-
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- for (i = 0; i < pclusterpages; ++i) {
- if (pcl->compressed_bvecs[i].page == &folio->page) {
- WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- ret = true;
- break;
+ if (pcl->obj.lockref.count <= 0) {
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ for (i = 0; i < pclusterpages; ++i) {
+ if (pcl->compressed_bvecs[i].folio == folio) {
+ pcl->compressed_bvecs[i].folio = NULL;
+ folio_detach_private(folio);
+ ret = true;
+ break;
+ }
}
}
- if (ret)
- folio_detach_private(folio);
-out:
spin_unlock(&pcl->obj.lockref.lock);
return ret;
}
@@ -962,20 +951,20 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
return 0;
}
-static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, bool ra)
+static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *fe,
+ struct folio *folio, bool ra)
{
struct inode *const inode = fe->inode;
struct erofs_map_blocks *const map = &fe->map;
- const loff_t offset = page_offset(page);
- const unsigned int bs = i_blocksize(inode);
+ const loff_t offset = folio_pos(folio);
+ const unsigned int bs = i_blocksize(inode), fs = folio_size(folio);
bool tight = true, exclusive;
unsigned int cur, end, len, split;
int err = 0;
- z_erofs_onlinepage_init(page);
+ z_erofs_onlinefolio_init(folio);
split = 0;
- end = PAGE_SIZE;
+ end = fs;
repeat:
if (offset + end - 1 < map->m_la ||
offset + end - 1 >= map->m_la + map->m_llen) {
@@ -992,7 +981,7 @@ repeat:
++split;
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
- zero_user_segment(page, cur, end);
+ folio_zero_segment(folio, cur, end);
tight = false;
goto next_part;
}
@@ -1001,8 +990,8 @@ repeat:
erofs_off_t fpos = offset + cur - map->m_la;
len = min_t(unsigned int, map->m_llen - fpos, end - cur);
- err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
- EROFS_I(inode)->z_fragmentoff + fpos);
+ err = z_erofs_read_fragment(inode->i_sb, &folio->page, cur,
+ cur + len, EROFS_I(inode)->z_fragmentoff + fpos);
if (err)
goto out;
tight = false;
@@ -1017,25 +1006,25 @@ repeat:
}
/*
- * Ensure the current partial page belongs to this submit chain rather
+ * Ensure the current partial folio belongs to this submit chain rather
* than other concurrent submit chains or the noio(bypass) chain since
- * those chains are handled asynchronously thus the page cannot be used
+ * those chains are handled asynchronously thus the folio cannot be used
* for inplace I/O or bvpage (should be processed in a strict order.)
*/
tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
- exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE)));
+ exclusive = (!cur && ((split <= 1) || (tight && bs == fs)));
if (cur)
tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
- .page = page,
+ .page = &folio->page,
.offset = offset - map->m_la,
.end = end,
}), exclusive);
if (err)
goto out;
- z_erofs_onlinepage_split(page);
+ z_erofs_onlinefolio_split(folio);
if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
fe->pcl->multibases = true;
if (fe->pcl->length < offset + end - map->m_la) {
@@ -1056,7 +1045,7 @@ next_part:
goto repeat;
out:
- z_erofs_onlinepage_endio(page, err);
+ z_erofs_onlinefolio_end(folio, err);
return err;
}
@@ -1159,7 +1148,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
- z_erofs_onlinepage_endio(bvi->bvec.page, err);
+ z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
list_del(p);
kfree(bvi);
}
@@ -1210,7 +1199,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
be->compressed_pages[i] = page;
if (z_erofs_is_inline_pcluster(pcl) ||
- erofs_page_is_managed(EROFS_SB(be->sb), page)) {
+ erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
if (!PageUptodate(page))
err = -EIO;
continue;
@@ -1295,7 +1284,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* consider shortlived pages added when decompressing */
page = be->compressed_pages[i];
- if (!page || erofs_page_is_managed(sbi, page))
+ if (!page ||
+ erofs_folio_is_managed(sbi, page_folio(page)))
continue;
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
@@ -1316,7 +1306,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* recycle all individual short-lived pages */
if (z_erofs_put_shortlivedpage(be->pagepool, page))
continue;
- z_erofs_onlinepage_endio(page, err);
+ z_erofs_onlinefolio_end(page_folio(page), err);
}
if (be->decompressed_pages != be->onstack_pages)
@@ -1430,38 +1420,34 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
struct z_erofs_bvec zbv;
struct address_space *mapping;
struct page *page;
- int justfound, bs = i_blocksize(f->inode);
+ int bs = i_blocksize(f->inode);
- /* Except for inplace pages, the entire page can be used for I/Os */
+ /* Except for inplace folios, the entire folio can be used for I/Os */
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
spin_lock(&pcl->obj.lockref.lock);
zbv = pcl->compressed_bvecs[nr];
- page = zbv.page;
- justfound = (unsigned long)page & 1UL;
- page = (struct page *)((unsigned long)page & ~1UL);
- pcl->compressed_bvecs[nr].page = page;
spin_unlock(&pcl->obj.lockref.lock);
- if (!page)
- goto out_allocpage;
+ if (!zbv.folio)
+ goto out_allocfolio;
- bvec->bv_page = page;
- DBG_BUGON(z_erofs_is_shortlived_page(page));
+ bvec->bv_page = &zbv.folio->page;
+ DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
/*
- * Handle preallocated cached pages. We tried to allocate such pages
+ * Handle preallocated cached folios. We tried to allocate such folios
* without triggering direct reclaim. If allocation failed, inplace
- * file-backed pages will be used instead.
+ * file-backed folios will be used instead.
*/
- if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
- set_page_private(page, 0);
+ if (zbv.folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
+ zbv.folio->private = 0;
tocache = true;
goto out_tocache;
}
- mapping = READ_ONCE(page->mapping);
+ mapping = READ_ONCE(zbv.folio->mapping);
/*
- * File-backed pages for inplace I/Os are all locked steady,
+ * File-backed folios for inplace I/Os are all locked steady,
* therefore it is impossible for `mapping` to be NULL.
*/
if (mapping && mapping != mc) {
@@ -1471,26 +1457,21 @@ repeat:
return;
}
- lock_page(page);
- /* only true if page reclaim goes wrong, should never happen */
- DBG_BUGON(justfound && PagePrivate(page));
-
- /* the cached page is still in managed cache */
- if (page->mapping == mc) {
+ folio_lock(zbv.folio);
+ if (zbv.folio->mapping == mc) {
/*
- * The cached page is still available but without a valid
- * `->private` pcluster hint. Let's reconnect them.
+ * The cached folio is still in managed cache but without
+ * a valid `->private` pcluster hint. Let's reconnect them.
*/
- if (!PagePrivate(page)) {
- DBG_BUGON(!justfound);
- /* compressed_bvecs[] already takes a ref */
- attach_page_private(page, pcl);
- put_page(page);
+ if (!folio_test_private(zbv.folio)) {
+ folio_attach_private(zbv.folio, pcl);
+ /* compressed_bvecs[] already takes a ref before */
+ folio_put(zbv.folio);
}
/* no need to submit if it is already up-to-date */
- if (PageUptodate(page)) {
- unlock_page(page);
+ if (folio_test_uptodate(zbv.folio)) {
+ folio_unlock(zbv.folio);
bvec->bv_page = NULL;
}
return;
@@ -1500,34 +1481,32 @@ repeat:
* It has been truncated, so it's unsafe to reuse this one. Let's
* allocate a new page for compressed data.
*/
- DBG_BUGON(page->mapping);
- DBG_BUGON(!justfound);
-
+ DBG_BUGON(zbv.folio->mapping);
tocache = true;
- unlock_page(page);
- put_page(page);
-out_allocpage:
+ folio_unlock(zbv.folio);
+ folio_put(zbv.folio);
+out_allocfolio:
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
spin_lock(&pcl->obj.lockref.lock);
- if (pcl->compressed_bvecs[nr].page) {
+ if (pcl->compressed_bvecs[nr].folio) {
erofs_pagepool_add(&f->pagepool, page);
spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
- pcl->compressed_bvecs[nr].page = page;
+ pcl->compressed_bvecs[nr].folio = zbv.folio = page_folio(page);
spin_unlock(&pcl->obj.lockref.lock);
bvec->bv_page = page;
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) {
- /* turn into a temporary shortlived page (1 ref) */
- set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
+ filemap_add_folio(mc, zbv.folio, pcl->obj.index + nr, gfp)) {
+ /* turn into a temporary shortlived folio (1 ref) */
+ zbv.folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
}
- attach_page_private(page, pcl);
+ folio_attach_private(zbv.folio, pcl);
/* drop a refcount added by allocpage (then 2 refs in total here) */
- put_page(page);
+ folio_put(zbv.folio);
}
static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
@@ -1582,28 +1561,29 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
qtail[JQ_BYPASS] = &pcl->next;
}
-static void z_erofs_submissionqueue_endio(struct bio *bio)
+static void z_erofs_endio(struct bio *bio)
{
struct z_erofs_decompressqueue *q = bio->bi_private;
blk_status_t err = bio->bi_status;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
- if (!err)
- SetPageUptodate(page);
- unlock_page(page);
- }
+ DBG_BUGON(folio_test_uptodate(folio));
+ DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
+ if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
+ continue;
+
+ if (!err)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
}
if (err)
q->eio = true;
z_erofs_decompress_kickoff(q, -1);
- bio_put(bio);
+ if (bio->bi_bdev)
+ bio_put(bio);
}
static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
@@ -1617,7 +1597,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
z_erofs_next_pcluster_t owned_head = f->owned_head;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
erofs_off_t last_pa;
- struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
unsigned long pflags;
@@ -1664,9 +1643,13 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
continue;
if (bio && (cur != last_pa ||
- last_bdev != mdev.m_bdev)) {
-submit_bio_retry:
- submit_bio(bio);
+ bio->bi_bdev != mdev.m_bdev)) {
+io_retry:
+ if (!erofs_is_fscache_mode(sb))
+ submit_bio(bio);
+ else
+ erofs_fscache_submit_bio(bio);
+
if (memstall) {
psi_memstall_leave(&pflags);
memstall = 0;
@@ -1681,15 +1664,16 @@ submit_bio_retry:
}
if (!bio) {
- bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
- REQ_OP_READ, GFP_NOIO);
- bio->bi_end_io = z_erofs_submissionqueue_endio;
+ bio = erofs_is_fscache_mode(sb) ?
+ erofs_fscache_bio_alloc(&mdev) :
+ bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
+ bio->bi_end_io = z_erofs_endio;
bio->bi_iter.bi_sector = cur >> 9;
bio->bi_private = q[JQ_SUBMIT];
if (readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
- last_bdev = mdev.m_bdev;
}
if (cur + bvec.bv_len > end)
@@ -1697,7 +1681,7 @@ submit_bio_retry:
DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
- goto submit_bio_retry;
+ goto io_retry;
last_pa = cur + bvec.bv_len;
bypass = false;
@@ -1710,7 +1694,10 @@ submit_bio_retry:
} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
if (bio) {
- submit_bio(bio);
+ if (!erofs_is_fscache_mode(sb))
+ submit_bio(bio);
+ else
+ erofs_fscache_submit_bio(bio);
if (memstall)
psi_memstall_leave(&pflags);
}
@@ -1795,7 +1782,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
if (PageUptodate(page))
unlock_page(page);
else
- (void)z_erofs_do_read_page(f, page, !!rac);
+ z_erofs_scan_folio(f, page_folio(page), !!rac);
put_page(page);
}
@@ -1816,7 +1803,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, &folio->page, false);
+ err = z_erofs_scan_folio(&f, folio, false);
z_erofs_pcluster_readmore(&f, NULL, false);
z_erofs_pcluster_end(&f);
@@ -1857,7 +1844,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
folio = head;
head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, &folio->page, true);
+ err = z_erofs_scan_folio(&f, folio, true);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
folio->index, EROFS_I(inode)->nid);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ad8186d47ba7..9afdb722fa92 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -251,7 +251,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
ssize_t res;
__u64 ucnt;
- if (count < sizeof(ucnt))
+ if (count != sizeof(ucnt))
return -EINVAL;
if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
return -EFAULT;
@@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct eventfd_ctx *ctx = f->private_data;
+ __u64 cnt;
spin_lock_irq(&ctx->wqh.lock);
- seq_printf(m, "eventfd-count: %16llx\n",
- (unsigned long long)ctx->count);
+ cnt = ctx->count;
spin_unlock_irq(&ctx->wqh.lock);
- seq_printf(m, "eventfd-id: %d\n", ctx->id);
- seq_printf(m, "eventfd-semaphore: %d\n",
+
+ seq_printf(m,
+ "eventfd-count: %16llx\n"
+ "eventfd-id: %d\n"
+ "eventfd-semaphore: %d\n",
+ cnt,
+ ctx->id,
!!(ctx->flags & EFD_SEMAPHORE));
}
#endif
@@ -383,6 +388,7 @@ static int do_eventfd(unsigned int count, int flags)
/* Check the EFD_* constants for consistency. */
BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+ BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0));
if (flags & ~EFD_FLAGS_SET)
return -EINVAL;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3534d36a1474..882b89edc52a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -37,6 +37,7 @@
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
+#include <linux/capability.h>
#include <net/busy_poll.h>
/*
@@ -206,7 +207,7 @@ struct eventpoll {
*/
struct epitem *ovflist;
- /* wakeup_source used when ep_scan_ready_list is running */
+ /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
@@ -227,6 +228,11 @@ struct eventpoll {
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
+ /* busy poll timeout */
+ u32 busy_poll_usecs;
+ /* busy poll packet budget */
+ u16 busy_poll_budget;
+ bool prefer_busy_poll;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -387,11 +393,41 @@ static inline int ep_events_available(struct eventpoll *ep)
}
#ifdef CONFIG_NET_RX_BUSY_POLL
+/**
+ * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
+ * from the epoll instance ep is preferred, but if it is not set fallback to
+ * the system-wide global via busy_loop_timeout.
+ *
+ * @start_time: The start time used to compute the remaining time until timeout.
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Return: true if the timeout has expired, false otherwise.
+ */
+static bool busy_loop_ep_timeout(unsigned long start_time,
+ struct eventpoll *ep)
+{
+ unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
+
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ } else {
+ return busy_loop_timeout(start_time);
+ }
+}
+
+static bool ep_busy_loop_on(struct eventpoll *ep)
+{
+ return !!ep->busy_poll_usecs || net_busy_loop_on();
+}
+
static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
struct eventpoll *ep = p;
- return ep_events_available(ep) || busy_loop_timeout(start_time);
+ return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
}
/*
@@ -403,10 +439,15 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
+ u16 budget = READ_ONCE(ep->busy_poll_budget);
+ bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
- if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
- napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
- BUSY_POLL_BUDGET);
+ if (!budget)
+ budget = BUSY_POLL_BUDGET;
+
+ if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
+ napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
+ ep, prefer_busy_poll, budget);
if (ep_events_available(ep))
return true;
/*
@@ -425,12 +466,12 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
*/
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
- struct eventpoll *ep;
+ struct eventpoll *ep = epi->ep;
unsigned int napi_id;
struct socket *sock;
struct sock *sk;
- if (!net_busy_loop_on())
+ if (!ep_busy_loop_on(ep))
return;
sock = sock_from_file(epi->ffd.file);
@@ -442,7 +483,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
return;
napi_id = READ_ONCE(sk->sk_napi_id);
- ep = epi->ep;
/* Non-NAPI IDs can be rejected
* or
@@ -455,6 +495,49 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
ep->napi_id = napi_id;
}
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct eventpoll *ep = file->private_data;
+ void __user *uarg = (void __user *)arg;
+ struct epoll_params epoll_params;
+
+ switch (cmd) {
+ case EPIOCSPARAMS:
+ if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
+ return -EFAULT;
+
+ /* pad byte must be zero */
+ if (epoll_params.__pad)
+ return -EINVAL;
+
+ if (epoll_params.busy_poll_usecs > S32_MAX)
+ return -EINVAL;
+
+ if (epoll_params.prefer_busy_poll > 1)
+ return -EINVAL;
+
+ if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
+ !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
+ WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
+ WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
+ return 0;
+ case EPIOCGPARAMS:
+ memset(&epoll_params, 0, sizeof(epoll_params));
+ epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
+ epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
+ epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+ if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
#else
static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
@@ -466,6 +549,12 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return -EOPNOTSUPP;
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
/*
@@ -678,12 +767,6 @@ static void ep_done_scan(struct eventpoll *ep,
write_unlock_irq(&ep->lock);
}
-static void epi_rcu_free(struct rcu_head *head)
-{
- struct epitem *epi = container_of(head, struct epitem, rcu);
- kmem_cache_free(epi_cache, epi);
-}
-
static void ep_get(struct eventpoll *ep)
{
refcount_inc(&ep->refcount);
@@ -767,7 +850,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
* use of the rbn field.
*/
- call_rcu(&epi->rcu, epi_rcu_free);
+ kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
return ep_refcount_dec_and_test(ep);
@@ -825,6 +908,27 @@ static void ep_clear_and_put(struct eventpoll *ep)
ep_free(ep);
}
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret;
+
+ if (!is_file_epoll(file))
+ return -EINVAL;
+
+ switch (cmd) {
+ case EPIOCSPARAMS:
+ case EPIOCGPARAMS:
+ ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
struct eventpoll *ep = file->private_data;
@@ -931,6 +1035,8 @@ static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
.llseek = noop_llseek,
+ .unlocked_ioctl = ep_eventpoll_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
/*
@@ -1153,7 +1259,7 @@ static inline bool chain_epi_lockless(struct epitem *epi)
* This callback takes a read lock in order not to contend with concurrent
* events from another file descriptor, thus all modifications to ->rdllist
* or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * ep_start/done_scan(), which stops all list modifications and guarantees
* that lists state is seen correctly.
*
* Another thing worth to mention is that ep_poll_callback() can be called
@@ -1751,7 +1857,7 @@ static int ep_send_events(struct eventpoll *ep,
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
- * ep_scan_ready_list() holding "mtx" and the
+ * ep_send_events() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1904,7 +2010,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
__set_current_state(TASK_INTERRUPTIBLE);
/*
- * Do the final check under the lock. ep_scan_ready_list()
+ * Do the final check under the lock. ep_start/done_scan()
* plays with two lists (->rdllist and ->ovflist) and there
* is always a race when both lists are empty for short
* period of time although events are pending, so lock is
@@ -2058,6 +2164,11 @@ static int do_epoll_create(int flags)
error = PTR_ERR(file);
goto out_free_fd;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ ep->busy_poll_usecs = 0;
+ ep->busy_poll_budget = 0;
+ ep->prefer_busy_poll = false;
+#endif
ep->file = file;
fd_install(fd, file);
return fd;
diff --git a/fs/exec.c b/fs/exec.c
index af4fbb61cd53..cf1df7f16e55 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -895,6 +895,7 @@ int transfer_args_to_stack(struct linux_binprm *bprm,
goto out;
}
+ bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
*sp_location = sp;
out:
@@ -1158,7 +1159,6 @@ static int de_thread(struct task_struct *tsk)
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD;
-
/*
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
@@ -1720,7 +1720,6 @@ static int prepare_binprm(struct linux_binprm *bprm)
*/
int remove_arg_zero(struct linux_binprm *bprm)
{
- int ret = 0;
unsigned long offset;
char *kaddr;
struct page *page;
@@ -1731,10 +1730,8 @@ int remove_arg_zero(struct linux_binprm *bprm)
do {
offset = bprm->p & ~PAGE_MASK;
page = get_arg_page(bprm, bprm->p, 0);
- if (!page) {
- ret = -EFAULT;
- goto out;
- }
+ if (!page)
+ return -EFAULT;
kaddr = kmap_local_page(page);
for (; offset < PAGE_SIZE && kaddr[offset];
@@ -1747,10 +1744,8 @@ int remove_arg_zero(struct linux_binprm *bprm)
bprm->p++;
bprm->argc--;
- ret = 0;
-out:
- return ret;
+ return 0;
}
EXPORT_SYMBOL(remove_arg_zero);
diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c
index 5a2f119b7e8c..7cc200d89821 100644
--- a/fs/exfat/cache.c
+++ b/fs/exfat/cache.c
@@ -46,7 +46,7 @@ int exfat_cache_init(void)
{
exfat_cachep = kmem_cache_create("exfat_cache",
sizeof(struct exfat_cache),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ 0, SLAB_RECLAIM_ACCOUNT,
exfat_cache_init_once);
if (!exfat_cachep)
return -ENOMEM;
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 9f9295847a4e..077944d3c2c0 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -448,88 +448,34 @@ static void exfat_init_name_entry(struct exfat_dentry *ep,
}
}
-int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
- int entry, unsigned int type, unsigned int start_clu,
- unsigned long long size)
+void exfat_init_dir_entry(struct exfat_entry_set_cache *es,
+ unsigned int type, unsigned int start_clu,
+ unsigned long long size, struct timespec64 *ts)
{
- struct super_block *sb = inode->i_sb;
+ struct super_block *sb = es->sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct timespec64 ts = current_time(inode);
struct exfat_dentry *ep;
- struct buffer_head *bh;
-
- /*
- * We cannot use exfat_get_dentry_set here because file ep is not
- * initialized yet.
- */
- ep = exfat_get_dentry(sb, p_dir, entry, &bh);
- if (!ep)
- return -EIO;
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
exfat_set_entry_type(ep, type);
- exfat_set_entry_time(sbi, &ts,
+ exfat_set_entry_time(sbi, ts,
&ep->dentry.file.create_tz,
&ep->dentry.file.create_time,
&ep->dentry.file.create_date,
&ep->dentry.file.create_time_cs);
- exfat_set_entry_time(sbi, &ts,
+ exfat_set_entry_time(sbi, ts,
&ep->dentry.file.modify_tz,
&ep->dentry.file.modify_time,
&ep->dentry.file.modify_date,
&ep->dentry.file.modify_time_cs);
- exfat_set_entry_time(sbi, &ts,
+ exfat_set_entry_time(sbi, ts,
&ep->dentry.file.access_tz,
&ep->dentry.file.access_time,
&ep->dentry.file.access_date,
NULL);
- exfat_update_bh(bh, IS_DIRSYNC(inode));
- brelse(bh);
-
- ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh);
- if (!ep)
- return -EIO;
-
+ ep = exfat_get_dentry_cached(es, ES_IDX_STREAM);
exfat_init_stream_entry(ep, start_clu, size);
- exfat_update_bh(bh, IS_DIRSYNC(inode));
- brelse(bh);
-
- return 0;
-}
-
-int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
- int entry)
-{
- struct super_block *sb = inode->i_sb;
- int ret = 0;
- int i, num_entries;
- u16 chksum;
- struct exfat_dentry *ep, *fep;
- struct buffer_head *fbh, *bh;
-
- fep = exfat_get_dentry(sb, p_dir, entry, &fbh);
- if (!fep)
- return -EIO;
-
- num_entries = fep->dentry.file.num_ext + 1;
- chksum = exfat_calc_chksum16(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
-
- for (i = 1; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
- if (!ep) {
- ret = -EIO;
- goto release_fbh;
- }
- chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
- CS_DEFAULT);
- brelse(bh);
- }
-
- fep->dentry.file.checksum = cpu_to_le16(chksum);
- exfat_update_bh(fbh, IS_DIRSYNC(inode));
-release_fbh:
- brelse(fbh);
- return ret;
}
static void exfat_free_benign_secondary_clusters(struct inode *inode,
@@ -551,76 +497,49 @@ static void exfat_free_benign_secondary_clusters(struct inode *inode,
exfat_free_cluster(inode, &dir);
}
-int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
- int entry, int num_entries, struct exfat_uni_name *p_uniname)
+void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries,
+ struct exfat_uni_name *p_uniname)
{
- struct super_block *sb = inode->i_sb;
int i;
unsigned short *uniname = p_uniname->name;
struct exfat_dentry *ep;
- struct buffer_head *bh;
- int sync = IS_DIRSYNC(inode);
-
- ep = exfat_get_dentry(sb, p_dir, entry, &bh);
- if (!ep)
- return -EIO;
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
ep->dentry.file.num_ext = (unsigned char)(num_entries - 1);
- exfat_update_bh(bh, sync);
- brelse(bh);
-
- ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh);
- if (!ep)
- return -EIO;
+ ep = exfat_get_dentry_cached(es, ES_IDX_STREAM);
ep->dentry.stream.name_len = p_uniname->name_len;
ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash);
- exfat_update_bh(bh, sync);
- brelse(bh);
-
- for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
- if (!ep)
- return -EIO;
-
- if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC)
- exfat_free_benign_secondary_clusters(inode, ep);
+ for (i = ES_IDX_FIRST_FILENAME; i < num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
exfat_init_name_entry(ep, uniname);
- exfat_update_bh(bh, sync);
- brelse(bh);
uniname += EXFAT_FILE_NAME_LEN;
}
- exfat_update_dir_chksum(inode, p_dir, entry);
- return 0;
+ exfat_update_dir_chksum(es);
}
-int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
- int entry, int order, int num_entries)
+void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es,
+ int order)
{
- struct super_block *sb = inode->i_sb;
int i;
struct exfat_dentry *ep;
- struct buffer_head *bh;
- for (i = order; i < num_entries; i++) {
- ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
- if (!ep)
- return -EIO;
+ for (i = order; i < es->num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC)
exfat_free_benign_secondary_clusters(inode, ep);
exfat_set_entry_type(ep, TYPE_DELETED);
- exfat_update_bh(bh, IS_DIRSYNC(inode));
- brelse(bh);
}
- return 0;
+ if (order < es->num_entries)
+ es->modified = true;
}
-void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
+void exfat_update_dir_chksum(struct exfat_entry_set_cache *es)
{
int chksum_type = CS_DIR_ENTRY, i;
unsigned short chksum = 0;
@@ -775,7 +694,6 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
}
enum exfat_validate_dentry_mode {
- ES_MODE_STARTED,
ES_MODE_GET_FILE_ENTRY,
ES_MODE_GET_STRM_ENTRY,
ES_MODE_GET_NAME_ENTRY,
@@ -790,11 +708,6 @@ static bool exfat_validate_entry(unsigned int type,
return false;
switch (*mode) {
- case ES_MODE_STARTED:
- if (type != TYPE_FILE && type != TYPE_DIR)
- return false;
- *mode = ES_MODE_GET_FILE_ENTRY;
- break;
case ES_MODE_GET_FILE_ENTRY:
if (type != TYPE_STREAM)
return false;
@@ -834,7 +747,7 @@ struct exfat_dentry *exfat_get_dentry_cached(
}
/*
- * Returns a set of dentries for a file or dir.
+ * Returns a set of dentries.
*
* Note It provides a direct pointer to bh->data via exfat_get_dentry_cached().
* User should call exfat_get_dentry_set() after setting 'modified' to apply
@@ -842,22 +755,24 @@ struct exfat_dentry *exfat_get_dentry_cached(
*
* in:
* sb+p_dir+entry: indicates a file/dir
- * type: specifies how many dentries should be included.
+ * num_entries: specifies how many dentries should be included.
+ * It will be set to es->num_entries if it is not 0.
+ * If num_entries is 0, es->num_entries will be obtained
+ * from the first dentry.
+ * out:
+ * es: pointer of entry set on success.
* return:
- * pointer of entry set on success,
- * NULL on failure.
+ * 0 on success
+ * -error code on failure
*/
-int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+static int __exfat_get_dentry_set(struct exfat_entry_set_cache *es,
struct super_block *sb, struct exfat_chain *p_dir, int entry,
- unsigned int type)
+ unsigned int num_entries)
{
int ret, i, num_bh;
unsigned int off;
sector_t sec;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct exfat_dentry *ep;
- int num_entries;
- enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
struct buffer_head *bh;
if (p_dir->dir == DIR_DELETED) {
@@ -880,12 +795,18 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
return -EIO;
es->bh[es->num_bh++] = bh;
- ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
- if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto put_es;
+ if (num_entries == ES_ALL_ENTRIES) {
+ struct exfat_dentry *ep;
+
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
+ if (ep->type != EXFAT_FILE) {
+ brelse(bh);
+ return -EIO;
+ }
+
+ num_entries = ep->dentry.file.num_ext + 1;
+ }
- num_entries = type == ES_ALL_ENTRIES ?
- ep->dentry.file.num_ext + 1 : type;
es->num_entries = num_entries;
num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
@@ -918,8 +839,27 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
es->bh[es->num_bh++] = bh;
}
+ return 0;
+
+put_es:
+ exfat_put_dentry_set(es, false);
+ return -EIO;
+}
+
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, unsigned int num_entries)
+{
+ int ret, i;
+ struct exfat_dentry *ep;
+ enum exfat_validate_dentry_mode mode = ES_MODE_GET_FILE_ENTRY;
+
+ ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries);
+ if (ret < 0)
+ return ret;
+
/* validate cached dentries */
- for (i = ES_IDX_STREAM; i < num_entries; i++) {
+ for (i = ES_IDX_STREAM; i < es->num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
goto put_es;
@@ -931,6 +871,85 @@ put_es:
return -EIO;
}
+static int exfat_validate_empty_dentry_set(struct exfat_entry_set_cache *es)
+{
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+ int i, off;
+ bool unused_hit = false;
+
+ /*
+ * ONLY UNUSED OR DELETED DENTRIES ARE ALLOWED:
+ * Although it violates the specification for a deleted entry to
+ * follow an unused entry, some exFAT implementations could work
+ * like this. Therefore, to improve compatibility, let's allow it.
+ */
+ for (i = 0; i < es->num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
+ if (ep->type == EXFAT_UNUSED) {
+ unused_hit = true;
+ } else if (!IS_EXFAT_DELETED(ep->type)) {
+ if (unused_hit)
+ goto err_used_follow_unused;
+ i++;
+ goto count_skip_entries;
+ }
+ }
+
+ return 0;
+
+err_used_follow_unused:
+ off = es->start_off + (i << DENTRY_SIZE_BITS);
+ bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)];
+
+ exfat_fs_error(es->sb,
+ "in sector %lld, dentry %d should be unused, but 0x%x",
+ bh->b_blocknr, off >> DENTRY_SIZE_BITS, ep->type);
+
+ return -EIO;
+
+count_skip_entries:
+ es->num_entries = EXFAT_B_TO_DEN(EXFAT_BLK_TO_B(es->num_bh, es->sb) - es->start_off);
+ for (; i < es->num_entries; i++) {
+ ep = exfat_get_dentry_cached(es, i);
+ if (IS_EXFAT_DELETED(ep->type))
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Get an empty dentry set.
+ *
+ * in:
+ * sb+p_dir+entry: indicates the empty dentry location
+ * num_entries: specifies how many empty dentries should be included.
+ * out:
+ * es: pointer of empty dentry set on success.
+ * return:
+ * 0 : on success
+ * >0 : the dentries are not empty, the return value is the number of
+ * dentries to be skipped for the next lookup.
+ * <0 : on failure
+ */
+int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir,
+ int entry, unsigned int num_entries)
+{
+ int ret;
+
+ ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries);
+ if (ret < 0)
+ return ret;
+
+ ret = exfat_validate_empty_dentry_set(es);
+ if (ret)
+ exfat_put_dentry_set(es, false);
+
+ return ret;
+}
+
static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
{
hint_femp->eidx = EXFAT_HINT_NONE;
@@ -1187,27 +1206,6 @@ found:
return dentry - num_ext;
}
-int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
- int entry, struct exfat_dentry *ep)
-{
- int i, count = 0;
- unsigned int type;
- struct exfat_dentry *ext_ep;
- struct buffer_head *bh;
-
- for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) {
- ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh);
- if (!ext_ep)
- return -EIO;
-
- type = exfat_get_entry_type(ext_ep);
- brelse(bh);
- if (type & TYPE_CRITICAL_SEC || type & TYPE_BENIGN_SEC)
- count++;
- }
- return count;
-}
-
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
{
int i, count = 0;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 361595433480..ecc5db952deb 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -431,8 +431,6 @@ int exfat_ent_get(struct super_block *sb, unsigned int loc,
unsigned int *content);
int exfat_ent_set(struct super_block *sb, unsigned int loc,
unsigned int content);
-int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
- int entry, struct exfat_dentry *p_entry);
int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain,
unsigned int len);
int exfat_zeroed_cluster(struct inode *dir, unsigned int clu);
@@ -480,16 +478,14 @@ int exfat_get_cluster(struct inode *inode, unsigned int cluster,
extern const struct inode_operations exfat_dir_inode_operations;
extern const struct file_operations exfat_dir_operations;
unsigned int exfat_get_entry_type(struct exfat_dentry *p_entry);
-int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
- int entry, unsigned int type, unsigned int start_clu,
- unsigned long long size);
-int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
- int entry, int num_entries, struct exfat_uni_name *p_uniname);
-int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
- int entry, int order, int num_entries);
-int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
- int entry);
-void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es);
+void exfat_init_dir_entry(struct exfat_entry_set_cache *es,
+ unsigned int type, unsigned int start_clu,
+ unsigned long long size, struct timespec64 *ts);
+void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries,
+ struct exfat_uni_name *p_uniname);
+void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es,
+ int order);
+void exfat_update_dir_chksum(struct exfat_entry_set_cache *es);
int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
@@ -501,7 +497,10 @@ struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
int num);
int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
struct super_block *sb, struct exfat_chain *p_dir, int entry,
- unsigned int type);
+ unsigned int num_entries);
+int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir, int entry,
+ unsigned int num_entries);
int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 0687f952956c..dd894e558c91 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -94,7 +94,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
}
- exfat_update_dir_chksum_with_entry_set(&es);
+ exfat_update_dir_chksum(&es);
return exfat_put_dentry_set(&es, sync);
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 9c549fd11fc8..631ad9e8e32a 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -204,21 +204,16 @@ const struct dentry_operations exfat_utf8_dentry_ops = {
.d_compare = exfat_utf8_d_cmp,
};
-/* used only in search empty_slot() */
-#define CNT_UNUSED_NOHIT (-1)
-#define CNT_UNUSED_HIT (-2)
/* search EMPTY CONTINUOUS "num_entries" entries */
static int exfat_search_empty_slot(struct super_block *sb,
struct exfat_hint_femp *hint_femp, struct exfat_chain *p_dir,
- int num_entries)
+ int num_entries, struct exfat_entry_set_cache *es)
{
- int i, dentry, num_empty = 0;
+ int i, dentry, ret;
int dentries_per_clu;
- unsigned int type;
struct exfat_chain clu;
- struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct buffer_head *bh;
+ int total_entries = EXFAT_CLU_TO_DEN(p_dir->size, sbi);
dentries_per_clu = sbi->dentries_per_clu;
@@ -231,7 +226,7 @@ static int exfat_search_empty_slot(struct super_block *sb,
* Otherwise, and if "dentry + hint_famp->count" is also equal
* to "p_dir->size * dentries_per_clu", it means ENOSPC.
*/
- if (dentry + hint_femp->count == p_dir->size * dentries_per_clu &&
+ if (dentry + hint_femp->count == total_entries &&
num_entries > hint_femp->count)
return -ENOSPC;
@@ -242,69 +237,41 @@ static int exfat_search_empty_slot(struct super_block *sb,
dentry = 0;
}
- while (clu.dir != EXFAT_EOF_CLUSTER) {
+ while (dentry + num_entries < total_entries &&
+ clu.dir != EXFAT_EOF_CLUSTER) {
i = dentry & (dentries_per_clu - 1);
- for (; i < dentries_per_clu; i++, dentry++) {
- ep = exfat_get_dentry(sb, &clu, i, &bh);
- if (!ep)
- return -EIO;
- type = exfat_get_entry_type(ep);
- brelse(bh);
-
- if (type == TYPE_UNUSED || type == TYPE_DELETED) {
- num_empty++;
- if (hint_femp->eidx == EXFAT_HINT_NONE) {
- hint_femp->eidx = dentry;
- hint_femp->count = CNT_UNUSED_NOHIT;
- exfat_chain_set(&hint_femp->cur,
- clu.dir, clu.size, clu.flags);
- }
-
- if (type == TYPE_UNUSED &&
- hint_femp->count != CNT_UNUSED_HIT)
- hint_femp->count = CNT_UNUSED_HIT;
+ ret = exfat_get_empty_dentry_set(es, sb, &clu, i, num_entries);
+ if (ret < 0)
+ return ret;
+ else if (ret == 0)
+ return dentry;
+
+ dentry += ret;
+ i += ret;
+
+ while (i >= dentries_per_clu) {
+ if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+ if (--clu.size > 0)
+ clu.dir++;
+ else
+ clu.dir = EXFAT_EOF_CLUSTER;
} else {
- if (hint_femp->eidx != EXFAT_HINT_NONE &&
- hint_femp->count == CNT_UNUSED_HIT) {
- /* unused empty group means
- * an empty group which includes
- * unused dentry
- */
- exfat_fs_error(sb,
- "found bogus dentry(%d) beyond unused empty group(%d) (start_clu : %u, cur_clu : %u)",
- dentry, hint_femp->eidx,
- p_dir->dir, clu.dir);
+ if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
- }
-
- num_empty = 0;
- hint_femp->eidx = EXFAT_HINT_NONE;
}
- if (num_empty >= num_entries) {
- /* found and invalidate hint_femp */
- hint_femp->eidx = EXFAT_HINT_NONE;
- return (dentry - (num_entries - 1));
- }
- }
-
- if (clu.flags == ALLOC_NO_FAT_CHAIN) {
- if (--clu.size > 0)
- clu.dir++;
- else
- clu.dir = EXFAT_EOF_CLUSTER;
- } else {
- if (exfat_get_next_cluster(sb, &clu.dir))
- return -EIO;
+ i -= dentries_per_clu;
}
}
- hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty;
- hint_femp->count = num_empty;
- if (num_empty == 0)
+ hint_femp->eidx = dentry;
+ hint_femp->count = 0;
+ if (dentry == total_entries || clu.dir == EXFAT_EOF_CLUSTER)
exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0,
clu.flags);
+ else
+ hint_femp->cur = clu;
return -ENOSPC;
}
@@ -325,7 +292,8 @@ static int exfat_check_max_dentries(struct inode *inode)
* if there isn't any empty slot, expand cluster chain.
*/
static int exfat_find_empty_entry(struct inode *inode,
- struct exfat_chain *p_dir, int num_entries)
+ struct exfat_chain *p_dir, int num_entries,
+ struct exfat_entry_set_cache *es)
{
int dentry;
unsigned int ret, last_clu;
@@ -344,7 +312,7 @@ static int exfat_find_empty_entry(struct inode *inode,
}
while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir,
- num_entries)) < 0) {
+ num_entries, es)) < 0) {
if (dentry == -EIO)
break;
@@ -499,6 +467,8 @@ static int exfat_add_entry(struct inode *inode, const char *path,
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_uni_name uniname;
struct exfat_chain clu;
+ struct timespec64 ts = current_time(inode);
+ struct exfat_entry_set_cache es;
int clu_size = 0;
unsigned int start_clu = EXFAT_FREE_CLUSTER;
@@ -513,7 +483,7 @@ static int exfat_add_entry(struct inode *inode, const char *path,
}
/* exfat_find_empty_entry must be called before alloc_cluster() */
- dentry = exfat_find_empty_entry(inode, p_dir, num_entries);
+ dentry = exfat_find_empty_entry(inode, p_dir, num_entries, &es);
if (dentry < 0) {
ret = dentry; /* -EIO or -ENOSPC */
goto out;
@@ -521,8 +491,10 @@ static int exfat_add_entry(struct inode *inode, const char *path,
if (type == TYPE_DIR && !sbi->options.zero_size_dir) {
ret = exfat_alloc_new_dir(inode, &clu);
- if (ret)
+ if (ret) {
+ exfat_put_dentry_set(&es, false);
goto out;
+ }
start_clu = clu.dir;
clu_size = sbi->cluster_size;
}
@@ -531,12 +503,10 @@ static int exfat_add_entry(struct inode *inode, const char *path,
/* fill the dos name directory entry information of the created file.
* the first cluster is not determined yet. (0)
*/
- ret = exfat_init_dir_entry(inode, p_dir, dentry, type,
- start_clu, clu_size);
- if (ret)
- goto out;
+ exfat_init_dir_entry(&es, type, start_clu, clu_size, &ts);
+ exfat_init_ext_entry(&es, num_entries, &uniname);
- ret = exfat_init_ext_entry(inode, p_dir, dentry, num_entries, &uniname);
+ ret = exfat_put_dentry_set(&es, IS_DIRSYNC(inode));
if (ret)
goto out;
@@ -577,6 +547,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
struct exfat_dir_entry info;
loff_t i_pos;
int err;
+ loff_t size = i_size_read(dir);
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
@@ -587,7 +558,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
inode_inc_iversion(dir);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- if (IS_DIRSYNC(dir))
+ if (IS_DIRSYNC(dir) && size != i_size_read(dir))
exfat_sync_inode(dir);
else
mark_inode_dirty(dir);
@@ -795,12 +766,11 @@ unlock:
static int exfat_unlink(struct inode *dir, struct dentry *dentry)
{
struct exfat_chain cdir;
- struct exfat_dentry *ep;
struct super_block *sb = dir->i_sb;
struct inode *inode = dentry->d_inode;
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct buffer_head *bh;
- int num_entries, entry, err = 0;
+ struct exfat_entry_set_cache es;
+ int entry, err = 0;
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_chain_dup(&cdir, &ei->dir);
@@ -811,26 +781,20 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
goto unlock;
}
- ep = exfat_get_dentry(sb, &cdir, entry, &bh);
- if (!ep) {
- err = -EIO;
- goto unlock;
- }
- num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep);
- if (num_entries < 0) {
+ err = exfat_get_dentry_set(&es, sb, &cdir, entry, ES_ALL_ENTRIES);
+ if (err) {
err = -EIO;
- brelse(bh);
goto unlock;
}
- num_entries++;
- brelse(bh);
exfat_set_volume_dirty(sb);
+
/* update the directory entry */
- if (exfat_remove_entries(dir, &cdir, entry, 0, num_entries)) {
- err = -EIO;
+ exfat_remove_entries(inode, &es, ES_IDX_FILE);
+
+ err = exfat_put_dentry_set(&es, IS_DIRSYNC(inode));
+ if (err)
goto unlock;
- }
/* This doesn't modify ei */
ei->dir.dir = DIR_DELETED;
@@ -838,10 +802,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
inode_inc_iversion(dir);
simple_inode_init_ts(dir);
exfat_truncate_inode_atime(dir);
- if (IS_DIRSYNC(dir))
- exfat_sync_inode(dir);
- else
- mark_inode_dirty(dir);
+ mark_inode_dirty(dir);
clear_nlink(inode);
simple_inode_init_ts(inode);
@@ -862,6 +823,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct exfat_chain cdir;
loff_t i_pos;
int err;
+ loff_t size = i_size_read(dir);
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
@@ -872,7 +834,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inode_inc_iversion(dir);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- if (IS_DIRSYNC(dir))
+ if (IS_DIRSYNC(dir) && size != i_size_read(dir))
exfat_sync_inode(dir);
else
mark_inode_dirty(dir);
@@ -946,13 +908,12 @@ static int exfat_check_dir_empty(struct super_block *sb,
static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- struct exfat_dentry *ep;
struct exfat_chain cdir, clu_to_free;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct buffer_head *bh;
- int num_entries, entry, err;
+ struct exfat_entry_set_cache es;
+ int entry, err;
mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
@@ -976,27 +937,20 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
goto unlock;
}
- ep = exfat_get_dentry(sb, &cdir, entry, &bh);
- if (!ep) {
- err = -EIO;
- goto unlock;
- }
-
- num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep);
- if (num_entries < 0) {
+ err = exfat_get_dentry_set(&es, sb, &cdir, entry, ES_ALL_ENTRIES);
+ if (err) {
err = -EIO;
- brelse(bh);
goto unlock;
}
- num_entries++;
- brelse(bh);
exfat_set_volume_dirty(sb);
- err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
- if (err) {
- exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err);
+
+ exfat_remove_entries(inode, &es, ES_IDX_FILE);
+
+ err = exfat_put_dentry_set(&es, IS_DIRSYNC(dir));
+ if (err)
goto unlock;
- }
+
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
@@ -1022,67 +976,52 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
int oldentry, struct exfat_uni_name *p_uniname,
struct exfat_inode_info *ei)
{
- int ret, num_old_entries, num_new_entries;
+ int ret, num_new_entries;
struct exfat_dentry *epold, *epnew;
struct super_block *sb = inode->i_sb;
- struct buffer_head *new_bh, *old_bh;
+ struct exfat_entry_set_cache old_es, new_es;
int sync = IS_DIRSYNC(inode);
- epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh);
- if (!epold)
- return -EIO;
-
- num_old_entries = exfat_count_ext_entries(sb, p_dir, oldentry, epold);
- if (num_old_entries < 0)
- return -EIO;
- num_old_entries++;
-
num_new_entries = exfat_calc_num_entries(p_uniname);
if (num_new_entries < 0)
return num_new_entries;
- if (num_old_entries < num_new_entries) {
- int newentry;
+ ret = exfat_get_dentry_set(&old_es, sb, p_dir, oldentry, ES_ALL_ENTRIES);
+ if (ret) {
+ ret = -EIO;
+ return ret;
+ }
- newentry =
- exfat_find_empty_entry(inode, p_dir, num_new_entries);
- if (newentry < 0)
- return newentry; /* -EIO or -ENOSPC */
+ epold = exfat_get_dentry_cached(&old_es, ES_IDX_FILE);
- epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh);
- if (!epnew)
- return -EIO;
+ if (old_es.num_entries < num_new_entries) {
+ int newentry;
+ newentry = exfat_find_empty_entry(inode, p_dir, num_new_entries,
+ &new_es);
+ if (newentry < 0) {
+ ret = newentry; /* -EIO or -ENOSPC */
+ goto put_old_es;
+ }
+
+ epnew = exfat_get_dentry_cached(&new_es, ES_IDX_FILE);
*epnew = *epold;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
ei->attr |= EXFAT_ATTR_ARCHIVE;
}
- exfat_update_bh(new_bh, sync);
- brelse(old_bh);
- brelse(new_bh);
-
- epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh);
- if (!epold)
- return -EIO;
- epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh);
- if (!epnew) {
- brelse(old_bh);
- return -EIO;
- }
+ epold = exfat_get_dentry_cached(&old_es, ES_IDX_STREAM);
+ epnew = exfat_get_dentry_cached(&new_es, ES_IDX_STREAM);
*epnew = *epold;
- exfat_update_bh(new_bh, sync);
- brelse(old_bh);
- brelse(new_bh);
- ret = exfat_init_ext_entry(inode, p_dir, newentry,
- num_new_entries, p_uniname);
+ exfat_init_ext_entry(&new_es, num_new_entries, p_uniname);
+
+ ret = exfat_put_dentry_set(&new_es, sync);
if (ret)
- return ret;
+ goto put_old_es;
- exfat_remove_entries(inode, p_dir, oldentry, 0,
- num_old_entries);
+ exfat_remove_entries(inode, &old_es, ES_IDX_FILE);
ei->dir = *p_dir;
ei->entry = newentry;
} else {
@@ -1090,85 +1029,72 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
epold->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
ei->attr |= EXFAT_ATTR_ARCHIVE;
}
- exfat_update_bh(old_bh, sync);
- brelse(old_bh);
- ret = exfat_init_ext_entry(inode, p_dir, oldentry,
- num_new_entries, p_uniname);
- if (ret)
- return ret;
- exfat_remove_entries(inode, p_dir, oldentry, num_new_entries,
- num_old_entries);
+ exfat_remove_entries(inode, &old_es, ES_IDX_FIRST_FILENAME + 1);
+ exfat_init_ext_entry(&old_es, num_new_entries, p_uniname);
}
- return 0;
+ return exfat_put_dentry_set(&old_es, sync);
+
+put_old_es:
+ exfat_put_dentry_set(&old_es, false);
+ return ret;
}
static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
int oldentry, struct exfat_chain *p_newdir,
struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei)
{
- int ret, newentry, num_new_entries, num_old_entries;
+ int ret, newentry, num_new_entries;
struct exfat_dentry *epmov, *epnew;
struct super_block *sb = inode->i_sb;
- struct buffer_head *mov_bh, *new_bh;
-
- epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh);
- if (!epmov)
- return -EIO;
-
- num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry,
- epmov);
- if (num_old_entries < 0)
- return -EIO;
- num_old_entries++;
+ struct exfat_entry_set_cache mov_es, new_es;
num_new_entries = exfat_calc_num_entries(p_uniname);
if (num_new_entries < 0)
return num_new_entries;
- newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries);
- if (newentry < 0)
- return newentry; /* -EIO or -ENOSPC */
-
- epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh);
- if (!epnew)
+ ret = exfat_get_dentry_set(&mov_es, sb, p_olddir, oldentry,
+ ES_ALL_ENTRIES);
+ if (ret)
return -EIO;
+ newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries,
+ &new_es);
+ if (newentry < 0) {
+ ret = newentry; /* -EIO or -ENOSPC */
+ goto put_mov_es;
+ }
+
+ epmov = exfat_get_dentry_cached(&mov_es, ES_IDX_FILE);
+ epnew = exfat_get_dentry_cached(&new_es, ES_IDX_FILE);
*epnew = *epmov;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
ei->attr |= EXFAT_ATTR_ARCHIVE;
}
- exfat_update_bh(new_bh, IS_DIRSYNC(inode));
- brelse(mov_bh);
- brelse(new_bh);
-
- epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh);
- if (!epmov)
- return -EIO;
- epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh);
- if (!epnew) {
- brelse(mov_bh);
- return -EIO;
- }
+ epmov = exfat_get_dentry_cached(&mov_es, ES_IDX_STREAM);
+ epnew = exfat_get_dentry_cached(&new_es, ES_IDX_STREAM);
*epnew = *epmov;
- exfat_update_bh(new_bh, IS_DIRSYNC(inode));
- brelse(mov_bh);
- brelse(new_bh);
-
- ret = exfat_init_ext_entry(inode, p_newdir, newentry, num_new_entries,
- p_uniname);
- if (ret)
- return ret;
- exfat_remove_entries(inode, p_olddir, oldentry, 0, num_old_entries);
+ exfat_init_ext_entry(&new_es, num_new_entries, p_uniname);
+ exfat_remove_entries(inode, &mov_es, ES_IDX_FILE);
exfat_chain_set(&ei->dir, p_newdir->dir, p_newdir->size,
p_newdir->flags);
ei->entry = newentry;
- return 0;
+
+ ret = exfat_put_dentry_set(&new_es, IS_DIRSYNC(inode));
+ if (ret)
+ goto put_mov_es;
+
+ return exfat_put_dentry_set(&mov_es, IS_DIRSYNC(inode));
+
+put_mov_es:
+ exfat_put_dentry_set(&mov_es, false);
+
+ return ret;
}
/* rename or move a old file into a new file */
@@ -1186,7 +1112,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
struct exfat_sb_info *sbi = EXFAT_SB(sb);
const unsigned char *new_path = new_dentry->d_name.name;
struct inode *new_inode = new_dentry->d_inode;
- int num_entries;
struct exfat_inode_info *new_ei = NULL;
unsigned int new_entry_type = TYPE_UNUSED;
int new_entry = 0;
@@ -1257,25 +1182,21 @@ static int __exfat_rename(struct inode *old_parent_inode,
&newdir, &uni_name, ei);
if (!ret && new_inode) {
+ struct exfat_entry_set_cache es;
+
/* delete entries of new_dir */
- ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh);
- if (!ep) {
+ ret = exfat_get_dentry_set(&es, sb, p_dir, new_entry,
+ ES_ALL_ENTRIES);
+ if (ret) {
ret = -EIO;
goto del_out;
}
- num_entries = exfat_count_ext_entries(sb, p_dir, new_entry, ep);
- if (num_entries < 0) {
- ret = -EIO;
- goto del_out;
- }
- brelse(new_bh);
+ exfat_remove_entries(new_inode, &es, ES_IDX_FILE);
- if (exfat_remove_entries(new_inode, p_dir, new_entry, 0,
- num_entries + 1)) {
- ret = -EIO;
+ ret = exfat_put_dentry_set(&es, IS_DIRSYNC(new_inode));
+ if (ret)
goto del_out;
- }
/* Free the clusters if new_inode is a dir(as if exfat_rmdir) */
if (new_entry_type == TYPE_DIR &&
@@ -1317,6 +1238,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
struct super_block *sb = old_dir->i_sb;
loff_t i_pos;
int err;
+ loff_t size = i_size_read(new_dir);
/*
* The VFS already checks for existence, so for local filesystems
@@ -1338,7 +1260,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
exfat_truncate_inode_atime(new_dir);
- if (IS_DIRSYNC(new_dir))
+ if (IS_DIRSYNC(new_dir) && size != i_size_read(new_dir))
exfat_sync_inode(new_dir);
else
mark_inode_dirty(new_dir);
@@ -1359,9 +1281,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
}
inode_inc_iversion(old_dir);
- if (IS_DIRSYNC(old_dir))
- exfat_sync_inode(old_dir);
- else
+ if (new_dir != old_dir)
mark_inode_dirty(old_dir);
if (new_inode) {
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index fcb658267765..3d5ea2cfad66 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -813,7 +813,7 @@ static int __init init_exfat_fs(void)
exfat_inode_cachep = kmem_cache_create("exfat_inode_cache",
sizeof(struct exfat_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ 0, SLAB_RECLAIM_ACCOUNT,
exfat_inode_init_once);
if (!exfat_inode_cachep) {
err = -ENOMEM;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 3ae0154c5680..07ea3d62b298 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -255,7 +255,7 @@ static bool filldir_one(struct dir_context *ctx, const char *name, int len,
container_of(ctx, struct getdents_callback, ctx);
buf->sequence++;
- if (buf->ino == ino && len <= NAME_MAX) {
+ if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) {
memcpy(buf->name, name, len);
buf->name[len] = '\0';
buf->found = 1;
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 74d98965902e..d6cfb1849580 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,16 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
config EXT2_FS
- tristate "Second extended fs support"
+ tristate "Second extended fs support (DEPRECATED)"
select BUFFER_HEAD
select FS_IOMAP
select LEGACY_DIRECT_IO
help
Ext2 is a standard Linux file system for hard disks.
- To compile this file system support as a module, choose M here: the
- module will be called ext2.
+ This filesystem driver is deprecated because it does not properly
+ support inode time stamps beyond 03:14:07 UTC on 19 January 2038.
- If unsure, say Y.
+ Ext2 users are advised to use ext4 driver to access their filesystem.
+ The driver is fully compatible, supports filesystems without journal
+ or extents, and also supports larger time stamps if the filesystem
+ is created with at least 256 byte inodes.
+
+ This code is kept as a simple reference for filesystem developers.
+
+ If unsure, say N.
config EXT2_FS_XATTR
bool "Ext2 extended attributes"
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e124f3d709b2..1bfd6ab11038 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -412,7 +412,7 @@ void ext2_init_block_alloc_info(struct inode *inode)
struct ext2_block_alloc_info *block_i;
struct super_block *sb = inode->i_sb;
- block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+ block_i = kmalloc(sizeof(*block_i), GFP_KERNEL);
if (block_i) {
struct ext2_reserve_window_node *rsv = &block_i->rsv_window_node;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 677a9ad45dcb..f38bdd46e4f7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -674,7 +674,7 @@ struct ext2_inode_info {
struct inode vfs_inode;
struct list_head i_orphan; /* unlinked but open inodes */
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 5a4272b2c6b0..f3d570a9302b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -754,7 +754,7 @@ static int ext2_get_blocks(struct inode *inode,
*/
err = sb_issue_zeroout(inode->i_sb,
le32_to_cpu(chain[depth-1].key), count,
- GFP_NOFS);
+ GFP_KERNEL);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 01f9addc8b1f..37f7ce56adce 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -213,8 +213,7 @@ static int __init init_inodecache(void)
{
ext2_inode_cachep = kmem_cache_create_usercopy("ext2_inode_cache",
sizeof(struct ext2_inode_info), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT),
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
offsetof(struct ext2_inode_info, i_data),
sizeof_field(struct ext2_inode_info, i_data),
init_once);
@@ -320,7 +319,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, siz
static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
static int ext2_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path);
-static struct dquot **ext2_get_dquots(struct inode *inode)
+static struct dquot __rcu **ext2_get_dquots(struct inode *inode)
{
return EXT2_I(inode)->i_dquot;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e849241ebb8f..c885dcc3bd0d 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -874,7 +874,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
int error;
- error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr,
+ error = mb_cache_entry_create(cache, GFP_KERNEL, hash, bh->b_blocknr,
true);
if (error) {
if (error == -EBUSY) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 023571f8dd1b..8d126654019e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1158,7 +1158,7 @@ struct ext4_inode_info {
tid_t i_datasync_tid;
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
@@ -1550,7 +1550,7 @@ struct ext4_sb_info {
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct bdev_handle *s_journal_bdev_handle;
+ struct file *s_journal_bdev_file;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7669d154c05e..e57054bdc5fd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4111,10 +4111,10 @@ insert_hole:
*
* Need to be called with
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
*
* return > 0, number of blocks already mapped/allocated
- * if create == 0 and these are pre-allocated blocks
+ * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
* buffer head is unmapped
* otherwise blocks are mapped
*
@@ -4218,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/*
* requested block isn't allocated yet;
- * we couldn't try to create block if create flag is zero
+ * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
ext4_lblk_t len;
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 11e6f33677a2..df853c4d3a8c 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -576,9 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->s_journal_bdev_handle &&
+ if (EXT4_SB(sb)->s_journal_bdev_file &&
fm->fmr_device ==
- new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev))
+ new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev))
return true;
return false;
}
@@ -648,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->s_journal_bdev_handle) {
+ if (EXT4_SB(sb)->s_journal_bdev_file) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev);
+ file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2ccf3b5e3a7c..537803250ca9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -465,9 +465,10 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocated. if
- * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
- * is marked as unwritten. If the create == 1, it will mark @map as mapped.
+ * On success, it returns the number of blocks being mapped or allocated.
+ * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
+ * pre-allocated and unwritten, the resulting @map is marked as unwritten.
+ * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
* that case, @map is returned as unmapped but we still do fill map->m_len to
@@ -589,8 +590,7 @@ found:
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns the create = 0
- * with buffer head unmapped.
+ * ext4_ext_map_blocks() returns with buffer head unmapped
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
/*
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index f94901fd3835..044ca5238f41 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -5,6 +5,7 @@
#include <kunit/test.h>
#include <kunit/static_stub.h>
+#include <linux/random.h>
#include "ext4.h"
@@ -20,41 +21,135 @@ struct mbt_ctx {
};
struct mbt_ext4_super_block {
- struct super_block sb;
+ struct ext4_super_block es;
+ struct ext4_sb_info sbi;
struct mbt_ctx mbt_ctx;
};
-#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx))
+#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi))
+#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx)
#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+static const struct super_operations mbt_sops = {
+};
+
+static void mbt_kill_sb(struct super_block *sb)
+{
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type mbt_fs_type = {
+ .name = "mballoc test",
+ .kill_sb = mbt_kill_sb,
+};
+
+static int mbt_mb_init(struct super_block *sb)
+{
+ ext4_fsblk_t block;
+ int ret;
+
+ /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */
+ sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL);
+ if (sb->s_bdev == NULL)
+ return -ENOMEM;
+
+ sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL);
+ if (sb->s_bdev->bd_queue == NULL) {
+ kfree(sb->s_bdev);
+ return -ENOMEM;
+ }
+
+ /*
+ * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache =
+ * new_inode(sb);
+ */
+ INIT_LIST_HEAD(&sb->s_inodes);
+ sb->s_op = &mbt_sops;
+
+ ret = ext4_mb_init(sb);
+ if (ret != 0)
+ goto err_out;
+
+ block = ext4_count_free_clusters(sb);
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_mb_release;
+
+ ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0,
+ GFP_KERNEL);
+ if (ret != 0)
+ goto err_freeclusters;
+
+ return 0;
+
+err_freeclusters:
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+err_mb_release:
+ ext4_mb_release(sb);
+err_out:
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+ return ret;
+}
+
+static void mbt_mb_release(struct super_block *sb)
+{
+ percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter);
+ percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+ ext4_mb_release(sb);
+ kfree(sb->s_bdev->bd_queue);
+ kfree(sb->s_bdev);
+}
+
+static int mbt_set(struct super_block *sb, void *data)
+{
+ return 0;
+}
+
static struct super_block *mbt_ext4_alloc_super_block(void)
{
- struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL);
- struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+ struct mbt_ext4_super_block *fsb;
+ struct super_block *sb;
+ struct ext4_sb_info *sbi;
+
+ fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+ if (fsb == NULL)
+ return NULL;
- if (fsb == NULL || sbi == NULL || es == NULL)
+ sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+ if (IS_ERR(sb))
goto out;
- sbi->s_es = es;
- fsb->sb.s_fs_info = sbi;
- return &fsb->sb;
+ sbi = &fsb->sbi;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock)
+ goto out_deactivate;
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ sbi->s_es = &fsb->es;
+ sb->s_fs_info = sbi;
+
+ up_write(&sb->s_umount);
+ return sb;
+out_deactivate:
+ deactivate_locked_super(sb);
out:
kfree(fsb);
- kfree(sbi);
- kfree(es);
return NULL;
}
static void mbt_ext4_free_super_block(struct super_block *sb)
{
- struct mbt_ext4_super_block *fsb =
- container_of(sb, struct mbt_ext4_super_block, sb);
+ struct mbt_ext4_super_block *fsb = MBT_SB(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- kfree(sbi->s_es);
- kfree(sbi);
+ kfree(sbi->s_blockgroup_lock);
+ deactivate_super(sb);
kfree(fsb);
}
@@ -82,6 +177,9 @@ static void mbt_init_sb_layout(struct super_block *sb,
sbi->s_clusters_per_group = layout->blocks_per_group >>
layout->cluster_bits;
sbi->s_desc_size = layout->desc_size;
+ sbi->s_desc_per_block_bits =
+ sb->s_blocksize_bits - (fls(layout->desc_size) - 1);
+ sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits;
es->s_first_data_block = cpu_to_le32(0);
es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
@@ -91,9 +189,13 @@ static void mbt_init_sb_layout(struct super_block *sb,
static int mbt_grp_ctx_init(struct super_block *sb,
struct mbt_grp_ctx *grp_ctx)
{
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+
grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
if (grp_ctx->bitmap_bh.b_data == NULL)
return -ENOMEM;
+ mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max);
+ ext4_free_group_clusters_set(sb, &grp_ctx->desc, max);
return 0;
}
@@ -112,6 +214,13 @@ static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
}
+static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group)
+{
+ struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+ return grp_ctx->bitmap_bh.b_data;
+}
+
/* called after mbt_init_sb_layout */
static int mbt_ctx_init(struct super_block *sb)
{
@@ -133,6 +242,8 @@ static int mbt_ctx_init(struct super_block *sb)
* block which will fail ext4_sb_block_valid check.
*/
mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+ ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc,
+ EXT4_CLUSTERS_PER_GROUP(sb) - 1);
return 0;
out:
@@ -167,6 +278,13 @@ static int ext4_wait_block_bitmap_stub(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh)
{
+ /*
+ * real ext4_wait_block_bitmap will set these flags and
+ * functions like ext4_mb_init_cache will verify the flags.
+ */
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ set_buffer_verified(bh);
return 0;
}
@@ -232,6 +350,14 @@ static int mbt_kunit_init(struct kunit *test)
kunit_activate_static_stub(test,
ext4_mb_mark_context,
ext4_mb_mark_context_stub);
+
+ /* stub function will be called in mbt_mb_init->ext4_mb_init */
+ if (mbt_mb_init(sb) != 0) {
+ mbt_ctx_release(sb);
+ mbt_ext4_free_super_block(sb);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -239,6 +365,7 @@ static void mbt_kunit_exit(struct kunit *test)
{
struct super_block *sb = (struct super_block *)test->priv;
+ mbt_mb_release(sb);
mbt_ctx_release(sb);
mbt_ext4_free_super_block(sb);
}
@@ -246,14 +373,19 @@ static void mbt_kunit_exit(struct kunit *test)
static void test_new_blocks_simple(struct kunit *test)
{
struct super_block *sb = (struct super_block *)test->priv;
- struct inode inode = { .i_sb = sb, };
+ struct inode *inode;
struct ext4_allocation_request ar;
ext4_group_t i, goal_group = TEST_GOAL_GROUP;
int err = 0;
ext4_fsblk_t found;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ar.inode = &inode;
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+
+ inode->i_sb = sb;
+ ar.inode = inode;
/* get block at goal */
ar.goal = ext4_group_first_block_no(sb, goal_group);
@@ -297,6 +429,436 @@ static void test_new_blocks_simple(struct kunit *test)
"unexpectedly get block when no block is available");
}
+#define TEST_RANGE_COUNT 8
+
+struct test_range {
+ ext4_grpblk_t start;
+ ext4_grpblk_t len;
+};
+
+static void
+mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges,
+ int count)
+{
+ ext4_grpblk_t start, len, max;
+ int i;
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb) / count;
+ for (i = 0; i < count; i++) {
+ start = get_random_u32() % max;
+ len = get_random_u32() % max;
+ len = min(len, max - start);
+
+ ranges[i].start = start + i * max;
+ ranges[i].len = len;
+ }
+}
+
+static void
+validate_free_blocks_simple(struct kunit *test, struct super_block *sb,
+ ext4_group_t goal_group, ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ void *bitmap;
+ ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++) {
+ if (i == goal_group)
+ continue;
+
+ bitmap = mbt_ctx_bitmap(sb, i);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ_MSG(test, bit, max,
+ "free block on unexpected group %d", i);
+ }
+
+ bitmap = mbt_ctx_bitmap(sb, goal_group);
+ bit = mb_find_next_zero_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, bit, start);
+
+ bit = mb_find_next_bit(bitmap, max, bit + 1);
+ KUNIT_ASSERT_EQ(test, bit, start + len);
+}
+
+static void
+test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group,
+ ext4_grpblk_t start, ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode;
+ ext4_fsblk_t block;
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ if (len == 0)
+ return;
+
+ block = ext4_group_first_block_no(sb, goal_group) +
+ EXT4_C2B(sbi, start);
+ ext4_free_blocks_simple(inode, block, len);
+ validate_free_blocks_simple(test, sb, goal_group, start, len);
+ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+}
+
+static void test_free_blocks_simple(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_group_t i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ for (i = 0; i < ext4_get_groups_count(sb); i++)
+ mbt_ctx_mark_used(sb, i, 0, max);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_free_blocks_simple_range(test, TEST_GOAL_GROUP,
+ ranges[i].start, ranges[i].len);
+}
+
+static void
+test_mark_diskspace_used_range(struct kunit *test,
+ struct ext4_allocation_context *ac,
+ ext4_grpblk_t start,
+ ext4_grpblk_t len)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int ret;
+ void *bitmap;
+ ext4_grpblk_t i, max;
+
+ /* ext4_mb_mark_diskspace_used will BUG if len is 0 */
+ if (len == 0)
+ return;
+
+ ac->ac_b_ex.fe_group = TEST_GOAL_GROUP;
+ ac->ac_b_ex.fe_start = start;
+ ac->ac_b_ex.fe_len = len;
+
+ bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
+ memset(bitmap, 0, sb->s_blocksize);
+ ret = ext4_mb_mark_diskspace_used(ac, NULL, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ i = mb_find_next_bit(bitmap, max, 0);
+ KUNIT_ASSERT_EQ(test, i, start);
+ i = mb_find_next_zero_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, i, start + len);
+ i = mb_find_next_bit(bitmap, max, i + 1);
+ KUNIT_ASSERT_EQ(test, max, i);
+}
+
+static void test_mark_diskspace_used(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct inode *inode;
+ struct ext4_allocation_context ac;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+
+ inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+ if (!inode)
+ return;
+ inode->i_sb = sb;
+
+ ac.ac_status = AC_STATUS_FOUND;
+ ac.ac_sb = sb;
+ ac.ac_inode = inode;
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mark_diskspace_used_range(test, &ac, ranges[i].start,
+ ranges[i].len);
+}
+
+static void mbt_generate_buddy(struct super_block *sb, void *buddy,
+ void *bitmap, struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ uint32_t order, off;
+ void *bb, *bb_h;
+ int max;
+
+ memset(buddy, 0xff, sb->s_blocksize);
+ memset(grp, 0, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]));
+
+ bb = bitmap;
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ bb_h = buddy + sbi->s_mb_offsets[1];
+
+ off = mb_find_next_zero_bit(bb, max, 0);
+ grp->bb_first_free = off;
+ while (off < max) {
+ grp->bb_counters[0]++;
+ grp->bb_free++;
+
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ grp->bb_free++;
+ grp->bb_counters[0]--;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[1]++;
+ grp->bb_largest_free_order = 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+
+ for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) {
+ bb = buddy + sbi->s_mb_offsets[order];
+ bb_h = buddy + sbi->s_mb_offsets[order + 1];
+ max = max >> 1;
+ off = mb_find_next_zero_bit(bb, max, 0);
+
+ while (off < max) {
+ if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+ mb_set_bits(bb, off, 2);
+ grp->bb_counters[order] -= 2;
+ mb_clear_bit(off >> 1, bb_h);
+ grp->bb_counters[order + 1]++;
+ grp->bb_largest_free_order = order + 1;
+ off++;
+ }
+
+ off = mb_find_next_zero_bit(bb, max, off + 1);
+ }
+ }
+
+ max = EXT4_CLUSTERS_PER_GROUP(sb);
+ off = mb_find_next_zero_bit(bitmap, max, 0);
+ while (off < max) {
+ grp->bb_fragments++;
+
+ off = mb_find_next_bit(bitmap, max, off + 1);
+ if (off + 1 >= max)
+ break;
+
+ off = mb_find_next_zero_bit(bitmap, max, off + 1);
+ }
+}
+
+static void
+mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1,
+ struct ext4_group_info *grp2)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ KUNIT_ASSERT_EQ(test, grp1->bb_first_free,
+ grp2->bb_first_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_fragments,
+ grp2->bb_fragments);
+ KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free);
+ KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order,
+ grp2->bb_largest_free_order);
+
+ for (i = 1; i < MB_NUM_ORDERS(sb); i++) {
+ KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i],
+ grp2->bb_counters[i],
+ "bb_counters[%d] diffs, expected %d, generated %d",
+ i, grp1->bb_counters[i],
+ grp2->bb_counters[i]);
+ }
+}
+
+static void
+do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
+ void *mbt_buddy, struct ext4_group_info *mbt_grp,
+ void *ext4_buddy, struct ext4_group_info *ext4_grp)
+{
+ int i;
+
+ mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp);
+
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ ext4_grp->bb_counters[i] = 0;
+ /* needed by validation in ext4_mb_generate_buddy */
+ ext4_grp->bb_free = mbt_grp->bb_free;
+ memset(ext4_buddy, 0xff, sb->s_blocksize);
+ ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
+ ext4_grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, mbt_grp, ext4_grp);
+}
+
+static void test_mb_generate_buddy(struct kunit *test)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *expected_bb, *generate_bb;
+ struct ext4_group_info *expected_grp, *generate_grp;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb);
+ generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb);
+ expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp);
+ generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP);
+ KUNIT_ASSERT_NOT_NULL(test, generate_grp);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++) {
+ mb_set_bits(bitmap, ranges[i].start, ranges[i].len);
+ do_test_generate_buddy(test, sb, bitmap, expected_bb,
+ expected_grp, generate_bb, generate_grp);
+ }
+}
+
+static void
+test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ struct ext4_free_extent ex;
+ int i;
+
+ /* mb_mark_used only accepts non-zero len */
+ if (len == 0)
+ return;
+
+ ex.fe_start = start;
+ ex.fe_len = len;
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ mb_set_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free -= len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+}
+
+static void test_mb_mark_used(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ int ret;
+ struct test_range ranges[TEST_RANGE_COUNT];
+ int i;
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_mark_used_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
+static void
+test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
+ ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+ void *buddy, struct ext4_group_info *grp)
+{
+ struct super_block *sb = (struct super_block *)test->priv;
+ int i;
+
+ /* mb_free_blocks will WARN if len is 0 */
+ if (len == 0)
+ return;
+
+ ext4_lock_group(sb, e4b->bd_group);
+ mb_free_blocks(NULL, e4b, start, len);
+ ext4_unlock_group(sb, e4b->bd_group);
+
+ mb_clear_bits(bitmap, start, len);
+ /* bypass bb_free validatoin in ext4_mb_generate_buddy */
+ grp->bb_free += len;
+ memset(buddy, 0xff, sb->s_blocksize);
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ grp->bb_counters[i] = 0;
+ ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+ KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+ 0);
+ mbt_validate_group_info(test, grp, e4b->bd_info);
+
+}
+
+static void test_mb_free_blocks(struct kunit *test)
+{
+ struct ext4_buddy e4b;
+ struct super_block *sb = (struct super_block *)test->priv;
+ void *bitmap, *buddy;
+ struct ext4_group_info *grp;
+ struct ext4_free_extent ex;
+ int ret;
+ int i;
+ struct test_range ranges[TEST_RANGE_COUNT];
+
+ /* buddy cache assumes that each page contains at least one block */
+ if (sb->s_blocksize > PAGE_SIZE)
+ kunit_skip(test, "blocksize exceeds pagesize");
+
+ bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+ buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+ grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+ bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+
+ ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ex.fe_start = 0;
+ ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb);
+ ex.fe_group = TEST_GOAL_GROUP;
+
+ ext4_lock_group(sb, TEST_GOAL_GROUP);
+ mb_mark_used(&e4b, &ex);
+ ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+ grp->bb_free = 0;
+ memset(bitmap, 0xff, sb->s_blocksize);
+
+ mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+ for (i = 0; i < TEST_RANGE_COUNT; i++)
+ test_mb_free_blocks_range(test, &e4b, ranges[i].start,
+ ranges[i].len, bitmap, buddy, grp);
+
+ ext4_mb_unload_buddy(&e4b);
+}
+
static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
{
.blocksize_bits = 10,
@@ -334,6 +896,11 @@ KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
static struct kunit_case mbt_test_cases[] = {
KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params),
+ KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params),
{}
};
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e4f7cf9d89c4..12b3f196010b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3015,8 +3015,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
struct super_block *sb = pde_data(file_inode(seq->file));
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
- int i;
- int err, buddy_loaded = 0;
+ int i, err;
+ char nbuf[16];
struct ext4_buddy e4b;
struct ext4_group_info *grinfo;
unsigned char blocksize_bits = min_t(unsigned char,
@@ -3043,23 +3043,26 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
- seq_printf(seq, "#%-5u: I/O error\n", group);
+ seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf));
return 0;
}
- buddy_loaded = 1;
+ ext4_mb_unload_buddy(&e4b);
}
+ /*
+ * We care only about free space counters in the group info and
+ * these are safe to access even after the buddy has been unloaded
+ */
memcpy(&sg, grinfo, i);
-
- if (buddy_loaded)
- ext4_mb_unload_buddy(&e4b);
-
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
sg.info.bb_counters[i] : 0);
- seq_puts(seq, " ]\n");
+ seq_puts(seq, " ]");
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
+ seq_puts(seq, " Block bitmap corrupted!");
+ seq_puts(seq, "\n");
return 0;
}
@@ -3829,8 +3832,7 @@ void ext4_mb_release(struct super_block *sb)
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t cluster, int count,
- struct bio **biop)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
ext4_fsblk_t discard_block;
@@ -3839,13 +3841,8 @@ static inline int ext4_issue_discard(struct super_block *sb,
count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- if (biop) {
- return __blkdev_issue_discard(sb->s_bdev,
- (sector_t)discard_block << (sb->s_blocksize_bits - 9),
- (sector_t)count << (sb->s_blocksize_bits - 9),
- GFP_NOFS, biop);
- } else
- return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
static void ext4_free_data_in_buddy(struct super_block *sb,
@@ -5169,10 +5166,16 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
.fe_len = ac->ac_orig_goal_len,
};
loff_t orig_goal_end = extent_logical_end(sbi, &ex);
+ loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);
- /* we can't allocate as much as normalizer wants.
- * so, found space must get proper lstart
- * to cover original request */
+ /*
+ * We can't allocate as much as normalizer wants, so we try
+ * to get proper lstart to cover the original request, except
+ * when the goal doesn't cover the original request as below:
+ *
+ * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
+ * best_ex:0/200(200) -> adjusted: 1848/2048(200)
+ */
BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
@@ -5184,7 +5187,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
* 1. Check if best ex can be kept at end of goal (before
* cr_best_avail trimmed it) and still cover original start
* 2. Else, check if best ex can be kept at start of goal and
- * still cover original start
+ * still cover original end
* 3. Else, keep the best ex at start of original request.
*/
ex.fe_len = ac->ac_b_ex.fe_len;
@@ -5194,7 +5197,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
goto adjust_bex;
ex.fe_logical = ac->ac_g_ex.fe_logical;
- if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex))
+ if (o_ex_end <= extent_logical_end(sbi, &ex))
goto adjust_bex;
ex.fe_logical = ac->ac_o_ex.fe_logical;
@@ -5202,7 +5205,6 @@ adjust_bex:
ac->ac_b_ex.fe_logical = ex.fe_logical;
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
- BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
}
@@ -6487,8 +6489,14 @@ do_more:
} else {
if (test_opt(sb, DISCARD)) {
err = ext4_issue_discard(sb, block_group, bit,
- count_clusters, NULL);
- if (err && err != -EOPNOTSUPP)
+ count_clusters);
+ /*
+ * Ignore EOPNOTSUPP error. This is consistent with
+ * what happens when using journal.
+ */
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ if (err)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
@@ -6738,7 +6746,7 @@ __acquires(bitlock)
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ret = ext4_issue_discard(sb, group, start, count, NULL);
+ ret = ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 05b647e6bc19..5e4f65c14dfb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1762,7 +1762,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct buffer_head *bh;
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
- generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return NULL;
if (err)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4d4a5a32e310..0ba9837d65ca 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1602,7 +1602,8 @@ exit_journal:
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = ext4_has_feature_meta_bg(sb);
+ int meta_bg = ext4_has_feature_meta_bg(sb) &&
+ gdb_num >= le32_to_cpu(es->s_first_meta_bg);
sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
ext4_group_first_block_no(sb, 0);
@@ -2084,7 +2085,7 @@ retry:
}
}
- if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+ if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) {
err = ext4_convert_meta_bg(sb, resize_inode);
if (err)
goto out;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0f931d0c227d..044135796f2b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1359,14 +1359,14 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->s_journal_bdev_handle) {
+ if (sbi->s_journal_bdev_file) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->s_journal_bdev_handle->bdev);
- invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
+ sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
+ invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -1500,8 +1500,7 @@ static int __init init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
sizeof(struct ext4_inode_info), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT),
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
offsetof(struct ext4_inode_info, i_data),
sizeof_field(struct ext4_inode_info, i_data),
init_once);
@@ -1600,7 +1599,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
-static struct dquot **ext4_get_dquots(struct inode *inode)
+static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
{
return EXT4_I(inode)->i_dquot;
}
@@ -4233,7 +4232,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->s_journal_bdev_handle)
+ if (sbi->s_journal && !sbi->s_journal_bdev_file)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
@@ -4422,22 +4421,6 @@ static int ext4_handle_clustersize(struct super_block *sb)
}
sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
le32_to_cpu(es->s_log_block_size);
- sbi->s_clusters_per_group =
- le32_to_cpu(es->s_clusters_per_group);
- if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#clusters per group too big: %lu",
- sbi->s_clusters_per_group);
- return -EINVAL;
- }
- if (sbi->s_blocks_per_group !=
- (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
- ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
- "clusters per group (%lu) inconsistent",
- sbi->s_blocks_per_group,
- sbi->s_clusters_per_group);
- return -EINVAL;
- }
} else {
if (clustersize != sb->s_blocksize) {
ext4_msg(sb, KERN_ERR,
@@ -4451,9 +4434,21 @@ static int ext4_handle_clustersize(struct super_block *sb)
sbi->s_blocks_per_group);
return -EINVAL;
}
- sbi->s_clusters_per_group = sbi->s_blocks_per_group;
sbi->s_cluster_bits = 0;
}
+ sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
+ ext4_msg(sb, KERN_ERR,
+ "blocks per group (%lu) and clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group, sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
/* Do we have standard group size of clustersize * 8 blocks ? */
@@ -5346,7 +5341,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
- memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+ super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
@@ -5484,6 +5479,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount4;
}
+ generic_set_sb_d_ops(sb);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
@@ -5670,9 +5666,9 @@ failed_mount:
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
brelse(sbi->s_sbh);
- if (sbi->s_journal_bdev_handle) {
- invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
- bdev_release(sbi->s_journal_bdev_handle);
+ if (sbi->s_journal_bdev_file) {
+ invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
+ bdev_fput(sbi->s_journal_bdev_file);
}
out_fail:
invalidate_bdev(sb->s_bdev);
@@ -5842,30 +5838,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
return journal;
}
-static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
+static struct file *ext4_get_journal_blkdev(struct super_block *sb,
dev_t j_dev, ext4_fsblk_t *j_start,
ext4_fsblk_t *j_len)
{
struct buffer_head *bh;
struct block_device *bdev;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
struct ext4_super_block *es;
int errno;
- bdev_handle = bdev_open_by_dev(j_dev,
+ bdev_file = bdev_file_open_by_dev(j_dev,
BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
sb, &fs_holder_ops);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
ext4_msg(sb, KERN_ERR,
"failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle));
- return bdev_handle;
+ MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
+ return bdev_file;
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
@@ -5912,12 +5908,12 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
*j_start = sb_block + 1;
*j_len = ext4_blocks_count(es);
brelse(bh);
- return bdev_handle;
+ return bdev_file;
out_bh:
brelse(bh);
out_bdev:
- bdev_release(bdev_handle);
+ bdev_fput(bdev_file);
return ERR_PTR(errno);
}
@@ -5927,14 +5923,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
journal_t *journal;
ext4_fsblk_t j_start;
ext4_fsblk_t j_len;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int errno = 0;
- bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
- if (IS_ERR(bdev_handle))
- return ERR_CAST(bdev_handle);
+ bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
- journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start,
+ journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
j_len, sb->s_blocksize);
if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5949,14 +5945,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
goto out_journal;
}
journal->j_private = sb;
- EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle;
+ EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
ext4_init_journal_params(sb, journal);
return journal;
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- bdev_release(bdev_handle);
+ bdev_fput(bdev_file);
return ERR_PTR(errno);
}
@@ -6864,6 +6860,10 @@ static int ext4_write_dquot(struct dquot *dquot)
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to commit dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
@@ -6880,6 +6880,10 @@ static int ext4_acquire_dquot(struct dquot *dquot)
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_acquire(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to acquire dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
@@ -6899,6 +6903,10 @@ static int ext4_release_dquot(struct dquot *dquot)
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
+ if (ret < 0)
+ ext4_error_err(dquot->dq_sb, -ret,
+ "Failed to release dquot type %d",
+ dquot->dq_id.type);
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
@@ -7314,12 +7322,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
static void ext4_kill_sb(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL;
+ struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;
kill_block_super(sb);
- if (handle)
- bdev_release(handle);
+ if (bdev_file)
+ bdev_fput(bdev_file);
}
static struct file_system_type ext4_fs_type = {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 82dc5e673d5c..b67a176bfcf9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1565,46 +1565,49 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
/*
* Add value of the EA in an inode.
*/
-static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
- const void *value, size_t value_len,
- struct inode **ret_inode)
+static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
+ struct inode *inode, const void *value, size_t value_len)
{
struct inode *ea_inode;
u32 hash;
int err;
+ /* Account inode & space to quota even if sharing... */
+ err = ext4_xattr_inode_alloc_quota(inode, value_len);
+ if (err)
+ return ERR_PTR(err);
+
hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
if (ea_inode) {
err = ext4_xattr_inode_inc_ref(handle, ea_inode);
- if (err) {
- iput(ea_inode);
- return err;
- }
-
- *ret_inode = ea_inode;
- return 0;
+ if (err)
+ goto out_err;
+ return ea_inode;
}
/* Create an inode for the EA value */
ea_inode = ext4_xattr_inode_create(handle, inode, hash);
- if (IS_ERR(ea_inode))
- return PTR_ERR(ea_inode);
+ if (IS_ERR(ea_inode)) {
+ ext4_xattr_inode_free_quota(inode, NULL, value_len);
+ return ea_inode;
+ }
err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
if (err) {
if (ext4_xattr_inode_dec_ref(handle, ea_inode))
ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
- iput(ea_inode);
- return err;
+ goto out_err;
}
if (EA_INODE_CACHE(inode))
mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
ea_inode->i_ino, true /* reusable */);
-
- *ret_inode = ea_inode;
- return 0;
+ return ea_inode;
+out_err:
+ iput(ea_inode);
+ ext4_xattr_inode_free_quota(inode, NULL, value_len);
+ return ERR_PTR(err);
}
/*
@@ -1712,16 +1715,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
if (i->value && in_inode) {
WARN_ON_ONCE(!i->value_len);
- ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
- if (ret)
- goto out;
-
- ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
- i->value_len,
- &new_ea_inode);
- if (ret) {
+ new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
+ i->value, i->value_len);
+ if (IS_ERR(new_ea_inode)) {
+ ret = PTR_ERR(new_ea_inode);
new_ea_inode = NULL;
- ext4_xattr_inode_free_quota(inode, NULL, i->value_len);
goto out;
}
}
@@ -2160,17 +2158,6 @@ getblk_failed:
ENTRY(header(s->base)+1));
if (error)
goto getblk_failed;
- if (ea_inode) {
- /* Drop the extra ref on ea_inode. */
- error = ext4_xattr_inode_dec_ref(handle,
- ea_inode);
- if (error)
- ext4_warning_inode(ea_inode,
- "dec ref error=%d",
- error);
- iput(ea_inode);
- ea_inode = NULL;
- }
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, sb,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b0597a539fc5..eac698b8dd38 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -154,49 +154,47 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
if (unlikely(f2fs_cp_error(sbi)))
return exist;
- if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) {
- f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
- blkaddr, exist);
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- return exist;
- }
+ if ((exist && type == DATA_GENERIC_ENHANCE_UPDATE) ||
+ (!exist && type == DATA_GENERIC_ENHANCE))
+ goto out_err;
+ if (!exist && type != DATA_GENERIC_ENHANCE_UPDATE)
+ goto out_handle;
+ return exist;
- if (!exist && type == DATA_GENERIC_ENHANCE) {
- f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
- blkaddr, exist);
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- dump_stack();
- }
+out_err:
+ f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
+ blkaddr, exist);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ dump_stack();
+out_handle:
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return exist;
}
-bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
- if (time_to_inject(sbi, FAULT_BLKADDR))
- return false;
-
switch (type) {
case META_NAT:
break;
case META_SIT:
if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
- return false;
+ goto err;
break;
case META_SSA:
if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
blkaddr < SM_I(sbi)->ssa_blkaddr))
- return false;
+ goto err;
break;
case META_CP:
if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
blkaddr < __start_cp_addr(sbi)))
- return false;
+ goto err;
break;
case META_POR:
if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
blkaddr < MAIN_BLKADDR(sbi)))
- return false;
+ goto err;
break;
case DATA_GENERIC:
case DATA_GENERIC_ENHANCE:
@@ -213,7 +211,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
blkaddr);
set_sbi_flag(sbi, SBI_NEED_FSCK);
dump_stack();
- return false;
+ goto err;
} else {
return __is_bitmap_valid(sbi, blkaddr, type);
}
@@ -221,13 +219,30 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
case META_GENERIC:
if (unlikely(blkaddr < SEG0_BLKADDR(sbi) ||
blkaddr >= MAIN_BLKADDR(sbi)))
- return false;
+ goto err;
break;
default:
BUG();
}
return true;
+err:
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ return false;
+}
+
+bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type)
+{
+ if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY))
+ return false;
+ return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
+}
+
+bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type)
+{
+ return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
}
/*
@@ -889,7 +904,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
- if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
+ if (cp_blocks > BLKS_PER_SEG(sbi) || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
@@ -1324,7 +1339,7 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (cpc->reason & CP_UMOUNT) {
if (le32_to_cpu(ckpt->cp_pack_total_block_count) +
- NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) {
+ NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) {
clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
f2fs_notice(sbi, "Disable nat_bits due to no space");
} else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
@@ -1527,7 +1542,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
cp_ver |= ((__u64)crc32 << 32);
*(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
- blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
+ blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++)
f2fs_update_meta_page(sbi, nm_i->nat_bits +
(i << F2FS_BLKSIZE_BITS), blk + i);
@@ -1587,8 +1602,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
*/
if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) ||
f2fs_sb_has_compression(sbi))
- invalidate_mapping_pages(META_MAPPING(sbi),
- MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
+ f2fs_bug_on(sbi,
+ invalidate_inode_pages2_range(META_MAPPING(sbi),
+ MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1));
f2fs_release_ino_entry(sbi, false);
@@ -1730,9 +1746,9 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
im->ino_num = 0;
}
- sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
+ sbi->max_orphans = (BLKS_PER_SEG(sbi) - F2FS_CP_PACKS -
NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) *
- F2FS_ORPHANS_PER_BLOCK;
+ F2FS_ORPHANS_PER_BLOCK;
}
int __init f2fs_create_checkpoint_caches(void)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 531517dac079..8892c8262141 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -512,8 +512,8 @@ static int lzorle_compress_pages(struct compress_ctx *cc)
ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
&cc->clen, cc->private);
if (ret != LZO_E_OK) {
- printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n",
- KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret);
+ f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+ "lzo-rle compress failed, ret:%d", ret);
return -EIO;
}
return 0;
@@ -780,9 +780,9 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
if (provided != calculated) {
if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) {
set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT);
- printk_ratelimited(
- "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x",
- KERN_INFO, sbi->sb->s_id, dic->inode->i_ino,
+ f2fs_info_ratelimited(sbi,
+ "checksum invalid, nid = %lu, %x vs %x",
+ dic->inode->i_ino,
provided, calculated);
}
set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -1418,6 +1418,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
struct f2fs_sb_info *sbi = bio->bi_private;
struct compress_io_ctx *cic =
(struct compress_io_ctx *)page_private(page);
+ enum count_type type = WB_DATA_TYPE(page,
+ f2fs_is_compressed_page(page));
int i;
if (unlikely(bio->bi_status))
@@ -1425,7 +1427,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
f2fs_compress_free_page(page);
- dec_page_count(sbi, F2FS_WB_DATA);
+ dec_page_count(sbi, type);
if (atomic_dec_return(&cic->pending_pages))
return;
@@ -1441,12 +1443,14 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
}
static int f2fs_write_raw_pages(struct compress_ctx *cc,
- int *submitted,
+ int *submitted_p,
struct writeback_control *wbc,
enum iostat_type io_type)
{
struct address_space *mapping = cc->inode->i_mapping;
- int _submitted, compr_blocks, ret, i;
+ struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ int submitted, compr_blocks, i;
+ int ret = 0;
compr_blocks = f2fs_compressed_blocks(cc);
@@ -1461,6 +1465,10 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
if (compr_blocks < 0)
return compr_blocks;
+ /* overwrite compressed cluster w/ normal cluster */
+ if (compr_blocks > 0)
+ f2fs_lock_op(sbi);
+
for (i = 0; i < cc->cluster_size; i++) {
if (!cc->rpages[i])
continue;
@@ -1485,7 +1493,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(cc->rpages[i]))
goto continue_unlock;
- ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
+ ret = f2fs_write_single_data_page(cc->rpages[i], &submitted,
NULL, NULL, wbc, io_type,
compr_blocks, false);
if (ret) {
@@ -1493,26 +1501,29 @@ continue_unlock:
unlock_page(cc->rpages[i]);
ret = 0;
} else if (ret == -EAGAIN) {
+ ret = 0;
/*
* for quota file, just redirty left pages to
* avoid deadlock caused by cluster update race
* from foreground operation.
*/
if (IS_NOQUOTA(cc->inode))
- return 0;
- ret = 0;
+ goto out;
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry_write;
}
- return ret;
+ goto out;
}
- *submitted += _submitted;
+ *submitted_p += submitted;
}
- f2fs_balance_fs(F2FS_M_SB(mapping), true);
+out:
+ if (compr_blocks > 0)
+ f2fs_unlock_op(sbi);
- return 0;
+ f2fs_balance_fs(sbi, true);
+ return ret;
}
int f2fs_write_multi_pages(struct compress_ctx *cc,
@@ -1806,16 +1817,18 @@ void f2fs_put_page_dic(struct page *page, bool in_task)
* check whether cluster blocks are contiguous, and add extent cache entry
* only if cluster blocks are logically and physically contiguous.
*/
-unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn,
+ unsigned int ofs_in_node)
{
- bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR;
+ bool compressed = data_blkaddr(dn->inode, dn->node_page,
+ ofs_in_node) == COMPRESS_ADDR;
int i = compressed ? 1 : 0;
block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
- dn->ofs_in_node + i);
+ ofs_in_node + i);
for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) {
block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
- dn->ofs_in_node + i);
+ ofs_in_node + i);
if (!__is_valid_data_blkaddr(blkaddr))
break;
@@ -1878,12 +1891,8 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
set_page_private_data(cpage, ino);
- if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ))
- goto out;
-
memcpy(page_address(cpage), page_address(page), PAGE_SIZE);
SetPageUptodate(cpage);
-out:
f2fs_put_page(cpage, 1);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 26e317696b33..d9494b5fc7c1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -48,7 +48,7 @@ void f2fs_destroy_bioset(void)
bioset_exit(&f2fs_bioset);
}
-static bool __is_cp_guaranteed(struct page *page)
+bool f2fs_is_cp_guaranteed(struct page *page)
{
struct address_space *mapping = page->mapping;
struct inode *inode;
@@ -65,8 +65,6 @@ static bool __is_cp_guaranteed(struct page *page)
S_ISDIR(inode->i_mode))
return true;
- if (f2fs_is_compressed_page(page))
- return false;
if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
page_private_gcing(page))
return true;
@@ -338,18 +336,7 @@ static void f2fs_write_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
- enum count_type type = WB_DATA_TYPE(page);
-
- if (page_private_dummy(page)) {
- clear_page_private_dummy(page);
- unlock_page(page);
- mempool_free(page, sbi->write_io_dummy);
-
- if (unlikely(bio->bi_status))
- f2fs_stop_checkpoint(sbi, true,
- STOP_CP_REASON_WRITE_FAIL);
- continue;
- }
+ enum count_type type = WB_DATA_TYPE(page, false);
fscrypt_finalize_bounce_page(&page);
@@ -524,50 +511,13 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
submit_bio(bio);
}
-static void f2fs_align_write_bio(struct f2fs_sb_info *sbi, struct bio *bio)
-{
- unsigned int start =
- (bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS) % F2FS_IO_SIZE(sbi);
-
- if (start == 0)
- return;
-
- /* fill dummy pages */
- for (; start < F2FS_IO_SIZE(sbi); start++) {
- struct page *page =
- mempool_alloc(sbi->write_io_dummy,
- GFP_NOIO | __GFP_NOFAIL);
- f2fs_bug_on(sbi, !page);
-
- lock_page(page);
-
- zero_user_segment(page, 0, PAGE_SIZE);
- set_page_private_dummy(page);
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
- f2fs_bug_on(sbi, 1);
- }
-}
-
static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
WARN_ON_ONCE(is_read_io(bio_op(bio)));
- if (type == DATA || type == NODE) {
- if (f2fs_lfs_mode(sbi) && current->plug)
- blk_finish_plug(current->plug);
-
- if (F2FS_IO_ALIGNED(sbi)) {
- f2fs_align_write_bio(sbi, bio);
- /*
- * In the NODE case, we lose next block address chain.
- * So, we need to do checkpoint in f2fs_sync_file.
- */
- if (type == NODE)
- set_sbi_flag(sbi, SBI_NEED_CP);
- }
- }
+ if (f2fs_lfs_mode(sbi) && current->plug && PAGE_TYPE_ON_MAIN(type))
+ blk_finish_plug(current->plug);
trace_f2fs_submit_write_bio(sbi->sb, type, bio);
iostat_update_submit_ctx(bio, type);
@@ -740,10 +690,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
fio->is_por ? META_POR : (__is_meta_io(fio) ?
- META_GENERIC : DATA_GENERIC_ENHANCE))) {
- f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
+ META_GENERIC : DATA_GENERIC_ENHANCE)))
return -EFSCORRUPTED;
- }
trace_f2fs_submit_page_bio(page, fio);
@@ -762,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
- __read_io_type(page) : WB_DATA_TYPE(fio->page));
+ __read_io_type(page) : WB_DATA_TYPE(fio->page, false));
if (is_read_io(bio_op(bio)))
f2fs_submit_read_bio(fio->sbi, bio, fio->type);
@@ -796,16 +744,6 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
block_t last_blkaddr,
block_t cur_blkaddr)
{
- if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) {
- unsigned int filled_blocks =
- F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size);
- unsigned int io_size = F2FS_IO_SIZE(sbi);
- unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt;
-
- /* IOs in bio is aligned and left space of vectors is not enough */
- if (!(filled_blocks % io_size) && left_vecs < io_size)
- return false;
- }
if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr))
return false;
return io_type_is_mergeable(io, fio);
@@ -948,10 +886,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
fio->encrypted_page : fio->page;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
- __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) {
- f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
+ __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
return -EFSCORRUPTED;
- }
trace_f2fs_submit_page_bio(page, fio);
@@ -973,7 +909,7 @@ alloc_new:
if (fio->io_wbc)
wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
- inc_page_count(fio->sbi, WB_DATA_TYPE(page));
+ inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
*fio->last_block = fio->new_blkaddr;
*fio->bio = bio;
@@ -1007,11 +943,12 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
struct page *bio_page;
+ enum count_type type;
f2fs_bug_on(sbi, is_read_io(fio->op));
f2fs_down_write(&io->io_rwsem);
-
+next:
#ifdef CONFIG_BLK_DEV_ZONED
if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) {
wait_for_completion_io(&io->zone_wait);
@@ -1021,7 +958,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
}
#endif
-next:
if (fio->in_list) {
spin_lock(&io->io_lock);
if (list_empty(&io->io_list)) {
@@ -1046,7 +982,8 @@ next:
/* set submitted = true as a return value */
fio->submitted = 1;
- inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+ type = WB_DATA_TYPE(bio_page, fio->compressed_page);
+ inc_page_count(sbi, type);
if (io->bio &&
(!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
@@ -1056,13 +993,6 @@ next:
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- if (F2FS_IO_ALIGNED(sbi) &&
- (fio->type == DATA || fio->type == NODE) &&
- fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
- dec_page_count(sbi, WB_DATA_TYPE(bio_page));
- fio->retry = 1;
- goto skip;
- }
io->bio = __bio_alloc(fio, BIO_MAX_VECS);
f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
bio_page->index, fio, GFP_NOIO);
@@ -1080,10 +1010,6 @@ alloc_new:
io->last_block_in_bio = fio->new_blkaddr;
trace_f2fs_submit_page_write(fio->page, fio);
-skip:
- if (fio->in_list)
- goto next;
-out:
#ifdef CONFIG_BLK_DEV_ZONED
if (f2fs_sb_has_blkzoned(sbi) && btype < META &&
is_end_zone_blkaddr(sbi, fio->new_blkaddr)) {
@@ -1096,6 +1022,9 @@ out:
__submit_merged_bio(io);
}
#endif
+ if (fio->in_list)
+ goto next;
+out:
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
!f2fs_is_checkpoint_ready(sbi))
__submit_merged_bio(io);
@@ -1218,7 +1147,8 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
+ err = inc_valid_block_count(sbi, dn->inode, &count, true);
+ if (unlikely(err))
return err;
trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
@@ -1285,8 +1215,6 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(F2FS_I_SB(inode),
- ERROR_INVALID_BLKADDR);
goto put_err;
}
goto got_it;
@@ -1312,8 +1240,6 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
dn.data_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(F2FS_I_SB(inode),
- ERROR_INVALID_BLKADDR);
goto put_err;
}
got_it:
@@ -1475,15 +1401,18 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
dn->data_blkaddr = f2fs_data_blkaddr(dn);
if (dn->data_blkaddr == NULL_ADDR) {
- err = inc_valid_block_count(sbi, dn->inode, &count);
+ err = inc_valid_block_count(sbi, dn->inode, &count, true);
if (unlikely(err))
return err;
}
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
old_blkaddr = dn->data_blkaddr;
- f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
- &sum, seg_type, NULL);
+ err = f2fs_allocate_data_block(sbi, NULL, old_blkaddr,
+ &dn->data_blkaddr, &sum, seg_type, NULL);
+ if (err)
+ return err;
+
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
f2fs_invalidate_internal_cache(sbi, old_blkaddr);
@@ -1641,7 +1570,6 @@ next_block:
if (!is_hole &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto sync_out;
}
@@ -2165,8 +2093,6 @@ got_it:
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
DATA_GENERIC_ENHANCE_READ)) {
ret = -EFSCORRUPTED;
- f2fs_handle_error(F2FS_I_SB(inode),
- ERROR_INVALID_BLKADDR);
goto out;
}
} else {
@@ -2668,8 +2594,6 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
if (fio) {
if (page_private_gcing(fio->page))
return true;
- if (page_private_dummy(fio->page))
- return true;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
return true;
@@ -2706,11 +2630,8 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
f2fs_lookup_read_extent_cache_block(inode, page->index,
&fio->old_blkaddr)) {
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
- DATA_GENERIC_ENHANCE)) {
- f2fs_handle_error(fio->sbi,
- ERROR_INVALID_BLKADDR);
+ DATA_GENERIC_ENHANCE))
return -EFSCORRUPTED;
- }
ipu_force = true;
fio->need_lock = LOCK_DONE;
@@ -2738,7 +2659,6 @@ got_it:
!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
goto out_writepage;
}
@@ -2838,7 +2758,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.encrypted_page = NULL,
.submitted = 0,
.compr_blocks = compr_blocks,
- .need_lock = LOCK_RETRY,
+ .need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY,
.post_read = f2fs_post_read_required(inode) ? 1 : 0,
.io_type = io_type,
.io_wbc = wbc,
@@ -2919,6 +2839,7 @@ write:
if (err == -EAGAIN) {
err = f2fs_do_write_data_page(&fio);
if (err == -EAGAIN) {
+ f2fs_bug_on(sbi, compr_blocks);
fio.need_lock = LOCK_REQ;
err = f2fs_do_write_data_page(&fio);
}
@@ -3704,7 +3625,6 @@ repeat:
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto fail;
}
err = f2fs_submit_page_read(use_cow ?
@@ -3905,26 +3825,36 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int blkofs;
unsigned int blk_per_sec = BLKS_PER_SEC(sbi);
+ unsigned int end_blk = start_blk + blkcnt - 1;
unsigned int secidx = start_blk / blk_per_sec;
- unsigned int end_sec = secidx + blkcnt / blk_per_sec;
+ unsigned int end_sec;
int ret = 0;
+ if (!blkcnt)
+ return 0;
+ end_sec = end_blk / blk_per_sec;
+
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
set_inode_flag(inode, FI_ALIGNED_WRITE);
set_inode_flag(inode, FI_OPU_WRITE);
- for (; secidx < end_sec; secidx++) {
+ for (; secidx <= end_sec; secidx++) {
+ unsigned int blkofs_end = secidx == end_sec ?
+ end_blk % blk_per_sec : blk_per_sec - 1;
+
f2fs_down_write(&sbi->pin_sem);
- f2fs_lock_op(sbi);
- f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
- f2fs_unlock_op(sbi);
+ ret = f2fs_allocate_pinning_section(sbi);
+ if (ret) {
+ f2fs_up_write(&sbi->pin_sem);
+ break;
+ }
set_inode_flag(inode, FI_SKIP_WRITES);
- for (blkofs = 0; blkofs < blk_per_sec; blkofs++) {
+ for (blkofs = 0; blkofs <= blkofs_end; blkofs++) {
struct page *page;
unsigned int blkidx = secidx * blk_per_sec + blkofs;
@@ -4013,27 +3943,34 @@ retry:
nr_pblocks = map.m_len;
if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask ||
- nr_pblocks & sec_blks_mask) {
+ nr_pblocks & sec_blks_mask ||
+ !f2fs_valid_pinned_area(sbi, pblock)) {
+ bool last_extent = false;
+
not_aligned++;
nr_pblocks = roundup(nr_pblocks, blks_per_sec);
if (cur_lblock + nr_pblocks > sis->max)
nr_pblocks -= blks_per_sec;
+ /* this extent is last one */
if (!nr_pblocks) {
- /* this extent is last one */
- nr_pblocks = map.m_len;
- f2fs_warn(sbi, "Swapfile: last extent is not aligned to section");
- goto next;
+ nr_pblocks = last_lblock - cur_lblock;
+ last_extent = true;
}
ret = f2fs_migrate_blocks(inode, cur_lblock,
nr_pblocks);
- if (ret)
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = -EINVAL;
goto out;
- goto retry;
+ }
+
+ if (!last_extent)
+ goto retry;
}
-next:
+
if (cur_lblock + nr_pblocks >= sis->max)
nr_pblocks = sis->max - cur_lblock;
@@ -4071,17 +4008,17 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
struct inode *inode = file_inode(file);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int ret;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+ if (f2fs_readonly(sbi->sb))
return -EROFS;
- if (f2fs_lfs_mode(F2FS_I_SB(inode))) {
- f2fs_err(F2FS_I_SB(inode),
- "Swapfile not supported in LFS mode");
+ if (f2fs_lfs_mode(sbi) && !f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_err(sbi, "Swapfile not supported in LFS mode");
return -EINVAL;
}
@@ -4092,6 +4029,10 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
+ ret = filemap_fdatawrite(inode->i_mapping);
+ if (ret < 0)
+ return ret;
+
f2fs_precache_extents(inode);
ret = check_swap_activate(sis, file, span);
@@ -4100,7 +4041,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
stat_inc_swapfile_inode(inode);
set_inode_flag(inode, FI_PIN_FILE);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
return ret;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fdbf994f1271..8b0e1e71b667 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -41,7 +41,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi)
total_vblocks = 0;
blks_per_sec = CAP_BLKS_PER_SEC(sbi);
hblks_per_sec = blks_per_sec / 2;
- for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
vblocks = get_valid_blocks(sbi, segno, true);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -135,7 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->cur_ckpt_time = sbi->cprc_info.cur_time;
si->peak_ckpt_time = sbi->cprc_info.peak_time;
spin_unlock(&sbi->cprc_info.stat_lock);
- si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
+ si->total_count = BLKS_TO_SEGS(sbi, (int)sbi->user_block_count);
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
si->valid_count = valid_user_blocks(sbi);
@@ -176,11 +176,10 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
si->io_skip_bggc = sbi->io_skip_bggc;
si->other_skip_bggc = sbi->other_skip_bggc;
- si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
+ si->util_free = (int)(BLKS_TO_SEGS(sbi, free_user_blocks(sbi)))
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
- si->util_valid = (int)(written_block_count(sbi) >>
- sbi->log_blocks_per_seg)
+ si->util_valid = (int)(BLKS_TO_SEGS(sbi, written_block_count(sbi)))
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
si->util_invalid = 50 - si->util_free - si->util_valid;
@@ -208,7 +207,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
if (!blks)
continue;
- if (blks == sbi->blocks_per_seg)
+ if (blks == BLKS_PER_SEG(sbi))
si->full_seg[type]++;
else
si->dirty_seg[type]++;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 042593aed1ec..02c9355176d3 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -830,13 +830,14 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
return err;
}
-int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir,
+ struct f2fs_filename *fname)
{
struct page *page;
int err = 0;
f2fs_down_write(&F2FS_I(inode)->i_sem);
- page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
+ page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -995,9 +996,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
de = &d->dentry[bit_pos];
if (de->name_len == 0) {
if (found_valid_dirent || !bit_pos) {
- printk_ratelimited(
- "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
- KERN_WARNING, sbi->sb->s_id,
+ f2fs_warn_ratelimited(sbi,
+ "invalid namelen(0), ino:%u, run fsck to fix.",
le32_to_cpu(de->ino));
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ad8dfac73bd4..48048fa36427 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -43,7 +43,6 @@ bool sanity_check_extent_cache(struct inode *inode)
if (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE) ||
!f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1,
DATA_GENERIC_ENHANCE)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix",
__func__, inode->i_ino,
ei->blk, ei->fofs, ei->len);
@@ -856,10 +855,8 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
goto out;
if (__is_valid_data_blkaddr(blkaddr) &&
- !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
- f2fs_bug_on(sbi, 1);
+ !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE))
return -EINVAL;
- }
out:
/*
* init block age with zero, this can happen when the block age extent
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 65294e3b0bef..fced2b7652f4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,6 +24,7 @@
#include <linux/blkdev.h>
#include <linux/quotaops.h>
#include <linux/part_stat.h>
+#include <linux/rw_hint.h>
#include <crypto/hash.h>
#include <linux/fscrypt.h>
@@ -60,7 +61,9 @@ enum {
FAULT_SLAB_ALLOC,
FAULT_DQUOT_INIT,
FAULT_LOCK_OP,
- FAULT_BLKADDR,
+ FAULT_BLKADDR_VALIDITY,
+ FAULT_BLKADDR_CONSISTENCE,
+ FAULT_NO_SEGMENT,
FAULT_MAX,
};
@@ -75,6 +78,11 @@ struct f2fs_fault_info {
extern const char *f2fs_fault_name[FAULT_MAX];
#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type))
+
+/* maximum retry count for injected failure */
+#define DEFAULT_FAILURE_RETRY_COUNT 8
+#else
+#define DEFAULT_FAILURE_RETRY_COUNT 1
#endif
/*
@@ -142,7 +150,6 @@ struct f2fs_rwsem {
struct f2fs_mount_info {
unsigned int opt;
- int write_io_size_bits; /* Write IO size bits */
block_t root_reserved_blocks; /* root reserved blocks */
kuid_t s_resuid; /* reserved blocks for uid */
kgid_t s_resgid; /* reserved blocks for gid */
@@ -829,7 +836,7 @@ struct f2fs_inode_info {
spinlock_t i_size_lock; /* protect last_disk_size */
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
@@ -1080,7 +1087,8 @@ struct f2fs_sm_info {
* f2fs monitors the number of several block types such as on-writeback,
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
-#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
+#define WB_DATA_TYPE(p, f) \
+ (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
enum count_type {
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
@@ -1110,6 +1118,7 @@ enum count_type {
* ... Only can be used with META.
*/
#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
+#define PAGE_TYPE_ON_MAIN(type) ((type) == DATA || (type) == NODE)
enum page_type {
DATA = 0,
NODE = 1, /* should not change this */
@@ -1204,7 +1213,6 @@ struct f2fs_io_info {
unsigned int submitted:1; /* indicate IO submission */
unsigned int in_list:1; /* indicate fio is in io_list */
unsigned int is_por:1; /* indicate IO is from recovery or not */
- unsigned int retry:1; /* need to reallocate block address */
unsigned int encrypted:1; /* indicate file is encrypted */
unsigned int post_read:1; /* require post read */
enum iostat_type io_type; /* io type */
@@ -1239,7 +1247,7 @@ struct f2fs_bio_info {
#define FDEV(i) (sbi->devs[i])
#define RDEV(i) (raw_super->devs[i])
struct f2fs_dev_info {
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
char path[MAX_PATH_LEN];
unsigned int total_segments;
@@ -1406,18 +1414,16 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr);
* Layout A: lowest bit should be 1
* | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... |
* bit 0 PAGE_PRIVATE_NOT_POINTER
- * bit 1 PAGE_PRIVATE_DUMMY_WRITE
- * bit 2 PAGE_PRIVATE_ONGOING_MIGRATION
- * bit 3 PAGE_PRIVATE_INLINE_INODE
- * bit 4 PAGE_PRIVATE_REF_RESOURCE
- * bit 5- f2fs private data
+ * bit 1 PAGE_PRIVATE_ONGOING_MIGRATION
+ * bit 2 PAGE_PRIVATE_INLINE_INODE
+ * bit 3 PAGE_PRIVATE_REF_RESOURCE
+ * bit 4- f2fs private data
*
* Layout B: lowest bit should be 0
* page.private is a wrapped pointer.
*/
enum {
PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */
- PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */
PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */
PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */
PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */
@@ -1564,7 +1570,6 @@ struct f2fs_sb_info {
struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
/* keep migration IO order for LFS mode */
struct f2fs_rwsem io_order_lock;
- mempool_t *write_io_dummy; /* Dummy pages */
pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */
int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */
@@ -1810,6 +1815,37 @@ struct f2fs_sb_info {
#endif
};
+/* Definitions to access f2fs_sb_info */
+#define SEGS_TO_BLKS(sbi, segs) \
+ ((segs) << (sbi)->log_blocks_per_seg)
+#define BLKS_TO_SEGS(sbi, blks) \
+ ((blks) >> (sbi)->log_blocks_per_seg)
+
+#define BLKS_PER_SEG(sbi) ((sbi)->blocks_per_seg)
+#define BLKS_PER_SEC(sbi) (SEGS_TO_BLKS(sbi, (sbi)->segs_per_sec))
+#define SEGS_PER_SEC(sbi) ((sbi)->segs_per_sec)
+
+__printf(3, 4)
+void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...);
+
+#define f2fs_err(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_notice(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__)
+#define f2fs_info(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__)
+#define f2fs_debug(sbi, fmt, ...) \
+ f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__)
+
+#define f2fs_err_ratelimited(sbi, fmt, ...) \
+ f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn_ratelimited(sbi, fmt, ...) \
+ f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_info_ratelimited(sbi, fmt, ...) \
+ f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__)
+
#ifdef CONFIG_F2FS_FAULT_INJECTION
#define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \
__builtin_return_address(0))
@@ -1827,9 +1863,8 @@ static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type,
atomic_inc(&ffi->inject_ops);
if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
atomic_set(&ffi->inject_ops, 0);
- printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n",
- KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type],
- func, parent_func);
+ f2fs_info_ratelimited(sbi, "inject %s in %s of %pS",
+ f2fs_fault_name[type], func, parent_func);
return true;
}
return false;
@@ -2249,9 +2284,30 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
return false;
}
+static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi,
+ struct inode *inode, bool cap)
+{
+ block_t avail_user_block_count;
+
+ avail_user_block_count = sbi->user_block_count -
+ sbi->current_reserved_blocks;
+
+ if (!__allow_reserved_blocks(sbi, inode, cap))
+ avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (avail_user_block_count > sbi->unusable_block_count)
+ avail_user_block_count -= sbi->unusable_block_count;
+ else
+ avail_user_block_count = 0;
+ }
+
+ return avail_user_block_count;
+}
+
static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
- struct inode *inode, blkcnt_t *count)
+ struct inode *inode, blkcnt_t *count, bool partial)
{
blkcnt_t diff = 0, release = 0;
block_t avail_user_block_count;
@@ -2274,23 +2330,14 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
sbi->total_valid_block_count += (block_t)(*count);
- avail_user_block_count = sbi->user_block_count -
- sbi->current_reserved_blocks;
+ avail_user_block_count = get_available_block_count(sbi, inode, true);
- if (!__allow_reserved_blocks(sbi, inode, true))
- avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
-
- if (F2FS_IO_ALIGNED(sbi))
- avail_user_block_count -= sbi->blocks_per_seg *
- SM_I(sbi)->additional_reserved_segments;
-
- if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
- if (avail_user_block_count > sbi->unusable_block_count)
- avail_user_block_count -= sbi->unusable_block_count;
- else
- avail_user_block_count = 0;
- }
if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
+ if (!partial) {
+ spin_unlock(&sbi->stat_lock);
+ goto enospc;
+ }
+
diff = sbi->total_valid_block_count - avail_user_block_count;
if (diff > *count)
diff = *count;
@@ -2318,20 +2365,6 @@ release_quota:
return -ENOSPC;
}
-__printf(2, 3)
-void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...);
-
-#define f2fs_err(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_ERR fmt, ##__VA_ARGS__)
-#define f2fs_warn(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_WARNING fmt, ##__VA_ARGS__)
-#define f2fs_notice(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_NOTICE fmt, ##__VA_ARGS__)
-#define f2fs_info(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_INFO fmt, ##__VA_ARGS__)
-#define f2fs_debug(sbi, fmt, ...) \
- f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__)
-
#define PAGE_PRIVATE_GET_FUNC(name, flagname) \
static inline bool page_private_##name(struct page *page) \
{ \
@@ -2360,17 +2393,14 @@ static inline void clear_page_private_##name(struct page *page) \
PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER);
PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE);
PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE);
PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE);
static inline unsigned long get_page_private_data(struct page *page)
{
@@ -2504,11 +2534,8 @@ static inline int get_dirty_pages(struct inode *inode)
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
- unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
- unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >>
- sbi->log_blocks_per_seg;
-
- return segs / sbi->segs_per_sec;
+ return div_u64(get_pages(sbi, block_type) + BLKS_PER_SEC(sbi) - 1,
+ BLKS_PER_SEC(sbi));
}
static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
@@ -2572,7 +2599,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
if (sbi->cur_cp_pack == 2)
- start_addr += sbi->blocks_per_seg;
+ start_addr += BLKS_PER_SEG(sbi);
return start_addr;
}
@@ -2581,7 +2608,7 @@ static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
if (sbi->cur_cp_pack == 1)
- start_addr += sbi->blocks_per_seg;
+ start_addr += BLKS_PER_SEG(sbi);
return start_addr;
}
@@ -2600,7 +2627,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
struct inode *inode, bool is_inode)
{
block_t valid_block_count;
- unsigned int valid_node_count, user_block_count;
+ unsigned int valid_node_count;
+ unsigned int avail_user_block_count;
int err;
if (is_inode) {
@@ -2620,21 +2648,10 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
- valid_block_count = sbi->total_valid_block_count +
- sbi->current_reserved_blocks + 1;
-
- if (!__allow_reserved_blocks(sbi, inode, false))
- valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+ valid_block_count = sbi->total_valid_block_count + 1;
+ avail_user_block_count = get_available_block_count(sbi, inode, false);
- if (F2FS_IO_ALIGNED(sbi))
- valid_block_count += sbi->blocks_per_seg *
- SM_I(sbi)->additional_reserved_segments;
-
- user_block_count = sbi->user_block_count;
- if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
- user_block_count -= sbi->unusable_block_count;
-
- if (unlikely(valid_block_count > user_block_count)) {
+ if (unlikely(valid_block_count > avail_user_block_count)) {
spin_unlock(&sbi->stat_lock);
goto enospc;
}
@@ -3021,6 +3038,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_INLINE_DOTS:
case FI_PIN_FILE:
case FI_COMPRESS_RELEASED:
+ case FI_ATOMIC_COMMITTED:
f2fs_mark_inode_dirty_sync(inode, true);
}
}
@@ -3364,17 +3382,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
}
-static inline bool is_dot_dotdot(const u8 *name, size_t len)
-{
- if (len == 1 && name[0] == '.')
- return true;
-
- if (len == 2 && name[0] == '.' && name[1] == '.')
- return true;
-
- return false;
-}
-
static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
@@ -3455,7 +3462,7 @@ static inline __le32 *get_dnode_addr(struct inode *inode,
sizeof((f2fs_inode)->field)) \
<= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \
-#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
+#define __is_large_section(sbi) (SEGS_PER_SEC(sbi) > 1)
#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META)
@@ -3464,11 +3471,9 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
static inline void verify_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
- if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) {
+ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type))
f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.",
blkaddr, type);
- f2fs_bug_on(sbi, 1);
- }
}
static inline bool __is_valid_data_blkaddr(block_t blkaddr)
@@ -3570,7 +3575,8 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode);
void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
struct inode *dir, struct inode *inode);
-int f2fs_do_tmpfile(struct inode *inode, struct inode *dir);
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir,
+ struct f2fs_filename *fname);
bool f2fs_empty_dir(struct inode *dir);
static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
@@ -3685,15 +3691,14 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno);
-void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
+int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi);
void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi);
-void f2fs_get_new_segment(struct f2fs_sb_info *sbi,
- unsigned int *newseg, bool new_sec, int dir);
-void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end);
-void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force);
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
+int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force);
+int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi);
+int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
@@ -3714,7 +3719,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
block_t old_addr, block_t new_addr,
unsigned char version, bool recover_curseg,
bool recover_newaddr);
-void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio);
@@ -3764,6 +3769,8 @@ struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index);
struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
+bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type);
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync);
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
@@ -3804,6 +3811,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
+bool f2fs_is_cp_guaranteed(struct page *page);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
@@ -3867,6 +3875,9 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi);
block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control);
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
+int f2fs_gc_range(struct f2fs_sb_info *sbi,
+ unsigned int start_seg, unsigned int end_seg,
+ bool dry_run, unsigned int dry_run_sections);
int f2fs_resize_fs(struct file *filp, __u64 block_count);
int __init f2fs_create_garbage_collection_cache(void);
void f2fs_destroy_garbage_collection_cache(void);
@@ -4287,7 +4298,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
bool in_task);
void f2fs_put_page_dic(struct page *page, bool in_task);
-unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn,
+ unsigned int ofs_in_node);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
@@ -4344,7 +4356,8 @@ static inline void f2fs_put_page_dic(struct page *page, bool in_task)
{
WARN_ON_ONCE(1);
}
-static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; }
+static inline unsigned int f2fs_cluster_blocks_are_contiguous(
+ struct dnode_of_data *dn, unsigned int ofs_in_node) { return 0; }
static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; }
static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { }
@@ -4401,15 +4414,24 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
- if (!f2fs_compressed_file(inode))
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
+
+ if (!f2fs_compressed_file(inode)) {
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return true;
- if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
+ }
+ if (f2fs_is_mmap_file(inode) ||
+ (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) {
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return false;
+ }
fi->i_flags &= ~F2FS_COMPR_FL;
stat_dec_compr_inode(inode);
clear_inode_flag(inode, FI_COMPRESSED_FILE);
f2fs_mark_inode_dirty_sync(inode, true);
+
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return true;
}
@@ -4512,6 +4534,17 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
+static inline bool f2fs_valid_pinned_area(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ int devi = f2fs_target_device_index(sbi, blkaddr);
+
+ return !bdev_is_zoned(FDEV(devi).bdev);
+ }
+ return true;
+}
+
static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi)
{
return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW;
@@ -4613,10 +4646,36 @@ static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
}
+static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi,
+ block_t blkaddr, unsigned int cnt)
+{
+ bool need_submit = false;
+ int i = 0;
+
+ do {
+ struct page *page;
+
+ page = find_get_page(META_MAPPING(sbi), blkaddr + i);
+ if (page) {
+ if (PageWriteback(page))
+ need_submit = true;
+ f2fs_put_page(page, 0);
+ }
+ } while (++i < cnt && !need_submit);
+
+ if (need_submit)
+ f2fs_submit_merged_write_cond(sbi, sbi->meta_inode,
+ NULL, 0, DATA);
+
+ truncate_inode_pages_range(META_MAPPING(sbi),
+ F2FS_BLK_TO_BYTES((loff_t)blkaddr),
+ F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1)));
+}
+
static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
block_t blkaddr)
{
- invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr);
+ f2fs_truncate_meta_inode_pages(sbi, blkaddr, 1);
f2fs_invalidate_compress_page(sbi, blkaddr);
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b58ab1157b7e..1761ad125f97 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -39,6 +39,7 @@
static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_flags_t flags = vmf->vma->vm_flags;
vm_fault_t ret;
ret = filemap_fault(vmf);
@@ -46,7 +47,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_MAPPED_READ_IO, F2FS_BLKSIZE);
- trace_f2fs_filemap_fault(inode, vmf->pgoff, vmf->vma->vm_flags, ret);
+ trace_f2fs_filemap_fault(inode, vmf->pgoff, flags, ret);
return ret;
}
@@ -394,9 +395,20 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
return f2fs_do_sync_file(file, start, end, datasync, false);
}
-static bool __found_offset(struct address_space *mapping, block_t blkaddr,
- pgoff_t index, int whence)
+static bool __found_offset(struct address_space *mapping,
+ struct dnode_of_data *dn, pgoff_t index, int whence)
{
+ block_t blkaddr = f2fs_data_blkaddr(dn);
+ struct inode *inode = mapping->host;
+ bool compressed_cluster = false;
+
+ if (f2fs_compressed_file(inode)) {
+ block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ ALIGN_DOWN(dn->ofs_in_node, F2FS_I(inode)->i_cluster_size));
+
+ compressed_cluster = first_blkaddr == COMPRESS_ADDR;
+ }
+
switch (whence) {
case SEEK_DATA:
if (__is_valid_data_blkaddr(blkaddr))
@@ -404,8 +416,12 @@ static bool __found_offset(struct address_space *mapping, block_t blkaddr,
if (blkaddr == NEW_ADDR &&
xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY))
return true;
+ if (compressed_cluster)
+ return true;
break;
case SEEK_HOLE:
+ if (compressed_cluster)
+ return false;
if (blkaddr == NULL_ADDR)
return true;
break;
@@ -474,7 +490,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
goto fail;
}
- if (__found_offset(file->f_mapping, blkaddr,
+ if (__found_offset(file->f_mapping, &dn,
pgofs, whence)) {
f2fs_put_dnode(&dn);
goto found;
@@ -590,8 +606,10 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
f2fs_set_data_blkaddr(dn, NULL_ADDR);
if (__is_valid_data_blkaddr(blkaddr)) {
- if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE))
+ if (time_to_inject(sbi, FAULT_BLKADDR_CONSISTENCE))
+ continue;
+ if (!f2fs_is_valid_blkaddr_raw(sbi, blkaddr,
+ DATA_GENERIC_ENHANCE))
continue;
if (compressed_cluster)
valid_blocks++;
@@ -818,8 +836,6 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw)
*/
if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE))
return true;
- if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
- return true;
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
return true;
@@ -1192,7 +1208,6 @@ next_dnode:
!f2fs_is_valid_blkaddr(sbi, *blkaddr,
DATA_GENERIC_ENHANCE)) {
f2fs_put_dnode(&dn);
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
}
@@ -1478,7 +1493,6 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr,
DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
break;
}
@@ -1662,10 +1676,12 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
}
filemap_invalidate_unlock(mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ if (ret)
+ return ret;
/* write out all moved pages, if possible */
filemap_invalidate_lock(mapping);
- filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
+ ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
truncate_pagecache(inode, offset);
filemap_invalidate_unlock(mapping);
@@ -1731,9 +1747,11 @@ next_alloc:
f2fs_down_write(&sbi->pin_sem);
- f2fs_lock_op(sbi);
- f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
- f2fs_unlock_op(sbi);
+ err = f2fs_allocate_pinning_section(sbi);
+ if (err) {
+ f2fs_up_write(&sbi->pin_sem);
+ goto out_err;
+ }
map.m_seg_type = CURSEG_COLD_DATA_PINNED;
err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_DIO);
@@ -2066,7 +2084,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
inode_lock(inode);
- if (!f2fs_disable_compressed_file(inode)) {
+ if (!f2fs_disable_compressed_file(inode) ||
+ f2fs_is_pinned_file(inode)) {
ret = -EINVAL;
goto out;
}
@@ -2243,8 +2262,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
case F2FS_GOING_DOWN_METASYNC:
/* do checkpoint only */
ret = f2fs_sync_fs(sb, 1);
- if (ret)
+ if (ret) {
+ if (ret == -EIO)
+ ret = 0;
goto out;
+ }
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
break;
case F2FS_GOING_DOWN_NOSYNC:
@@ -2260,6 +2282,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
set_sbi_flag(sbi, SBI_IS_DIRTY);
/* do checkpoint only */
ret = f2fs_sync_fs(sb, 1);
+ if (ret == -EIO)
+ ret = 0;
goto out;
default:
ret = -EINVAL;
@@ -2578,7 +2602,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
.m_may_create = false };
struct extent_info ei = {};
pgoff_t pg_start, pg_end, next_pgofs;
- unsigned int blk_per_seg = sbi->blocks_per_seg;
unsigned int total = 0, sec_num;
block_t blk_end = 0;
bool fragmented = false;
@@ -2687,7 +2710,8 @@ do_map:
set_inode_flag(inode, FI_SKIP_WRITES);
idx = map.m_lblk;
- while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+ while (idx < map.m_lblk + map.m_len &&
+ cnt < BLKS_PER_SEG(sbi)) {
struct page *page;
page = f2fs_get_lock_data_page(inode, idx, true);
@@ -2707,7 +2731,7 @@ do_map:
map.m_lblk = idx;
check:
- if (map.m_lblk < pg_end && cnt < blk_per_seg)
+ if (map.m_lblk < pg_end && cnt < BLKS_PER_SEG(sbi))
goto do_map;
clear_inode_flag(inode, FI_SKIP_WRITES);
@@ -2976,8 +3000,8 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num ||
__is_large_section(sbi)) {
- f2fs_warn(sbi, "Can't flush %u in %d for segs_per_sec %u != 1",
- range.dev_num, sbi->s_ndevs, sbi->segs_per_sec);
+ f2fs_warn(sbi, "Can't flush %u in %d for SEGS_PER_SEC %u != 1",
+ range.dev_num, sbi->s_ndevs, SEGS_PER_SEC(sbi));
return -EINVAL;
}
@@ -3183,6 +3207,7 @@ int f2fs_pin_file_control(struct inode *inode, bool inc)
static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
__u32 pin;
int ret = 0;
@@ -3192,7 +3217,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+ if (f2fs_readonly(sbi->sb))
return -EROFS;
ret = mnt_want_write_file(filp);
@@ -3205,9 +3230,18 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
clear_inode_flag(inode, FI_PIN_FILE);
f2fs_i_gc_failures_write(inode, 0);
goto done;
+ } else if (f2fs_is_pinned_file(inode)) {
+ goto done;
}
- if (f2fs_should_update_outplace(inode, NULL)) {
+ if (f2fs_sb_has_blkzoned(sbi) && F2FS_HAS_BLOCKS(inode)) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ /* Let's allow file pinning on zoned device. */
+ if (!f2fs_sb_has_blkzoned(sbi) &&
+ f2fs_should_update_outplace(inode, NULL)) {
ret = -EINVAL;
goto out;
}
@@ -3229,7 +3263,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
set_inode_flag(inode, FI_PIN_FILE);
ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
done:
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
out:
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -3438,10 +3472,8 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (!__is_valid_data_blkaddr(blkaddr))
continue;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE))) {
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ DATA_GENERIC_ENHANCE)))
return -EFSCORRUPTED;
- }
}
while (count) {
@@ -3588,10 +3620,10 @@ out:
return ret;
}
-static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
+static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
+ unsigned int *reserved_blocks)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- unsigned int reserved_blocks = 0;
int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
block_t blkaddr;
int i;
@@ -3603,10 +3635,8 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (!__is_valid_data_blkaddr(blkaddr))
continue;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE))) {
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ DATA_GENERIC_ENHANCE)))
return -EFSCORRUPTED;
- }
}
while (count) {
@@ -3614,40 +3644,53 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
blkcnt_t reserved;
int ret;
- for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
- blkaddr = f2fs_data_blkaddr(dn);
+ for (i = 0; i < cluster_size; i++) {
+ blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
if (i == 0) {
- if (blkaddr == COMPRESS_ADDR)
- continue;
- dn->ofs_in_node += cluster_size;
- goto next;
+ if (blkaddr != COMPRESS_ADDR) {
+ dn->ofs_in_node += cluster_size;
+ goto next;
+ }
+ continue;
}
- if (__is_valid_data_blkaddr(blkaddr)) {
+ /*
+ * compressed cluster was not released due to it
+ * fails in release_compress_blocks(), so NEW_ADDR
+ * is a possible case.
+ */
+ if (blkaddr == NEW_ADDR ||
+ __is_valid_data_blkaddr(blkaddr)) {
compr_blocks++;
continue;
}
-
- f2fs_set_data_blkaddr(dn, NEW_ADDR);
}
reserved = cluster_size - compr_blocks;
- ret = inc_valid_block_count(sbi, dn->inode, &reserved);
- if (ret)
+
+ /* for the case all blocks in cluster were reserved */
+ if (reserved == 1)
+ goto next;
+
+ ret = inc_valid_block_count(sbi, dn->inode, &reserved, false);
+ if (unlikely(ret))
return ret;
- if (reserved != cluster_size - compr_blocks)
- return -ENOSPC;
+ for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
+ if (f2fs_data_blkaddr(dn) == NULL_ADDR)
+ f2fs_set_data_blkaddr(dn, NEW_ADDR);
+ }
f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true);
- reserved_blocks += reserved;
+ *reserved_blocks += reserved;
next:
count -= cluster_size;
}
- return reserved_blocks;
+ return 0;
}
static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
@@ -3671,9 +3714,6 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
if (ret)
return ret;
- if (atomic_read(&F2FS_I(inode)->i_compr_blocks))
- goto out;
-
f2fs_balance_fs(sbi, true);
inode_lock(inode);
@@ -3683,6 +3723,9 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
goto unlock_inode;
}
+ if (atomic_read(&F2FS_I(inode)->i_compr_blocks))
+ goto unlock_inode;
+
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
@@ -3708,7 +3751,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
count = round_up(count, F2FS_I(inode)->i_cluster_size);
- ret = reserve_compress_blocks(&dn, count);
+ ret = reserve_compress_blocks(&dn, count, &reserved_blocks);
f2fs_put_dnode(&dn);
@@ -3716,23 +3759,21 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
break;
page_idx += count;
- reserved_blocks += ret;
}
filemap_invalidate_unlock(inode->i_mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- if (ret >= 0) {
+ if (!ret) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
}
unlock_inode:
inode_unlock(inode);
-out:
mnt_drop_write_file(filp);
- if (ret >= 0) {
+ if (!ret) {
ret = put_user(reserved_blocks, (u64 __user *)arg);
} else if (reserved_blocks &&
atomic_read(&F2FS_I(inode)->i_compr_blocks)) {
@@ -3877,8 +3918,6 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
f2fs_put_dnode(&dn);
- f2fs_handle_error(sbi,
- ERROR_INVALID_BLKADDR);
goto out;
}
@@ -3981,16 +4020,20 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
sizeof(option)))
return -EFAULT;
- if (!f2fs_compressed_file(inode) ||
- option.log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
- option.log_cluster_size > MAX_COMPRESS_LOG_SIZE ||
- option.algorithm >= COMPRESS_MAX)
+ if (option.log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
+ option.log_cluster_size > MAX_COMPRESS_LOG_SIZE ||
+ option.algorithm >= COMPRESS_MAX)
return -EINVAL;
file_start_write(filp);
inode_lock(inode);
f2fs_down_write(&F2FS_I(inode)->i_sem);
+ if (!f2fs_compressed_file(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) {
ret = -EBUSY;
goto out;
@@ -4066,7 +4109,6 @@ static int f2fs_ioc_decompress_file(struct file *filp)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
pgoff_t page_idx = 0, last_idx;
- unsigned int blk_per_seg = sbi->blocks_per_seg;
int cluster_size = fi->i_cluster_size;
int count, ret;
@@ -4110,7 +4152,7 @@ static int f2fs_ioc_decompress_file(struct file *filp)
if (ret < 0)
break;
- if (get_dirty_pages(inode) >= blk_per_seg) {
+ if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) {
ret = filemap_fdatawrite(inode->i_mapping);
if (ret < 0)
break;
@@ -4145,7 +4187,6 @@ static int f2fs_ioc_compress_file(struct file *filp)
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t page_idx = 0, last_idx;
- unsigned int blk_per_seg = sbi->blocks_per_seg;
int cluster_size = F2FS_I(inode)->i_cluster_size;
int count, ret;
@@ -4188,7 +4229,7 @@ static int f2fs_ioc_compress_file(struct file *filp)
if (ret < 0)
break;
- if (get_dirty_pages(inode) >= blk_per_seg) {
+ if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) {
ret = filemap_fdatawrite(inode->i_mapping);
if (ret < 0)
break;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index a079eebfb080..8852814dab7f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -259,7 +259,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->ofs_unit = 1;
} else {
p->gc_mode = select_gc_type(sbi, gc_type);
- p->ofs_unit = sbi->segs_per_sec;
+ p->ofs_unit = SEGS_PER_SEC(sbi);
if (__is_large_section(sbi)) {
p->dirty_bitmap = dirty_i->dirty_secmap;
p->max_search = count_bits(p->dirty_bitmap,
@@ -280,11 +280,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
- /* let's select beginning hot/small space first in no_heap mode*/
+ /* let's select beginning hot/small space first. */
if (f2fs_need_rand_seg(sbi))
- p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
- else if (test_opt(sbi, NOHEAP) &&
- (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
+ p->offset = get_random_u32_below(MAIN_SECS(sbi) *
+ SEGS_PER_SEC(sbi));
+ else if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
p->offset = 0;
else
p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
@@ -295,13 +295,13 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
{
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
else if (p->alloc_mode == AT_SSR)
return UINT_MAX;
/* LFS */
if (p->gc_mode == GC_GREEDY)
- return 2 * sbi->blocks_per_seg * p->ofs_unit;
+ return SEGS_TO_BLKS(sbi, 2 * p->ofs_unit);
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else if (p->gc_mode == GC_AT)
@@ -348,7 +348,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
mtime = div_u64(mtime, usable_segs_per_sec);
vblocks = div_u64(vblocks, usable_segs_per_sec);
- u = (vblocks * 100) >> sbi->log_blocks_per_seg;
+ u = BLKS_TO_SEGS(sbi, vblocks * 100);
/* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
@@ -496,9 +496,9 @@ static void add_victim_entry(struct f2fs_sb_info *sbi,
return;
}
- for (i = 0; i < sbi->segs_per_sec; i++)
+ for (i = 0; i < SEGS_PER_SEC(sbi); i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
- mtime = div_u64(mtime, sbi->segs_per_sec);
+ mtime = div_u64(mtime, SEGS_PER_SEC(sbi));
/* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
@@ -599,7 +599,6 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi,
unsigned long long age;
unsigned long long max_mtime = sit_i->dirty_max_mtime;
unsigned long long min_mtime = sit_i->dirty_min_mtime;
- unsigned int seg_blocks = sbi->blocks_per_seg;
unsigned int vblocks;
unsigned int dirty_threshold = max(am->max_candidate_count,
am->candidate_ratio *
@@ -629,7 +628,7 @@ next_node:
f2fs_bug_on(sbi, !vblocks);
/* rare case */
- if (vblocks == seg_blocks)
+ if (vblocks == BLKS_PER_SEG(sbi))
goto skip_node;
iter++;
@@ -755,7 +754,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
int ret = 0;
mutex_lock(&dirty_i->seglist_lock);
- last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec;
+ last_segment = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi);
p.alloc_mode = alloc_mode;
p.age = age;
@@ -896,7 +895,7 @@ next:
else
sm->last_victim[p.gc_mode] = segno + p.ofs_unit;
sm->last_victim[p.gc_mode] %=
- (MAIN_SECS(sbi) * sbi->segs_per_sec);
+ (MAIN_SECS(sbi) * SEGS_PER_SEC(sbi));
break;
}
}
@@ -1184,7 +1183,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
.op_flags = 0,
.encrypted_page = NULL,
.in_list = 0,
- .retry = 0,
};
int err;
@@ -1197,7 +1195,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ))) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto put_page;
}
goto got_it;
@@ -1216,7 +1213,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE))) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto put_page;
}
got_it:
@@ -1273,7 +1269,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
.op_flags = 0,
.encrypted_page = NULL,
.in_list = 0,
- .retry = 0,
};
struct dnode_of_data dn;
struct f2fs_summary sum;
@@ -1364,8 +1359,13 @@ static int move_data_block(struct inode *inode, block_t bidx,
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* allocate block address */
- f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
+ err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
&sum, type, NULL);
+ if (err) {
+ f2fs_put_page(mpage, 1);
+ /* filesystem should shutdown, no need to recovery block */
+ goto up_out;
+ }
fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -1393,18 +1393,12 @@ static int move_data_block(struct inode *inode, block_t bidx,
fio.op_flags = REQ_SYNC;
fio.new_blkaddr = newaddr;
f2fs_submit_page_write(&fio);
- if (fio.retry) {
- err = -EAGAIN;
- if (PageWriteback(fio.encrypted_page))
- end_page_writeback(fio.encrypted_page);
- goto put_page_out;
- }
f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE);
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
-put_page_out:
+
f2fs_put_page(fio.encrypted_page, 1);
recover_block:
if (err)
@@ -1678,7 +1672,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct f2fs_summary_block *sum;
struct blk_plug plug;
unsigned int segno = start_segno;
- unsigned int end_segno = start_segno + sbi->segs_per_sec;
+ unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi);
int seg_freed = 0, migrated = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
@@ -1686,7 +1680,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
int submitted = 0;
if (__is_large_section(sbi))
- end_segno = rounddown(end_segno, sbi->segs_per_sec);
+ end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi));
/*
* zone-capacity can be less than zone-size in zoned devices,
@@ -1694,7 +1688,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
* calculate the end segno in the zone which can be garbage collected
*/
if (f2fs_sb_has_blkzoned(sbi))
- end_segno -= sbi->segs_per_sec -
+ end_segno -= SEGS_PER_SEC(sbi) -
f2fs_usable_segs_in_sec(sbi, segno);
sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
@@ -1983,10 +1977,43 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
init_atgc_management(sbi);
}
+int f2fs_gc_range(struct f2fs_sb_info *sbi,
+ unsigned int start_seg, unsigned int end_seg,
+ bool dry_run, unsigned int dry_run_sections)
+{
+ unsigned int segno;
+ unsigned int gc_secs = dry_run_sections;
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) {
+ struct gc_inode_list gc_list = {
+ .ilist = LIST_HEAD_INIT(gc_list.ilist),
+ .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
+ };
+
+ do_garbage_collect(sbi, segno, &gc_list, FG_GC,
+ dry_run_sections == 0);
+ put_gc_inode(&gc_list);
+
+ if (!dry_run && get_valid_blocks(sbi, segno, true))
+ return -EAGAIN;
+ if (dry_run && dry_run_sections &&
+ !get_valid_blocks(sbi, segno, true) && --gc_secs == 0)
+ break;
+
+ if (fatal_signal_pending(current))
+ return -ERESTARTSYS;
+ }
+
+ return 0;
+}
+
static int free_segment_range(struct f2fs_sb_info *sbi,
- unsigned int secs, bool gc_only)
+ unsigned int secs, bool dry_run)
{
- unsigned int segno, next_inuse, start, end;
+ unsigned int next_inuse, start, end;
struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
int gc_mode, gc_type;
int err = 0;
@@ -1994,7 +2021,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
/* Force block allocation for GC */
MAIN_SECS(sbi) -= secs;
- start = MAIN_SECS(sbi) * sbi->segs_per_sec;
+ start = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi);
end = MAIN_SEGS(sbi) - 1;
mutex_lock(&DIRTY_I(sbi)->seglist_lock);
@@ -2008,29 +2035,15 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
/* Move out cursegs from the target range */
- for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++)
- f2fs_allocate_segment_for_resize(sbi, type, start, end);
-
- /* do GC to move out valid blocks in the range */
- for (segno = start; segno <= end; segno += sbi->segs_per_sec) {
- struct gc_inode_list gc_list = {
- .ilist = LIST_HEAD_INIT(gc_list.ilist),
- .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
- };
-
- do_garbage_collect(sbi, segno, &gc_list, FG_GC, true);
- put_gc_inode(&gc_list);
-
- if (!gc_only && get_valid_blocks(sbi, segno, true)) {
- err = -EAGAIN;
- goto out;
- }
- if (fatal_signal_pending(current)) {
- err = -ERESTARTSYS;
+ for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) {
+ err = f2fs_allocate_segment_for_resize(sbi, type, start, end);
+ if (err)
goto out;
- }
}
- if (gc_only)
+
+ /* do GC to move out valid blocks in the range */
+ err = f2fs_gc_range(sbi, start, end, dry_run, 0);
+ if (err || dry_run)
goto out;
stat_inc_cp_call_count(sbi, TOTAL_CALL);
@@ -2056,7 +2069,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
int segment_count;
int segment_count_main;
long long block_count;
- int segs = secs * sbi->segs_per_sec;
+ int segs = secs * SEGS_PER_SEC(sbi);
f2fs_down_write(&sbi->sb_lock);
@@ -2069,7 +2082,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
raw_sb->segment_count = cpu_to_le32(segment_count + segs);
raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
raw_sb->block_count = cpu_to_le64(block_count +
- (long long)segs * sbi->blocks_per_seg);
+ (long long)SEGS_TO_BLKS(sbi, segs));
if (f2fs_is_multi_device(sbi)) {
int last_dev = sbi->s_ndevs - 1;
int dev_segs =
@@ -2084,8 +2097,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
{
- int segs = secs * sbi->segs_per_sec;
- long long blks = (long long)segs * sbi->blocks_per_seg;
+ int segs = secs * SEGS_PER_SEC(sbi);
+ long long blks = SEGS_TO_BLKS(sbi, segs);
long long user_block_count =
le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
@@ -2127,7 +2140,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count)
int last_dev = sbi->s_ndevs - 1;
__u64 last_segs = FDEV(last_dev).total_segments;
- if (block_count + last_segs * sbi->blocks_per_seg <=
+ if (block_count + SEGS_TO_BLKS(sbi, last_segs) <=
old_block_count)
return -EINVAL;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 28a00942802c..9c0d06c4d19a 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -96,7 +96,7 @@ static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi)
if (f2fs_sb_has_blkzoned(sbi))
return free_segs_blk_count_zoned(sbi);
- return free_segments(sbi) << sbi->log_blocks_per_seg;
+ return SEGS_TO_BLKS(sbi, free_segments(sbi));
}
static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
@@ -104,7 +104,7 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
block_t free_blks, ovp_blks;
free_blks = free_segs_blk_count(sbi);
- ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg;
+ ovp_blks = SEGS_TO_BLKS(sbi, overprovision_segments(sbi));
if (free_blks < ovp_blks)
return 0;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b3bb815fc6aa..e54f8c08bda8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -531,7 +531,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
}
err = f2fs_prepare_lookup(dir, dentry, &fname);
- generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
goto out_splice;
if (err)
@@ -852,7 +851,7 @@ out:
static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode, bool is_whiteout,
- struct inode **new_inode)
+ struct inode **new_inode, struct f2fs_filename *fname)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
@@ -880,7 +879,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
if (err)
goto out;
- err = f2fs_do_tmpfile(inode, dir);
+ err = f2fs_do_tmpfile(inode, dir, fname);
if (err)
goto release_out;
@@ -931,22 +930,24 @@ static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL);
+ err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL, NULL);
return finish_open_simple(file, err);
}
static int f2fs_create_whiteout(struct mnt_idmap *idmap,
- struct inode *dir, struct inode **whiteout)
+ struct inode *dir, struct inode **whiteout,
+ struct f2fs_filename *fname)
{
- return __f2fs_tmpfile(idmap, dir, NULL,
- S_IFCHR | WHITEOUT_MODE, true, whiteout);
+ return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE,
+ true, whiteout, fname);
}
int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct inode **new_inode)
{
- return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, false, new_inode);
+ return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG,
+ false, new_inode, NULL);
}
static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
@@ -990,7 +991,14 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
if (flags & RENAME_WHITEOUT) {
- err = f2fs_create_whiteout(idmap, old_dir, &whiteout);
+ struct f2fs_filename fname;
+
+ err = f2fs_setup_filename(old_dir, &old_dentry->d_name,
+ 0, &fname);
+ if (err)
+ return err;
+
+ err = f2fs_create_whiteout(idmap, old_dir, &whiteout, &fname);
if (err)
return err;
}
@@ -1105,14 +1113,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
iput(whiteout);
}
- if (old_is_dir) {
- if (old_dir_entry)
- f2fs_set_link(old_inode, old_dir_entry,
- old_dir_page, new_dir);
- else
- f2fs_put_page(old_dir_page, 0);
+ if (old_dir_entry)
+ f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
+ if (old_is_dir)
f2fs_i_links_write(old_dir, false);
- }
+
if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) {
f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9b546fd21010..b3de6d6cdb02 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -852,21 +852,29 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
f2fs_sb_has_readonly(sbi)) {
- unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
+ unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
+ unsigned int ofs_in_node = dn->ofs_in_node;
+ pgoff_t fofs = index;
+ unsigned int c_len;
block_t blkaddr;
+ /* should align fofs and ofs_in_node to cluster_size */
+ if (fofs % cluster_size) {
+ fofs = round_down(fofs, cluster_size);
+ ofs_in_node = round_down(ofs_in_node, cluster_size);
+ }
+
+ c_len = f2fs_cluster_blocks_are_contiguous(dn, ofs_in_node);
if (!c_len)
goto out;
- blkaddr = f2fs_data_blkaddr(dn);
+ blkaddr = data_blkaddr(dn->inode, dn->node_page, ofs_in_node);
if (blkaddr == COMPRESS_ADDR)
blkaddr = data_blkaddr(dn->inode, dn->node_page,
- dn->ofs_in_node + 1);
+ ofs_in_node + 1);
f2fs_update_read_extent_tree_range_compressed(dn->inode,
- index, blkaddr,
- F2FS_I(dn->inode)->i_cluster_size,
- c_len);
+ fofs, blkaddr, cluster_size, c_len);
}
out:
return 0;
@@ -1919,7 +1927,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
for (i = 0; i < nr_folios; i++) {
struct page *page = &fbatch.folios[i]->page;
- if (!IS_DNODE(page))
+ if (!IS_INODE(page))
continue;
lock_page(page);
@@ -2841,7 +2849,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
int i, idx, last_offset, nrpages;
/* scan the node segment */
- last_offset = sbi->blocks_per_seg;
+ last_offset = BLKS_PER_SEG(sbi);
addr = START_BLOCK(sbi, segno);
sum_entry = &sum->entries[0];
@@ -3158,7 +3166,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
return 0;
- nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
+ nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) -
nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++) {
struct page *page;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 5bd16a95eef8..6aea13024ac1 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -208,10 +208,10 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
(block_off << 1) -
- (block_off & (sbi->blocks_per_seg - 1)));
+ (block_off & (BLKS_PER_SEG(sbi) - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
- block_addr += sbi->blocks_per_seg;
+ block_addr += BLKS_PER_SEG(sbi);
return block_addr;
}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d0f24ccbd1ac..e7bf15b8240a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -354,7 +354,7 @@ static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
if (blkaddr + 1 == next_blkaddr)
ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
ra_blocks * 2);
- else if (next_blkaddr % sbi->blocks_per_seg)
+ else if (next_blkaddr % BLKS_PER_SEG(sbi))
ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
ra_blocks / 2);
return ra_blocks;
@@ -611,6 +611,19 @@ truncate_out:
return 0;
}
+static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn)
+{
+ int i, err = 0;
+
+ for (i = DEFAULT_FAILURE_RETRY_COUNT; i > 0; i--) {
+ err = f2fs_reserve_new_block(dn);
+ if (!err)
+ break;
+ }
+
+ return err;
+}
+
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *page)
{
@@ -680,14 +693,12 @@ retry_dn:
if (__is_valid_data_blkaddr(src) &&
!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto err;
}
if (__is_valid_data_blkaddr(dest) &&
!f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto err;
}
@@ -712,14 +723,8 @@ retry_dn:
*/
if (dest == NEW_ADDR) {
f2fs_truncate_data_blocks_range(&dn, 1);
- do {
- err = f2fs_reserve_new_block(&dn);
- if (err == -ENOSPC) {
- f2fs_bug_on(sbi, 1);
- break;
- }
- } while (err &&
- IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+
+ err = f2fs_reserve_new_block_retry(&dn);
if (err)
goto err;
continue;
@@ -727,16 +732,8 @@ retry_dn:
/* dest is valid block, try to recover from src to dest */
if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
-
if (src == NULL_ADDR) {
- do {
- err = f2fs_reserve_new_block(&dn);
- if (err == -ENOSPC) {
- f2fs_bug_on(sbi, 1);
- break;
- }
- } while (err &&
- IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+ err = f2fs_reserve_new_block_retry(&dn);
if (err)
goto err;
}
@@ -756,8 +753,6 @@ retry_prev:
f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u",
dest, inode->i_ino, dn.ofs_in_node);
err = -EFSCORRUPTED;
- f2fs_handle_error(sbi,
- ERROR_INVALID_BLKADDR);
goto err;
}
@@ -852,7 +847,7 @@ next:
f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
if (!err)
- f2fs_allocate_new_segments(sbi);
+ err = f2fs_allocate_new_segments(sbi);
return err;
}
@@ -864,7 +859,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
int ret = 0;
unsigned long s_flags = sbi->sb->s_flags;
bool need_writecp = false;
- bool fix_curseg_write_pointer = false;
if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE))
f2fs_info(sbi, "recover fsync data on readonly fs");
@@ -895,8 +889,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
else
f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE);
skip:
- fix_curseg_write_pointer = !check_only || list_empty(&inode_list);
-
destroy_fsync_dnodes(&inode_list, err);
destroy_fsync_dnodes(&tmp_inode_list, err);
@@ -914,11 +906,13 @@ skip:
* and the f2fs is not read only, check and fix zoned block devices'
* write pointer consistency.
*/
- if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) &&
- f2fs_sb_has_blkzoned(sbi)) {
- err = f2fs_fix_curseg_write_pointer(sbi);
- if (!err)
- err = f2fs_check_write_pointer(sbi);
+ if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sbi->sb)) {
+ int err2 = f2fs_fix_curseg_write_pointer(sbi);
+
+ if (!err2)
+ err2 = f2fs_check_write_pointer(sbi);
+ if (err2)
+ err = err2;
ret = err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4c8836ded90f..4fd76e867e0a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,6 +192,9 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
if (!f2fs_is_atomic_file(inode))
return;
+ if (clean)
+ truncate_inode_pages_final(inode->i_mapping);
+
release_atomic_write_cnt(inode);
clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
clear_inode_flag(inode, FI_ATOMIC_REPLACE);
@@ -201,7 +204,6 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
F2FS_I(inode)->atomic_write_task = NULL;
if (clean) {
- truncate_inode_pages_final(inode->i_mapping);
f2fs_i_size_write(inode, fi->original_i_size);
fi->original_i_size = 0;
}
@@ -248,7 +250,7 @@ retry:
} else {
blkcnt_t count = 1;
- err = inc_valid_block_count(sbi, inode, &count);
+ err = inc_valid_block_count(sbi, inode, &count, true);
if (err) {
f2fs_put_dnode(&dn);
return err;
@@ -334,8 +336,6 @@ static int __f2fs_commit_atomic_write(struct inode *inode)
DATA_GENERIC_ENHANCE)) {
f2fs_put_dnode(&dn);
ret = -EFSCORRUPTED;
- f2fs_handle_error(sbi,
- ERROR_INVALID_BLKADDR);
goto out;
}
@@ -400,6 +400,9 @@ int f2fs_commit_atomic_write(struct inode *inode)
*/
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
+ if (f2fs_cp_error(sbi))
+ return;
+
if (time_to_inject(sbi, FAULT_CHECKPOINT))
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT);
@@ -448,8 +451,8 @@ static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
unsigned int meta = get_pages(sbi, F2FS_DIRTY_META);
unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
- unsigned int threshold = sbi->blocks_per_seg * factor *
- DEFAULT_DIRTY_THRESHOLD;
+ unsigned int threshold =
+ SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD));
unsigned int global_threshold = threshold * 3 / 2;
if (dents >= threshold || qdata >= threshold ||
@@ -872,7 +875,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
{
int ovp_hole_segs =
(overprovision_segments(sbi) - reserved_segments(sbi));
- block_t ovp_holes = ovp_hole_segs << sbi->log_blocks_per_seg;
+ block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs);
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
block_t holes[2] = {0, 0}; /* DATA and NODE */
block_t unusable;
@@ -901,11 +904,16 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable)
{
int ovp_hole_segs =
(overprovision_segments(sbi) - reserved_segments(sbi));
+
+ if (F2FS_OPTION(sbi).unusable_cap_perc == 100)
+ return 0;
if (unusable > F2FS_OPTION(sbi).unusable_cap)
return -EAGAIN;
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
dirty_segments(sbi) > ovp_hole_segs)
return -EAGAIN;
+ if (has_not_enough_free_secs(sbi, 0, 0))
+ return -EAGAIN;
return 0;
}
@@ -1132,8 +1140,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
struct seg_entry *sentry;
unsigned int segno;
block_t blk = start;
- unsigned long offset, size, max_blocks = sbi->blocks_per_seg;
- unsigned long *map;
+ unsigned long offset, size, *map;
while (blk < end) {
segno = GET_SEGNO(sbi, blk);
@@ -1143,7 +1150,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
if (end < START_BLOCK(sbi, segno + 1))
size = GET_BLKOFF_FROM_SEG0(sbi, end);
else
- size = max_blocks;
+ size = BLKS_PER_SEG(sbi);
map = (unsigned long *)(sentry->cur_valid_map);
offset = __find_rev_next_bit(map, size, offset);
f2fs_bug_on(sbi, offset != size);
@@ -1971,9 +1978,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) {
+ unsigned int nofs_flags;
+ int ret;
+
trace_f2fs_issue_reset_zone(bdev, blkstart);
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- sector, nr_sects, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ sector, nr_sects);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
__queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen);
@@ -2042,7 +2055,6 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
bool check_only)
{
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
- int max_blocks = sbi->blocks_per_seg;
struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
@@ -2054,8 +2066,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
int i;
- if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) ||
- !f2fs_block_unit_discard(sbi))
+ if (se->valid_blocks == BLKS_PER_SEG(sbi) ||
+ !f2fs_hw_support_discard(sbi) ||
+ !f2fs_block_unit_discard(sbi))
return false;
if (!force) {
@@ -2072,13 +2085,14 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
while (force || SM_I(sbi)->dcc_info->nr_discards <=
SM_I(sbi)->dcc_info->max_discards) {
- start = __find_rev_next_bit(dmap, max_blocks, end + 1);
- if (start >= max_blocks)
+ start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1);
+ if (start >= BLKS_PER_SEG(sbi))
break;
- end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
- if (force && start && end != max_blocks
- && (end - start) < cpc->trim_minlen)
+ end = __find_rev_next_zero_bit(dmap,
+ BLKS_PER_SEG(sbi), start + 1);
+ if (force && start && end != BLKS_PER_SEG(sbi) &&
+ (end - start) < cpc->trim_minlen)
continue;
if (check_only)
@@ -2160,8 +2174,8 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
start + 1);
if (section_alignment) {
- start = rounddown(start, sbi->segs_per_sec);
- end = roundup(end, sbi->segs_per_sec);
+ start = rounddown(start, SEGS_PER_SEC(sbi));
+ end = roundup(end, SEGS_PER_SEC(sbi));
}
for (i = start; i < end; i++) {
@@ -2180,7 +2194,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
if (!f2fs_sb_has_blkzoned(sbi) &&
(!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) {
f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
- (end - start) << sbi->log_blocks_per_seg);
+ SEGS_TO_BLKS(sbi, end - start));
continue;
}
next:
@@ -2189,9 +2203,9 @@ next:
if (!IS_CURSEC(sbi, secno) &&
!get_valid_blocks(sbi, start, true))
f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
- sbi->segs_per_sec << sbi->log_blocks_per_seg);
+ BLKS_PER_SEC(sbi));
- start = start_segno + sbi->segs_per_sec;
+ start = start_segno + SEGS_PER_SEC(sbi);
if (start < end)
goto next;
else
@@ -2210,7 +2224,7 @@ next:
find_next:
if (is_valid) {
next_pos = find_next_zero_bit_le(entry->discard_map,
- sbi->blocks_per_seg, cur_pos);
+ BLKS_PER_SEG(sbi), cur_pos);
len = next_pos - cur_pos;
if (f2fs_sb_has_blkzoned(sbi) ||
@@ -2222,13 +2236,13 @@ find_next:
total_len += len;
} else {
next_pos = find_next_bit_le(entry->discard_map,
- sbi->blocks_per_seg, cur_pos);
+ BLKS_PER_SEG(sbi), cur_pos);
}
skip:
cur_pos = next_pos;
is_valid = !is_valid;
- if (cur_pos < sbi->blocks_per_seg)
+ if (cur_pos < BLKS_PER_SEG(sbi))
goto find_next;
release_discard_addr(entry);
@@ -2245,6 +2259,12 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
int err = 0;
+ if (f2fs_sb_has_readonly(sbi)) {
+ f2fs_info(sbi,
+ "Skip to start discard thread for readonly image");
+ return 0;
+ }
+
if (!f2fs_realtime_discard_enable(sbi))
return 0;
@@ -2277,7 +2297,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE;
if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
- dcc->discard_granularity = sbi->blocks_per_seg;
+ dcc->discard_granularity = BLKS_PER_SEG(sbi);
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
dcc->discard_granularity = BLKS_PER_SEC(sbi);
@@ -2291,7 +2311,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
atomic_set(&dcc->queued_discard, 0);
atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
- dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi));
dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST;
dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
@@ -2399,6 +2419,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
#endif
segno = GET_SEGNO(sbi, blkaddr);
+ if (segno == NULL_SEGNO)
+ return;
se = get_seg_entry(sbi, segno);
new_vblocks = se->valid_blocks + del;
@@ -2540,7 +2562,7 @@ static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int typ
struct curseg_info *curseg = CURSEG_I(sbi, type);
if (sbi->ckpt->alloc_type[type] == SSR)
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
return curseg->next_blkoff;
}
@@ -2628,7 +2650,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi,
unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
- if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
+ if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi))
return !test_bit(segno, free_i->free_segmap);
return 0;
}
@@ -2637,54 +2659,51 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi,
* Find a new segment from the free segments bitmap to right order
* This function should be returned with success, otherwise BUG
*/
-static void get_new_segment(struct f2fs_sb_info *sbi,
- unsigned int *newseg, bool new_sec, int dir)
+static int get_new_segment(struct f2fs_sb_info *sbi,
+ unsigned int *newseg, bool new_sec, bool pinning)
{
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno, secno, zoneno;
unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
- unsigned int left_start = hint;
bool init = true;
- int go_left = 0;
int i;
+ int ret = 0;
spin_lock(&free_i->segmap_lock);
- if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
+ if (time_to_inject(sbi, FAULT_NO_SEGMENT)) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+
+ if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) {
segno = find_next_zero_bit(free_i->free_segmap,
GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
goto got_it;
}
+
+ /*
+ * If we format f2fs on zoned storage, let's try to get pinned sections
+ * from beginning of the storage, which should be a conventional one.
+ */
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ segno = pinning ? 0 : max(first_zoned_segno(sbi), *newseg);
+ hint = GET_SEC_FROM_SEG(sbi, segno);
+ }
+
find_other_zone:
secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
if (secno >= MAIN_SECS(sbi)) {
- if (dir == ALLOC_RIGHT) {
- secno = find_first_zero_bit(free_i->free_secmap,
+ secno = find_first_zero_bit(free_i->free_secmap,
MAIN_SECS(sbi));
- f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
- } else {
- go_left = 1;
- left_start = hint - 1;
- }
- }
- if (go_left == 0)
- goto skip_left;
-
- while (test_bit(left_start, free_i->free_secmap)) {
- if (left_start > 0) {
- left_start--;
- continue;
+ if (secno >= MAIN_SECS(sbi)) {
+ ret = -ENOSPC;
+ goto out_unlock;
}
- left_start = find_first_zero_bit(free_i->free_secmap,
- MAIN_SECS(sbi));
- f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
- break;
}
- secno = left_start;
-skip_left:
segno = GET_SEG_FROM_SEC(sbi, secno);
zoneno = GET_ZONE_FROM_SEC(sbi, secno);
@@ -2695,21 +2714,13 @@ skip_left:
goto got_it;
if (zoneno == old_zoneno)
goto got_it;
- if (dir == ALLOC_LEFT) {
- if (!go_left && zoneno + 1 >= total_zones)
- goto got_it;
- if (go_left && zoneno == 0)
- goto got_it;
- }
for (i = 0; i < NR_CURSEG_TYPE; i++)
if (CURSEG_I(sbi, i)->zone == zoneno)
break;
if (i < NR_CURSEG_TYPE) {
/* zone is in user, try another */
- if (go_left)
- hint = zoneno * sbi->secs_per_zone - 1;
- else if (zoneno + 1 >= total_zones)
+ if (zoneno + 1 >= total_zones)
hint = 0;
else
hint = (zoneno + 1) * sbi->secs_per_zone;
@@ -2719,9 +2730,23 @@ skip_left:
got_it:
/* set it as dirty segment in free segmap */
f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
+
+ /* no free section in conventional zone */
+ if (new_sec && pinning &&
+ !f2fs_valid_pinned_area(sbi, START_BLOCK(sbi, segno))) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
__set_inuse(sbi, segno);
*newseg = segno;
+out_unlock:
spin_unlock(&free_i->segmap_lock);
+
+ if (ret == -ENOSPC) {
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT);
+ f2fs_bug_on(sbi, 1);
+ }
+ return ret;
}
static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -2730,6 +2755,10 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
struct summary_footer *sum_footer;
unsigned short seg_type = curseg->seg_type;
+ /* only happen when get_new_segment() fails */
+ if (curseg->next_segno == NULL_SEGNO)
+ return;
+
curseg->inited = true;
curseg->segno = curseg->next_segno;
curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
@@ -2755,9 +2784,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
sanity_check_seg_type(sbi, seg_type);
if (f2fs_need_rand_seg(sbi))
- return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
+ return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi));
- /* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
return curseg->segno;
@@ -2768,8 +2796,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return 0;
- if (test_opt(sbi, NOHEAP) &&
- (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)))
+ if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))
return 0;
if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
@@ -2786,30 +2813,31 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
*/
-static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
+static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- unsigned short seg_type = curseg->seg_type;
unsigned int segno = curseg->segno;
- int dir = ALLOC_LEFT;
+ bool pinning = type == CURSEG_COLD_DATA_PINNED;
+ int ret;
if (curseg->inited)
- write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, segno));
- if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA)
- dir = ALLOC_RIGHT;
-
- if (test_opt(sbi, NOHEAP))
- dir = ALLOC_RIGHT;
+ write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno));
segno = __get_next_segno(sbi, type);
- get_new_segment(sbi, &segno, new_sec, dir);
+ ret = get_new_segment(sbi, &segno, new_sec, pinning);
+ if (ret) {
+ if (ret == -ENOSPC)
+ curseg->segno = NULL_SEGNO;
+ return ret;
+ }
+
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
curseg->alloc_type = LFS;
if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
curseg->fragment_remained_chunk =
get_random_u32_inclusive(1, sbi->max_fragment_chunk);
+ return 0;
}
static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2825,7 +2853,7 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi,
for (i = 0; i < entries; i++)
target_map[i] = ckpt_map[i] | cur_map[i];
- return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
+ return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start);
}
static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi,
@@ -2836,14 +2864,14 @@ static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi,
bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
{
- return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg;
+ return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi);
}
/*
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
-static void change_curseg(struct f2fs_sb_info *sbi, int type)
+static int change_curseg(struct f2fs_sb_info *sbi, int type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2868,21 +2896,23 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
if (IS_ERR(sum_page)) {
/* GC won't be able to use stale summary pages by cp_error */
memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
- return;
+ return PTR_ERR(sum_page);
}
sum_node = (struct f2fs_summary_block *)page_address(sum_page);
memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
f2fs_put_page(sum_page, 1);
+ return 0;
}
static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
int alloc_mode, unsigned long long age);
-static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
+static int get_atssr_segment(struct f2fs_sb_info *sbi, int type,
int target_type, int alloc_mode,
unsigned long long age)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
+ int ret = 0;
curseg->seg_type = target_type;
@@ -2890,38 +2920,41 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
curseg->seg_type = se->type;
- change_curseg(sbi, type);
+ ret = change_curseg(sbi, type);
} else {
/* allocate cold segment by default */
curseg->seg_type = CURSEG_COLD_DATA;
- new_curseg(sbi, type, true);
+ ret = new_curseg(sbi, type, true);
}
stat_inc_seg_type(sbi, curseg);
+ return ret;
}
-static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
+static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC);
+ int ret = 0;
if (!sbi->am.atgc_enabled)
- return;
+ return 0;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
- get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0);
+ ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC,
+ CURSEG_COLD_DATA, SSR, 0);
up_write(&SIT_I(sbi)->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
-
+ return ret;
}
-void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
+int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
{
- __f2fs_init_atgc_curseg(sbi);
+ return __f2fs_init_atgc_curseg(sbi);
}
static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type)
@@ -3049,11 +3082,12 @@ static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
return false;
}
-void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int segno;
+ int ret = 0;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
@@ -3064,9 +3098,9 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
goto unlock;
if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
- change_curseg(sbi, type);
+ ret = change_curseg(sbi, type);
else
- new_curseg(sbi, type, true);
+ ret = new_curseg(sbi, type, true);
stat_inc_seg_type(sbi, curseg);
@@ -3080,45 +3114,84 @@ unlock:
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
+ return ret;
}
-static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
+static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
bool new_sec, bool force)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int old_segno;
+ int err = 0;
+
+ if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited)
+ goto allocate;
if (!force && curseg->inited &&
!curseg->next_blkoff &&
!get_valid_blocks(sbi, curseg->segno, new_sec) &&
!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
- return;
+ return 0;
+allocate:
old_segno = curseg->segno;
- new_curseg(sbi, type, true);
+ err = new_curseg(sbi, type, true);
+ if (err)
+ return err;
stat_inc_seg_type(sbi, curseg);
locate_dirty_segment(sbi, old_segno);
+ return 0;
}
-void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
+int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
{
+ int ret;
+
f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
- __allocate_new_segment(sbi, type, true, force);
+ ret = __allocate_new_segment(sbi, type, true, force);
up_write(&SIT_I(sbi)->sentry_lock);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
+
+ return ret;
+}
+
+int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi)
+{
+ int err;
+ bool gc_required = true;
+
+retry:
+ f2fs_lock_op(sbi);
+ err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
+ f2fs_unlock_op(sbi);
+
+ if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) {
+ f2fs_down_write(&sbi->gc_lock);
+ err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1);
+ f2fs_up_write(&sbi->gc_lock);
+
+ gc_required = false;
+ if (!err)
+ goto retry;
+ }
+
+ return err;
}
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
+int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
+ int err = 0;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
- __allocate_new_segment(sbi, i, false, false);
+ err += __allocate_new_segment(sbi, i, false, false);
up_write(&SIT_I(sbi)->sentry_lock);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
+
+ return err;
}
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
@@ -3236,8 +3309,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
GET_SEGNO(sbi, end);
if (need_align) {
- start_segno = rounddown(start_segno, sbi->segs_per_sec);
- end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1;
+ start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi));
+ end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1;
}
cpc.reason = CP_DISCARD;
@@ -3410,7 +3483,14 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi,
get_random_u32_inclusive(1, sbi->max_fragment_hole);
}
-void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+static void reset_curseg_fields(struct curseg_info *curseg)
+{
+ curseg->inited = false;
+ curseg->segno = NULL_SEGNO;
+ curseg->next_segno = 0;
+}
+
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio)
@@ -3421,12 +3501,18 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
struct seg_entry *se = NULL;
bool segment_full = false;
+ int ret = 0;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
+ if (curseg->segno == NULL_SEGNO) {
+ ret = -ENOSPC;
+ goto out_err;
+ }
+
if (from_gc) {
f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO);
se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr));
@@ -3435,7 +3521,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
}
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg);
+ f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi));
f2fs_wait_discard_bio(sbi, *new_blkaddr);
@@ -3464,25 +3550,35 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
* since SSR needs latest valid block information.
*/
update_sit_entry(sbi, *new_blkaddr, 1);
- if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
- update_sit_entry(sbi, old_blkaddr, -1);
+ update_sit_entry(sbi, old_blkaddr, -1);
/*
* If the current segment is full, flush it out and replace it with a
* new segment.
*/
if (segment_full) {
+ if (type == CURSEG_COLD_DATA_PINNED &&
+ !((curseg->segno + 1) % sbi->segs_per_sec)) {
+ reset_curseg_fields(curseg);
+ goto skip_new_segment;
+ }
+
if (from_gc) {
- get_atssr_segment(sbi, type, se->type,
+ ret = get_atssr_segment(sbi, type, se->type,
AT_SSR, se->mtime);
} else {
if (need_new_seg(sbi, type))
- new_curseg(sbi, type, false);
+ ret = new_curseg(sbi, type, false);
else
- change_curseg(sbi, type);
+ ret = change_curseg(sbi, type);
stat_inc_seg_type(sbi, curseg);
}
+
+ if (ret)
+ goto out_err;
}
+
+skip_new_segment:
/*
* segment dirty status should be updated after segment allocation,
* so we just need to update status only one time after previous
@@ -3491,12 +3587,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
- if (IS_DATASEG(type))
+ if (IS_DATASEG(curseg->seg_type))
atomic64_inc(&sbi->allocated_data_blocks);
up_write(&sit_i->sentry_lock);
- if (page && IS_NODESEG(type)) {
+ if (page && IS_NODESEG(curseg->seg_type)) {
fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
f2fs_inode_chksum_set(sbi, page);
@@ -3505,9 +3601,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
if (fio) {
struct f2fs_bio_info *io;
- if (F2FS_IO_ALIGNED(sbi))
- fio->retry = 0;
-
INIT_LIST_HEAD(&fio->list);
fio->in_list = 1;
io = sbi->write_io[fio->type] + fio->temp;
@@ -3517,8 +3610,15 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
}
mutex_unlock(&curseg->curseg_mutex);
-
f2fs_up_read(&SM_I(sbi)->curseg_lock);
+ return 0;
+out_err:
+ *new_blkaddr = NULL_ADDR;
+ up_write(&sit_i->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
+ return ret;
+
}
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
@@ -3555,21 +3655,25 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
if (keep_order)
f2fs_down_read(&fio->sbi->io_order_lock);
-reallocate:
- f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
- &fio->new_blkaddr, sum, type, fio);
+
+ if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+ &fio->new_blkaddr, sum, type, fio)) {
+ if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host))
+ fscrypt_finalize_bounce_page(&fio->encrypted_page);
+ if (PageWriteback(fio->page))
+ end_page_writeback(fio->page);
+ if (f2fs_in_warm_node_list(fio->sbi, fio->page))
+ f2fs_del_fsync_node_entry(fio->sbi, fio->page);
+ goto out;
+ }
if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr);
/* writeout dirty page into bdev */
f2fs_submit_page_write(fio);
- if (fio->retry) {
- fio->old_blkaddr = fio->new_blkaddr;
- goto reallocate;
- }
f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
-
+out:
if (keep_order)
f2fs_up_read(&fio->sbi->io_order_lock);
}
@@ -3653,8 +3757,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
}
if (fio->post_read)
- invalidate_mapping_pages(META_MAPPING(sbi),
- fio->new_blkaddr, fio->new_blkaddr);
+ f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1);
stat_inc_inplace_blocks(fio->sbi);
@@ -3743,7 +3846,8 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
/* change the current segment */
if (segno != curseg->segno) {
curseg->next_segno = segno;
- change_curseg(sbi, type);
+ if (change_curseg(sbi, type))
+ goto out_unlock;
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -3769,12 +3873,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (recover_curseg) {
if (old_cursegno != curseg->segno) {
curseg->next_segno = old_cursegno;
- change_curseg(sbi, type);
+ if (change_curseg(sbi, type))
+ goto out_unlock;
}
curseg->next_blkoff = old_blkoff;
curseg->alloc_type = old_alloc_type;
}
+out_unlock:
up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_write(&SM_I(sbi)->curseg_lock);
@@ -3844,7 +3950,7 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
for (i = 0; i < len; i++)
f2fs_wait_on_block_writeback(inode, blkaddr + i);
- invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1);
+ f2fs_truncate_meta_inode_pages(sbi, blkaddr, len);
}
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
@@ -3886,7 +3992,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
seg_i->next_blkoff = blk_off;
if (seg_i->alloc_type == SSR)
- blk_off = sbi->blocks_per_seg;
+ blk_off = BLKS_PER_SEG(sbi);
for (j = 0; j < blk_off; j++) {
struct f2fs_summary *s;
@@ -3954,7 +4060,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
struct f2fs_summary *ns = &sum->entries[0];
int i;
- for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
+ for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) {
ns->version = 0;
ns->ofs_in_node = 0;
}
@@ -4460,7 +4566,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
#endif
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
- sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
+ sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs);
sit_i->written_valid_blocks = 0;
sit_i->bitmap_size = sit_bitmap_size;
sit_i->dirty_sentries = 0;
@@ -4533,9 +4639,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
array[i].seg_type = CURSEG_COLD_DATA;
else if (i == CURSEG_ALL_DATA_ATGC)
array[i].seg_type = CURSEG_COLD_DATA;
- array[i].segno = NULL_SEGNO;
- array[i].next_blkoff = 0;
- array[i].inited = false;
+ reset_curseg_fields(&array[i]);
}
return restore_curseg_summaries(sbi);
}
@@ -4587,21 +4691,20 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
- if (f2fs_block_unit_discard(sbi)) {
- /* build discard map only one time */
- if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
- memset(se->discard_map, 0xff,
+ if (!f2fs_block_unit_discard(sbi))
+ goto init_discard_map_done;
+
+ /* build discard map only one time */
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
SIT_VBLOCK_MAP_SIZE);
- } else {
- memcpy(se->discard_map,
- se->cur_valid_map,
+ goto init_discard_map_done;
+ }
+ memcpy(se->discard_map, se->cur_valid_map,
SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks +=
- sbi->blocks_per_seg -
+ sbi->discard_blks += BLKS_PER_SEG(sbi) -
se->valid_blocks;
- }
- }
-
+init_discard_map_done:
if (__is_large_section(sbi))
get_sec_entry(sbi, start)->valid_blocks +=
se->valid_blocks;
@@ -4741,7 +4844,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
return;
mutex_lock(&dirty_i->seglist_lock);
- for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
valid_blocks = get_valid_blocks(sbi, segno, true);
secno = GET_SEC_FROM_SEG(sbi, segno);
@@ -4840,7 +4943,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
if (curseg->alloc_type == SSR)
continue;
- for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) {
+ for (blkofs += 1; blkofs < BLKS_PER_SEG(sbi); blkofs++) {
if (!f2fs_test_bit(blkofs, se->cur_valid_map))
continue;
out:
@@ -4856,6 +4959,16 @@ out:
}
#ifdef CONFIG_BLK_DEV_ZONED
+static const char *f2fs_zone_status[BLK_ZONE_COND_OFFLINE + 1] = {
+ [BLK_ZONE_COND_NOT_WP] = "NOT_WP",
+ [BLK_ZONE_COND_EMPTY] = "EMPTY",
+ [BLK_ZONE_COND_IMP_OPEN] = "IMPLICIT_OPEN",
+ [BLK_ZONE_COND_EXP_OPEN] = "EXPLICIT_OPEN",
+ [BLK_ZONE_COND_CLOSED] = "CLOSED",
+ [BLK_ZONE_COND_READONLY] = "READONLY",
+ [BLK_ZONE_COND_FULL] = "FULL",
+ [BLK_ZONE_COND_OFFLINE] = "OFFLINE",
+};
static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
struct f2fs_dev_info *fdev,
@@ -4865,6 +4978,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
block_t zone_block, valid_block_cnt;
unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
int ret;
+ unsigned int nofs_flags;
if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
@@ -4876,14 +4990,19 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
* Skip check of zones cursegs point to, since
* fix_curseg_write_pointer() checks them.
*/
- if (zone_segno >= MAIN_SEGS(sbi) ||
- IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno)))
+ if (zone_segno >= MAIN_SEGS(sbi))
return 0;
/*
* Get # of valid block of the zone.
*/
valid_block_cnt = get_valid_blocks(sbi, zone_segno, true);
+ if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
+ f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]",
+ zone_segno, valid_block_cnt,
+ f2fs_zone_status[zone->cond]);
+ return 0;
+ }
if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) ||
(valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL))
@@ -4891,8 +5010,8 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
if (!valid_block_cnt) {
f2fs_notice(sbi, "Zone without valid block has non-zero write "
- "pointer. Reset the write pointer: cond[0x%x]",
- zone->cond);
+ "pointer. Reset the write pointer: cond[%s]",
+ f2fs_zone_status[zone->cond]);
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
zone->len >> log_sectors_per_block);
if (ret)
@@ -4909,11 +5028,13 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
* selected for write operation until it get discarded.
*/
f2fs_notice(sbi, "Valid blocks are not aligned with write "
- "pointer: valid block[0x%x,0x%x] cond[0x%x]",
- zone_segno, valid_block_cnt, zone->cond);
+ "pointer: valid block[0x%x,0x%x] cond[%s]",
+ zone_segno, valid_block_cnt, f2fs_zone_status[zone->cond]);
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
- zone->start, zone->len, GFP_NOFS);
+ zone->start, zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret == -EOPNOTSUPP) {
ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
zone->len - (zone->wp - zone->start),
@@ -5119,7 +5240,7 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
unsigned int secno;
if (!sbi->unusable_blocks_per_sec)
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
secno = GET_SEC_FROM_SEG(sbi, segno);
seg_start = START_BLOCK(sbi, segno);
@@ -5134,10 +5255,10 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
*/
if (seg_start >= sec_cap_blkaddr)
return 0;
- if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr)
+ if (seg_start + BLKS_PER_SEG(sbi) > sec_cap_blkaddr)
return sec_cap_blkaddr - seg_start;
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
}
#else
int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
@@ -5163,7 +5284,7 @@ unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
if (f2fs_sb_has_blkzoned(sbi))
return f2fs_usable_zone_blks_in_seg(sbi, segno);
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
}
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
@@ -5172,7 +5293,7 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
if (f2fs_sb_has_blkzoned(sbi))
return CAP_SEGS_PER_SEC(sbi);
- return sbi->segs_per_sec;
+ return SEGS_PER_SEC(sbi);
}
/*
@@ -5187,14 +5308,14 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
sit_i->min_mtime = ULLONG_MAX;
- for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
unsigned int i;
unsigned long long mtime = 0;
- for (i = 0; i < sbi->segs_per_sec; i++)
+ for (i = 0; i < SEGS_PER_SEC(sbi); i++)
mtime += get_seg_entry(sbi, segno + i)->mtime;
- mtime = div_u64(mtime, sbi->segs_per_sec);
+ mtime = div_u64(mtime, SEGS_PER_SEC(sbi));
if (sit_i->min_mtime > mtime)
sit_i->min_mtime = mtime;
@@ -5233,7 +5354,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC);
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
- sm_info->min_seq_blocks = sbi->blocks_per_seg;
+ sm_info->min_seq_blocks = BLKS_PER_SEG(sbi);
sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
sm_info->min_ssr_sections = reserved_sections(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 8129be788bd5..e1c0f418aa11 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -48,21 +48,21 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
#define IS_CURSEC(sbi, secno) \
(((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \
- (sbi)->segs_per_sec) || \
+ SEGS_PER_SEC(sbi)) || \
((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \
- (sbi)->segs_per_sec))
+ SEGS_PER_SEC(sbi)))
#define MAIN_BLKADDR(sbi) \
(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
@@ -77,40 +77,37 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
#define TOTAL_SEGS(sbi) \
(SM_I(sbi) ? SM_I(sbi)->segment_count : \
le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count))
-#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi) (SEGS_TO_BLKS(sbi, TOTAL_SEGS(sbi)))
#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \
(sbi)->log_blocks_per_seg))
#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
- (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
+ (SEGS_TO_BLKS(sbi, GET_R2L_SEGNO(FREE_I(sbi), segno))))
#define NEXT_FREE_BLKADDR(sbi, curseg) \
(START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
+ (BLKS_TO_SEGS(sbi, GET_SEGOFF_FROM_SEG0(sbi, blk_addr)))
#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (BLKS_PER_SEG(sbi) - 1))
#define GET_SEGNO(sbi, blk_addr) \
((!__is_valid_data_blkaddr(blk_addr)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define BLKS_PER_SEC(sbi) \
- ((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
#define CAP_BLKS_PER_SEC(sbi) \
- ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \
- (sbi)->unusable_blocks_per_sec)
+ (BLKS_PER_SEC(sbi) - (sbi)->unusable_blocks_per_sec)
#define CAP_SEGS_PER_SEC(sbi) \
- ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\
- (sbi)->log_blocks_per_seg))
+ (SEGS_PER_SEC(sbi) - \
+ BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec))
#define GET_SEC_FROM_SEG(sbi, segno) \
- (((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec)
+ (((segno) == -1) ? -1 : (segno) / SEGS_PER_SEC(sbi))
#define GET_SEG_FROM_SEC(sbi, secno) \
- ((secno) * (sbi)->segs_per_sec)
+ ((secno) * SEGS_PER_SEC(sbi))
#define GET_ZONE_FROM_SEC(sbi, secno) \
(((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone)
#define GET_ZONE_FROM_SEG(sbi, segno) \
@@ -139,16 +136,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
/*
- * indicate a block allocation direction: RIGHT and LEFT.
- * RIGHT means allocating new sections towards the end of volume.
- * LEFT means the opposite direction.
- */
-enum {
- ALLOC_RIGHT = 0,
- ALLOC_LEFT
-};
-
-/*
* In the victim_sel_policy->alloc_mode, there are three block allocation modes.
* LFS writes data sequentially with cleaning operations.
* SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
@@ -364,7 +351,7 @@ static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
unsigned int blocks = 0;
int i;
- for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) {
+ for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) {
struct seg_entry *se = get_seg_entry(sbi, start_segno);
blocks += se->ckpt_valid_blocks;
@@ -449,7 +436,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
free_i->free_segments++;
next = find_next_bit(free_i->free_segmap,
- start_segno + sbi->segs_per_sec, start_segno);
+ start_segno + SEGS_PER_SEC(sbi), start_segno);
if (next >= start_segno + usable_segs) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
@@ -485,7 +472,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
if (!inmem && IS_CURSEC(sbi, secno))
goto skip_free;
next = find_next_bit(free_i->free_segmap,
- start_segno + sbi->segs_per_sec, start_segno);
+ start_segno + SEGS_PER_SEC(sbi), start_segno);
if (next >= start_segno + usable_segs) {
if (test_and_clear_bit(secno, free_i->free_secmap))
free_i->free_sections++;
@@ -573,23 +560,22 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
unsigned int node_blocks, unsigned int dent_blocks)
{
- unsigned int segno, left_blocks;
+ unsigned segno, left_blocks;
int i;
- /* check current node segment */
+ /* check current node sections in the worst case. */
for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
segno = CURSEG_I(sbi, i)->segno;
- left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
- get_seg_entry(sbi, segno)->ckpt_valid_blocks;
-
+ left_blocks = CAP_BLKS_PER_SEC(sbi) -
+ get_ckpt_valid_blocks(sbi, segno, true);
if (node_blocks > left_blocks)
return false;
}
- /* check current data segment */
+ /* check current data section for dentry blocks. */
segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
- left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
- get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+ left_blocks = CAP_BLKS_PER_SEC(sbi) -
+ get_ckpt_valid_blocks(sbi, segno, true);
if (dent_blocks > left_blocks)
return false;
return true;
@@ -638,7 +624,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
if (free_secs > upper_secs)
return false;
- else if (free_secs <= lower_secs)
+ if (free_secs <= lower_secs)
return true;
return !curseg_space;
}
@@ -793,10 +779,10 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
- if (usable_blks_per_seg < sbi->blocks_per_seg)
+ if (usable_blks_per_seg < BLKS_PER_SEG(sbi))
f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map,
- sbi->blocks_per_seg,
- usable_blks_per_seg) != sbi->blocks_per_seg);
+ BLKS_PER_SEG(sbi),
+ usable_blks_per_seg) != BLKS_PER_SEG(sbi));
/* check segment usage, and check boundary of a given segment number */
if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg
@@ -915,9 +901,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
return 0;
if (type == DATA)
- return sbi->blocks_per_seg;
+ return BLKS_PER_SEG(sbi);
else if (type == NODE)
- return 8 * sbi->blocks_per_seg;
+ return SEGS_TO_BLKS(sbi, 8);
else if (type == META)
return 8 * BIO_MAX_VECS;
else
@@ -969,3 +955,13 @@ wake_up:
dcc->discard_wake = true;
wake_up_interruptible_all(&dcc->discard_wait_queue);
}
+
+static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi)
+{
+ int devi;
+
+ for (devi = 0; devi < sbi->s_ndevs; devi++)
+ if (bdev_is_zoned(FDEV(devi).bdev))
+ return GET_SEGNO(sbi, FDEV(devi).start_blk);
+ return 0;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d45ab0992ae5..a4bc26dfdb1a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -44,24 +44,26 @@ static struct kmem_cache *f2fs_inode_cachep;
#ifdef CONFIG_F2FS_FAULT_INJECTION
const char *f2fs_fault_name[FAULT_MAX] = {
- [FAULT_KMALLOC] = "kmalloc",
- [FAULT_KVMALLOC] = "kvmalloc",
- [FAULT_PAGE_ALLOC] = "page alloc",
- [FAULT_PAGE_GET] = "page get",
- [FAULT_ALLOC_NID] = "alloc nid",
- [FAULT_ORPHAN] = "orphan",
- [FAULT_BLOCK] = "no more block",
- [FAULT_DIR_DEPTH] = "too big dir depth",
- [FAULT_EVICT_INODE] = "evict_inode fail",
- [FAULT_TRUNCATE] = "truncate fail",
- [FAULT_READ_IO] = "read IO error",
- [FAULT_CHECKPOINT] = "checkpoint error",
- [FAULT_DISCARD] = "discard error",
- [FAULT_WRITE_IO] = "write IO error",
- [FAULT_SLAB_ALLOC] = "slab alloc",
- [FAULT_DQUOT_INIT] = "dquot initialize",
- [FAULT_LOCK_OP] = "lock_op",
- [FAULT_BLKADDR] = "invalid blkaddr",
+ [FAULT_KMALLOC] = "kmalloc",
+ [FAULT_KVMALLOC] = "kvmalloc",
+ [FAULT_PAGE_ALLOC] = "page alloc",
+ [FAULT_PAGE_GET] = "page get",
+ [FAULT_ALLOC_NID] = "alloc nid",
+ [FAULT_ORPHAN] = "orphan",
+ [FAULT_BLOCK] = "no more block",
+ [FAULT_DIR_DEPTH] = "too big dir depth",
+ [FAULT_EVICT_INODE] = "evict_inode fail",
+ [FAULT_TRUNCATE] = "truncate fail",
+ [FAULT_READ_IO] = "read IO error",
+ [FAULT_CHECKPOINT] = "checkpoint error",
+ [FAULT_DISCARD] = "discard error",
+ [FAULT_WRITE_IO] = "write IO error",
+ [FAULT_SLAB_ALLOC] = "slab alloc",
+ [FAULT_DQUOT_INIT] = "dquot initialize",
+ [FAULT_LOCK_OP] = "lock_op",
+ [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr",
+ [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr",
+ [FAULT_NO_SEGMENT] = "no free segment",
};
void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -137,7 +139,6 @@ enum {
Opt_resgid,
Opt_resuid,
Opt_mode,
- Opt_io_size_bits,
Opt_fault_injection,
Opt_fault_type,
Opt_lazytime,
@@ -216,7 +217,6 @@ static match_table_t f2fs_tokens = {
{Opt_resgid, "resgid=%u"},
{Opt_resuid, "resuid=%u"},
{Opt_mode, "mode=%s"},
- {Opt_io_size_bits, "io_bits=%u"},
{Opt_fault_injection, "fault_injection=%u"},
{Opt_fault_type, "fault_type=%u"},
{Opt_lazytime, "lazytime"},
@@ -263,7 +263,8 @@ static match_table_t f2fs_tokens = {
{Opt_err, NULL},
};
-void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
+void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
+ const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -274,8 +275,12 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
level = printk_get_level(fmt);
vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
- printk("%c%cF2FS-fs (%s): %pV\n",
- KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ if (limit_rate)
+ printk_ratelimited("%c%cF2FS-fs (%s): %pV\n",
+ KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ else
+ printk("%c%cF2FS-fs (%s): %pV\n",
+ KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
va_end(args);
}
@@ -343,46 +348,6 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).s_resgid));
}
-static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi)
-{
- unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec;
- unsigned int avg_vblocks;
- unsigned int wanted_reserved_segments;
- block_t avail_user_block_count;
-
- if (!F2FS_IO_ALIGNED(sbi))
- return 0;
-
- /* average valid block count in section in worst case */
- avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi);
-
- /*
- * we need enough free space when migrating one section in worst case
- */
- wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) *
- reserved_segments(sbi);
- wanted_reserved_segments -= reserved_segments(sbi);
-
- avail_user_block_count = sbi->user_block_count -
- sbi->current_reserved_blocks -
- F2FS_OPTION(sbi).root_reserved_blocks;
-
- if (wanted_reserved_segments * sbi->blocks_per_seg >
- avail_user_block_count) {
- f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u",
- wanted_reserved_segments,
- avail_user_block_count >> sbi->log_blocks_per_seg);
- return -ENOSPC;
- }
-
- SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments;
-
- f2fs_info(sbi, "IO align feature needs additional reserved segment: %u",
- wanted_reserved_segments);
-
- return 0;
-}
-
static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi)
{
if (!F2FS_OPTION(sbi).unusable_cap_perc)
@@ -663,7 +628,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
#ifdef CONFIG_F2FS_FS_ZSTD
static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
{
- unsigned int level;
+ int level;
int len = 4;
if (strlen(str) == len) {
@@ -677,9 +642,15 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
return -EINVAL;
}
- if (kstrtouint(str + 1, 10, &level))
+ if (kstrtoint(str + 1, 10, &level))
return -EINVAL;
+ /* f2fs does not support negative compress level now */
+ if (level < 0) {
+ f2fs_info(sbi, "do not support negative compress level: %d", level);
+ return -ERANGE;
+ }
+
if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) {
f2fs_info(sbi, "invalid zstd compress level: %d", level);
return -EINVAL;
@@ -763,10 +734,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
clear_opt(sbi, DISCARD);
break;
case Opt_noheap:
- set_opt(sbi, NOHEAP);
- break;
case Opt_heap:
- clear_opt(sbi, NOHEAP);
+ f2fs_warn(sbi, "heap/no_heap options were deprecated");
break;
#ifdef CONFIG_F2FS_FS_XATTR
case Opt_user_xattr:
@@ -913,16 +882,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
- case Opt_io_size_bits:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) {
- f2fs_warn(sbi, "Not support %ld, larger than %d",
- BIT(arg), BIO_MAX_VECS);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).write_io_size_bits = arg;
- break;
#ifdef CONFIG_F2FS_FAULT_INJECTION
case Opt_fault_injection:
if (args->from && match_int(args, &arg))
@@ -1392,12 +1351,6 @@ default_check:
}
#endif
- if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
- f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO",
- F2FS_IO_SIZE_KB(sbi));
- return -EINVAL;
- }
-
if (test_opt(sbi, INLINE_XATTR_SIZE)) {
int min_size, max_size;
@@ -1605,7 +1558,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
for (i = 0; i < sbi->s_ndevs; i++) {
if (i > 0)
- bdev_release(FDEV(i).bdev_handle);
+ bdev_fput(FDEV(i).bdev_file);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
#endif
@@ -1718,7 +1671,6 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_destroy_page_array_cache(sbi);
f2fs_destroy_xattr_caches(sbi);
- mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
@@ -2009,10 +1961,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
} else {
seq_puts(seq, ",nodiscard");
}
- if (test_opt(sbi, NOHEAP))
- seq_puts(seq, ",no_heap");
- else
- seq_puts(seq, ",heap");
#ifdef CONFIG_F2FS_FS_XATTR
if (test_opt(sbi, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -2078,9 +2026,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
F2FS_OPTION(sbi).s_resuid),
from_kgid_munged(&init_user_ns,
F2FS_OPTION(sbi).s_resgid));
- if (F2FS_IO_SIZE_BITS(sbi))
- seq_printf(seq, ",io_bits=%u",
- F2FS_OPTION(sbi).write_io_size_bits);
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (test_opt(sbi, FAULT_INJECTION)) {
seq_printf(seq, ",fault_injection=%u",
@@ -2192,7 +2137,6 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
- set_opt(sbi, NOHEAP);
set_opt(sbi, MERGE_CHECKPOINT);
F2FS_OPTION(sbi).unusable_cap = 0;
sbi->sb->s_flags |= SB_LAZYTIME;
@@ -2247,6 +2191,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
.init_gc_type = FG_GC,
.should_migrate_blocks = false,
.err_gc_skipped = true,
+ .no_bg_gc = true,
.nr_free_secs = 1 };
f2fs_down_write(&sbi->gc_lock);
@@ -2332,7 +2277,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
- bool no_io_align = !F2FS_IO_ALIGNED(sbi);
bool no_atgc = !test_opt(sbi, ATGC);
bool no_discard = !test_opt(sbi, DISCARD);
bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
@@ -2440,12 +2384,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
- err = -EINVAL;
- f2fs_warn(sbi, "switch io_bits option is not allowed");
- goto restore_opts;
- }
-
if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) {
err = -EINVAL;
f2fs_warn(sbi, "switch compress_cache option is not allowed");
@@ -2768,7 +2706,7 @@ int f2fs_dquot_initialize(struct inode *inode)
return dquot_initialize(inode);
}
-static struct dquot **f2fs_get_dquots(struct inode *inode)
+static struct dquot __rcu **f2fs_get_dquots(struct inode *inode)
{
return F2FS_I(inode)->i_dquot;
}
@@ -3706,7 +3644,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
}
main_segs = le32_to_cpu(raw_super->segment_count_main);
- blocks_per_seg = sbi->blocks_per_seg;
+ blocks_per_seg = BLKS_PER_SEG(sbi);
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
@@ -3818,9 +3756,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
sbi->total_sections = le32_to_cpu(raw_super->section_count);
- sbi->total_node_count =
- (le32_to_cpu(raw_super->segment_count_nat) / 2)
- * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
+ sbi->total_node_count = SEGS_TO_BLKS(sbi,
+ ((le32_to_cpu(raw_super->segment_count_nat) / 2) *
+ NAT_ENTRY_PER_BLOCK));
F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino);
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
@@ -3829,7 +3767,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
- sbi->migration_granularity = sbi->segs_per_sec;
+ sbi->migration_granularity = SEGS_PER_SEC(sbi);
sbi->seq_file_ra_mul = MIN_RA_MUL;
sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
@@ -3930,11 +3868,6 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
return 0;
zone_sectors = bdev_zone_sectors(bdev);
- if (!is_power_of_2(zone_sectors)) {
- f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n");
- return -EINVAL;
- }
-
if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
SECTOR_TO_BLOCK(zone_sectors))
return -EINVAL;
@@ -4090,7 +4023,9 @@ static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi)
f2fs_up_write(&sbi->sb_lock);
if (err)
- f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err);
+ f2fs_err_ratelimited(sbi,
+ "f2fs_commit_super fails to record stop_reason, err:%d",
+ err);
}
void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag)
@@ -4133,8 +4068,9 @@ static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error)
err = f2fs_commit_super(sbi, false);
if (err)
- f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d",
- error, err);
+ f2fs_err_ratelimited(sbi,
+ "f2fs_commit_super fails to record errors:%u, err:%d",
+ error, err);
out_unlock:
f2fs_up_write(&sbi->sb_lock);
}
@@ -4247,7 +4183,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
for (i = 0; i < max_devices; i++) {
if (i == 0)
- FDEV(0).bdev_handle = sbi->sb->s_bdev_handle;
+ FDEV(0).bdev_file = sbi->sb->s_bdev_file;
else if (!RDEV(i).path[0])
break;
@@ -4259,22 +4195,22 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
if (i == 0) {
FDEV(i).start_blk = 0;
FDEV(i).end_blk = FDEV(i).start_blk +
- (FDEV(i).total_segments <<
- sbi->log_blocks_per_seg) - 1 +
- le32_to_cpu(raw_super->segment0_blkaddr);
+ SEGS_TO_BLKS(sbi,
+ FDEV(i).total_segments) - 1 +
+ le32_to_cpu(raw_super->segment0_blkaddr);
} else {
FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
FDEV(i).end_blk = FDEV(i).start_blk +
- (FDEV(i).total_segments <<
- sbi->log_blocks_per_seg) - 1;
- FDEV(i).bdev_handle = bdev_open_by_path(
+ SEGS_TO_BLKS(sbi,
+ FDEV(i).total_segments) - 1;
+ FDEV(i).bdev_file = bdev_file_open_by_path(
FDEV(i).path, mode, sbi->sb, NULL);
}
}
- if (IS_ERR(FDEV(i).bdev_handle))
- return PTR_ERR(FDEV(i).bdev_handle);
+ if (IS_ERR(FDEV(i).bdev_file))
+ return PTR_ERR(FDEV(i).bdev_file);
- FDEV(i).bdev = FDEV(i).bdev_handle->bdev;
+ FDEV(i).bdev = file_bdev(FDEV(i).bdev_file);
/* to release errored devices */
sbi->s_ndevs = i + 1;
@@ -4305,8 +4241,6 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
FDEV(i).total_segments,
FDEV(i).start_blk, FDEV(i).end_blk);
}
- f2fs_info(sbi,
- "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi));
return 0;
}
@@ -4496,7 +4430,7 @@ try_onemore:
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
- memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+ super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid));
sb->s_iflags |= SB_I_CGROUPWB;
/* init f2fs-specific super block info */
@@ -4519,19 +4453,10 @@ try_onemore:
if (err)
goto free_iostat;
- if (F2FS_IO_ALIGNED(sbi)) {
- sbi->write_io_dummy =
- mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
- if (!sbi->write_io_dummy) {
- err = -ENOMEM;
- goto free_percpu;
- }
- }
-
/* init per sbi slab cache */
err = f2fs_init_xattr_caches(sbi);
if (err)
- goto free_io_dummy;
+ goto free_percpu;
err = f2fs_init_page_array_cache(sbi);
if (err)
goto free_xattr_cache;
@@ -4619,10 +4544,6 @@ try_onemore:
goto free_nm;
}
- err = adjust_reserved_segment(sbi);
- if (err)
- goto free_nm;
-
/* For write statistics */
sbi->sectors_written_start = f2fs_get_sectors_written(sbi);
@@ -4660,6 +4581,7 @@ try_onemore:
goto free_node_inode;
}
+ generic_set_sb_d_ops(sb);
sb->s_root = d_make_root(root); /* allocate root dentry */
if (!sb->s_root) {
err = -ENOMEM;
@@ -4748,13 +4670,20 @@ reset_checkpoint:
* If the f2fs is not readonly and fsync data recovery succeeds,
* check zoned block devices' write pointer consistency.
*/
- if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) {
- err = f2fs_check_write_pointer(sbi);
- if (err)
- goto free_meta;
+ if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sb)) {
+ int err2;
+
+ f2fs_notice(sbi, "Checking entire write pointers");
+ err2 = f2fs_check_write_pointer(sbi);
+ if (err2)
+ err = err2;
}
+ if (err)
+ goto free_meta;
- f2fs_init_inmem_curseg(sbi);
+ err = f2fs_init_inmem_curseg(sbi);
+ if (err)
+ goto sync_free_meta;
/* f2fs_recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -4853,8 +4782,6 @@ free_page_array_cache:
f2fs_destroy_page_array_cache(sbi);
free_xattr_cache:
f2fs_destroy_xattr_caches(sbi);
-free_io_dummy:
- mempool_destroy(sbi->write_io_dummy);
free_percpu:
destroy_percpu_info(sbi);
free_iostat:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a7ec55c7bb20..a568ce96cf56 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -493,8 +493,8 @@ out:
spin_lock(&sbi->stat_lock);
if (t > (unsigned long)(sbi->user_block_count -
F2FS_OPTION(sbi).root_reserved_blocks -
- sbi->blocks_per_seg *
- SM_I(sbi)->additional_reserved_segments)) {
+ SEGS_TO_BLKS(sbi,
+ SM_I(sbi)->additional_reserved_segments))) {
spin_unlock(&sbi->stat_lock);
return -EINVAL;
}
@@ -551,7 +551,7 @@ out:
}
if (!strcmp(a->attr.name, "migration_granularity")) {
- if (t == 0 || t > sbi->segs_per_sec)
+ if (t == 0 || t > SEGS_PER_SEC(sbi))
return -EINVAL;
}
@@ -1492,6 +1492,50 @@ static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
return 0;
}
+static int __maybe_unused disk_map_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int i;
+
+ seq_printf(seq, "Address Layout : %5luB Block address (# of Segments)\n",
+ F2FS_BLKSIZE);
+ seq_printf(seq, " SB : %12s\n", "0/1024B");
+ seq_printf(seq, " seg0_blkaddr : 0x%010x\n", SEG0_BLKADDR(sbi));
+ seq_printf(seq, " Checkpoint : 0x%010x (%10d)\n",
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr), 2);
+ seq_printf(seq, " SIT : 0x%010x (%10d)\n",
+ SIT_I(sbi)->sit_base_addr,
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_sit));
+ seq_printf(seq, " NAT : 0x%010x (%10d)\n",
+ NM_I(sbi)->nat_blkaddr,
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_nat));
+ seq_printf(seq, " SSA : 0x%010x (%10d)\n",
+ SM_I(sbi)->ssa_blkaddr,
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_ssa));
+ seq_printf(seq, " Main : 0x%010x (%10d)\n",
+ SM_I(sbi)->main_blkaddr,
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main));
+ seq_printf(seq, " # of Sections : %12d\n",
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count));
+ seq_printf(seq, " Segs/Sections : %12d\n",
+ SEGS_PER_SEC(sbi));
+ seq_printf(seq, " Section size : %12d MB\n",
+ SEGS_PER_SEC(sbi) << 1);
+
+ if (!f2fs_is_multi_device(sbi))
+ return 0;
+
+ seq_puts(seq, "\nDisk Map for multi devices:\n");
+ for (i = 0; i < sbi->s_ndevs; i++)
+ seq_printf(seq, "Disk:%2d (zoned=%d): 0x%010x - 0x%010x on %s\n",
+ i, bdev_is_zoned(FDEV(i).bdev),
+ FDEV(i).start_blk, FDEV(i).end_blk,
+ FDEV(i).path);
+ return 0;
+}
+
int __init f2fs_init_sysfs(void)
{
int ret;
@@ -1573,6 +1617,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
victim_bits_seq_show, sb);
proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
discard_plist_seq_show, sb);
+ proc_create_single_data("disk_map", 0444, sbi->s_proc,
+ disk_map_seq_show, sb);
return 0;
put_feature_list_kobj:
kobject_put(&sbi->s_feature_list_kobj);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 4fc95f353a7a..f7bb0c54502c 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -258,21 +258,23 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- struct page *page;
+ struct folio *folio;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
- page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
- if (!page || !PageUptodate(page)) {
+ folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
- if (page)
- put_page(page);
+ if (!IS_ERR(folio))
+ folio_put(folio);
else if (num_ra_pages > 1)
page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
- page = read_mapping_page(inode->i_mapping, index, NULL);
+ folio = read_mapping_folio(inode->i_mapping, index, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
- return page;
+ return folio_file_page(folio, index);
}
static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 738e427e2d21..2af424e200b3 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -47,7 +47,7 @@ int __init fat_cache_init(void)
{
fat_cache_cachep = kmem_cache_create("fat_cache",
sizeof(struct fat_cache),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ 0, SLAB_RECLAIM_ACCOUNT,
init_once);
if (fat_cache_cachep == NULL)
return -ENOMEM;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1fac3dabf130..d9e6fbb6f246 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -787,7 +787,7 @@ static int __init fat_init_inodecache(void)
fat_inode_cachep = kmem_cache_create("fat_inode_cache",
sizeof(struct msdos_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (fat_inode_cachep == NULL)
return -ENOMEM;
@@ -1762,6 +1762,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
else /* fat 16 or 12 */
sbi->vol_id = bpb.fat16_vol_id;
+ __le32 vol_id_le = cpu_to_le32(sbi->vol_id);
+ super_set_uuid(sb, (void *) &vol_id_le, sizeof(vol_id_le));
+
sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index c52e63e10d35..509eea96a457 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -130,6 +130,12 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
fid->parent_i_gen = parent->i_generation;
type = FILEID_FAT_WITH_PARENT;
*lenp = FAT_FID_SIZE_WITH_PARENT;
+ } else {
+ /*
+ * We need to initialize this field because the fh is actually
+ * 12 bytes long
+ */
+ fid->parent_i_pos_hi = 0;
}
return type;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..54cc85d3338e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -27,6 +27,7 @@
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
+#include <linux/rw_hint.h>
#include <linux/poll.h>
#include <asm/siginfo.h>
@@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
}
#endif
-static bool rw_hint_valid(enum rw_hint hint)
+static bool rw_hint_valid(u64 hint)
{
+ BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
+ BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
+ BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
+ BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
+ BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
+ BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
+
switch (hint) {
case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
@@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint)
}
}
-static long fcntl_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
- enum rw_hint hint;
- u64 h;
+ u64 hint = READ_ONCE(inode->i_write_hint);
- switch (cmd) {
- case F_GET_RW_HINT:
- h = inode->i_write_hint;
- if (copy_to_user(argp, &h, sizeof(*argp)))
- return -EFAULT;
- return 0;
- case F_SET_RW_HINT:
- if (copy_from_user(&h, argp, sizeof(h)))
- return -EFAULT;
- hint = (enum rw_hint) h;
- if (!rw_hint_valid(hint))
- return -EINVAL;
+ if (copy_to_user(argp, &hint, sizeof(*argp)))
+ return -EFAULT;
+ return 0;
+}
- inode_lock(inode);
- inode->i_write_hint = hint;
- inode_unlock(inode);
- return 0;
- default:
+static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ u64 __user *argp = (u64 __user *)arg;
+ u64 hint;
+
+ if (copy_from_user(&hint, argp, sizeof(hint)))
+ return -EFAULT;
+ if (!rw_hint_valid(hint))
return -EINVAL;
- }
+
+ WRITE_ONCE(inode->i_write_hint, hint);
+
+ /*
+ * file->f_mapping->host may differ from inode. As an example,
+ * blkdev_open() modifies file->f_mapping.
+ */
+ if (file->f_mapping->host != inode)
+ WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);
+
+ return 0;
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
+ err = fcntl_get_rw_hint(filp, cmd, arg);
+ break;
case F_SET_RW_HINT:
- err = fcntl_rw_hint(filp, cmd, arg);
+ err = fcntl_set_rw_hint(filp, cmd, arg);
break;
default:
break;
@@ -846,12 +862,6 @@ int send_sigurg(struct fown_struct *fown)
static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __ro_after_init;
-static void fasync_free_rcu(struct rcu_head *head)
-{
- kmem_cache_free(fasync_cache,
- container_of(head, struct fasync_struct, fa_rcu));
-}
-
/*
* Remove a fasync entry. If successfully removed, return
* positive and clear the FASYNC flag. If no entry exists,
@@ -877,7 +887,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
write_unlock_irq(&fa->fa_lock);
*fp = fa->fa_next;
- call_rcu(&fa->fa_rcu, fasync_free_rcu);
+ kfree_rcu(fa, fa_rcu);
filp->f_flags &= ~FASYNC;
result = 1;
break;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 18b3ba8dc8ea..57a12614addf 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path,
if (f_handle.handle_bytes > MAX_HANDLE_SZ)
return -EINVAL;
- handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+ handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
GFP_KERNEL);
if (!handle)
return -ENOMEM;
diff --git a/fs/file_table.c b/fs/file_table.c
index b991f90571b4..4f03beed4737 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -26,7 +26,6 @@
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
-#include <linux/ima.h>
#include <linux/swap.h>
#include <linux/kmemleak.h>
@@ -276,21 +275,15 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
}
/**
- * alloc_file - allocate and initialize a 'struct file'
+ * file_init_path - initialize a 'struct file' based on path
*
+ * @file: the file to set up
* @path: the (dentry, vfsmount) pair for the new file
- * @flags: O_... flags with which the new file will be opened
* @fop: the 'struct file_operations' for the new file
*/
-static struct file *alloc_file(const struct path *path, int flags,
- const struct file_operations *fop)
+static void file_init_path(struct file *file, const struct path *path,
+ const struct file_operations *fop)
{
- struct file *file;
-
- file = alloc_empty_file(flags, current_cred());
- if (IS_ERR(file))
- return file;
-
file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
@@ -309,22 +302,51 @@ static struct file *alloc_file(const struct path *path, int flags,
file->f_op = fop;
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(path->dentry->d_inode);
+}
+
+/**
+ * alloc_file - allocate and initialize a 'struct file'
+ *
+ * @path: the (dentry, vfsmount) pair for the new file
+ * @flags: O_... flags with which the new file will be opened
+ * @fop: the 'struct file_operations' for the new file
+ */
+static struct file *alloc_file(const struct path *path, int flags,
+ const struct file_operations *fop)
+{
+ struct file *file;
+
+ file = alloc_empty_file(flags, current_cred());
+ if (!IS_ERR(file))
+ file_init_path(file, path, fop);
return file;
}
-struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
- const char *name, int flags,
- const struct file_operations *fops)
+static inline int alloc_path_pseudo(const char *name, struct inode *inode,
+ struct vfsmount *mnt, struct path *path)
{
struct qstr this = QSTR_INIT(name, strlen(name));
+
+ path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
+ if (!path->dentry)
+ return -ENOMEM;
+ path->mnt = mntget(mnt);
+ d_instantiate(path->dentry, inode);
+ return 0;
+}
+
+struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
+ const char *name, int flags,
+ const struct file_operations *fops)
+{
+ int ret;
struct path path;
struct file *file;
- path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
- if (!path.dentry)
- return ERR_PTR(-ENOMEM);
- path.mnt = mntget(mnt);
- d_instantiate(path.dentry, inode);
+ ret = alloc_path_pseudo(name, inode, mnt, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
file = alloc_file(&path, flags, fops);
if (IS_ERR(file)) {
ihold(inode);
@@ -334,6 +356,30 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
}
EXPORT_SYMBOL(alloc_file_pseudo);
+struct file *alloc_file_pseudo_noaccount(struct inode *inode,
+ struct vfsmount *mnt, const char *name,
+ int flags,
+ const struct file_operations *fops)
+{
+ int ret;
+ struct path path;
+ struct file *file;
+
+ ret = alloc_path_pseudo(name, inode, mnt, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ file = alloc_empty_file_noaccount(flags, current_cred());
+ if (IS_ERR(file)) {
+ ihold(inode);
+ path_put(&path);
+ return file;
+ }
+ file_init_path(file, &path, fops);
+ return file;
+}
+EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
+
struct file *alloc_file_clone(struct file *base, int flags,
const struct file_operations *fops)
{
@@ -367,7 +413,7 @@ static void __fput(struct file *file)
eventpoll_release(file);
locks_remove_file(file);
- ima_file_free(file);
+ security_file_release(file);
if (unlikely(file->f_flags & FASYNC)) {
if (file->f_op->fasync)
file->f_op->fasync(-1, file, 0);
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e6e2a2185e7c..42e03b6b1cc7 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -307,7 +307,7 @@ vxfs_init(void)
vxfs_inode_cachep = kmem_cache_create_usercopy("vxfs_inode",
sizeof(struct vxfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT,
offsetof(struct vxfs_inode_info, vii_immed.vi_immed),
sizeof_field(struct vxfs_inode_info,
vii_immed.vi_immed),
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d84fcc471c6..e4f17c53ddfc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb)
spin_unlock_irq(&wb->work_lock);
}
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+static void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+ unsigned long timeout;
+
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ spin_lock_irq(&wb->work_lock);
+ if (test_bit(WB_registered, &wb->state))
+ queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+ spin_unlock_irq(&wb->work_lock);
+}
+
static void finish_writeback_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index edb3712dcfa5..a4d6ca0b8971 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -83,8 +83,8 @@ static const struct fs_parameter_spec *fs_lookup_key(
}
/*
- * fs_parse - Parse a filesystem configuration parameter
- * @fc: The filesystem context to log errors through.
+ * __fs_parse - Parse a filesystem configuration parameter
+ * @log: The filesystem context to log errors through.
* @desc: The parameter description to use.
* @param: The parameter.
* @result: Where to place the result of the parse
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 038ed0b9aaa5..8674dbfbe59d 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -52,3 +52,14 @@ config FUSE_DAX
If you want to allow mounting a Virtio Filesystem with the "dax"
option, answer Y.
+
+config FUSE_PASSTHROUGH
+ bool "FUSE passthrough operations support"
+ default y
+ depends on FUSE_FS
+ select FS_STACK
+ help
+ This allows bypassing FUSE server by mapping specific FUSE operations
+ to be performed directly on a backing file.
+
+ If you want to allow passthrough operations, answer Y.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 0c48b35c058d..6e0228c6d0cb 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -8,6 +8,8 @@ obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
+fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 284a35006462..97ac994ff78f 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
if (!fc)
goto out;
- down_read(&fc->killsb);
- spin_lock(&fc->bg_lock);
- fc->congestion_threshold = val;
- spin_unlock(&fc->bg_lock);
- up_read(&fc->killsb);
+ WRITE_ONCE(fc->congestion_threshold, val);
fuse_conn_put(fc);
out:
return ret;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1a8f82f478cb..3ec8bb5e68ff 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1775,6 +1775,61 @@ copy_finish:
return err;
}
+/*
+ * Resending all processing queue requests.
+ *
+ * During a FUSE daemon panics and failover, it is possible for some inflight
+ * requests to be lost and never returned. As a result, applications awaiting
+ * replies would become stuck forever. To address this, we can use notification
+ * to trigger resending of these pending requests to the FUSE daemon, ensuring
+ * they are properly processed again.
+ *
+ * Please note that this strategy is applicable only to idempotent requests or
+ * if the FUSE daemon takes careful measures to avoid processing duplicated
+ * non-idempotent requests.
+ */
+static void fuse_resend(struct fuse_conn *fc)
+{
+ struct fuse_dev *fud;
+ struct fuse_req *req, *next;
+ struct fuse_iqueue *fiq = &fc->iq;
+ LIST_HEAD(to_queue);
+ unsigned int i;
+
+ spin_lock(&fc->lock);
+ if (!fc->connected) {
+ spin_unlock(&fc->lock);
+ return;
+ }
+
+ list_for_each_entry(fud, &fc->devices, entry) {
+ struct fuse_pqueue *fpq = &fud->pq;
+
+ spin_lock(&fpq->lock);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ list_splice_tail_init(&fpq->processing[i], &to_queue);
+ spin_unlock(&fpq->lock);
+ }
+ spin_unlock(&fc->lock);
+
+ list_for_each_entry_safe(req, next, &to_queue, list) {
+ __set_bit(FR_PENDING, &req->flags);
+ /* mark the request as resend request */
+ req->in.h.unique |= FUSE_UNIQUE_RESEND;
+ }
+
+ spin_lock(&fiq->lock);
+ /* iq and pq requests are both oldest to newest */
+ list_splice(&to_queue, &fiq->pending);
+ fiq->ops->wake_pending_and_unlock(fiq);
+}
+
+static int fuse_notify_resend(struct fuse_conn *fc)
+{
+ fuse_resend(fc);
+ return 0;
+}
+
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
@@ -1800,6 +1855,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_DELETE:
return fuse_notify_delete(fc, size, cs);
+ case FUSE_NOTIFY_RESEND:
+ return fuse_notify_resend(fc);
+
default:
fuse_copy_finish(cs);
return -EINVAL;
@@ -2251,43 +2309,91 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
return 0;
}
-static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
{
int res;
int oldfd;
struct fuse_dev *fud = NULL;
struct fd f;
+ if (get_user(oldfd, argp))
+ return -EFAULT;
+
+ f = fdget(oldfd);
+ if (!f.file)
+ return -EINVAL;
+
+ /*
+ * Check against file->f_op because CUSE
+ * uses the same ioctl handler.
+ */
+ if (f.file->f_op == file->f_op)
+ fud = fuse_get_dev(f.file);
+
+ res = -EINVAL;
+ if (fud) {
+ mutex_lock(&fuse_mutex);
+ res = fuse_device_clone(fud->fc, file);
+ mutex_unlock(&fuse_mutex);
+ }
+
+ fdput(f);
+ return res;
+}
+
+static long fuse_dev_ioctl_backing_open(struct file *file,
+ struct fuse_backing_map __user *argp)
+{
+ struct fuse_dev *fud = fuse_get_dev(file);
+ struct fuse_backing_map map;
+
+ if (!fud)
+ return -EPERM;
+
+ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&map, argp, sizeof(map)))
+ return -EFAULT;
+
+ return fuse_backing_open(fud->fc, &map);
+}
+
+static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
+{
+ struct fuse_dev *fud = fuse_get_dev(file);
+ int backing_id;
+
+ if (!fud)
+ return -EPERM;
+
+ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ return -EOPNOTSUPP;
+
+ if (get_user(backing_id, argp))
+ return -EFAULT;
+
+ return fuse_backing_close(fud->fc, backing_id);
+}
+
+static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+
switch (cmd) {
case FUSE_DEV_IOC_CLONE:
- if (get_user(oldfd, (__u32 __user *)arg))
- return -EFAULT;
+ return fuse_dev_ioctl_clone(file, argp);
- f = fdget(oldfd);
- if (!f.file)
- return -EINVAL;
+ case FUSE_DEV_IOC_BACKING_OPEN:
+ return fuse_dev_ioctl_backing_open(file, argp);
+
+ case FUSE_DEV_IOC_BACKING_CLOSE:
+ return fuse_dev_ioctl_backing_close(file, argp);
- /*
- * Check against file->f_op because CUSE
- * uses the same ioctl handler.
- */
- if (f.file->f_op == file->f_op)
- fud = fuse_get_dev(f.file);
-
- res = -EINVAL;
- if (fud) {
- mutex_lock(&fuse_mutex);
- res = fuse_device_clone(fud->fc, file);
- mutex_unlock(&fuse_mutex);
- }
- fdput(f);
- break;
default:
- res = -ENOTTY;
- break;
+ return -ENOTTY;
}
- return res;
}
const struct file_operations fuse_dev_operations = {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d19cbf34c634..4a6df591add6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -391,6 +391,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
err = -EIO;
if (fuse_invalid_attr(&outarg->attr))
goto out_put_forget;
+ if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) {
+ pr_warn_once("root generation should be zero\n");
+ outarg->generation = 0;
+ }
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
&outarg->attr, ATTR_TIMEOUT(outarg),
@@ -615,7 +619,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
FUSE_ARGS(args);
struct fuse_forget_link *forget;
struct fuse_create_in inarg;
- struct fuse_open_out outopen;
+ struct fuse_open_out *outopenp;
struct fuse_entry_out outentry;
struct fuse_inode *fi;
struct fuse_file *ff;
@@ -630,7 +634,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
goto out_err;
err = -ENOMEM;
- ff = fuse_file_alloc(fm);
+ ff = fuse_file_alloc(fm, true);
if (!ff)
goto out_put_forget_req;
@@ -659,8 +663,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.out_numargs = 2;
args.out_args[0].size = sizeof(outentry);
args.out_args[0].value = &outentry;
- args.out_args[1].size = sizeof(outopen);
- args.out_args[1].value = &outopen;
+ /* Store outarg for fuse_finish_open() */
+ outopenp = &ff->args->open_outarg;
+ args.out_args[1].size = sizeof(*outopenp);
+ args.out_args[1].value = outopenp;
err = get_create_ext(&args, dir, entry, mode);
if (err)
@@ -676,9 +682,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
fuse_invalid_attr(&outentry.attr))
goto out_free_ff;
- ff->fh = outopen.fh;
+ ff->fh = outopenp->fh;
ff->nodeid = outentry.nodeid;
- ff->open_flags = outopen.open_flags;
+ ff->open_flags = outopenp->open_flags;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
&outentry.attr, ATTR_TIMEOUT(&outentry), 0);
if (!inode) {
@@ -692,13 +698,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
d_instantiate(entry, inode);
fuse_change_entry_timeout(entry, &outentry);
fuse_dir_changed(dir);
- err = finish_open(file, entry, generic_file_open);
+ err = generic_file_open(inode, file);
+ if (!err) {
+ file->private_data = ff;
+ err = finish_open(file, entry, fuse_finish_open);
+ }
if (err) {
fi = get_fuse_inode(inode);
fuse_sync_release(fi, ff, flags);
} else {
- file->private_data = ff;
- fuse_finish_open(inode, file);
if (fm->fc->atomic_o_trunc && trunc)
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
@@ -1210,7 +1218,7 @@ static int fuse_do_statx(struct inode *inode, struct file *file,
if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) ||
((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) ||
inode_wrong_type(inode, sx->mode)))) {
- make_bad_inode(inode);
+ fuse_make_bad(inode);
return -EIO;
}
@@ -1485,7 +1493,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
*
* 1) Local access checking ('default_permissions' mount option) based
* on file mode. This is the plain old disk filesystem permission
- * modell.
+ * model.
*
* 2) "Remote" access checking, where server is responsible for
* checking permission in each inode operation. An exception to this
@@ -1630,7 +1638,30 @@ out_err:
static int fuse_dir_open(struct inode *inode, struct file *file)
{
- return fuse_open_common(inode, file, true);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ int err;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ err = generic_file_open(inode, file);
+ if (err)
+ return err;
+
+ err = fuse_do_open(fm, get_node_id(inode), file, true);
+ if (!err) {
+ struct fuse_file *ff = file->private_data;
+
+ /*
+ * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for
+ * directories for backward compatibility, though it's unlikely
+ * to be useful.
+ */
+ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE))
+ nonseekable_open(inode, file);
+ }
+
+ return err;
}
static int fuse_dir_release(struct inode *inode, struct file *file)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 148a71b8b4d0..a56e7bffd000 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -20,6 +20,7 @@
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/splice.h>
+#include <linux/task_io_accounting_ops.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -50,13 +51,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
return fuse_simple_request(fm, &args);
}
-struct fuse_release_args {
- struct fuse_args args;
- struct fuse_release_in inarg;
- struct inode *inode;
-};
-
-struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
{
struct fuse_file *ff;
@@ -65,15 +60,15 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
return NULL;
ff->fm = fm;
- ff->release_args = kzalloc(sizeof(*ff->release_args),
- GFP_KERNEL_ACCOUNT);
- if (!ff->release_args) {
- kfree(ff);
- return NULL;
+ if (release) {
+ ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT);
+ if (!ff->args) {
+ kfree(ff);
+ return NULL;
+ }
}
INIT_LIST_HEAD(&ff->write_entry);
- mutex_init(&ff->readdir.lock);
refcount_set(&ff->count, 1);
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
@@ -85,8 +80,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
void fuse_file_free(struct fuse_file *ff)
{
- kfree(ff->release_args);
- mutex_destroy(&ff->readdir.lock);
+ kfree(ff->args);
kfree(ff);
}
@@ -105,14 +99,17 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
kfree(ra);
}
-static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
{
if (refcount_dec_and_test(&ff->count)) {
- struct fuse_args *args = &ff->release_args->args;
+ struct fuse_release_args *ra = &ff->args->release_args;
+ struct fuse_args *args = (ra ? &ra->args : NULL);
- if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
- /* Do nothing when client does not implement 'open' */
- fuse_release_end(ff->fm, args, 0);
+ if (ra && ra->inode)
+ fuse_file_io_release(ff, ra->inode);
+
+ if (!args) {
+ /* Do nothing when server does not implement 'open' */
} else if (sync) {
fuse_simple_request(ff->fm, args);
fuse_release_end(ff->fm, args, 0);
@@ -132,27 +129,31 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
struct fuse_conn *fc = fm->fc;
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+ bool open = isdir ? !fc->no_opendir : !fc->no_open;
- ff = fuse_file_alloc(fm);
+ ff = fuse_file_alloc(fm, open);
if (!ff)
return ERR_PTR(-ENOMEM);
ff->fh = 0;
/* Default for no-open */
ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
- if (isdir ? !fc->no_opendir : !fc->no_open) {
- struct fuse_open_out outarg;
+ if (open) {
+ /* Store outarg for fuse_finish_open() */
+ struct fuse_open_out *outargp = &ff->args->open_outarg;
int err;
- err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
+ err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
if (!err) {
- ff->fh = outarg.fh;
- ff->open_flags = outarg.open_flags;
-
+ ff->fh = outargp->fh;
+ ff->open_flags = outargp->open_flags;
} else if (err != -ENOSYS) {
fuse_file_free(ff);
return ERR_PTR(err);
} else {
+ /* No release needed */
+ kfree(ff->args);
+ ff->args = NULL;
if (isdir)
fc->no_opendir = 1;
else
@@ -195,40 +196,50 @@ static void fuse_link_write_file(struct file *file)
spin_unlock(&fi->lock);
}
-void fuse_finish_open(struct inode *inode, struct file *file)
+int fuse_finish_open(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ err = fuse_file_io_open(file, inode);
+ if (err)
+ return err;
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
- if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
- i_size_write(inode, 0);
- spin_unlock(&fi->lock);
- file_update_time(file);
- fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
- }
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
+
+ return 0;
}
-int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ i_size_write(inode, 0);
+ spin_unlock(&fi->lock);
+ file_update_time(file);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+}
+
+static int fuse_open(struct inode *inode, struct file *file)
{
struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = fm->fc;
+ struct fuse_file *ff;
int err;
- bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
- fc->atomic_o_trunc &&
- fc->writeback_cache;
- bool dax_truncate = (file->f_flags & O_TRUNC) &&
- fc->atomic_o_trunc && FUSE_IS_DAX(inode);
+ bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
+ bool is_wb_truncate = is_truncate && fc->writeback_cache;
+ bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
if (fuse_is_bad(inode))
return -EIO;
@@ -250,16 +261,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (is_wb_truncate || dax_truncate)
fuse_set_nowrite(inode);
- err = fuse_do_open(fm, get_node_id(inode), file, isdir);
- if (!err)
- fuse_finish_open(inode, file);
+ err = fuse_do_open(fm, get_node_id(inode), file, false);
+ if (!err) {
+ ff = file->private_data;
+ err = fuse_finish_open(inode, file);
+ if (err)
+ fuse_sync_release(fi, ff, file->f_flags);
+ else if (is_truncate)
+ fuse_truncate_update_attr(inode, file);
+ }
if (is_wb_truncate || dax_truncate)
fuse_release_nowrite(inode);
if (!err) {
- struct fuse_file *ff = file->private_data;
-
- if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+ if (is_truncate)
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
@@ -274,10 +289,13 @@ out_inode_unlock:
}
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
- unsigned int flags, int opcode)
+ unsigned int flags, int opcode, bool sync)
{
struct fuse_conn *fc = ff->fm->fc;
- struct fuse_release_args *ra = ff->release_args;
+ struct fuse_release_args *ra = &ff->args->release_args;
+
+ if (fuse_file_passthrough(ff))
+ fuse_passthrough_release(ff, fuse_inode_backing(fi));
/* Inode is NULL on error path of fuse_create_open() */
if (likely(fi)) {
@@ -292,6 +310,11 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
wake_up_interruptible_all(&ff->poll_wait);
+ if (!ra)
+ return;
+
+ /* ff->args was used for open outarg */
+ memset(ff->args, 0, sizeof(*ff->args));
ra->inarg.fh = ff->fh;
ra->inarg.flags = flags;
ra->args.in_numargs = 1;
@@ -301,23 +324,28 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
ra->args.nodeid = ff->nodeid;
ra->args.force = true;
ra->args.nocreds = true;
+
+ /*
+ * Hold inode until release is finished.
+ * From fuse_sync_release() the refcount is 1 and everything's
+ * synchronous, so we are fine with not doing igrab() here.
+ */
+ ra->inode = sync ? NULL : igrab(&fi->inode);
}
void fuse_file_release(struct inode *inode, struct fuse_file *ff,
unsigned int open_flags, fl_owner_t id, bool isdir)
{
struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_release_args *ra = ff->release_args;
+ struct fuse_release_args *ra = &ff->args->release_args;
int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
- fuse_prepare_release(fi, ff, open_flags, opcode);
+ fuse_prepare_release(fi, ff, open_flags, opcode, false);
- if (ff->flock) {
+ if (ra && ff->flock) {
ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
}
- /* Hold inode until release is finished */
- ra->inode = igrab(inode);
/*
* Normally this will send the RELEASE request, however if
@@ -328,7 +356,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
- fuse_file_put(ff, ff->fm->fc->destroy, isdir);
+ fuse_file_put(ff, ff->fm->fc->destroy);
}
void fuse_release_common(struct file *file, bool isdir)
@@ -337,11 +365,6 @@ void fuse_release_common(struct file *file, bool isdir)
(fl_owner_t) file, isdir);
}
-static int fuse_open(struct inode *inode, struct file *file)
-{
- return fuse_open_common(inode, file, false);
-}
-
static int fuse_release(struct inode *inode, struct file *file)
{
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -363,12 +386,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
unsigned int flags)
{
WARN_ON(refcount_read(&ff->count) > 1);
- fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
- /*
- * iput(NULL) is a no-op and since the refcount is 1 and everything's
- * synchronous, we are fine with not doing igrab() here"
- */
- fuse_file_put(ff, true, false);
+ fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
+ fuse_file_put(ff, true);
}
EXPORT_SYMBOL_GPL(fuse_sync_release);
@@ -634,7 +653,8 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap,
for (i = 0; i < ap->num_pages; i++) {
if (should_dirty)
set_page_dirty_lock(ap->pages[i]);
- put_page(ap->pages[i]);
+ if (ap->args.is_pinned)
+ unpin_user_page(ap->pages[i]);
}
}
@@ -925,7 +945,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
put_page(page);
}
if (ia->ff)
- fuse_file_put(ia->ff, false, false);
+ fuse_file_put(ia->ff, false);
fuse_io_free(ia);
}
@@ -1299,13 +1319,93 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
return res;
}
+static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
+/*
+ * @return true if an exclusive lock for direct IO writes is needed
+ */
+static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /* Server side has to advise that it supports parallel dio writes. */
+ if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
+ return true;
+
+ /*
+ * Append will need to know the eventual EOF - always needs an
+ * exclusive lock.
+ */
+ if (iocb->ki_flags & IOCB_APPEND)
+ return true;
+
+ /* shared locks are not allowed with parallel page cache IO */
+ if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
+ return false;
+
+ /* Parallel dio beyond EOF is not supported, at least for now. */
+ if (fuse_io_past_eof(iocb, from))
+ return true;
+
+ return false;
+}
+
+static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
+ bool *exclusive)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_file *ff = iocb->ki_filp->private_data;
+
+ *exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
+ if (*exclusive) {
+ inode_lock(inode);
+ } else {
+ inode_lock_shared(inode);
+ /*
+ * New parallal dio allowed only if inode is not in caching
+ * mode and denies new opens in caching mode. This check
+ * should be performed only after taking shared inode lock.
+ * Previous past eof check was without inode lock and might
+ * have raced, so check it again.
+ */
+ if (fuse_io_past_eof(iocb, from) ||
+ fuse_file_uncached_io_start(inode, ff, NULL) != 0) {
+ inode_unlock_shared(inode);
+ inode_lock(inode);
+ *exclusive = true;
+ }
+ }
+}
+
+static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_file *ff = iocb->ki_filp->private_data;
+
+ if (exclusive) {
+ inode_unlock(inode);
+ } else {
+ /* Allow opens in caching mode after last parallel dio end */
+ fuse_file_uncached_io_end(inode, ff);
+ inode_unlock_shared(inode);
+ }
+}
+
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
ssize_t written = 0;
struct inode *inode = mapping->host;
- ssize_t err;
+ ssize_t err, count;
struct fuse_conn *fc = get_fuse_conn(inode);
if (fc->writeback_cache) {
@@ -1327,10 +1427,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
writethrough:
inode_lock(inode);
- err = generic_write_checks(iocb, from);
+ err = count = generic_write_checks(iocb, from);
if (err <= 0)
goto out;
+ task_io_account_write(count);
+
err = file_remove_privs(file);
if (err)
goto out;
@@ -1392,10 +1494,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
while (nbytes < *nbytesp && ap->num_pages < max_pages) {
unsigned npages;
size_t start;
- ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
- *nbytesp - nbytes,
- max_pages - ap->num_pages,
- &start);
+ struct page **pt_pages;
+
+ pt_pages = &ap->pages[ap->num_pages];
+ ret = iov_iter_extract_pages(ii, &pt_pages,
+ *nbytesp - nbytes,
+ max_pages - ap->num_pages,
+ 0, &start);
if (ret < 0)
break;
@@ -1412,6 +1517,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
+ ap->args.is_pinned = iov_iter_extract_will_pin(ii);
ap->args.user_pages = true;
if (write)
ap->args.in_pages = true;
@@ -1558,51 +1664,17 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
return res;
}
-static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
- struct iov_iter *iter)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
-
- return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
-}
-
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct file *file = iocb->ki_filp;
- struct fuse_file *ff = file->private_data;
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
- bool exclusive_lock =
- !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
- get_fuse_conn(inode)->direct_io_allow_mmap ||
- iocb->ki_flags & IOCB_APPEND ||
- fuse_direct_write_extending_i_size(iocb, from);
-
- /*
- * Take exclusive lock if
- * - Parallel direct writes are disabled - a user space decision
- * - Parallel direct writes are enabled and i_size is being extended.
- * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP).
- * This might not be needed at all, but needs further investigation.
- */
- if (exclusive_lock)
- inode_lock(inode);
- else {
- inode_lock_shared(inode);
-
- /* A race with truncate might have come up as the decision for
- * the lock type was done without holding the lock, check again.
- */
- if (fuse_direct_write_extending_i_size(iocb, from)) {
- inode_unlock_shared(inode);
- inode_lock(inode);
- exclusive_lock = true;
- }
- }
+ bool exclusive;
+ fuse_dio_lock(iocb, from, &exclusive);
res = generic_write_checks(iocb, from);
if (res > 0) {
+ task_io_account_write(res);
if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
res = fuse_direct_IO(iocb, from);
} else {
@@ -1611,10 +1683,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- if (exclusive_lock)
- inode_unlock(inode);
- else
- inode_unlock_shared(inode);
+ fuse_dio_unlock(iocb, exclusive);
return res;
}
@@ -1631,10 +1700,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (FUSE_IS_DAX(inode))
return fuse_dax_read_iter(iocb, to);
- if (!(ff->open_flags & FOPEN_DIRECT_IO))
- return fuse_cache_read_iter(iocb, to);
- else
+ /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+ if (ff->open_flags & FOPEN_DIRECT_IO)
return fuse_direct_read_iter(iocb, to);
+ else if (fuse_file_passthrough(ff))
+ return fuse_passthrough_read_iter(iocb, to);
+ else
+ return fuse_cache_read_iter(iocb, to);
}
static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1649,10 +1721,38 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (FUSE_IS_DAX(inode))
return fuse_dax_write_iter(iocb, from);
- if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+ if (ff->open_flags & FOPEN_DIRECT_IO)
+ return fuse_direct_write_iter(iocb, from);
+ else if (fuse_file_passthrough(ff))
+ return fuse_passthrough_write_iter(iocb, from);
+ else
return fuse_cache_write_iter(iocb, from);
+}
+
+static ssize_t fuse_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct fuse_file *ff = in->private_data;
+
+ /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+ if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_passthrough_splice_read(in, ppos, pipe, len, flags);
else
- return fuse_direct_write_iter(iocb, from);
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
+static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fuse_file *ff = out->private_data;
+
+ /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+ if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_passthrough_splice_write(pipe, out, ppos, len, flags);
+ else
+ return iter_file_splice_write(pipe, out, ppos, len, flags);
}
static void fuse_writepage_free(struct fuse_writepage_args *wpa)
@@ -1667,7 +1767,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
__free_page(ap->pages[i]);
if (wpa->ia.ff)
- fuse_file_put(wpa->ia.ff, false, false);
+ fuse_file_put(wpa->ia.ff, false);
kfree(ap->pages);
kfree(wpa);
@@ -1909,7 +2009,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
- fuse_file_put(ff, false, false);
+ fuse_file_put(ff, false);
return err;
}
@@ -1947,26 +2047,26 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
rcu_read_unlock();
}
-static int fuse_writepage_locked(struct page *page)
+static int fuse_writepage_locked(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_writepage_args *wpa;
struct fuse_args_pages *ap;
- struct page *tmp_page;
+ struct folio *tmp_folio;
int error = -ENOMEM;
- set_page_writeback(page);
+ folio_start_writeback(folio);
wpa = fuse_writepage_args_alloc();
if (!wpa)
goto err;
ap = &wpa->ia.ap;
- tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!tmp_page)
+ tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
+ if (!tmp_folio)
goto err_free;
error = -EIO;
@@ -1975,21 +2075,21 @@ static int fuse_writepage_locked(struct page *page)
goto err_nofile;
fuse_writepage_add_to_bucket(fc, wpa);
- fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
+ fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0);
- copy_highpage(tmp_page, page);
+ folio_copy(tmp_folio, folio);
wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
wpa->next = NULL;
ap->args.in_pages = true;
ap->num_pages = 1;
- ap->pages[0] = tmp_page;
+ ap->pages[0] = &tmp_folio->page;
ap->descs[0].offset = 0;
ap->descs[0].length = PAGE_SIZE;
ap->args.end = fuse_writepage_end;
wpa->inode = inode;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
+ node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
spin_lock(&fi->lock);
tree_insert(&fi->writepages, wpa);
@@ -1997,48 +2097,20 @@ static int fuse_writepage_locked(struct page *page)
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
- end_page_writeback(page);
+ folio_end_writeback(folio);
return 0;
err_nofile:
- __free_page(tmp_page);
+ folio_put(tmp_folio);
err_free:
kfree(wpa);
err:
- mapping_set_error(page->mapping, error);
- end_page_writeback(page);
+ mapping_set_error(folio->mapping, error);
+ folio_end_writeback(folio);
return error;
}
-static int fuse_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
- int err;
-
- if (fuse_page_is_writeback(page->mapping->host, page->index)) {
- /*
- * ->writepages() should be called for sync() and friends. We
- * should only get here on direct reclaim and then we are
- * allowed to skip a page which is already in flight
- */
- WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
-
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- fc->num_background >= fc->congestion_threshold)
- return AOP_WRITEPAGE_ACTIVATE;
-
- err = fuse_writepage_locked(page);
- unlock_page(page);
-
- return err;
-}
-
struct fuse_fill_wb_data {
struct fuse_writepage_args *wpa;
struct fuse_file *ff;
@@ -2307,7 +2379,7 @@ static int fuse_writepages(struct address_space *mapping,
fuse_writepages_send(&data);
}
if (data.ff)
- fuse_file_put(data.ff, false, false);
+ fuse_file_put(data.ff, false);
kfree(data.orig_pages);
out:
@@ -2401,7 +2473,7 @@ static int fuse_launder_folio(struct folio *folio)
/* Serialize with pending writeback for the same page */
fuse_wait_on_page_writeback(inode, folio->index);
- err = fuse_writepage_locked(&folio->page);
+ err = fuse_writepage_locked(folio);
if (!err)
fuse_wait_on_page_writeback(inode, folio->index);
}
@@ -2462,13 +2534,30 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
+ struct inode *inode = file_inode(file);
+ int rc;
/* DAX mmap is superior to direct_io mmap */
- if (FUSE_IS_DAX(file_inode(file)))
+ if (FUSE_IS_DAX(inode))
return fuse_dax_mmap(file, vma);
+ /*
+ * If inode is in passthrough io mode, because it has some file open
+ * in passthrough mode, either mmap to backing file or fail mmap,
+ * because mixing cached mmap and passthrough io mode is not allowed.
+ */
+ if (fuse_file_passthrough(ff))
+ return fuse_passthrough_mmap(file, vma);
+ else if (fuse_inode_backing(get_fuse_inode(inode)))
+ return -ENODEV;
+
+ /*
+ * FOPEN_DIRECT_IO handling is special compared to O_DIRECT,
+ * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
+ */
if (ff->open_flags & FOPEN_DIRECT_IO) {
- /* Can't provide the coherency needed for MAP_SHARED
+ /*
+ * Can't provide the coherency needed for MAP_SHARED
* if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
*/
if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
@@ -2476,7 +2565,19 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
invalidate_inode_pages2(file->f_mapping);
- return generic_file_mmap(file, vma);
+ if (!(vma->vm_flags & VM_MAYSHARE)) {
+ /* MAP_PRIVATE */
+ return generic_file_mmap(file, vma);
+ }
+
+ /*
+ * First mmap of direct_io file enters caching inode io mode.
+ * Also waits for parallel dio writers to go into serial mode
+ * (exclusive instead of shared lock).
+ */
+ rc = fuse_file_cached_io_start(inode, ff);
+ if (rc)
+ return rc;
}
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
@@ -2509,14 +2610,14 @@ static int convert_fuse_file_lock(struct fuse_conn *fc,
* translate it into the caller's pid namespace.
*/
rcu_read_lock();
- fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
+ fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
rcu_read_unlock();
break;
default:
return -EIO;
}
- fl->fl_type = ffl->type;
+ fl->c.flc_type = ffl->type;
return 0;
}
@@ -2530,10 +2631,10 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file,
memset(inarg, 0, sizeof(*inarg));
inarg->fh = ff->fh;
- inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+ inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner);
inarg->lk.start = fl->fl_start;
inarg->lk.end = fl->fl_end;
- inarg->lk.type = fl->fl_type;
+ inarg->lk.type = fl->c.flc_type;
inarg->lk.pid = pid;
if (flock)
inarg->lk_flags |= FUSE_LK_FLOCK;
@@ -2570,8 +2671,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
- int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
- struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
+ int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+ struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL;
pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
int err;
@@ -2580,10 +2681,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
return -ENOLCK;
}
- /* Unlock on close is handled by the flush method */
- if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
- return 0;
-
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
err = fuse_simple_request(fm, &args);
@@ -3213,8 +3310,8 @@ static const struct file_operations fuse_file_operations = {
.lock = fuse_file_lock,
.get_unmapped_area = thp_get_unmapped_area,
.flock = fuse_file_flock,
- .splice_read = filemap_splice_read,
- .splice_write = iter_file_splice_write,
+ .splice_read = fuse_splice_read,
+ .splice_write = fuse_splice_write,
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
@@ -3225,10 +3322,10 @@ static const struct file_operations fuse_file_operations = {
static const struct address_space_operations fuse_file_aops = {
.read_folio = fuse_read_folio,
.readahead = fuse_readahead,
- .writepage = fuse_writepage,
.writepages = fuse_writepages,
.launder_folio = fuse_launder_folio,
.dirty_folio = filemap_dirty_folio,
+ .migrate_folio = filemap_migrate_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
.write_begin = fuse_write_begin,
@@ -3245,7 +3342,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
fi->writectr = 0;
+ fi->iocachectr = 0;
init_waitqueue_head(&fi->page_waitq);
+ init_waitqueue_head(&fi->direct_io_waitq);
fi->writepages = RB_ROOT;
if (IS_ENABLED(CONFIG_FUSE_DAX))
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bcbe34488862..b24084b60864 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -76,6 +76,16 @@ struct fuse_submount_lookup {
struct fuse_forget_link *forget;
};
+/** Container for data related to mapping to backing file */
+struct fuse_backing {
+ struct file *file;
+ struct cred *cred;
+
+ /** refcount */
+ refcount_t count;
+ struct rcu_head rcu;
+};
+
/** FUSE inode */
struct fuse_inode {
/** Inode data */
@@ -111,7 +121,7 @@ struct fuse_inode {
u64 attr_version;
union {
- /* Write related fields (regular file only) */
+ /* read/write io cache (regular file only) */
struct {
/* Files usable in writepage. Protected by fi->lock */
struct list_head write_files;
@@ -123,9 +133,15 @@ struct fuse_inode {
* (FUSE_NOWRITE) means more writes are blocked */
int writectr;
+ /** Number of files/maps using page cache */
+ int iocachectr;
+
/* Waitq for writepage completion */
wait_queue_head_t page_waitq;
+ /* waitq for direct-io completion */
+ wait_queue_head_t direct_io_waitq;
+
/* List of writepage requestst (pending or sent) */
struct rb_root writepages;
};
@@ -173,6 +189,10 @@ struct fuse_inode {
#endif
/** Submount specific lookup tracking */
struct fuse_submount_lookup *submount_lookup;
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ /** Reference to backing file in passthrough mode */
+ struct fuse_backing *fb;
+#endif
};
/** FUSE inode state bits */
@@ -187,19 +207,21 @@ enum {
FUSE_I_BAD,
/* Has btime */
FUSE_I_BTIME,
+ /* Wants or already has page cache IO */
+ FUSE_I_CACHE_IO_MODE,
};
struct fuse_conn;
struct fuse_mount;
-struct fuse_release_args;
+union fuse_file_args;
/** FUSE specific file data */
struct fuse_file {
/** Fuse connection for this file */
struct fuse_mount *fm;
- /* Argument space reserved for release */
- struct fuse_release_args *release_args;
+ /* Argument space reserved for open/release */
+ union fuse_file_args *args;
/** Kernel file handle guaranteed to be unique */
u64 kh;
@@ -221,12 +243,6 @@ struct fuse_file {
/* Readdir related */
struct {
- /*
- * Protects below fields against (crazy) parallel readdir on
- * same open file. Uncontended in the normal case.
- */
- struct mutex lock;
-
/* Dir stream position */
loff_t pos;
@@ -244,6 +260,15 @@ struct fuse_file {
/** Wait queue head for poll */
wait_queue_head_t poll_wait;
+ /** Does file hold a fi->iocachectr refcount? */
+ enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode;
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ /** Reference to backing file in passthrough mode */
+ struct file *passthrough;
+ const struct cred *cred;
+#endif
+
/** Has flock been performed on this file? */
bool flock:1;
};
@@ -283,6 +308,7 @@ struct fuse_args {
bool page_replace:1;
bool may_block:1;
bool is_ext:1;
+ bool is_pinned:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
@@ -295,6 +321,19 @@ struct fuse_args_pages {
unsigned int num_pages;
};
+struct fuse_release_args {
+ struct fuse_args args;
+ struct fuse_release_in inarg;
+ struct inode *inode;
+};
+
+union fuse_file_args {
+ /* Used during open() */
+ struct fuse_open_out open_outarg;
+ /* Used during release() */
+ struct fuse_release_args release_args;
+};
+
#define FUSE_ARGS(args) struct fuse_args args = {}
/** The request IO state (for asynchronous processing) */
@@ -818,6 +857,12 @@ struct fuse_conn {
/* Is statx not implemented by fs? */
unsigned int no_statx:1;
+ /** Passthrough support for read/write IO */
+ unsigned int passthrough:1;
+
+ /** Maximum stack depth for passthrough backing files */
+ int max_stack_depth;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -867,6 +912,11 @@ struct fuse_conn {
/* New writepages go into this bucket */
struct fuse_sync_bucket __rcu *curr_bucket;
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ /** IDR for backing files ids */
+ struct idr backing_files_map;
+#endif
};
/*
@@ -940,7 +990,6 @@ static inline bool fuse_stale_inode(const struct inode *inode, int generation,
static inline void fuse_make_bad(struct inode *inode)
{
- remove_inode_hash(inode);
set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
}
@@ -1032,14 +1081,9 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
size_t count, int opcode);
-/**
- * Send OPEN or OPENDIR request
- */
-int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
-
-struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release);
void fuse_file_free(struct fuse_file *ff);
-void fuse_finish_open(struct inode *inode, struct file *file);
+int fuse_finish_open(struct inode *inode, struct file *file);
void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
unsigned int flags);
@@ -1349,11 +1393,82 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int fuse_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
-/* file.c */
+/* iomode.c */
+int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff);
+int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb);
+void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff);
+
+int fuse_file_io_open(struct file *file, struct inode *inode);
+void fuse_file_io_release(struct fuse_file *ff, struct inode *inode);
+/* file.c */
struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, bool isdir);
void fuse_file_release(struct inode *inode, struct fuse_file *ff,
unsigned int open_flags, fl_owner_t id, bool isdir);
+/* passthrough.c */
+static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return READ_ONCE(fi->fb);
+#else
+ return NULL;
+#endif
+}
+
+static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
+ struct fuse_backing *fb)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return xchg(&fi->fb, fb);
+#else
+ return NULL;
+#endif
+}
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb);
+void fuse_backing_put(struct fuse_backing *fb);
+#else
+
+static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+ return NULL;
+}
+
+static inline void fuse_backing_put(struct fuse_backing *fb)
+{
+}
+#endif
+
+void fuse_backing_files_init(struct fuse_conn *fc);
+void fuse_backing_files_free(struct fuse_conn *fc);
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map);
+int fuse_backing_close(struct fuse_conn *fc, int backing_id);
+
+struct fuse_backing *fuse_passthrough_open(struct file *file,
+ struct inode *inode,
+ int backing_id);
+void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb);
+
+static inline struct file *fuse_file_passthrough(struct fuse_file *ff)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return ff->passthrough;
+#else
+ return NULL;
+#endif
+}
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags);
+ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags);
+ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 516ea2979a90..3a5d88878335 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -111,6 +111,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
goto out_free_forget;
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_inode_backing_set(fi, NULL);
+
return &fi->inode;
out_free_forget:
@@ -129,6 +132,9 @@ static void fuse_free_inode(struct inode *inode)
#ifdef CONFIG_FUSE_DAX
kfree(fi->dax);
#endif
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_backing_put(fuse_inode_backing(fi));
+
kmem_cache_free(fuse_inode_cachep, fi);
}
@@ -469,8 +475,11 @@ retry:
} else if (fuse_stale_inode(inode, generation, attr)) {
/* nodeid was reused, any I/O on the old inode should fail */
fuse_make_bad(inode);
- iput(inode);
- goto retry;
+ if (inode != d_inode(sb->s_root)) {
+ remove_inode_hash(inode);
+ iput(inode);
+ goto retry;
+ }
}
fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
@@ -924,6 +933,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_backing_files_init(fc);
+
INIT_LIST_HEAD(&fc->mounts);
list_add(&fm->fc_entry, &fc->mounts);
fm->fc = fc;
@@ -954,6 +966,8 @@ void fuse_conn_put(struct fuse_conn *fc)
WARN_ON(atomic_read(&bucket->count) != 1);
kfree(bucket);
}
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ fuse_backing_files_free(fc);
call_rcu(&fc->rcu, delayed_release);
}
}
@@ -974,7 +988,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
attr.mode = mode;
attr.ino = FUSE_ROOT_ID;
attr.nlink = 1;
- return fuse_iget(sb, 1, 0, &attr, 0, 0);
+ return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0);
}
struct fuse_inode_handle {
@@ -1117,6 +1131,11 @@ static struct dentry *fuse_get_parent(struct dentry *child)
return parent;
}
+/* only for fid encoding; no support for file handle */
+static const struct export_operations fuse_export_fid_operations = {
+ .encode_fh = fuse_encode_fh,
+};
+
static const struct export_operations fuse_export_operations = {
.fh_to_dentry = fuse_fh_to_dentry,
.fh_to_parent = fuse_fh_to_parent,
@@ -1291,6 +1310,26 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->create_supp_group = 1;
if (flags & FUSE_DIRECT_IO_ALLOW_MMAP)
fc->direct_io_allow_mmap = 1;
+ /*
+ * max_stack_depth is the max stack depth of FUSE fs,
+ * so it has to be at least 1 to support passthrough
+ * to backing files.
+ *
+ * with max_stack_depth > 1, the backing files can be
+ * on a stacked fs (e.g. overlayfs) themselves and with
+ * max_stack_depth == 1, FUSE fs can be stacked as the
+ * underlying fs of a stacked fs (e.g. overlayfs).
+ */
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) &&
+ (flags & FUSE_PASSTHROUGH) &&
+ arg->max_stack_depth > 0 &&
+ arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) {
+ fc->passthrough = 1;
+ fc->max_stack_depth = arg->max_stack_depth;
+ fm->sb->s_stack_depth = arg->max_stack_depth;
+ }
+ if (flags & FUSE_NO_EXPORT_SUPPORT)
+ fm->sb->s_export_op = &fuse_export_fid_operations;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1337,7 +1376,8 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
- FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP;
+ FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
+ FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
@@ -1346,6 +1386,8 @@ void fuse_send_init(struct fuse_mount *fm)
#endif
if (fm->fc->auto_submounts)
flags |= FUSE_SUBMOUNTS;
+ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+ flags |= FUSE_PASSTHROUGH;
ia->in.flags = flags;
ia->in.flags2 = flags >> 32;
@@ -1496,8 +1538,8 @@ static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
.ctimensec = ctime.tv_nsec,
.mode = fi->inode.i_mode,
.nlink = fi->inode.i_nlink,
- .uid = fi->inode.i_uid.val,
- .gid = fi->inode.i_gid.val,
+ .uid = __kuid_val(fi->inode.i_uid),
+ .gid = __kgid_val(fi->inode.i_gid),
.rdev = fi->inode.i_rdev,
.blksize = 1u << fi->inode.i_blkbits,
};
@@ -1534,6 +1576,7 @@ static int fuse_fill_super_submount(struct super_block *sb,
sb->s_bdi = bdi_get(parent_sb->s_bdi);
sb->s_xattr = parent_sb->s_xattr;
+ sb->s_export_op = parent_sb->s_export_op;
sb->s_time_gran = parent_sb->s_time_gran;
sb->s_blocksize = parent_sb->s_blocksize;
sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
new file mode 100644
index 000000000000..c653ddcf0578
--- /dev/null
+++ b/fs/fuse/iomode.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE inode io modes.
+ *
+ * Copyright (c) 2024 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+
+/*
+ * Return true if need to wait for new opens in caching mode.
+ */
+static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi)
+{
+ return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi);
+}
+
+/*
+ * Start cached io mode.
+ *
+ * Blocks new parallel dio writes and waits for the in-progress parallel dio
+ * writes to complete.
+ */
+int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /* There are no io modes if server does not implement open */
+ if (!ff->args)
+ return 0;
+
+ spin_lock(&fi->lock);
+ /*
+ * Setting the bit advises new direct-io writes to use an exclusive
+ * lock - without it the wait below might be forever.
+ */
+ while (fuse_is_io_cache_wait(fi)) {
+ set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+ spin_unlock(&fi->lock);
+ wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi));
+ spin_lock(&fi->lock);
+ }
+
+ /*
+ * Check if inode entered passthrough io mode while waiting for parallel
+ * dio write completion.
+ */
+ if (fuse_inode_backing(fi)) {
+ clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+ spin_unlock(&fi->lock);
+ return -ETXTBSY;
+ }
+
+ WARN_ON(ff->iomode == IOM_UNCACHED);
+ if (ff->iomode == IOM_NONE) {
+ ff->iomode = IOM_CACHED;
+ if (fi->iocachectr == 0)
+ set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+ fi->iocachectr++;
+ }
+ spin_unlock(&fi->lock);
+ return 0;
+}
+
+static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ WARN_ON(fi->iocachectr <= 0);
+ WARN_ON(ff->iomode != IOM_CACHED);
+ ff->iomode = IOM_NONE;
+ fi->iocachectr--;
+ if (fi->iocachectr == 0)
+ clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+ spin_unlock(&fi->lock);
+}
+
+/* Start strictly uncached io mode where cache access is not allowed */
+int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_backing *oldfb;
+ int err = 0;
+
+ spin_lock(&fi->lock);
+ /* deny conflicting backing files on same fuse inode */
+ oldfb = fuse_inode_backing(fi);
+ if (oldfb && oldfb != fb) {
+ err = -EBUSY;
+ goto unlock;
+ }
+ if (fi->iocachectr > 0) {
+ err = -ETXTBSY;
+ goto unlock;
+ }
+ WARN_ON(ff->iomode != IOM_NONE);
+ fi->iocachectr--;
+ ff->iomode = IOM_UNCACHED;
+
+ /* fuse inode holds a single refcount of backing file */
+ if (!oldfb) {
+ oldfb = fuse_inode_backing_set(fi, fb);
+ WARN_ON_ONCE(oldfb != NULL);
+ } else {
+ fuse_backing_put(fb);
+ }
+unlock:
+ spin_unlock(&fi->lock);
+ return err;
+}
+
+void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_backing *oldfb = NULL;
+
+ spin_lock(&fi->lock);
+ WARN_ON(fi->iocachectr >= 0);
+ WARN_ON(ff->iomode != IOM_UNCACHED);
+ ff->iomode = IOM_NONE;
+ fi->iocachectr++;
+ if (!fi->iocachectr) {
+ wake_up(&fi->direct_io_waitq);
+ oldfb = fuse_inode_backing_set(fi, NULL);
+ }
+ spin_unlock(&fi->lock);
+ if (oldfb)
+ fuse_backing_put(oldfb);
+}
+
+/*
+ * Open flags that are allowed in combination with FOPEN_PASSTHROUGH.
+ * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write
+ * operations go directly to the server, but mmap is done on the backing file.
+ * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode
+ * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination.
+ */
+#define FOPEN_PASSTHROUGH_MASK \
+ (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \
+ FOPEN_NOFLUSH)
+
+static int fuse_file_passthrough_open(struct inode *inode, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_backing *fb;
+ int err;
+
+ /* Check allowed conditions for file open in passthrough mode */
+ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough ||
+ (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK))
+ return -EINVAL;
+
+ fb = fuse_passthrough_open(file, inode,
+ ff->args->open_outarg.backing_id);
+ if (IS_ERR(fb))
+ return PTR_ERR(fb);
+
+ /* First passthrough file open denies caching inode io mode */
+ err = fuse_file_uncached_io_start(inode, ff, fb);
+ if (!err)
+ return 0;
+
+ fuse_passthrough_release(ff, fb);
+ fuse_backing_put(fb);
+
+ return err;
+}
+
+/* Request access to submit new io to inode via open file */
+int fuse_file_io_open(struct file *file, struct inode *inode)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int err;
+
+ /*
+ * io modes are not relevant with DAX and with server that does not
+ * implement open.
+ */
+ if (FUSE_IS_DAX(inode) || !ff->args)
+ return 0;
+
+ /*
+ * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode
+ * which is already open for passthrough.
+ */
+ err = -EINVAL;
+ if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH))
+ goto fail;
+
+ /*
+ * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO.
+ */
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES;
+
+ /*
+ * First passthrough file open denies caching inode io mode.
+ * First caching file open enters caching inode io mode.
+ *
+ * Note that if user opens a file open with O_DIRECT, but server did
+ * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT,
+ * so we put the inode in caching mode to prevent parallel dio.
+ */
+ if ((ff->open_flags & FOPEN_DIRECT_IO) &&
+ !(ff->open_flags & FOPEN_PASSTHROUGH))
+ return 0;
+
+ if (ff->open_flags & FOPEN_PASSTHROUGH)
+ err = fuse_file_passthrough_open(inode, file);
+ else
+ err = fuse_file_cached_io_start(inode, ff);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n",
+ ff->open_flags, err);
+ /*
+ * The file open mode determines the inode io mode.
+ * Using incorrect open mode is a server mistake, which results in
+ * user visible failure of open() with EIO error.
+ */
+ return -EIO;
+}
+
+/* No more pending io and no new io possible to inode via open/mmapped file */
+void fuse_file_io_release(struct fuse_file *ff, struct inode *inode)
+{
+ /*
+ * Last parallel dio close allows caching inode io mode.
+ * Last caching file close exits caching inode io mode.
+ */
+ switch (ff->iomode) {
+ case IOM_NONE:
+ /* Nothing to do */
+ break;
+ case IOM_UNCACHED:
+ fuse_file_uncached_io_end(inode, ff);
+ break;
+ case IOM_CACHED:
+ fuse_file_cached_io_end(inode, ff);
+ break;
+ }
+}
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
new file mode 100644
index 000000000000..1567f0323858
--- /dev/null
+++ b/fs/fuse/passthrough.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE passthrough to backing file.
+ *
+ * Copyright (c) 2023 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/file.h>
+#include <linux/backing-file.h>
+#include <linux/splice.h>
+
+static void fuse_file_accessed(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ fuse_invalidate_atime(inode);
+}
+
+static void fuse_file_modified(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+}
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = file,
+ .accessed = fuse_file_accessed,
+ };
+
+
+ pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
+ backing_file, iocb->ki_pos, count);
+
+ if (!count)
+ return 0;
+
+ ret = backing_file_read_iter(backing_file, iter, iocb, iocb->ki_flags,
+ &ctx);
+
+ return ret;
+}
+
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct fuse_file *ff = file->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = file,
+ .end_write = fuse_file_modified,
+ };
+
+ pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
+ backing_file, iocb->ki_pos, count);
+
+ if (!count)
+ return 0;
+
+ inode_lock(inode);
+ ret = backing_file_write_iter(backing_file, iter, iocb, iocb->ki_flags,
+ &ctx);
+ inode_unlock(inode);
+
+ return ret;
+}
+
+ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct fuse_file *ff = in->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = in,
+ .accessed = fuse_file_accessed,
+ };
+
+ pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
+ backing_file, ppos ? *ppos : 0, len, flags);
+
+ return backing_file_splice_read(backing_file, ppos, pipe, len, flags,
+ &ctx);
+}
+
+ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags)
+{
+ struct fuse_file *ff = out->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ struct inode *inode = file_inode(out);
+ ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = out,
+ .end_write = fuse_file_modified,
+ };
+
+ pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
+ backing_file, ppos ? *ppos : 0, len, flags);
+
+ inode_lock(inode);
+ ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags,
+ &ctx);
+ inode_unlock(inode);
+
+ return ret;
+}
+
+ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct fuse_file *ff = file->private_data;
+ struct file *backing_file = fuse_file_passthrough(ff);
+ struct backing_file_ctx ctx = {
+ .cred = ff->cred,
+ .user_file = file,
+ .accessed = fuse_file_accessed,
+ };
+
+ pr_debug("%s: backing_file=0x%p, start=%lu, end=%lu\n", __func__,
+ backing_file, vma->vm_start, vma->vm_end);
+
+ return backing_file_mmap(backing_file, vma, &ctx);
+}
+
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+ if (fb && refcount_inc_not_zero(&fb->count))
+ return fb;
+ return NULL;
+}
+
+static void fuse_backing_free(struct fuse_backing *fb)
+{
+ pr_debug("%s: fb=0x%p\n", __func__, fb);
+
+ if (fb->file)
+ fput(fb->file);
+ put_cred(fb->cred);
+ kfree_rcu(fb, rcu);
+}
+
+void fuse_backing_put(struct fuse_backing *fb)
+{
+ if (fb && refcount_dec_and_test(&fb->count))
+ fuse_backing_free(fb);
+}
+
+void fuse_backing_files_init(struct fuse_conn *fc)
+{
+ idr_init(&fc->backing_files_map);
+}
+
+static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&fc->lock);
+ /* FIXME: xarray might be space inefficient */
+ id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
+ spin_unlock(&fc->lock);
+ idr_preload_end();
+
+ WARN_ON_ONCE(id == 0);
+ return id;
+}
+
+static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
+ int id)
+{
+ struct fuse_backing *fb;
+
+ spin_lock(&fc->lock);
+ fb = idr_remove(&fc->backing_files_map, id);
+ spin_unlock(&fc->lock);
+
+ return fb;
+}
+
+static int fuse_backing_id_free(int id, void *p, void *data)
+{
+ struct fuse_backing *fb = p;
+
+ WARN_ON_ONCE(refcount_read(&fb->count) != 1);
+ fuse_backing_free(fb);
+ return 0;
+}
+
+void fuse_backing_files_free(struct fuse_conn *fc)
+{
+ idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
+ idr_destroy(&fc->backing_files_map);
+}
+
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
+{
+ struct file *file;
+ struct super_block *backing_sb;
+ struct fuse_backing *fb = NULL;
+ int res;
+
+ pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ res = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ res = -EINVAL;
+ if (map->flags)
+ goto out;
+
+ file = fget(map->fd);
+ res = -EBADF;
+ if (!file)
+ goto out;
+
+ res = -EOPNOTSUPP;
+ if (!file->f_op->read_iter || !file->f_op->write_iter)
+ goto out_fput;
+
+ backing_sb = file_inode(file)->i_sb;
+ res = -ELOOP;
+ if (backing_sb->s_stack_depth >= fc->max_stack_depth)
+ goto out_fput;
+
+ fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
+ res = -ENOMEM;
+ if (!fb)
+ goto out_fput;
+
+ fb->file = file;
+ fb->cred = prepare_creds();
+ refcount_set(&fb->count, 1);
+
+ res = fuse_backing_id_alloc(fc, fb);
+ if (res < 0) {
+ fuse_backing_free(fb);
+ fb = NULL;
+ }
+
+out:
+ pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
+
+ return res;
+
+out_fput:
+ fput(file);
+ goto out;
+}
+
+int fuse_backing_close(struct fuse_conn *fc, int backing_id)
+{
+ struct fuse_backing *fb = NULL;
+ int err;
+
+ pr_debug("%s: backing_id=%d\n", __func__, backing_id);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ err = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ err = -EINVAL;
+ if (backing_id <= 0)
+ goto out;
+
+ err = -ENOENT;
+ fb = fuse_backing_id_remove(fc, backing_id);
+ if (!fb)
+ goto out;
+
+ fuse_backing_put(fb);
+ err = 0;
+out:
+ pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
+
+ return err;
+}
+
+/*
+ * Setup passthrough to a backing file.
+ *
+ * Returns an fb object with elevated refcount to be stored in fuse inode.
+ */
+struct fuse_backing *fuse_passthrough_open(struct file *file,
+ struct inode *inode,
+ int backing_id)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fm->fc;
+ struct fuse_backing *fb = NULL;
+ struct file *backing_file;
+ int err;
+
+ err = -EINVAL;
+ if (backing_id <= 0)
+ goto out;
+
+ rcu_read_lock();
+ fb = idr_find(&fc->backing_files_map, backing_id);
+ fb = fuse_backing_get(fb);
+ rcu_read_unlock();
+
+ err = -ENOENT;
+ if (!fb)
+ goto out;
+
+ /* Allocate backing file per fuse file to store fuse path */
+ backing_file = backing_file_open(&file->f_path, file->f_flags,
+ &fb->file->f_path, fb->cred);
+ err = PTR_ERR(backing_file);
+ if (IS_ERR(backing_file)) {
+ fuse_backing_put(fb);
+ goto out;
+ }
+
+ err = 0;
+ ff->passthrough = backing_file;
+ ff->cred = get_cred(fb->cred);
+out:
+ pr_debug("%s: backing_id=%d, fb=0x%p, backing_file=0x%p, err=%i\n", __func__,
+ backing_id, fb, ff->passthrough, err);
+
+ return err ? ERR_PTR(err) : fb;
+}
+
+void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb)
+{
+ pr_debug("%s: fb=0x%p, backing_file=0x%p\n", __func__,
+ fb, ff->passthrough);
+
+ fput(ff->passthrough);
+ ff->passthrough = NULL;
+ put_cred(ff->cred);
+ ff->cred = NULL;
+}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index c66a54d6c7d3..0377b6dc24c8 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -592,15 +592,11 @@ int fuse_readdir(struct file *file, struct dir_context *ctx)
if (fuse_is_bad(inode))
return -EIO;
- mutex_lock(&ff->readdir.lock);
-
err = UNCACHED;
if (ff->open_flags & FOPEN_CACHE_DIR)
err = fuse_readdir_cached(file, ctx);
if (err == UNCACHED)
err = fuse_readdir_uncached(file, ctx);
- mutex_unlock(&ff->readdir.lock);
-
return err;
}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 5f1be1da92ce..322af827a232 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -16,6 +16,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/highmem.h>
+#include <linux/cleanup.h>
#include <linux/uio.h>
#include "fuse_i.h"
@@ -31,6 +32,9 @@
static DEFINE_MUTEX(virtio_fs_mutex);
static LIST_HEAD(virtio_fs_instances);
+/* The /sys/fs/virtio_fs/ kset */
+static struct kset *virtio_fs_kset;
+
enum {
VQ_HIPRIO,
VQ_REQUEST
@@ -55,7 +59,7 @@ struct virtio_fs_vq {
/* A virtio-fs device instance */
struct virtio_fs {
- struct kref refcount;
+ struct kobject kobj;
struct list_head list; /* on virtio_fs_instances */
char *tag;
struct virtio_fs_vq *vqs;
@@ -161,18 +165,40 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
complete(&fsvq->in_flight_zero);
}
-static void release_virtio_fs_obj(struct kref *ref)
+static ssize_t tag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
+
+ return sysfs_emit(buf, fs->tag);
+}
+
+static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag);
+
+static struct attribute *virtio_fs_attrs[] = {
+ &virtio_fs_tag_attr.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(virtio_fs);
+
+static void virtio_fs_ktype_release(struct kobject *kobj)
{
- struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
+ struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);
kfree(vfs->vqs);
kfree(vfs);
}
+static const struct kobj_type virtio_fs_ktype = {
+ .release = virtio_fs_ktype_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = virtio_fs_groups,
+};
+
/* Make sure virtiofs_mutex is held */
static void virtio_fs_put(struct virtio_fs *fs)
{
- kref_put(&fs->refcount, release_virtio_fs_obj);
+ kobject_put(&fs->kobj);
}
static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
@@ -243,25 +269,46 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs)
}
/* Add a new instance to the list or return -EEXIST if tag name exists*/
-static int virtio_fs_add_instance(struct virtio_fs *fs)
+static int virtio_fs_add_instance(struct virtio_device *vdev,
+ struct virtio_fs *fs)
{
struct virtio_fs *fs2;
- bool duplicate = false;
+ int ret;
mutex_lock(&virtio_fs_mutex);
list_for_each_entry(fs2, &virtio_fs_instances, list) {
- if (strcmp(fs->tag, fs2->tag) == 0)
- duplicate = true;
+ if (strcmp(fs->tag, fs2->tag) == 0) {
+ mutex_unlock(&virtio_fs_mutex);
+ return -EEXIST;
+ }
}
- if (!duplicate)
- list_add_tail(&fs->list, &virtio_fs_instances);
+ /* Use the virtio_device's index as a unique identifier, there is no
+ * need to allocate our own identifiers because the virtio_fs instance
+ * is only visible to userspace as long as the underlying virtio_device
+ * exists.
+ */
+ fs->kobj.kset = virtio_fs_kset;
+ ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index);
+ if (ret < 0) {
+ mutex_unlock(&virtio_fs_mutex);
+ return ret;
+ }
+
+ ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device");
+ if (ret < 0) {
+ kobject_del(&fs->kobj);
+ mutex_unlock(&virtio_fs_mutex);
+ return ret;
+ }
+
+ list_add_tail(&fs->list, &virtio_fs_instances);
mutex_unlock(&virtio_fs_mutex);
- if (duplicate)
- return -EEXIST;
+ kobject_uevent(&fs->kobj, KOBJ_ADD);
+
return 0;
}
@@ -274,7 +321,7 @@ static struct virtio_fs *virtio_fs_find_instance(const char *tag)
list_for_each_entry(fs, &virtio_fs_instances, list) {
if (strcmp(fs->tag, tag) == 0) {
- kref_get(&fs->refcount);
+ kobject_get(&fs->kobj);
goto found;
}
}
@@ -323,6 +370,16 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
return -ENOMEM;
memcpy(fs->tag, tag_buf, len);
fs->tag[len] = '\0';
+
+ /* While the VIRTIO specification allows any character, newlines are
+ * awkward on mount(8) command-lines and cause problems in the sysfs
+ * "tag" attr and uevent TAG= properties. Forbid them.
+ */
+ if (strchr(fs->tag, '\n')) {
+ dev_dbg(&vdev->dev, "refusing virtiofs tag with newline character\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -345,7 +402,7 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work)
kfree(req);
dec_in_flight_req(fsvq);
}
- } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ } while (!virtqueue_enable_cb(vq));
spin_unlock(&fsvq->lock);
}
@@ -627,7 +684,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
list_move_tail(&req->list, &reqs);
spin_unlock(&fpq->lock);
}
- } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ } while (!virtqueue_enable_cb(vq));
spin_unlock(&fsvq->lock);
/* End requests */
@@ -795,8 +852,11 @@ static void virtio_fs_cleanup_dax(void *data)
put_dax(dax_dev);
}
+DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T))
+
static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
{
+ struct dax_device *dax_dev __free(cleanup_dax) = NULL;
struct virtio_shm_region cache_reg;
struct dev_pagemap *pgmap;
bool have_cache;
@@ -804,6 +864,12 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
if (!IS_ENABLED(CONFIG_FUSE_DAX))
return 0;
+ dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
+ if (IS_ERR(dax_dev)) {
+ int rc = PTR_ERR(dax_dev);
+ return rc == -EOPNOTSUPP ? 0 : rc;
+ }
+
/* Get cache region */
have_cache = virtio_get_shm_region(vdev, &cache_reg,
(u8)VIRTIO_FS_SHMCAP_ID_CACHE);
@@ -849,10 +915,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
- fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
- if (IS_ERR(fs->dax_dev))
- return PTR_ERR(fs->dax_dev);
-
+ fs->dax_dev = no_free_ptr(dax_dev);
return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
fs->dax_dev);
}
@@ -865,7 +928,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
fs = kzalloc(sizeof(*fs), GFP_KERNEL);
if (!fs)
return -ENOMEM;
- kref_init(&fs->refcount);
+ kobject_init(&fs->kobj, &virtio_fs_ktype);
vdev->priv = fs;
ret = virtio_fs_read_tag(vdev, fs);
@@ -887,7 +950,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
*/
virtio_device_ready(vdev);
- ret = virtio_fs_add_instance(fs);
+ ret = virtio_fs_add_instance(vdev, fs);
if (ret < 0)
goto out_vqs;
@@ -896,11 +959,10 @@ static int virtio_fs_probe(struct virtio_device *vdev)
out_vqs:
virtio_reset_device(vdev);
virtio_fs_cleanup_vqs(vdev);
- kfree(fs->vqs);
out:
vdev->priv = NULL;
- kfree(fs);
+ kobject_put(&fs->kobj);
return ret;
}
@@ -924,6 +986,8 @@ static void virtio_fs_remove(struct virtio_device *vdev)
mutex_lock(&virtio_fs_mutex);
/* This device is going away. No one should get new reference */
list_del_init(&fs->list);
+ sysfs_remove_link(&fs->kobj, "device");
+ kobject_del(&fs->kobj);
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
virtio_reset_device(vdev);
@@ -1510,21 +1574,56 @@ static struct file_system_type virtio_fs_type = {
.kill_sb = virtio_kill_sb,
};
+static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
+{
+ const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
+
+ add_uevent_var(env, "TAG=%s", fs->tag);
+ return 0;
+}
+
+static const struct kset_uevent_ops virtio_fs_uevent_ops = {
+ .uevent = virtio_fs_uevent,
+};
+
+static int __init virtio_fs_sysfs_init(void)
+{
+ virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops,
+ fs_kobj);
+ if (!virtio_fs_kset)
+ return -ENOMEM;
+ return 0;
+}
+
+static void virtio_fs_sysfs_exit(void)
+{
+ kset_unregister(virtio_fs_kset);
+ virtio_fs_kset = NULL;
+}
+
static int __init virtio_fs_init(void)
{
int ret;
- ret = register_virtio_driver(&virtio_fs_driver);
+ ret = virtio_fs_sysfs_init();
if (ret < 0)
return ret;
+ ret = register_virtio_driver(&virtio_fs_driver);
+ if (ret < 0)
+ goto sysfs_exit;
+
ret = register_filesystem(&virtio_fs_type);
- if (ret < 0) {
- unregister_virtio_driver(&virtio_fs_driver);
- return ret;
- }
+ if (ret < 0)
+ goto unregister_virtio_driver;
return 0;
+
+unregister_virtio_driver:
+ unregister_virtio_driver(&virtio_fs_driver);
+sysfs_exit:
+ virtio_fs_sysfs_exit();
+ return ret;
}
module_init(virtio_fs_init);
@@ -1532,6 +1631,7 @@ static void __exit virtio_fs_exit(void)
{
unregister_filesystem(&virtio_fs_type);
unregister_virtio_driver(&virtio_fs_driver);
+ virtio_fs_sysfs_exit();
}
module_exit(virtio_fs_exit);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index d9ccfd27e4f1..aa1626955b2c 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1718,7 +1718,8 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
struct buffer_head *dibh, *bh;
struct gfs2_holder rd_gh;
unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
- u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
+ unsigned int bsize = 1 << bsize_shift;
+ u64 lblock = (offset + bsize - 1) >> bsize_shift;
__u16 start_list[GFS2_MAX_META_HEIGHT];
__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
unsigned int start_aligned, end_aligned;
@@ -1729,7 +1730,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
u64 prev_bnr = 0;
__be64 *start, *end;
- if (offset >= maxsize) {
+ if (offset + bsize - 1 >= maxsize) {
/*
* The starting point lies beyond the allocated metadata;
* there are no blocks to deallocate.
@@ -2465,7 +2466,7 @@ out:
}
static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset)
+ loff_t offset, unsigned int len)
{
int ret;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 992ca4effb50..4c42ada60ae7 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1440,10 +1440,10 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- if (!(fl->fl_flags & FL_POSIX))
+ if (!(fl->c.flc_flags & FL_POSIX))
return -ENOLCK;
if (gfs2_withdrawing_or_withdrawn(sdp)) {
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
locks_lock_file_wait(file, fl);
return -EIO;
}
@@ -1451,7 +1451,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl);
else if (IS_GETLK(cmd))
return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
- else if (fl->fl_type == F_UNLCK)
+ else if (lock_is_unlock(fl))
return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
else
return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
@@ -1483,7 +1483,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
int error = 0;
int sleeptime;
- state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+ state = lock_is_write(fl) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
flags = GL_EXACT | GL_NOPID;
if (!IS_SETLKW(cmd))
flags |= LM_FLAG_TRY_1CB;
@@ -1495,8 +1495,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (fl_gh->gh_state == state)
goto out;
locks_init_lock(&request);
- request.fl_type = F_UNLCK;
- request.fl_flags = FL_FLOCK;
+ request.c.flc_type = F_UNLCK;
+ request.c.flc_flags = FL_FLOCK;
locks_lock_file_wait(file, &request);
gfs2_glock_dq(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
@@ -1557,10 +1557,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
- if (fl->fl_type == F_UNLCK) {
+ if (lock_is_unlock(fl)) {
do_unflock(file, fl);
return 0;
} else {
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 79be0cdc730c..04cadc02e5a6 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -111,7 +111,6 @@ static int __init init_gfs2_fs(void)
gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
sizeof(struct gfs2_inode),
0, SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|
SLAB_ACCOUNT,
gfs2_init_inode_once);
if (!gfs2_inode_cachep)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1281e60be639..572d58e86296 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -214,7 +214,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
- memcpy(&s->s_uuid, str->sb_uuid, 16);
+ super_set_uuid(s, str->sb_uuid, 16);
}
/**
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b0cb70400996..ce9346099c72 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -30,7 +30,7 @@ struct hfsplus_wd {
* @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
* @buf: buffer for I/O
* @data: output pointer for location of requested data
- * @opf: request op flags
+ * @opf: I/O operation type and flags
*
* The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
* HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 6b0ba3c1efba..314834a078e9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -255,7 +255,7 @@ static int init_inodecache(void)
hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
sizeof(struct hpfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (hpfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d746866ae3b6..6502c7e776d1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -933,7 +933,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
unsigned int ia_valid = attr->ia_valid;
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
- error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+ error = setattr_prepare(idmap, dentry, attr);
if (error)
return error;
@@ -950,7 +950,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
hugetlb_vmtruncate(inode, newsize);
}
- setattr_copy(&nop_mnt_idmap, inode, attr);
+ setattr_copy(idmap, inode, attr);
mark_inode_dirty(inode);
return 0;
}
@@ -985,6 +985,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
+ struct mnt_idmap *idmap,
struct inode *dir,
umode_t mode, dev_t dev)
{
@@ -1006,7 +1007,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
inode->i_ino = get_next_ino();
- inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+ inode_init_owner(idmap, inode, dir, mode);
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
@@ -1050,7 +1051,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
{
struct inode *inode;
- inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
if (!inode)
return -ENOSPC;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -1062,7 +1063,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
- int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry,
+ int retval = hugetlbfs_mknod(idmap, dir, dentry,
mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
@@ -1073,7 +1074,7 @@ static int hugetlbfs_create(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0);
+ return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}
static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
@@ -1082,7 +1083,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
{
struct inode *inode;
- inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
if (!inode)
return -ENOSPC;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -1094,10 +1095,11 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *dentry,
const char *symname)
{
+ const umode_t mode = S_IFLNK|S_IRWXUGO;
struct inode *inode;
int error = -ENOSPC;
- inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
if (inode) {
int l = strlen(symname)+1;
error = page_symlink(inode, symname, l);
@@ -1566,6 +1568,7 @@ static struct file_system_type hugetlbfs_fs_type = {
.init_fs_context = hugetlbfs_init_fs_context,
.parameters = hugetlb_fs_parameters,
.kill_sb = kill_litter_super,
+ .fs_flags = FS_ALLOW_IDMAP,
};
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1619,7 +1622,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
}
file = ERR_PTR(-ENOSPC);
- inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
+ /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */
+ inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
+ S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto out;
if (creat_flags == HUGETLB_SHMFS_INODE)
diff --git a/fs/inode.c b/fs/inode.c
index 91048c4c9c9e..3a41f83a4ba5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,6 +20,7 @@
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
+#include <linux/rw_hint.h>
#include <trace/events/writeback.h>
#include "internal.h"
@@ -588,7 +589,8 @@ void dump_mapping(const struct address_space *mapping)
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
- if (get_kernel_nofault(dentry, dentry_ptr)) {
+ if (get_kernel_nofault(dentry, dentry_ptr) ||
+ !dentry.d_parent || !dentry.d_name.name) {
pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
a_ops, ino, dentry_ptr);
return;
@@ -2031,7 +2033,7 @@ static int __remove_privs(struct mnt_idmap *idmap,
return notify_change(idmap, dentry, &newattrs, NULL);
}
-static int __file_remove_privs(struct file *file, unsigned int flags)
+int file_remove_privs_flags(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
@@ -2056,6 +2058,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
inode_has_no_xattr(inode);
return error;
}
+EXPORT_SYMBOL_GPL(file_remove_privs_flags);
/**
* file_remove_privs - remove special file privileges (suid, capabilities)
@@ -2068,7 +2071,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
*/
int file_remove_privs(struct file *file)
{
- return __file_remove_privs(file, 0);
+ return file_remove_privs_flags(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);
@@ -2161,7 +2164,7 @@ static int file_modified_flags(struct file *file, int flags)
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
- ret = __file_remove_privs(file, flags);
+ ret = file_remove_privs_flags(file, flags);
if (ret)
return ret;
@@ -2285,7 +2288,7 @@ void __init inode_init(void)
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
/* Hash may have been set up in inode_init_early */
@@ -2509,7 +2512,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
{
struct timespec64 now = current_time(inode);
- inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
+ inode_set_ctime_to_ts(inode, now);
return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);
diff --git a/fs/internal.h b/fs/internal.h
index b67406435fc0..7ca738904e34 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
+long do_ftruncate(struct file *file, loff_t length, int small);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
@@ -310,3 +311,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
+struct stashed_operations {
+ void (*put_data)(void *data);
+ int (*init_inode)(struct inode *inode, void *data);
+};
+int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
+ struct path *path);
+void stashed_dentry_prune(struct dentry *dentry);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 76cf22ac97d7..1d5abfdf0f22 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -763,6 +763,33 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
return err;
}
+static int ioctl_getfsuuid(struct file *file, void __user *argp)
+{
+ struct super_block *sb = file_inode(file)->i_sb;
+ struct fsuuid2 u = { .len = sb->s_uuid_len, };
+
+ if (!sb->s_uuid_len)
+ return -ENOIOCTLCMD;
+
+ memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len);
+
+ return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
+}
+
+static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
+{
+ struct super_block *sb = file_inode(file)->i_sb;
+
+ if (!strlen(sb->s_sysfs_name))
+ return -ENOIOCTLCMD;
+
+ struct fs_sysfs_path u = {};
+
+ u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name);
+
+ return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
+}
+
/*
* do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
* It's just a simple helper for sys_ioctl and compat_sys_ioctl.
@@ -845,6 +872,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
case FS_IOC_FSSETXATTR:
return ioctl_fssetxattr(filp, argp);
+ case FS_IOC_GETFSUUID:
+ return ioctl_getfsuuid(filp, argp);
+
+ case FS_IOC_GETFSSYSFSPATH:
+ return ioctl_get_fs_sysfs_path(filp, argp);
+
default:
if (S_ISREG(inode->i_mode))
return file_ioctl(filp, cmd, argp);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 093c4515b22a..4e8e41c8b3c0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (C) 2016-2019 Christoph Hellwig.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
return test_bit(block + blks_per_folio, ifs->state);
}
+static unsigned ifs_find_dirty_range(struct folio *folio,
+ struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned start_blk =
+ offset_in_folio(folio, *range_start) >> inode->i_blkbits;
+ unsigned end_blk = min_not_zero(
+ offset_in_folio(folio, range_end) >> inode->i_blkbits,
+ i_blocks_per_folio(inode, folio));
+ unsigned nblks = 1;
+
+ while (!ifs_block_is_dirty(folio, ifs, start_blk))
+ if (++start_blk == end_blk)
+ return 0;
+
+ while (start_blk + nblks < end_blk) {
+ if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
+ break;
+ nblks++;
+ }
+
+ *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
+ return nblks << inode->i_blkbits;
+}
+
+static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
+ u64 range_end)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (*range_start >= range_end)
+ return 0;
+
+ if (ifs)
+ return ifs_find_dirty_range(folio, ifs, range_start, range_end);
+ return range_end - *range_start;
+}
+
static void ifs_clear_range_dirty(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{
@@ -1454,15 +1492,10 @@ out_unlock:
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
- size_t len, int error)
+ size_t len)
{
struct iomap_folio_state *ifs = folio->private;
- if (error) {
- folio_set_error(folio);
- mapping_set_error(inode->i_mapping, error);
- }
-
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
@@ -1479,40 +1512,29 @@ static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_inline_bio;
- struct bio *last = ioend->io_bio, *next;
- u64 start = bio->bi_iter.bi_sector;
- loff_t offset = ioend->io_offset;
- bool quiet = bio_flagged(bio, BIO_QUIET);
+ struct bio *bio = &ioend->io_bio;
+ struct folio_iter fi;
u32 folio_count = 0;
- for (bio = &ioend->io_inline_bio; bio; bio = next) {
- struct folio_iter fi;
-
- /*
- * For the last bio, bi_private points to the ioend, so we
- * need to explicitly end the iteration here.
- */
- if (bio == last)
- next = NULL;
- else
- next = bio->bi_private;
-
- /* walk all folios in bio, ending page IO on them */
- bio_for_each_folio_all(fi, bio) {
- iomap_finish_folio_write(inode, fi.folio, fi.length,
- error);
- folio_count++;
+ if (error) {
+ mapping_set_error(inode->i_mapping, error);
+ if (!bio_flagged(bio, BIO_QUIET)) {
+ pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino,
+ ioend->io_offset, ioend->io_sector);
}
- bio_put(bio);
}
- /* The ioend has been freed by bio_put() */
- if (unlikely(error && !quiet)) {
- printk_ratelimited(KERN_ERR
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
- inode->i_sb->s_id, inode->i_ino, offset, start);
+ /* walk all folios in bio, ending page IO on them */
+ bio_for_each_folio_all(fi, bio) {
+ if (error)
+ folio_set_error(fi.folio);
+ iomap_finish_folio_write(inode, fi.folio, fi.length);
+ folio_count++;
}
+
+ bio_put(bio); /* frees the ioend */
return folio_count;
}
@@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends);
static bool
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
- if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+ if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
if ((ioend->io_flags & IOMAP_F_SHARED) ^
(next->io_flags & IOMAP_F_SHARED))
@@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
static void iomap_writepage_end_bio(struct bio *bio)
{
- struct iomap_ioend *ioend = bio->bi_private;
-
- iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+ iomap_finish_ioend(iomap_ioend_from_bio(bio),
+ blk_status_to_errno(bio->bi_status));
}
/*
* Submit the final bio for an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback
- * and unlocked them. In this situation, we need to fail the bio instead of
- * submitting it. This typically only happens on a filesystem shutdown.
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system proess the errors.
*/
-static int
-iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
- int error)
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{
- ioend->io_bio->bi_private = ioend;
- ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
+ if (!wpc->ioend)
+ return error;
+ /*
+ * Let the file systems prepare the I/O submission and hook in an I/O
+ * comletion handler. This also needs to happen in case after a
+ * failure happened so that the file system end I/O handler gets called
+ * to clean up.
+ */
if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(ioend, error);
+ error = wpc->ops->prepare_ioend(wpc->ioend, error);
+
if (error) {
- /*
- * If we're failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- ioend->io_bio->bi_status = errno_to_blk_status(error);
- bio_endio(ioend->io_bio);
- return error;
+ wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&wpc->ioend->io_bio);
+ } else {
+ submit_bio(&wpc->ioend->io_bio);
}
- submit_bio(ioend->io_bio);
- return 0;
+ wpc->ioend = NULL;
+ return error;
}
-static struct iomap_ioend *
-iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
- loff_t offset, sector_t sector, struct writeback_control *wbc)
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct inode *inode, loff_t pos)
{
struct iomap_ioend *ioend;
struct bio *bio;
@@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
REQ_OP_WRITE | wbc_to_write_flags(wbc),
GFP_NOFS, &iomap_ioend_bioset);
- bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+ bio->bi_end_io = iomap_writepage_end_bio;
wbc_init_bio(wbc, bio);
+ bio->bi_write_hint = inode->i_write_hint;
- ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+ ioend = iomap_ioend_from_bio(bio);
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = wpc->iomap.type;
ioend->io_flags = wpc->iomap.flags;
ioend->io_inode = inode;
ioend->io_size = 0;
- ioend->io_folios = 0;
- ioend->io_offset = offset;
- ioend->io_bio = bio;
- ioend->io_sector = sector;
- return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in iomap_finish_ioend().
- */
-static struct bio *
-iomap_chain_bio(struct bio *prev)
-{
- struct bio *new;
-
- new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
- bio_clone_blkg_association(new, prev);
- new->bi_iter.bi_sector = bio_end_sector(prev);
+ ioend->io_offset = pos;
+ ioend->io_sector = bio->bi_iter.bi_sector;
- bio_chain(prev, new);
- bio_get(prev); /* for iomap_finish_ioend */
- submit_bio(prev);
- return new;
+ wpc->nr_folios = 0;
+ return ioend;
}
-static bool
-iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
- sector_t sector)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
{
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
(wpc->ioend->io_flags & IOMAP_F_SHARED))
return false;
if (wpc->iomap.type != wpc->ioend->io_type)
return false;
- if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+ if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
- if (sector != bio_end_sector(wpc->ioend->io_bio))
+ if (iomap_sector(&wpc->iomap, pos) !=
+ bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
* Limit ioend bio chain lengths to minimise IO completion latency. This
* also prevents long tight loops ending page writeback on all the
* folios in the ioend.
*/
- if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
+ if (wpc->nr_folios >= IOEND_BATCH_SIZE)
return false;
return true;
}
@@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
/*
* Test to see if we have an existing ioend structure that we could append to
* first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly. Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
*/
-static void
-iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
- struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct list_head *iolist)
+static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, loff_t pos, unsigned len)
{
- sector_t sector = iomap_sector(&wpc->iomap, pos);
- unsigned len = i_blocksize(inode);
+ struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
+ int error;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
- if (wpc->ioend)
- list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+new_ioend:
+ error = iomap_submit_ioend(wpc, 0);
+ if (error)
+ return error;
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
}
- if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
- wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
- bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
- }
+ if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+ goto new_ioend;
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, &folio->page, len);
+ return 0;
}
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * the forward progress guarantees we need to provide. The current ioend we're
- * adding blocks to is cached in the writepage context, and if the new block
- * doesn't append to the cached ioend, it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected. While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-iomap_writepage_map(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode,
- struct folio *folio, u64 end_pos)
+static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, u64 pos, unsigned dirty_len,
+ unsigned *count)
{
- struct iomap_folio_state *ifs = folio->private;
- struct iomap_ioend *ioend, *next;
- unsigned len = i_blocksize(inode);
- unsigned nblocks = i_blocks_per_folio(inode, folio);
- u64 pos = folio_pos(folio);
- int error = 0, count = 0, i;
- LIST_HEAD(submit_list);
-
- WARN_ON_ONCE(end_pos <= pos);
-
- if (!ifs && nblocks > 1) {
- ifs = ifs_alloc(inode, folio, 0);
- iomap_set_range_dirty(folio, 0, end_pos - pos);
- }
+ int error;
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
-
- /*
- * Walk through the folio to find areas to write back. If we
- * run off the end of the current map or find the current map
- * invalid, grab a new one.
- */
- for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
- if (ifs && !ifs_block_is_dirty(folio, ifs, i))
- continue;
+ do {
+ unsigned map_len;
- error = wpc->ops->map_blocks(wpc, inode, pos);
+ error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
if (error)
break;
- trace_iomap_writepage_map(inode, &wpc->iomap);
- if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
- continue;
- if (wpc->iomap.type == IOMAP_HOLE)
- continue;
- iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
- &submit_list);
- count++;
- }
- if (count)
- wpc->ioend->io_folios++;
+ trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
- WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
- WARN_ON_ONCE(!folio_test_locked(folio));
- WARN_ON_ONCE(folio_test_writeback(folio));
- WARN_ON_ONCE(folio_test_dirty(folio));
+ map_len = min_t(u64, dirty_len,
+ wpc->iomap.offset + wpc->iomap.length - pos);
+ WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+ switch (wpc->iomap.type) {
+ case IOMAP_INLINE:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ case IOMAP_HOLE:
+ break;
+ default:
+ error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+ map_len);
+ if (!error)
+ (*count)++;
+ break;
+ }
+ dirty_len -= map_len;
+ pos += map_len;
+ } while (dirty_len && !error);
/*
* We cannot cancel the ioend directly here on error. We may have
* already set other pages under writeback and hence we have to run I/O
* completion to mark the error state of the pages under writeback
* appropriately.
+ *
+ * Just let the file system know what portion of the folio failed to
+ * map.
*/
- if (unlikely(error)) {
- /*
- * Let the filesystem know what portion of the current page
- * failed to map. If the page hasn't been added to ioend, it
- * won't be affected by I/O completion and we must unlock it
- * now.
- */
- if (wpc->ops->discard_folio)
- wpc->ops->discard_folio(folio, pos);
- if (!count) {
- folio_unlock(folio);
- goto done;
- }
- }
-
- /*
- * We can have dirty bits set past end of file in page_mkwrite path
- * while mapping the last partial folio. Hence it's better to clear
- * all the dirty bits in the folio here.
- */
- iomap_clear_range_dirty(folio, 0, folio_size(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
-
- /*
- * Preserve the original error if there was one; catch
- * submission errors here and propagate into subsequent ioend
- * submissions.
- */
- list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
- int error2;
-
- list_del_init(&ioend->io_list);
- error2 = iomap_submit_ioend(wpc, ioend, error);
- if (error2 && !error)
- error = error2;
- }
-
- /*
- * We can end up here with no error and nothing to write only if we race
- * with a partial page truncate on a sub-page block sized filesystem.
- */
- if (!count)
- folio_end_writeback(folio);
-done:
- mapping_set_error(inode->i_mapping, error);
+ if (error && wpc->ops->discard_folio)
+ wpc->ops->discard_folio(folio, pos);
return error;
}
/*
- * Write out a dirty page.
+ * Check interaction of the folio with the file end.
*
- * For delalloc space on the page, we need to allocate space and flush it.
- * For unwritten space on the page, we need to start the conversion to
- * regular allocated space.
+ * If the folio is entirely beyond i_size, return false. If it straddles
+ * i_size, adjust end_pos and zero all data beyond i_size.
*/
-static int iomap_do_writepage(struct folio *folio,
- struct writeback_control *wbc, void *data)
+static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+ u64 *end_pos)
{
- struct iomap_writepage_ctx *wpc = data;
- struct inode *inode = folio->mapping->host;
- u64 end_pos, isize;
-
- trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
+ u64 isize = i_size_read(inode);
- /*
- * Refuse to write the folio out if we're called from reclaim context.
- *
- * This avoids stack overflows when called from deeply used stacks in
- * random callers for direct reclaim or memcg reclaim. We explicitly
- * allow reclaim from kswapd as the stack usage there is relatively low.
- *
- * This should never happen except in the case of a VM regression so
- * warn about it.
- */
- if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC))
- goto redirty;
-
- /*
- * Is this folio beyond the end of the file?
- *
- * The folio index is less than the end_index, adjust the end_pos
- * to the highest offset that this folio should represent.
- * -----------------------------------------------------
- * | file mapping | <EOF> |
- * -----------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | |
- * ^--------------------------------^----------|--------
- * | desired writeback range | see else |
- * ---------------------------------^------------------|
- */
- isize = i_size_read(inode);
- end_pos = folio_pos(folio) + folio_size(folio);
- if (end_pos > isize) {
- /*
- * Check whether the page to write out is beyond or straddles
- * i_size or not.
- * -------------------------------------------------------
- * | file mapping | <EOF> |
- * -------------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
- * ^--------------------------------^-----------|---------
- * | | Straddles |
- * ---------------------------------^-----------|--------|
- */
+ if (*end_pos > isize) {
size_t poff = offset_in_folio(folio, isize);
pgoff_t end_index = isize >> PAGE_SHIFT;
/*
- * Skip the page if it's fully outside i_size, e.g.
- * due to a truncate operation that's in progress. We've
- * cleaned this page and truncate will finish things off for
- * us.
+ * If the folio is entirely ouside of i_size, skip it.
+ *
+ * This can happen due to a truncate operation that is in
+ * progress and in that case truncate will finish it off once
+ * we've dropped the folio lock.
*
- * Note that the end_index is unsigned long. If the given
- * offset is greater than 16TB on a 32-bit system then if we
- * checked if the page is fully outside i_size with
- * "if (page->index >= end_index + 1)", "end_index + 1" would
- * overflow and evaluate to 0. Hence this page would be
+ * Note that the pgoff_t used for end_index is an unsigned long.
+ * If the given offset is greater than 16TB on a 32-bit system,
+ * then if we checked if the folio is fully outside i_size with
+ * "if (folio->index >= end_index + 1)", "end_index + 1" would
+ * overflow and evaluate to 0. Hence this folio would be
* redirtied and written out repeatedly, which would result in
* an infinite loop; the user program performing this operation
* would hang. Instead, we can detect this situation by
- * checking if the page is totally beyond i_size or if its
+ * checking if the folio is totally beyond i_size or if its
* offset is just equal to the EOF.
*/
if (folio->index > end_index ||
(folio->index == end_index && poff == 0))
- goto unlock;
+ return false;
/*
- * The page straddles i_size. It must be zeroed out on each
- * and every writepage invocation because it may be mmapped.
- * "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
- * memory is zeroed when mapped, and writes to that region are
- * not written out to the file."
+ * The folio straddles i_size.
+ *
+ * It must be zeroed out on each and every writepage invocation
+ * because it may be mmapped:
+ *
+ * A file is mapped in multiples of the page size. For a
+ * file that is not a multiple of the page size, the
+ * remaining memory is zeroed when mapped, and writes to that
+ * region are not written out to the file.
+ *
+ * Also adjust the writeback range to skip all blocks entirely
+ * beyond i_size.
*/
folio_zero_segment(folio, poff, folio_size(folio));
- end_pos = isize;
+ *end_pos = round_up(isize, i_blocksize(inode));
+ }
+
+ return true;
+}
+
+static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ struct inode *inode = folio->mapping->host;
+ u64 pos = folio_pos(folio);
+ u64 end_pos = pos + folio_size(folio);
+ unsigned count = 0;
+ int error = 0;
+ u32 rlen;
+
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ WARN_ON_ONCE(folio_test_dirty(folio));
+ WARN_ON_ONCE(folio_test_writeback(folio));
+
+ trace_iomap_writepage(inode, pos, folio_size(folio));
+
+ if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
+ folio_unlock(folio);
+ return 0;
+ }
+ WARN_ON_ONCE(end_pos <= pos);
+
+ if (i_blocks_per_folio(inode, folio) > 1) {
+ if (!ifs) {
+ ifs = ifs_alloc(inode, folio, 0);
+ iomap_set_range_dirty(folio, 0, end_pos - pos);
+ }
+
+ /*
+ * Keep the I/O completion handler from clearing the writeback
+ * bit until we have submitted all blocks by adding a bias to
+ * ifs->write_bytes_pending, which is dropped after submitting
+ * all blocks.
+ */
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+ atomic_inc(&ifs->write_bytes_pending);
}
- return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
+ /*
+ * Set the writeback bit ASAP, as the I/O completion for the single
+ * block per folio case happen hit as soon as we're submitting the bio.
+ */
+ folio_start_writeback(folio);
-redirty:
- folio_redirty_for_writepage(wbc, folio);
-unlock:
+ /*
+ * Walk through the folio to find dirty areas to write back.
+ */
+ while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+ error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+ pos, rlen, &count);
+ if (error)
+ break;
+ pos += rlen;
+ }
+
+ if (count)
+ wpc->nr_folios++;
+
+ /*
+ * We can have dirty bits set past end of file in page_mkwrite path
+ * while mapping the last partial folio. Hence it's better to clear
+ * all the dirty bits in the folio here.
+ */
+ iomap_clear_range_dirty(folio, 0, folio_size(folio));
+
+ /*
+ * Usually the writeback bit is cleared by the I/O completion handler.
+ * But we may end up either not actually writing any blocks, or (when
+ * there are multiple blocks in a folio) all I/O might have finished
+ * already at this point. In that case we need to clear the writeback
+ * bit ourselves right after unlocking the page.
+ */
folio_unlock(folio);
- return 0;
+ if (ifs) {
+ if (atomic_dec_and_test(&ifs->write_bytes_pending))
+ folio_end_writeback(folio);
+ } else {
+ if (!count)
+ folio_end_writeback(folio);
+ }
+ mapping_set_error(inode->i_mapping, error);
+ return error;
+}
+
+static int iomap_do_writepage(struct folio *folio,
+ struct writeback_control *wbc, void *data)
+{
+ return iomap_writepage_map(data, wbc, folio);
}
int
@@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
{
int ret;
+ /*
+ * Writeback from reclaim context should never happen except in the case
+ * of a VM regression so warn about it and refuse to write the data.
+ */
+ if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) ==
+ PF_MEMALLOC))
+ return -EIO;
+
wpc->ops = ops;
ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
- if (!wpc->ioend)
- return ret;
- return iomap_submit_ioend(wpc, wpc->ioend, ret);
+ return iomap_submit_ioend(wpc, ret);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
static int __init iomap_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_inline_bio),
+ offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
fs_initcall(iomap_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..f3b43d223a46 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+ bio->bi_write_hint = inode->i_write_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..0a991c4ce87d 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name, \
TP_ARGS(inode, iomap))
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-DEFINE_IOMAP_EVENT(iomap_writepage_map);
+
+TRACE_EVENT(iomap_writepage_map,
+ TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
+ struct iomap *iomap),
+ TP_ARGS(inode, pos, dirty_len, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(u64, pos)
+ __field(u64, dirty_len)
+ __field(u64, addr)
+ __field(loff_t, offset)
+ __field(u64, length)
+ __field(u16, type)
+ __field(u16, flags)
+ __field(dev_t, bdev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pos = pos;
+ __entry->dirty_len = dirty_len;
+ __entry->addr = iomap->addr;
+ __entry->offset = iomap->offset;
+ __entry->length = iomap->length;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
+ "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ MAJOR(__entry->bdev), MINOR(__entry->bdev),
+ __entry->pos,
+ __entry->dirty_len,
+ __entry->addr,
+ __entry->offset,
+ __entry->length,
+ __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+);
TRACE_EVENT(iomap_iter,
TP_PROTO(struct iomap_iter *iter, const void *ops,
@@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter,
__field(u64, ino)
__field(loff_t, pos)
__field(u64, length)
+ __field(s64, processed)
__field(unsigned int, flags)
__field(const void *, ops)
__field(unsigned long, caller)
@@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter,
__entry->ino = iter->inode->i_ino;
__entry->pos = iter->pos;
__entry->length = iomap_length(iter);
+ __entry->processed = iter->processed;
__entry->flags = iter->flags;
__entry->ops = ops;
__entry->caller = caller;
),
- TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
__entry->length,
+ __entry->processed,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3e4d53e26f94..2a616a9f289d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -93,7 +93,7 @@ static int __init init_inodecache(void)
isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
sizeof(struct iso_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (!isofs_inode_cachep)
return -ENOMEM;
@@ -908,8 +908,22 @@ root_found:
* we then decide whether to use the Joliet descriptor.
*/
inode = isofs_iget(s, sbi->s_firstdatazone, 0);
- if (IS_ERR(inode))
- goto out_no_root;
+
+ /*
+ * Fix for broken CDs with a corrupt root inode but a correct Joliet
+ * root directory.
+ */
+ if (IS_ERR(inode)) {
+ if (joliet_level && sbi->s_firstdatazone != first_data_zone) {
+ printk(KERN_NOTICE
+ "ISOFS: root inode is unusable. "
+ "Disabling Rock Ridge and switching to Joliet.");
+ sbi->s_rock = 0;
+ inode = NULL;
+ } else {
+ goto out_no_root;
+ }
+ }
/*
* Fix for broken CDs with Rock Ridge and empty ISO root directory but
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f99591a634b4..aede1be4dc0c 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void)
jffs2_inode_cachep = kmem_cache_create("jffs2_i",
sizeof(struct jffs2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
jffs2_i_init_once);
if (!jffs2_inode_cachep) {
pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index dd4264aa9bed..10934f9a11be 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -92,7 +92,7 @@ struct jfs_inode_info {
} link;
} u;
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
u32 dev; /* will die when we get wide dev_t */
struct inode vfs_inode;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cb6d1fda66a7..9609349e92e5 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
int lmLogOpen(struct super_block *sb)
{
int rc;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct jfs_log *log;
struct jfs_sb_info *sbi = JFS_SBI(sb);
@@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb)
mutex_lock(&jfs_log_mutex);
list_for_each_entry(log, &jfs_external_logs, journal_list) {
- if (log->bdev_handle->bdev->bd_dev == sbi->logdev) {
+ if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) {
if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
jfs_warn("wrong uuid on JFS journal");
mutex_unlock(&jfs_log_mutex);
@@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb)
* file systems to log may have n-to-1 relationship;
*/
- bdev_handle = bdev_open_by_dev(sbi->logdev,
+ bdev_file = bdev_file_open_by_dev(sbi->logdev,
BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
- if (IS_ERR(bdev_handle)) {
- rc = PTR_ERR(bdev_handle);
+ if (IS_ERR(bdev_file)) {
+ rc = PTR_ERR(bdev_file);
goto free;
}
- log->bdev_handle = bdev_handle;
+ log->bdev_file = bdev_file;
uuid_copy(&log->uuid, &sbi->loguuid);
/*
@@ -1141,7 +1141,7 @@ journal_found:
lbmLogShutdown(log);
close: /* close external log device */
- bdev_release(bdev_handle);
+ bdev_fput(bdev_file);
free: /* free log descriptor */
mutex_unlock(&jfs_log_mutex);
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
init_waitqueue_head(&log->syncwait);
set_bit(log_INLINELOG, &log->flag);
- log->bdev_handle = sb->s_bdev_handle;
+ log->bdev_file = sb->s_bdev_file;
log->base = addressPXD(&JFS_SBI(sb)->logpxd);
log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
(L2LOGPSIZE - sb->s_blocksize_bits);
@@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb)
{
struct jfs_sb_info *sbi = JFS_SBI(sb);
struct jfs_log *log = sbi->log;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int rc = 0;
jfs_info("lmLogClose: log:0x%p", log);
@@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb)
* external log as separate logical volume
*/
list_del(&log->journal_list);
- bdev_handle = log->bdev_handle;
+ bdev_file = log->bdev_file;
rc = lmLogShutdown(log);
- bdev_release(bdev_handle);
+ bdev_fput(bdev_file);
kfree(log);
@@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bp->l_flag |= lbmREAD;
- bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS);
+ bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2115,7 +2115,7 @@ static void lbmStartIO(struct lbuf * bp)
jfs_info("lbmStartIO");
if (!log->no_integrity)
- bdev = log->bdev_handle->bdev;
+ bdev = file_bdev(log->bdev_file);
bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
GFP_NOFS);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 84aa2d253907..8b8994e48cd0 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -356,7 +356,7 @@ struct jfs_log {
* before writing syncpt.
*/
struct list_head journal_list; /* Global list */
- struct bdev_handle *bdev_handle; /* 4: log lv pointer */
+ struct file *bdev_file; /* 4: log lv pointer */
int serial; /* 4: log mount serial number */
s64 base; /* @8: log extent address (inline log ) */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 9b5c6a20b30c..98f9a432c336 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -431,7 +431,7 @@ int updateSuper(struct super_block *sb, uint state)
if (state == FM_MOUNT) {
/* record log's dev_t and mount serial number */
j_sb->s_logdev = cpu_to_le32(
- new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev));
+ new_encode_dev(file_bdev(sbi->log->bdev_file)->bd_dev));
j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
} else if (state == FM_CLEAN) {
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8d8e556bd610..e1be21ca5d6e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -824,7 +824,7 @@ out:
return len - towrite;
}
-static struct dquot **jfs_get_dquots(struct inode *inode)
+static struct dquot __rcu **jfs_get_dquots(struct inode *inode)
{
return JFS_IP(inode)->i_dquot;
}
@@ -932,7 +932,7 @@ static int __init init_jfs_fs(void)
jfs_inode_cachep =
kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+ 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
offsetof(struct jfs_inode_info, i_inline_all),
sizeof_field(struct jfs_inode_info, i_inline_all),
init_once);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index bce1d7ac95ca..458519e416fe 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
}
EXPORT_SYMBOL_GPL(kernfs_get);
+static void kernfs_free_rcu(struct rcu_head *rcu)
+{
+ struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
+
+ kfree_const(kn->name);
+
+ if (kn->iattr) {
+ simple_xattrs_free(&kn->iattr->xattrs, NULL);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+ }
+
+ kmem_cache_free(kernfs_node_cache, kn);
+}
+
/**
* kernfs_put - put a reference count on a kernfs_node
* @kn: the target kernfs_node
@@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_LINK)
kernfs_put(kn->symlink.target_kn);
- kfree_const(kn->name);
-
- if (kn->iattr) {
- simple_xattrs_free(&kn->iattr->xattrs, NULL);
- kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
- }
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock);
- kmem_cache_free(kernfs_node_cache, kn);
+
+ call_rcu(&kn->rcu, kernfs_free_rcu);
kn = parent;
if (kn) {
@@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
} else {
/* just released the root kn, free @root too */
idr_destroy(&root->ino_idr);
- kfree(root);
+ kfree_rcu(root, rcu);
}
}
EXPORT_SYMBOL_GPL(kernfs_put);
@@ -715,7 +724,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
ino_t ino = kernfs_id_ino(id);
u32 gen = kernfs_id_gen(id);
- spin_lock(&kernfs_idr_lock);
+ rcu_read_lock();
kn = idr_find(&root->ino_idr, (u32)ino);
if (!kn)
@@ -739,10 +748,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
goto err_unlock;
- spin_unlock(&kernfs_idr_lock);
+ rcu_read_unlock();
return kn;
err_unlock:
- spin_unlock(&kernfs_idr_lock);
+ rcu_read_unlock();
return NULL;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ffa4565c275a..8502ef68459b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -483,9 +483,11 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
goto out_put;
rc = 0;
- of->mmapped = true;
- of_on(of)->nr_mmapped++;
- of->vm_ops = vma->vm_ops;
+ if (!of->mmapped) {
+ of->mmapped = true;
+ of_on(of)->nr_mmapped++;
+ of->vm_ops = vma->vm_ops;
+ }
vma->vm_ops = &kernfs_vm_ops;
out_put:
kernfs_put_active(of->kn);
@@ -634,11 +636,18 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
* each file a separate locking class. Let's differentiate on
* whether the file has mmap or not for now.
*
- * Both paths of the branch look the same. They're supposed to
+ * For similar reasons, writable and readonly files are given different
+ * lockdep key, because the writable file /sys/power/resume may call vfs
+ * lookup helpers for arbitrary paths and readonly files can be read by
+ * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs.
+ *
+ * All three cases look the same. They're supposed to
* look that way and give @of->mutex different static lockdep keys.
*/
if (has_mmap)
mutex_init(&of->mutex);
+ else if (file->f_mode & FMODE_WRITE)
+ mutex_init(&of->mutex);
else
mutex_init(&of->mutex);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 237f2764b941..b42ee6547cdc 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -49,6 +49,8 @@ struct kernfs_root {
struct rw_semaphore kernfs_rwsem;
struct rw_semaphore kernfs_iattr_rwsem;
struct rw_semaphore kernfs_supers_rwsem;
+
+ struct rcu_head rcu;
};
/* +1 to avoid triggering overflow warning when negating it */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 0c93cad0f0ac..e29f4edf9572 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -358,7 +358,9 @@ int kernfs_get_tree(struct fs_context *fc)
}
sb->s_flags |= SB_ACTIVE;
- uuid_gen(&sb->s_uuid);
+ uuid_t uuid;
+ uuid_gen(&uuid);
+ super_set_uuid(sb, uuid.b, sizeof(uuid));
down_write(&root->kernfs_supers_rwsem);
list_add(&info->node, &info->root->supers);
diff --git a/fs/libfs.c b/fs/libfs.c
index eec6031b0155..3a6f2cb364f8 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -23,6 +23,7 @@
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>
+#include <linux/pidfs.h>
#include <linux/uaccess.h>
@@ -240,17 +241,22 @@ const struct inode_operations simple_dir_inode_operations = {
};
EXPORT_SYMBOL(simple_dir_inode_operations);
-static void offset_set(struct dentry *dentry, u32 offset)
+/* 0 is '.', 1 is '..', so always start with offset 2 or more */
+enum {
+ DIR_OFFSET_MIN = 2,
+};
+
+static void offset_set(struct dentry *dentry, long offset)
{
- dentry->d_fsdata = (void *)((uintptr_t)(offset));
+ dentry->d_fsdata = (void *)offset;
}
-static u32 dentry2offset(struct dentry *dentry)
+static long dentry2offset(struct dentry *dentry)
{
- return (u32)((uintptr_t)(dentry->d_fsdata));
+ return (long)dentry->d_fsdata;
}
-static struct lock_class_key simple_offset_xa_lock;
+static struct lock_class_key simple_offset_lock_class;
/**
* simple_offset_init - initialize an offset_ctx
@@ -259,11 +265,9 @@ static struct lock_class_key simple_offset_xa_lock;
*/
void simple_offset_init(struct offset_ctx *octx)
{
- xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
- lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
-
- /* 0 is '.', 1 is '..', so always start with offset 2 */
- octx->next_offset = 2;
+ mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
+ lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
+ octx->next_offset = DIR_OFFSET_MIN;
}
/**
@@ -271,20 +275,19 @@ void simple_offset_init(struct offset_ctx *octx)
* @octx: directory offset ctx to be updated
* @dentry: new dentry being added
*
- * Returns zero on success. @so_ctx and the dentry offset are updated.
+ * Returns zero on success. @octx and the dentry's offset are updated.
* Otherwise, a negative errno value is returned.
*/
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
{
- static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
- u32 offset;
+ unsigned long offset;
int ret;
if (dentry2offset(dentry) != 0)
return -EBUSY;
- ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
- &octx->next_offset, GFP_KERNEL);
+ ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
+ LONG_MAX, &octx->next_offset, GFP_KERNEL);
if (ret < 0)
return ret;
@@ -300,17 +303,49 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
*/
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
{
- u32 offset;
+ long offset;
offset = dentry2offset(dentry);
if (offset == 0)
return;
- xa_erase(&octx->xa, offset);
+ mtree_erase(&octx->mt, offset);
offset_set(dentry, 0);
}
/**
+ * simple_offset_empty - Check if a dentry can be unlinked
+ * @dentry: dentry to be tested
+ *
+ * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
+ */
+int simple_offset_empty(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct offset_ctx *octx;
+ struct dentry *child;
+ unsigned long index;
+ int ret = 1;
+
+ if (!inode || !S_ISDIR(inode->i_mode))
+ return ret;
+
+ index = DIR_OFFSET_MIN;
+ octx = inode->i_op->get_offset_ctx(inode);
+ mt_for_each(&octx->mt, child, index, LONG_MAX) {
+ spin_lock(&child->d_lock);
+ if (simple_positive(child)) {
+ spin_unlock(&child->d_lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&child->d_lock);
+ }
+
+ return ret;
+}
+
+/**
* simple_offset_rename_exchange - exchange rename with directory offsets
* @old_dir: parent of dentry being moved
* @old_dentry: dentry being moved
@@ -327,8 +362,8 @@ int simple_offset_rename_exchange(struct inode *old_dir,
{
struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
- u32 old_index = dentry2offset(old_dentry);
- u32 new_index = dentry2offset(new_dentry);
+ long old_index = dentry2offset(old_dentry);
+ long new_index = dentry2offset(new_dentry);
int ret;
simple_offset_remove(old_ctx, old_dentry);
@@ -354,9 +389,9 @@ int simple_offset_rename_exchange(struct inode *old_dir,
out_restore:
offset_set(old_dentry, old_index);
- xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+ mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL);
offset_set(new_dentry, new_index);
- xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+ mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
return ret;
}
@@ -369,7 +404,7 @@ out_restore:
*/
void simple_offset_destroy(struct offset_ctx *octx)
{
- xa_destroy(&octx->xa);
+ mtree_destroy(&octx->mt);
}
/**
@@ -399,15 +434,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
/* In this case, ->private_data is protected by f_pos_lock */
file->private_data = NULL;
- return vfs_setpos(file, offset, U32_MAX);
+ return vfs_setpos(file, offset, LONG_MAX);
}
-static struct dentry *offset_find_next(struct xa_state *xas)
+static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
{
+ MA_STATE(mas, &octx->mt, offset, offset);
struct dentry *child, *found = NULL;
rcu_read_lock();
- child = xas_next_entry(xas, U32_MAX);
+ child = mas_find(&mas, LONG_MAX);
if (!child)
goto out;
spin_lock(&child->d_lock);
@@ -421,8 +457,8 @@ out:
static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
{
- u32 offset = dentry2offset(dentry);
struct inode *inode = d_inode(dentry);
+ long offset = dentry2offset(dentry);
return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
inode->i_ino, fs_umode_to_dtype(inode->i_mode));
@@ -430,12 +466,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
{
- struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
- XA_STATE(xas, &so_ctx->xa, ctx->pos);
+ struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
struct dentry *dentry;
while (true) {
- dentry = offset_find_next(&xas);
+ dentry = offset_find_next(octx, ctx->pos);
if (!dentry)
return ERR_PTR(-ENOENT);
@@ -444,8 +479,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
break;
}
+ ctx->pos = dentry2offset(dentry) + 1;
dput(dentry);
- ctx->pos = xas.xa_index + 1;
}
return NULL;
}
@@ -481,7 +516,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
return 0;
/* In this case, ->private_data is protected by f_pos_lock */
- if (ctx->pos == 2)
+ if (ctx->pos == DIR_OFFSET_MIN)
file->private_data = NULL;
else if (file->private_data == ERR_PTR(-ENOENT))
return 0;
@@ -1580,7 +1615,7 @@ EXPORT_SYMBOL(alloc_anon_inode);
* All arguments are ignored and it just returns -EINVAL.
*/
int
-simple_nosetlease(struct file *filp, int arg, struct file_lock **flp,
+simple_nosetlease(struct file *filp, int arg, struct file_lease **flp,
void **priv)
{
return -EINVAL;
@@ -1704,16 +1739,28 @@ bool is_empty_dir_inode(struct inode *inode)
static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
- const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *dir = READ_ONCE(parent->d_inode);
- const struct super_block *sb = dentry->d_sb;
- const struct unicode_map *um = sb->s_encoding;
- struct qstr qstr = QSTR_INIT(str, len);
+ const struct dentry *parent;
+ const struct inode *dir;
char strbuf[DNAME_INLINE_LEN];
- int ret;
+ struct qstr qstr;
+
+ /*
+ * Attempt a case-sensitive match first. It is cheaper and
+ * should cover most lookups, including all the sane
+ * applications that expect a case-sensitive filesystem.
+ *
+ * This comparison is safe under RCU because the caller
+ * guarantees the consistency between str and len. See
+ * __d_lookup_rcu_op_compare() for details.
+ */
+ if (len == name->len && !memcmp(str, name->name, len))
+ return 0;
+ parent = READ_ONCE(dentry->d_parent);
+ dir = READ_ONCE(parent->d_inode);
if (!dir || !IS_CASEFOLDED(dir))
- goto fallback;
+ return 1;
+
/*
* If the dentry name is stored in-line, then it may be concurrently
* modified by a rename. If this happens, the VFS will eventually retry
@@ -1724,20 +1771,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
if (len <= DNAME_INLINE_LEN - 1) {
memcpy(strbuf, str, len);
strbuf[len] = 0;
- qstr.name = strbuf;
+ str = strbuf;
/* prevent compiler from optimizing out the temporary buffer */
barrier();
}
- ret = utf8_strncasecmp(um, name, &qstr);
- if (ret >= 0)
- return ret;
+ qstr.len = len;
+ qstr.name = str;
- if (sb_has_strict_encoding(sb))
- return -EINVAL;
-fallback:
- if (len != name->len)
- return 1;
- return !!memcmp(str, name->name, len);
+ return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}
/**
@@ -1752,7 +1793,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
const struct unicode_map *um = sb->s_encoding;
- int ret = 0;
+ int ret;
if (!dir || !IS_CASEFOLDED(dir))
return 0;
@@ -1766,73 +1807,45 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
-};
-#endif
-
#ifdef CONFIG_FS_ENCRYPTION
-static const struct dentry_operations generic_encrypted_dentry_ops = {
.d_revalidate = fscrypt_d_revalidate,
+#endif
};
#endif
-#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
-static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
- .d_hash = generic_ci_d_hash,
- .d_compare = generic_ci_d_compare,
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct dentry_operations generic_encrypted_dentry_ops = {
.d_revalidate = fscrypt_d_revalidate,
};
#endif
/**
- * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
- * @dentry: dentry to set ops on
- *
- * Casefolded directories need d_hash and d_compare set, so that the dentries
- * contained in them are handled case-insensitively. Note that these operations
- * are needed on the parent directory rather than on the dentries in it, and
- * while the casefolding flag can be toggled on and off on an empty directory,
- * dentry_operations can't be changed later. As a result, if the filesystem has
- * casefolding support enabled at all, we have to give all dentries the
- * casefolding operations even if their inode doesn't have the casefolding flag
- * currently (and thus the casefolding ops would be no-ops for now).
+ * generic_set_sb_d_ops - helper for choosing the set of
+ * filesystem-wide dentry operations for the enabled features
+ * @sb: superblock to be configured
*
- * Encryption works differently in that the only dentry operation it needs is
- * d_revalidate, which it only needs on dentries that have the no-key name flag.
- * The no-key flag can't be set "later", so we don't have to worry about that.
- *
- * Finally, to maximize compatibility with overlayfs (which isn't compatible
- * with certain dentry operations) and to avoid taking an unnecessary
- * performance hit, we use custom dentry_operations for each possible
- * combination rather than always installing all operations.
+ * Filesystems supporting casefolding and/or fscrypt can call this
+ * helper at mount-time to configure sb->s_d_op to best set of dentry
+ * operations required for the enabled features. The helper must be
+ * called after these have been configured, but before the root dentry
+ * is created.
*/
-void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+void generic_set_sb_d_ops(struct super_block *sb)
{
-#ifdef CONFIG_FS_ENCRYPTION
- bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
-#endif
#if IS_ENABLED(CONFIG_UNICODE)
- bool needs_ci_ops = dentry->d_sb->s_encoding;
-#endif
-#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
- if (needs_encrypt_ops && needs_ci_ops) {
- d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+ if (sb->s_encoding) {
+ sb->s_d_op = &generic_ci_dentry_ops;
return;
}
#endif
#ifdef CONFIG_FS_ENCRYPTION
- if (needs_encrypt_ops) {
- d_set_d_op(dentry, &generic_encrypted_dentry_ops);
- return;
- }
-#endif
-#if IS_ENABLED(CONFIG_UNICODE)
- if (needs_ci_ops) {
- d_set_d_op(dentry, &generic_ci_dentry_ops);
+ if (sb->s_cop) {
+ sb->s_d_op = &generic_encrypted_dentry_ops;
return;
}
#endif
}
-EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+EXPORT_SYMBOL(generic_set_sb_d_ops);
/**
* inode_maybe_inc_iversion - increments i_version
@@ -1973,3 +1986,147 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
return ts;
}
EXPORT_SYMBOL(simple_inode_init_ts);
+
+static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+{
+ struct dentry *dentry;
+
+ guard(rcu)();
+ dentry = READ_ONCE(stashed);
+ if (!dentry)
+ return NULL;
+ if (!lockref_get_not_dead(&dentry->d_lockref))
+ return NULL;
+ return dentry;
+}
+
+static struct dentry *prepare_anon_dentry(struct dentry **stashed,
+ struct super_block *sb,
+ void *data)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+ const struct stashed_operations *sops = sb->s_fs_info;
+ int ret;
+
+ inode = new_inode_pseudo(sb);
+ if (!inode) {
+ sops->put_data(data);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ inode->i_flags |= S_IMMUTABLE;
+ inode->i_mode = S_IFREG;
+ simple_inode_init_ts(inode);
+
+ ret = sops->init_inode(inode, data);
+ if (ret < 0) {
+ iput(inode);
+ return ERR_PTR(ret);
+ }
+
+ /* Notice when this is changed. */
+ WARN_ON_ONCE(!S_ISREG(inode->i_mode));
+ WARN_ON_ONCE(!IS_IMMUTABLE(inode));
+
+ dentry = d_alloc_anon(sb);
+ if (!dentry) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Store address of location where dentry's supposed to be stashed. */
+ dentry->d_fsdata = stashed;
+
+ /* @data is now owned by the fs */
+ d_instantiate(dentry, inode);
+ return dentry;
+}
+
+static struct dentry *stash_dentry(struct dentry **stashed,
+ struct dentry *dentry)
+{
+ guard(rcu)();
+ for (;;) {
+ struct dentry *old;
+
+ /* Assume any old dentry was cleared out. */
+ old = cmpxchg(stashed, NULL, dentry);
+ if (likely(!old))
+ return dentry;
+
+ /* Check if somebody else installed a reusable dentry. */
+ if (lockref_get_not_dead(&old->d_lockref))
+ return old;
+
+ /* There's an old dead dentry there, try to take it over. */
+ if (likely(try_cmpxchg(stashed, &old, dentry)))
+ return dentry;
+ }
+}
+
+/**
+ * path_from_stashed - create path from stashed or new dentry
+ * @stashed: where to retrieve or stash dentry
+ * @mnt: mnt of the filesystems to use
+ * @data: data to store in inode->i_private
+ * @path: path to create
+ *
+ * The function tries to retrieve a stashed dentry from @stashed. If the dentry
+ * is still valid then it will be reused. If the dentry isn't able the function
+ * will allocate a new dentry and inode. It will then check again whether it
+ * can reuse an existing dentry in case one has been added in the meantime or
+ * update @stashed with the newly added dentry.
+ *
+ * Special-purpose helper for nsfs and pidfs.
+ *
+ * Return: On success zero and on failure a negative error is returned.
+ */
+int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
+ struct path *path)
+{
+ struct dentry *dentry;
+ const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
+
+ /* See if dentry can be reused. */
+ path->dentry = get_stashed_dentry(*stashed);
+ if (path->dentry) {
+ sops->put_data(data);
+ goto out_path;
+ }
+
+ /* Allocate a new dentry. */
+ dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ /* Added a new dentry. @data is now owned by the filesystem. */
+ path->dentry = stash_dentry(stashed, dentry);
+ if (path->dentry != dentry)
+ dput(dentry);
+
+out_path:
+ WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+ WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+ path->mnt = mntget(mnt);
+ return 0;
+}
+
+void stashed_dentry_prune(struct dentry *dentry)
+{
+ struct dentry **stashed = dentry->d_fsdata;
+ struct inode *inode = d_inode(dentry);
+
+ if (WARN_ON_ONCE(!stashed))
+ return;
+
+ if (!inode)
+ return;
+
+ /*
+ * Only replace our own @dentry as someone else might've
+ * already cleared out @dentry and stashed their own
+ * dentry in there.
+ */
+ cmpxchg(stashed, dentry, NULL);
+}
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 8161667c976f..527458db4525 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -243,7 +243,7 @@ static void encode_nlm4_holder(struct xdr_stream *xdr,
u64 l_offset, l_len;
__be32 *p;
- encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK);
encode_int32(xdr, lock->svid);
encode_netobj(xdr, lock->oh.data, lock->oh.len);
@@ -270,7 +270,7 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
goto out_overflow;
exclusive = be32_to_cpup(p++);
lock->svid = be32_to_cpup(p);
- fl->fl_pid = (pid_t)lock->svid;
+ fl->c.flc_pid = (pid_t)lock->svid;
error = decode_netobj(xdr, &lock->oh);
if (unlikely(error))
@@ -280,8 +280,8 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
if (unlikely(p == NULL))
goto out_overflow;
- fl->fl_flags = FL_POSIX;
- fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
p = xdr_decode_hyper(p, &l_offset);
xdr_decode_hyper(p, &l_len);
nlm4svc_set_file_lock_range(fl, l_offset, l_len);
@@ -357,7 +357,7 @@ static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
const struct nlm_lock *lock = &args->lock;
encode_cookie(xdr, &args->cookie);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm4_lock(xdr, lock);
}
@@ -380,7 +380,7 @@ static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
encode_cookie(xdr, &args->cookie);
encode_bool(xdr, args->block);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm4_lock(xdr, lock);
encode_bool(xdr, args->reclaim);
encode_int32(xdr, args->state);
@@ -403,7 +403,7 @@ static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
encode_cookie(xdr, &args->cookie);
encode_bool(xdr, args->block);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm4_lock(xdr, lock);
}
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 5d85715be763..a7e0519ec024 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -185,7 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
continue;
if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
continue;
- if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0)
+ if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->c.flc_file)), fh) != 0)
continue;
/* Alright, we found a lock. Set the return status
* and wake up the caller
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index fba6c7fa7474..cebcc283b7ce 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -133,7 +133,8 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
char *nodename = req->a_host->h_rpcclnt->cl_nodename;
nlmclnt_next_cookie(&argp->cookie);
- memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
+ memcpy(&lock->fh, NFS_FH(file_inode(fl->c.flc_file)),
+ sizeof(struct nfs_fh));
lock->caller = nodename;
lock->oh.data = req->a_owner;
lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
@@ -142,7 +143,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
lock->svid = fl->fl_u.nfs_fl.owner->pid;
lock->fl.fl_start = fl->fl_start;
lock->fl.fl_end = fl->fl_end;
- lock->fl.fl_type = fl->fl_type;
+ lock->fl.c.flc_type = fl->c.flc_type;
}
static void nlmclnt_release_lockargs(struct nlm_rqst *req)
@@ -182,7 +183,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *dat
call->a_callback_data = data;
if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
- if (fl->fl_type != F_UNLCK) {
+ if (fl->c.flc_type != F_UNLCK) {
call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
status = nlmclnt_lock(call, fl);
} else
@@ -432,13 +433,14 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
{
int status;
- status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST);
+ status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req,
+ NLMPROC_TEST);
if (status < 0)
goto out;
switch (req->a_res.status) {
case nlm_granted:
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
break;
case nlm_lck_denied:
/*
@@ -446,8 +448,8 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
*/
fl->fl_start = req->a_res.lock.fl.fl_start;
fl->fl_end = req->a_res.lock.fl.fl_end;
- fl->fl_type = req->a_res.lock.fl.fl_type;
- fl->fl_pid = -req->a_res.lock.fl.fl_pid;
+ fl->c.flc_type = req->a_res.lock.fl.c.flc_type;
+ fl->c.flc_pid = -req->a_res.lock.fl.c.flc_pid;
break;
default:
status = nlm_stat_to_errno(req->a_res.status);
@@ -485,14 +487,15 @@ static const struct file_lock_operations nlmclnt_lock_ops = {
static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
{
fl->fl_u.nfs_fl.state = 0;
- fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, fl->fl_owner);
+ fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host,
+ fl->c.flc_owner);
INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
fl->fl_ops = &nlmclnt_lock_ops;
}
static int do_vfs_lock(struct file_lock *fl)
{
- return locks_lock_file_wait(fl->fl_file, fl);
+ return locks_lock_file_wait(fl->c.flc_file, fl);
}
/*
@@ -518,12 +521,12 @@ static int do_vfs_lock(struct file_lock *fl)
static int
nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
{
- const struct cred *cred = nfs_file_cred(fl->fl_file);
+ const struct cred *cred = nfs_file_cred(fl->c.flc_file);
struct nlm_host *host = req->a_host;
struct nlm_res *resp = &req->a_res;
struct nlm_wait block;
- unsigned char fl_flags = fl->fl_flags;
- unsigned char fl_type;
+ unsigned char flags = fl->c.flc_flags;
+ unsigned char type;
__be32 b_status;
int status = -ENOLCK;
@@ -531,9 +534,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
goto out;
req->a_args.state = nsm_local_state;
- fl->fl_flags |= FL_ACCESS;
+ fl->c.flc_flags |= FL_ACCESS;
status = do_vfs_lock(fl);
- fl->fl_flags = fl_flags;
+ fl->c.flc_flags = flags;
if (status < 0)
goto out;
@@ -591,11 +594,11 @@ again:
goto again;
}
/* Ensure the resulting lock will get added to granted list */
- fl->fl_flags |= FL_SLEEP;
+ fl->c.flc_flags |= FL_SLEEP;
if (do_vfs_lock(fl) < 0)
printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
up_read(&host->h_rwsem);
- fl->fl_flags = fl_flags;
+ fl->c.flc_flags = flags;
status = 0;
}
if (status < 0)
@@ -605,7 +608,7 @@ again:
* cases NLM_LCK_DENIED is returned for a permanent error. So
* turn it into an ENOLCK.
*/
- if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+ if (resp->status == nlm_lck_denied && (flags & FL_SLEEP))
status = -ENOLCK;
else
status = nlm_stat_to_errno(resp->status);
@@ -622,13 +625,13 @@ out_unlock:
req->a_host->h_addrlen, req->a_res.status);
dprintk("lockd: lock attempt ended in fatal error.\n"
" Attempting to unlock.\n");
- fl_type = fl->fl_type;
- fl->fl_type = F_UNLCK;
+ type = fl->c.flc_type;
+ fl->c.flc_type = F_UNLCK;
down_read(&host->h_rwsem);
do_vfs_lock(fl);
up_read(&host->h_rwsem);
- fl->fl_type = fl_type;
- fl->fl_flags = fl_flags;
+ fl->c.flc_type = type;
+ fl->c.flc_flags = flags;
nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
return status;
}
@@ -651,12 +654,14 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl,
nlmclnt_setlockargs(req, fl);
req->a_args.reclaim = 1;
- status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK);
+ status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req,
+ NLMPROC_LOCK);
if (status >= 0 && req->a_res.status == nlm_granted)
return 0;
printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d "
- "(errno %d, status %d)\n", fl->fl_pid,
+ "(errno %d, status %d)\n",
+ fl->c.flc_pid,
status, ntohl(req->a_res.status));
/*
@@ -683,26 +688,26 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
struct nlm_host *host = req->a_host;
struct nlm_res *resp = &req->a_res;
int status;
- unsigned char fl_flags = fl->fl_flags;
+ unsigned char flags = fl->c.flc_flags;
/*
* Note: the server is supposed to either grant us the unlock
* request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
* case, we want to unlock.
*/
- fl->fl_flags |= FL_EXISTS;
+ fl->c.flc_flags |= FL_EXISTS;
down_read(&host->h_rwsem);
status = do_vfs_lock(fl);
up_read(&host->h_rwsem);
- fl->fl_flags = fl_flags;
+ fl->c.flc_flags = flags;
if (status == -ENOENT) {
status = 0;
goto out;
}
refcount_inc(&req->a_count);
- status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
- NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+ status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req,
+ NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
if (status < 0)
goto out;
@@ -795,8 +800,8 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
req->a_args.block = block;
refcount_inc(&req->a_count);
- status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
- NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+ status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req,
+ NLMPROC_CANCEL, &nlmclnt_cancel_ops);
if (status == 0 && req->a_res.status == nlm_lck_denied)
status = -ENOLCK;
nlmclnt_release_call(req);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 4df62f635529..a3e97278b997 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -238,7 +238,7 @@ static void encode_nlm_holder(struct xdr_stream *xdr,
u32 l_offset, l_len;
__be32 *p;
- encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK);
encode_int32(xdr, lock->svid);
encode_netobj(xdr, lock->oh.data, lock->oh.len);
@@ -265,7 +265,7 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
goto out_overflow;
exclusive = be32_to_cpup(p++);
lock->svid = be32_to_cpup(p);
- fl->fl_pid = (pid_t)lock->svid;
+ fl->c.flc_pid = (pid_t)lock->svid;
error = decode_netobj(xdr, &lock->oh);
if (unlikely(error))
@@ -275,8 +275,8 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
if (unlikely(p == NULL))
goto out_overflow;
- fl->fl_flags = FL_POSIX;
- fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
l_offset = be32_to_cpup(p++);
l_len = be32_to_cpup(p);
end = l_offset + l_len - 1;
@@ -357,7 +357,7 @@ static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
const struct nlm_lock *lock = &args->lock;
encode_cookie(xdr, &args->cookie);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm_lock(xdr, lock);
}
@@ -380,7 +380,7 @@ static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
encode_cookie(xdr, &args->cookie);
encode_bool(xdr, args->block);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm_lock(xdr, lock);
encode_bool(xdr, args->reclaim);
encode_int32(xdr, args->state);
@@ -403,7 +403,7 @@ static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
encode_cookie(xdr, &args->cookie);
encode_bool(xdr, args->block);
- encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+ encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
encode_nlm_lock(xdr, lock);
}
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index ce5862482097..ab8042a5b895 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -710,8 +710,6 @@ static const struct svc_version *nlmsvc_version[] = {
#endif
};
-static struct svc_stat nlmsvc_stats;
-
#define NLM_NRVERS ARRAY_SIZE(nlmsvc_version)
static struct svc_program nlmsvc_program = {
.pg_prog = NLM_PROGRAM, /* program number */
@@ -719,7 +717,6 @@ static struct svc_program nlmsvc_program = {
.pg_vers = nlmsvc_version, /* version table */
.pg_name = "lockd", /* service name */
.pg_class = "nfsd", /* share authentication with nfsd */
- .pg_stats = &nlmsvc_stats, /* stats table */
.pg_authenticate = &lockd_authenticate, /* export authentication */
.pg_init_request = svc_generic_init_request,
.pg_rpcbind_set = svc_generic_rpcbind_set,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b72023a6b4c1..8a72c418cdcc 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -52,16 +52,16 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
*filp = file;
/* Set up the missing parts of the file_lock structure */
- lock->fl.fl_flags = FL_POSIX;
- lock->fl.fl_file = file->f_file[mode];
- lock->fl.fl_pid = current->tgid;
+ lock->fl.c.flc_flags = FL_POSIX;
+ lock->fl.c.flc_file = file->f_file[mode];
+ lock->fl.c.flc_pid = current->tgid;
lock->fl.fl_start = (loff_t)lock->lock_start;
lock->fl.fl_end = lock->lock_len ?
(loff_t)(lock->lock_start + lock->lock_len - 1) :
OFFSET_MAX;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
- if (!lock->fl.fl_owner) {
+ if (!lock->fl.c.flc_owner) {
/* lockowner allocation has failed */
nlmsvc_release_host(host);
return nlm_lck_denied_nolocks;
@@ -106,7 +106,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
- test_owner = argp->lock.fl.fl_owner;
+ test_owner = argp->lock.fl.c.flc_owner;
/* Now check for conflicting locks */
resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
if (resp->status == nlm_drop_reply)
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 2dc10900ad1c..1f2149db10f2 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -150,16 +150,17 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
struct file_lock *fl;
dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
- file, lock->fl.fl_pid,
+ file, lock->fl.c.flc_pid,
(long long)lock->fl.fl_start,
- (long long)lock->fl.fl_end, lock->fl.fl_type);
+ (long long)lock->fl.fl_end,
+ lock->fl.c.flc_type);
spin_lock(&nlm_blocked_lock);
list_for_each_entry(block, &nlm_blocked, b_list) {
fl = &block->b_call->a_args.lock.fl;
dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
- block->b_file, fl->fl_pid,
+ block->b_file, fl->c.flc_pid,
(long long)fl->fl_start,
- (long long)fl->fl_end, fl->fl_type,
+ (long long)fl->fl_end, fl->c.flc_type,
nlmdbg_cookie2a(&block->b_call->a_args.cookie));
if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) {
kref_get(&block->b_count);
@@ -244,7 +245,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
goto failed_free;
/* Set notifier function for VFS, and init args */
- call->a_args.lock.fl.fl_flags |= FL_SLEEP;
+ call->a_args.lock.fl.c.flc_flags |= FL_SLEEP;
call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
nlmclnt_next_cookie(&call->a_args.cookie);
@@ -402,14 +403,14 @@ static struct nlm_lockowner *nlmsvc_find_lockowner(struct nlm_host *host, pid_t
void
nlmsvc_release_lockowner(struct nlm_lock *lock)
{
- if (lock->fl.fl_owner)
- nlmsvc_put_lockowner(lock->fl.fl_owner);
+ if (lock->fl.c.flc_owner)
+ nlmsvc_put_lockowner(lock->fl.c.flc_owner);
}
void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host,
pid_t pid)
{
- fl->fl_owner = nlmsvc_find_lockowner(host, pid);
+ fl->c.flc_owner = nlmsvc_find_lockowner(host, pid);
}
/*
@@ -425,7 +426,7 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
/* set default data area */
call->a_args.lock.oh.data = call->a_owner;
- call->a_args.lock.svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid;
+ call->a_args.lock.svid = ((struct nlm_lockowner *) lock->fl.c.flc_owner)->pid;
if (lock->oh.len > NLMCLNT_OHSIZE) {
void *data = kmalloc(lock->oh.len, GFP_KERNEL);
@@ -489,7 +490,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
inode->i_sb->s_id, inode->i_ino,
- lock->fl.fl_type, lock->fl.fl_pid,
+ lock->fl.c.flc_type,
+ lock->fl.c.flc_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end,
wait);
@@ -512,7 +514,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
lock = &block->b_call->a_args.lock;
} else
- lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.c.flc_flags &= ~FL_SLEEP;
if (block->b_flags & B_QUEUED) {
dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n",
@@ -560,10 +562,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
spin_unlock(&nlm_blocked_lock);
if (!wait)
- lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.c.flc_flags &= ~FL_SLEEP;
mode = lock_to_openmode(&lock->fl);
error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
- lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.c.flc_flags &= ~FL_SLEEP;
dprintk("lockd: vfs_lock_file returned %d\n", error);
switch (error) {
@@ -616,7 +618,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
nlmsvc_file_inode(file)->i_sb->s_id,
nlmsvc_file_inode(file)->i_ino,
- lock->fl.fl_type,
+ lock->fl.c.flc_type,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -636,19 +638,19 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
- if (lock->fl.fl_type == F_UNLCK) {
+ if (lock->fl.c.flc_type == F_UNLCK) {
ret = nlm_granted;
goto out;
}
dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
- lock->fl.fl_type, (long long)lock->fl.fl_start,
+ lock->fl.c.flc_type, (long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
conflock->caller = "somehost"; /* FIXME */
conflock->len = strlen(conflock->caller);
conflock->oh.len = 0; /* don't return OH info */
- conflock->svid = lock->fl.fl_pid;
- conflock->fl.fl_type = lock->fl.fl_type;
+ conflock->svid = lock->fl.c.flc_pid;
+ conflock->fl.c.flc_type = lock->fl.c.flc_type;
conflock->fl.fl_start = lock->fl.fl_start;
conflock->fl.fl_end = lock->fl.fl_end;
locks_release_private(&lock->fl);
@@ -673,21 +675,21 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
nlmsvc_file_inode(file)->i_sb->s_id,
nlmsvc_file_inode(file)->i_ino,
- lock->fl.fl_pid,
+ lock->fl.c.flc_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
/* First, cancel any lock that might be there */
nlmsvc_cancel_blocked(net, file, lock);
- lock->fl.fl_type = F_UNLCK;
- lock->fl.fl_file = file->f_file[O_RDONLY];
- if (lock->fl.fl_file)
- error = vfs_lock_file(lock->fl.fl_file, F_SETLK,
+ lock->fl.c.flc_type = F_UNLCK;
+ lock->fl.c.flc_file = file->f_file[O_RDONLY];
+ if (lock->fl.c.flc_file)
+ error = vfs_lock_file(lock->fl.c.flc_file, F_SETLK,
&lock->fl, NULL);
- lock->fl.fl_file = file->f_file[O_WRONLY];
- if (lock->fl.fl_file)
- error |= vfs_lock_file(lock->fl.fl_file, F_SETLK,
+ lock->fl.c.flc_file = file->f_file[O_WRONLY];
+ if (lock->fl.c.flc_file)
+ error |= vfs_lock_file(lock->fl.c.flc_file, F_SETLK,
&lock->fl, NULL);
return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
@@ -710,7 +712,7 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
nlmsvc_file_inode(file)->i_sb->s_id,
nlmsvc_file_inode(file)->i_ino,
- lock->fl.fl_pid,
+ lock->fl.c.flc_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -863,12 +865,12 @@ nlmsvc_grant_blocked(struct nlm_block *block)
/* vfs_lock_file() can mangle fl_start and fl_end, but we need
* them unchanged for the GRANT_MSG
*/
- lock->fl.fl_flags |= FL_SLEEP;
+ lock->fl.c.flc_flags |= FL_SLEEP;
fl_start = lock->fl.fl_start;
fl_end = lock->fl.fl_end;
mode = lock_to_openmode(&lock->fl);
error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
- lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.c.flc_flags &= ~FL_SLEEP;
lock->fl.fl_start = fl_start;
lock->fl.fl_end = fl_end;
@@ -993,8 +995,8 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
/* Client doesn't want it, just unlock it */
nlmsvc_unlink_block(block);
fl = &block->b_call->a_args.lock.fl;
- fl->fl_type = F_UNLCK;
- error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL);
+ fl->c.flc_type = F_UNLCK;
+ error = vfs_lock_file(fl->c.flc_file, F_SETLK, fl, NULL);
if (error)
pr_warn("lockd: unable to unlock lock rejected by client!\n");
break;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 32784f508c81..a03220e66ce0 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -77,12 +77,12 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Set up the missing parts of the file_lock structure */
mode = lock_to_openmode(&lock->fl);
- lock->fl.fl_flags = FL_POSIX;
- lock->fl.fl_file = file->f_file[mode];
- lock->fl.fl_pid = current->tgid;
+ lock->fl.c.flc_flags = FL_POSIX;
+ lock->fl.c.flc_file = file->f_file[mode];
+ lock->fl.c.flc_pid = current->tgid;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
- if (!lock->fl.fl_owner) {
+ if (!lock->fl.c.flc_owner) {
/* lockowner allocation has failed */
nlmsvc_release_host(host);
return nlm_lck_denied_nolocks;
@@ -127,7 +127,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
- test_owner = argp->lock.fl.fl_owner;
+ test_owner = argp->lock.fl.c.flc_owner;
/* Now check for conflicting locks */
resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index e3b6229e7ae5..9103896164f6 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -73,7 +73,7 @@ static inline unsigned int file_hash(struct nfs_fh *f)
int lock_to_openmode(struct file_lock *lock)
{
- return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY;
+ return lock_is_write(lock) ? O_WRONLY : O_RDONLY;
}
/*
@@ -181,18 +181,18 @@ static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl)
struct file_lock lock;
locks_init_lock(&lock);
- lock.fl_type = F_UNLCK;
+ lock.c.flc_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
- lock.fl_owner = fl->fl_owner;
- lock.fl_pid = fl->fl_pid;
- lock.fl_flags = FL_POSIX;
+ lock.c.flc_owner = fl->c.flc_owner;
+ lock.c.flc_pid = fl->c.flc_pid;
+ lock.c.flc_flags = FL_POSIX;
- lock.fl_file = file->f_file[O_RDONLY];
- if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
+ lock.c.flc_file = file->f_file[O_RDONLY];
+ if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL))
goto out_err;
- lock.fl_file = file->f_file[O_WRONLY];
- if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
+ lock.c.flc_file = file->f_file[O_WRONLY];
+ if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL))
goto out_err;
return 0;
out_err:
@@ -218,14 +218,14 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
again:
file->f_locks = 0;
spin_lock(&flctx->flc_lock);
- list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ for_each_file_lock(fl, &flctx->flc_posix) {
if (fl->fl_lmops != &nlmsvc_lock_operations)
continue;
/* update current lock count */
file->f_locks++;
- lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host;
+ lockhost = ((struct nlm_lockowner *) fl->c.flc_owner)->host;
if (match(lockhost, host)) {
spin_unlock(&flctx->flc_lock);
@@ -272,7 +272,7 @@ nlm_file_inuse(struct nlm_file *file)
if (flctx && !list_empty_careful(&flctx->flc_posix)) {
spin_lock(&flctx->flc_lock);
- list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ for_each_file_lock(fl, &flctx->flc_posix) {
if (fl->fl_lmops == &nlmsvc_lock_operations) {
spin_unlock(&flctx->flc_lock);
return 1;
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 2fb5748dae0c..adfcce2bf11b 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -88,8 +88,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
return false;
locks_init_lock(fl);
- fl->fl_flags = FL_POSIX;
- fl->fl_type = F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_type = F_RDLCK;
end = start + len - 1;
fl->fl_start = s32_to_loff_t(start);
if (len == 0 || end < 0)
@@ -107,7 +107,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
s32 start, len;
/* exclusive */
- if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
+ if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0)
return false;
if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
return false;
@@ -164,7 +164,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
return true;
}
@@ -184,7 +184,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
@@ -209,7 +209,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
return true;
}
@@ -223,7 +223,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
- argp->lock.fl.fl_type = F_UNLCK;
+ argp->lock.fl.c.flc_type = F_UNLCK;
return true;
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 5fcbf30cd275..3d28b9c3ed15 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -89,8 +89,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
return false;
locks_init_lock(fl);
- fl->fl_flags = FL_POSIX;
- fl->fl_type = F_RDLCK;
+ fl->c.flc_flags = FL_POSIX;
+ fl->c.flc_type = F_RDLCK;
nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len);
return true;
}
@@ -102,7 +102,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
s64 start, len;
/* exclusive */
- if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
+ if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0)
return false;
if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
return false;
@@ -159,7 +159,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
return true;
}
@@ -179,7 +179,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
@@ -204,7 +204,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
if (exclusive)
- argp->lock.fl.fl_type = F_WRLCK;
+ argp->lock.fl.c.flc_type = F_WRLCK;
return true;
}
@@ -218,7 +218,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
return false;
- argp->lock.fl.fl_type = F_UNLCK;
+ argp->lock.fl.c.flc_type = F_UNLCK;
return true;
}
diff --git a/fs/locks.c b/fs/locks.c
index cc7c117ee192..90c8746874de 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -48,7 +48,6 @@
* children.
*
*/
-
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
@@ -70,24 +69,28 @@
#include <linux/uaccess.h>
-#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
-#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
-#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
-#define IS_REMOTELCK(fl) (fl->fl_pid <= 0)
+static struct file_lock *file_lock(struct file_lock_core *flc)
+{
+ return container_of(flc, struct file_lock, c);
+}
+
+static struct file_lease *file_lease(struct file_lock_core *flc)
+{
+ return container_of(flc, struct file_lease, c);
+}
-static bool lease_breaking(struct file_lock *fl)
+static bool lease_breaking(struct file_lease *fl)
{
- return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+ return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
}
-static int target_leasetype(struct file_lock *fl)
+static int target_leasetype(struct file_lease *fl)
{
- if (fl->fl_flags & FL_UNLOCK_PENDING)
+ if (fl->c.flc_flags & FL_UNLOCK_PENDING)
return F_UNLCK;
- if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+ if (fl->c.flc_flags & FL_DOWNGRADE_PENDING)
return F_RDLCK;
- return fl->fl_type;
+ return fl->c.flc_type;
}
static int leases_enable = 1;
@@ -168,6 +171,7 @@ static DEFINE_SPINLOCK(blocked_lock_lock);
static struct kmem_cache *flctx_cache __ro_after_init;
static struct kmem_cache *filelock_cache __ro_after_init;
+static struct kmem_cache *filelease_cache __ro_after_init;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
@@ -204,11 +208,12 @@ out:
static void
locks_dump_ctx_list(struct list_head *list, char *list_type)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
- list_for_each_entry(fl, list, fl_list) {
- pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
- }
+ list_for_each_entry(flc, list, flc_list)
+ pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
+ list_type, flc->flc_owner, flc->flc_flags,
+ flc->flc_type, flc->flc_pid);
}
static void
@@ -229,19 +234,19 @@ locks_check_ctx_lists(struct inode *inode)
}
static void
-locks_check_ctx_file_list(struct file *filp, struct list_head *list,
- char *list_type)
+locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
struct inode *inode = file_inode(filp);
- list_for_each_entry(fl, list, fl_list)
- if (fl->fl_file == filp)
+ list_for_each_entry(flc, list, flc_list)
+ if (flc->flc_file == filp)
pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
" fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
list_type, MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino,
- fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
+ flc->flc_owner, flc->flc_flags,
+ flc->flc_type, flc->flc_pid);
}
void
@@ -255,13 +260,13 @@ locks_free_lock_context(struct inode *inode)
}
}
-static void locks_init_lock_heads(struct file_lock *fl)
+static void locks_init_lock_heads(struct file_lock_core *flc)
{
- INIT_HLIST_NODE(&fl->fl_link);
- INIT_LIST_HEAD(&fl->fl_list);
- INIT_LIST_HEAD(&fl->fl_blocked_requests);
- INIT_LIST_HEAD(&fl->fl_blocked_member);
- init_waitqueue_head(&fl->fl_wait);
+ INIT_HLIST_NODE(&flc->flc_link);
+ INIT_LIST_HEAD(&flc->flc_list);
+ INIT_LIST_HEAD(&flc->flc_blocked_requests);
+ INIT_LIST_HEAD(&flc->flc_blocked_member);
+ init_waitqueue_head(&flc->flc_wait);
}
/* Allocate an empty lock structure. */
@@ -270,19 +275,33 @@ struct file_lock *locks_alloc_lock(void)
struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
if (fl)
- locks_init_lock_heads(fl);
+ locks_init_lock_heads(&fl->c);
return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lock);
+/* Allocate an empty lock structure. */
+struct file_lease *locks_alloc_lease(void)
+{
+ struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL);
+
+ if (fl)
+ locks_init_lock_heads(&fl->c);
+
+ return fl;
+}
+EXPORT_SYMBOL_GPL(locks_alloc_lease);
+
void locks_release_private(struct file_lock *fl)
{
- BUG_ON(waitqueue_active(&fl->fl_wait));
- BUG_ON(!list_empty(&fl->fl_list));
- BUG_ON(!list_empty(&fl->fl_blocked_requests));
- BUG_ON(!list_empty(&fl->fl_blocked_member));
- BUG_ON(!hlist_unhashed(&fl->fl_link));
+ struct file_lock_core *flc = &fl->c;
+
+ BUG_ON(waitqueue_active(&flc->flc_wait));
+ BUG_ON(!list_empty(&flc->flc_list));
+ BUG_ON(!list_empty(&flc->flc_blocked_requests));
+ BUG_ON(!list_empty(&flc->flc_blocked_member));
+ BUG_ON(!hlist_unhashed(&flc->flc_link));
if (fl->fl_ops) {
if (fl->fl_ops->fl_release_private)
@@ -292,8 +311,8 @@ void locks_release_private(struct file_lock *fl)
if (fl->fl_lmops) {
if (fl->fl_lmops->lm_put_owner) {
- fl->fl_lmops->lm_put_owner(fl->fl_owner);
- fl->fl_owner = NULL;
+ fl->fl_lmops->lm_put_owner(flc->flc_owner);
+ flc->flc_owner = NULL;
}
fl->fl_lmops = NULL;
}
@@ -309,16 +328,15 @@ EXPORT_SYMBOL_GPL(locks_release_private);
* %true: @owner has at least one blocker
* %false: @owner has no blockers
*/
-bool locks_owner_has_blockers(struct file_lock_context *flctx,
- fl_owner_t owner)
+bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
spin_lock(&flctx->flc_lock);
- list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
- if (fl->fl_owner != owner)
+ list_for_each_entry(flc, &flctx->flc_posix, flc_list) {
+ if (flc->flc_owner != owner)
continue;
- if (!list_empty(&fl->fl_blocked_requests)) {
+ if (!list_empty(&flc->flc_blocked_requests)) {
spin_unlock(&flctx->flc_lock);
return true;
}
@@ -336,35 +354,52 @@ void locks_free_lock(struct file_lock *fl)
}
EXPORT_SYMBOL(locks_free_lock);
+/* Free a lease which is not in use. */
+void locks_free_lease(struct file_lease *fl)
+{
+ kmem_cache_free(filelease_cache, fl);
+}
+EXPORT_SYMBOL(locks_free_lease);
+
static void
locks_dispose_list(struct list_head *dispose)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
while (!list_empty(dispose)) {
- fl = list_first_entry(dispose, struct file_lock, fl_list);
- list_del_init(&fl->fl_list);
- locks_free_lock(fl);
+ flc = list_first_entry(dispose, struct file_lock_core, flc_list);
+ list_del_init(&flc->flc_list);
+ if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
+ locks_free_lease(file_lease(flc));
+ else
+ locks_free_lock(file_lock(flc));
}
}
void locks_init_lock(struct file_lock *fl)
{
memset(fl, 0, sizeof(struct file_lock));
- locks_init_lock_heads(fl);
+ locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lock);
+void locks_init_lease(struct file_lease *fl)
+{
+ memset(fl, 0, sizeof(*fl));
+ locks_init_lock_heads(&fl->c);
+}
+EXPORT_SYMBOL(locks_init_lease);
+
/*
* Initialize a new lock from an existing file_lock structure.
*/
void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
- new->fl_owner = fl->fl_owner;
- new->fl_pid = fl->fl_pid;
- new->fl_file = NULL;
- new->fl_flags = fl->fl_flags;
- new->fl_type = fl->fl_type;
+ new->c.flc_owner = fl->c.flc_owner;
+ new->c.flc_pid = fl->c.flc_pid;
+ new->c.flc_file = NULL;
+ new->c.flc_flags = fl->c.flc_flags;
+ new->c.flc_type = fl->c.flc_type;
new->fl_start = fl->fl_start;
new->fl_end = fl->fl_end;
new->fl_lmops = fl->fl_lmops;
@@ -372,7 +407,7 @@ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
if (fl->fl_lmops) {
if (fl->fl_lmops->lm_get_owner)
- fl->fl_lmops->lm_get_owner(fl->fl_owner);
+ fl->fl_lmops->lm_get_owner(fl->c.flc_owner);
}
}
EXPORT_SYMBOL(locks_copy_conflock);
@@ -384,7 +419,7 @@ void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
locks_copy_conflock(new, fl);
- new->fl_file = fl->fl_file;
+ new->c.flc_file = fl->c.flc_file;
new->fl_ops = fl->fl_ops;
if (fl->fl_ops) {
@@ -400,15 +435,17 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
/*
* As ctx->flc_lock is held, new requests cannot be added to
- * ->fl_blocked_requests, so we don't need a lock to check if it
+ * ->flc_blocked_requests, so we don't need a lock to check if it
* is empty.
*/
- if (list_empty(&fl->fl_blocked_requests))
+ if (list_empty(&fl->c.flc_blocked_requests))
return;
spin_lock(&blocked_lock_lock);
- list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests);
- list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member)
- f->fl_blocker = new;
+ list_splice_init(&fl->c.flc_blocked_requests,
+ &new->c.flc_blocked_requests);
+ list_for_each_entry(f, &new->c.flc_blocked_requests,
+ c.flc_blocked_member)
+ f->c.flc_blocker = &new->c;
spin_unlock(&blocked_lock_lock);
}
@@ -429,21 +466,21 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
{
locks_init_lock(fl);
- fl->fl_file = filp;
- fl->fl_owner = filp;
- fl->fl_pid = current->tgid;
- fl->fl_flags = FL_FLOCK;
- fl->fl_type = type;
+ fl->c.flc_file = filp;
+ fl->c.flc_owner = filp;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_flags = FL_FLOCK;
+ fl->c.flc_type = type;
fl->fl_end = OFFSET_MAX;
}
-static int assign_type(struct file_lock *fl, int type)
+static int assign_type(struct file_lock_core *flc, int type)
{
switch (type) {
case F_RDLCK:
case F_WRLCK:
case F_UNLCK:
- fl->fl_type = type;
+ flc->flc_type = type;
break;
default:
return -EINVAL;
@@ -488,14 +525,14 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
} else
fl->fl_end = OFFSET_MAX;
- fl->fl_owner = current->files;
- fl->fl_pid = current->tgid;
- fl->fl_file = filp;
- fl->fl_flags = FL_POSIX;
+ fl->c.flc_owner = current->files;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_file = filp;
+ fl->c.flc_flags = FL_POSIX;
fl->fl_ops = NULL;
fl->fl_lmops = NULL;
- return assign_type(fl, l->l_type);
+ return assign_type(&fl->c, l->l_type);
}
/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
@@ -516,16 +553,16 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
/* default lease lock manager operations */
static bool
-lease_break_callback(struct file_lock *fl)
+lease_break_callback(struct file_lease *fl)
{
kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
return false;
}
static void
-lease_setup(struct file_lock *fl, void **priv)
+lease_setup(struct file_lease *fl, void **priv)
{
- struct file *filp = fl->fl_file;
+ struct file *filp = fl->c.flc_file;
struct fasync_struct *fa = *priv;
/*
@@ -539,7 +576,7 @@ lease_setup(struct file_lock *fl, void **priv)
__f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
}
-static const struct lock_manager_operations lease_manager_ops = {
+static const struct lease_manager_operations lease_manager_ops = {
.lm_break = lease_break_callback,
.lm_change = lease_modify,
.lm_setup = lease_setup,
@@ -548,27 +585,24 @@ static const struct lock_manager_operations lease_manager_ops = {
/*
* Initialize a lease, use the default lock manager operations
*/
-static int lease_init(struct file *filp, int type, struct file_lock *fl)
+static int lease_init(struct file *filp, int type, struct file_lease *fl)
{
- if (assign_type(fl, type) != 0)
+ if (assign_type(&fl->c, type) != 0)
return -EINVAL;
- fl->fl_owner = filp;
- fl->fl_pid = current->tgid;
+ fl->c.flc_owner = filp;
+ fl->c.flc_pid = current->tgid;
- fl->fl_file = filp;
- fl->fl_flags = FL_LEASE;
- fl->fl_start = 0;
- fl->fl_end = OFFSET_MAX;
- fl->fl_ops = NULL;
+ fl->c.flc_file = filp;
+ fl->c.flc_flags = FL_LEASE;
fl->fl_lmops = &lease_manager_ops;
return 0;
}
/* Allocate a file_lock initialised to this type of lease */
-static struct file_lock *lease_alloc(struct file *filp, int type)
+static struct file_lease *lease_alloc(struct file *filp, int type)
{
- struct file_lock *fl = locks_alloc_lock();
+ struct file_lease *fl = locks_alloc_lease();
int error = -ENOMEM;
if (fl == NULL)
@@ -576,7 +610,7 @@ static struct file_lock *lease_alloc(struct file *filp, int type)
error = lease_init(filp, type, fl);
if (error) {
- locks_free_lock(fl);
+ locks_free_lease(fl);
return ERR_PTR(error);
}
return fl;
@@ -593,26 +627,26 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
/*
* Check whether two locks have the same owner.
*/
-static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2)
{
- return fl1->fl_owner == fl2->fl_owner;
+ return fl1->flc_owner == fl2->flc_owner;
}
/* Must be called with the flc_lock held! */
-static void locks_insert_global_locks(struct file_lock *fl)
+static void locks_insert_global_locks(struct file_lock_core *flc)
{
struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
percpu_rwsem_assert_held(&file_rwsem);
spin_lock(&fll->lock);
- fl->fl_link_cpu = smp_processor_id();
- hlist_add_head(&fl->fl_link, &fll->hlist);
+ flc->flc_link_cpu = smp_processor_id();
+ hlist_add_head(&flc->flc_link, &fll->hlist);
spin_unlock(&fll->lock);
}
/* Must be called with the flc_lock held! */
-static void locks_delete_global_locks(struct file_lock *fl)
+static void locks_delete_global_locks(struct file_lock_core *flc)
{
struct file_lock_list_struct *fll;
@@ -623,33 +657,33 @@ static void locks_delete_global_locks(struct file_lock *fl)
* is done while holding the flc_lock, and new insertions into the list
* also require that it be held.
*/
- if (hlist_unhashed(&fl->fl_link))
+ if (hlist_unhashed(&flc->flc_link))
return;
- fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
+ fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu);
spin_lock(&fll->lock);
- hlist_del_init(&fl->fl_link);
+ hlist_del_init(&flc->flc_link);
spin_unlock(&fll->lock);
}
static unsigned long
-posix_owner_key(struct file_lock *fl)
+posix_owner_key(struct file_lock_core *flc)
{
- return (unsigned long)fl->fl_owner;
+ return (unsigned long) flc->flc_owner;
}
-static void locks_insert_global_blocked(struct file_lock *waiter)
+static void locks_insert_global_blocked(struct file_lock_core *waiter)
{
lockdep_assert_held(&blocked_lock_lock);
- hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
+ hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter));
}
-static void locks_delete_global_blocked(struct file_lock *waiter)
+static void locks_delete_global_blocked(struct file_lock_core *waiter)
{
lockdep_assert_held(&blocked_lock_lock);
- hash_del(&waiter->fl_link);
+ hash_del(&waiter->flc_link);
}
/* Remove waiter from blocker's block list.
@@ -657,41 +691,39 @@ static void locks_delete_global_blocked(struct file_lock *waiter)
*
* Must be called with blocked_lock_lock held.
*/
-static void __locks_delete_block(struct file_lock *waiter)
+static void __locks_unlink_block(struct file_lock_core *waiter)
{
locks_delete_global_blocked(waiter);
- list_del_init(&waiter->fl_blocked_member);
+ list_del_init(&waiter->flc_blocked_member);
}
-static void __locks_wake_up_blocks(struct file_lock *blocker)
+static void __locks_wake_up_blocks(struct file_lock_core *blocker)
{
- while (!list_empty(&blocker->fl_blocked_requests)) {
- struct file_lock *waiter;
+ while (!list_empty(&blocker->flc_blocked_requests)) {
+ struct file_lock_core *waiter;
+ struct file_lock *fl;
- waiter = list_first_entry(&blocker->fl_blocked_requests,
- struct file_lock, fl_blocked_member);
- __locks_delete_block(waiter);
- if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
- waiter->fl_lmops->lm_notify(waiter);
+ waiter = list_first_entry(&blocker->flc_blocked_requests,
+ struct file_lock_core, flc_blocked_member);
+
+ fl = file_lock(waiter);
+ __locks_unlink_block(waiter);
+ if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) &&
+ fl->fl_lmops && fl->fl_lmops->lm_notify)
+ fl->fl_lmops->lm_notify(fl);
else
- wake_up(&waiter->fl_wait);
+ locks_wake_up(fl);
/*
- * The setting of fl_blocker to NULL marks the "done"
+ * The setting of flc_blocker to NULL marks the "done"
* point in deleting a block. Paired with acquire at the top
* of locks_delete_block().
*/
- smp_store_release(&waiter->fl_blocker, NULL);
+ smp_store_release(&waiter->flc_blocker, NULL);
}
}
-/**
- * locks_delete_block - stop waiting for a file lock
- * @waiter: the lock which was waiting
- *
- * lockd/nfsd need to disconnect the lock while working on it.
- */
-int locks_delete_block(struct file_lock *waiter)
+static int __locks_delete_block(struct file_lock_core *waiter)
{
int status = -ENOENT;
@@ -716,24 +748,35 @@ int locks_delete_block(struct file_lock *waiter)
* no new locks can be inserted into its fl_blocked_requests list, and
* can avoid doing anything further if the list is empty.
*/
- if (!smp_load_acquire(&waiter->fl_blocker) &&
- list_empty(&waiter->fl_blocked_requests))
+ if (!smp_load_acquire(&waiter->flc_blocker) &&
+ list_empty(&waiter->flc_blocked_requests))
return status;
spin_lock(&blocked_lock_lock);
- if (waiter->fl_blocker)
+ if (waiter->flc_blocker)
status = 0;
__locks_wake_up_blocks(waiter);
- __locks_delete_block(waiter);
+ __locks_unlink_block(waiter);
/*
* The setting of fl_blocker to NULL marks the "done" point in deleting
* a block. Paired with acquire at the top of this function.
*/
- smp_store_release(&waiter->fl_blocker, NULL);
+ smp_store_release(&waiter->flc_blocker, NULL);
spin_unlock(&blocked_lock_lock);
return status;
}
+
+/**
+ * locks_delete_block - stop waiting for a file lock
+ * @waiter: the lock which was waiting
+ *
+ * lockd/nfsd need to disconnect the lock while working on it.
+ */
+int locks_delete_block(struct file_lock *waiter)
+{
+ return __locks_delete_block(&waiter->c);
+}
EXPORT_SYMBOL(locks_delete_block);
/* Insert waiter into blocker's block list.
@@ -751,26 +794,28 @@ EXPORT_SYMBOL(locks_delete_block);
* waiters, and add beneath any waiter that blocks the new waiter.
* Thus wakeups don't happen until needed.
*/
-static void __locks_insert_block(struct file_lock *blocker,
- struct file_lock *waiter,
- bool conflict(struct file_lock *,
- struct file_lock *))
+static void __locks_insert_block(struct file_lock_core *blocker,
+ struct file_lock_core *waiter,
+ bool conflict(struct file_lock_core *,
+ struct file_lock_core *))
{
- struct file_lock *fl;
- BUG_ON(!list_empty(&waiter->fl_blocked_member));
+ struct file_lock_core *flc;
+ BUG_ON(!list_empty(&waiter->flc_blocked_member));
new_blocker:
- list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
- if (conflict(fl, waiter)) {
- blocker = fl;
+ list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member)
+ if (conflict(flc, waiter)) {
+ blocker = flc;
goto new_blocker;
}
- waiter->fl_blocker = blocker;
- list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
- if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
+ waiter->flc_blocker = blocker;
+ list_add_tail(&waiter->flc_blocked_member,
+ &blocker->flc_blocked_requests);
+
+ if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX)
locks_insert_global_blocked(waiter);
- /* The requests in waiter->fl_blocked are known to conflict with
+ /* The requests in waiter->flc_blocked are known to conflict with
* waiter, but might not conflict with blocker, or the requests
* and lock which block it. So they all need to be woken.
*/
@@ -778,10 +823,10 @@ new_blocker:
}
/* Must be called with flc_lock held. */
-static void locks_insert_block(struct file_lock *blocker,
- struct file_lock *waiter,
- bool conflict(struct file_lock *,
- struct file_lock *))
+static void locks_insert_block(struct file_lock_core *blocker,
+ struct file_lock_core *waiter,
+ bool conflict(struct file_lock_core *,
+ struct file_lock_core *))
{
spin_lock(&blocked_lock_lock);
__locks_insert_block(blocker, waiter, conflict);
@@ -793,7 +838,7 @@ static void locks_insert_block(struct file_lock *blocker,
*
* Must be called with the inode->flc_lock held!
*/
-static void locks_wake_up_blocks(struct file_lock *blocker)
+static void locks_wake_up_blocks(struct file_lock_core *blocker)
{
/*
* Avoid taking global lock if list is empty. This is safe since new
@@ -802,7 +847,7 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
* fl_blocked_requests list does not require the flc_lock, so we must
* recheck list_empty() after acquiring the blocked_lock_lock.
*/
- if (list_empty(&blocker->fl_blocked_requests))
+ if (list_empty(&blocker->flc_blocked_requests))
return;
spin_lock(&blocked_lock_lock);
@@ -811,39 +856,39 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
}
static void
-locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
+locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before)
{
- list_add_tail(&fl->fl_list, before);
+ list_add_tail(&fl->flc_list, before);
locks_insert_global_locks(fl);
}
static void
-locks_unlink_lock_ctx(struct file_lock *fl)
+locks_unlink_lock_ctx(struct file_lock_core *fl)
{
locks_delete_global_locks(fl);
- list_del_init(&fl->fl_list);
+ list_del_init(&fl->flc_list);
locks_wake_up_blocks(fl);
}
static void
-locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
+locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose)
{
locks_unlink_lock_ctx(fl);
if (dispose)
- list_add(&fl->fl_list, dispose);
+ list_add(&fl->flc_list, dispose);
else
- locks_free_lock(fl);
+ locks_free_lock(file_lock(fl));
}
/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
* checks for shared/exclusive status of overlapping locks.
*/
-static bool locks_conflict(struct file_lock *caller_fl,
- struct file_lock *sys_fl)
+static bool locks_conflict(struct file_lock_core *caller_flc,
+ struct file_lock_core *sys_flc)
{
- if (sys_fl->fl_type == F_WRLCK)
+ if (sys_flc->flc_type == F_WRLCK)
return true;
- if (caller_fl->fl_type == F_WRLCK)
+ if (caller_flc->flc_type == F_WRLCK)
return true;
return false;
}
@@ -851,20 +896,23 @@ static bool locks_conflict(struct file_lock *caller_fl,
/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
* checking before calling the locks_conflict().
*/
-static bool posix_locks_conflict(struct file_lock *caller_fl,
- struct file_lock *sys_fl)
+static bool posix_locks_conflict(struct file_lock_core *caller_flc,
+ struct file_lock_core *sys_flc)
{
+ struct file_lock *caller_fl = file_lock(caller_flc);
+ struct file_lock *sys_fl = file_lock(sys_flc);
+
/* POSIX locks owned by the same process do not conflict with
* each other.
*/
- if (posix_same_owner(caller_fl, sys_fl))
+ if (posix_same_owner(caller_flc, sys_flc))
return false;
/* Check whether they overlap */
if (!locks_overlap(caller_fl, sys_fl))
return false;
- return locks_conflict(caller_fl, sys_fl);
+ return locks_conflict(caller_flc, sys_flc);
}
/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
@@ -873,28 +921,31 @@ static bool posix_locks_conflict(struct file_lock *caller_fl,
static bool posix_test_locks_conflict(struct file_lock *caller_fl,
struct file_lock *sys_fl)
{
+ struct file_lock_core *caller = &caller_fl->c;
+ struct file_lock_core *sys = &sys_fl->c;
+
/* F_UNLCK checks any locks on the same fd. */
- if (caller_fl->fl_type == F_UNLCK) {
- if (!posix_same_owner(caller_fl, sys_fl))
+ if (lock_is_unlock(caller_fl)) {
+ if (!posix_same_owner(caller, sys))
return false;
return locks_overlap(caller_fl, sys_fl);
}
- return posix_locks_conflict(caller_fl, sys_fl);
+ return posix_locks_conflict(caller, sys);
}
/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
* checking before calling the locks_conflict().
*/
-static bool flock_locks_conflict(struct file_lock *caller_fl,
- struct file_lock *sys_fl)
+static bool flock_locks_conflict(struct file_lock_core *caller_flc,
+ struct file_lock_core *sys_flc)
{
/* FLOCK locks referring to the same filp do not conflict with
* each other.
*/
- if (caller_fl->fl_file == sys_fl->fl_file)
+ if (caller_flc->flc_file == sys_flc->flc_file)
return false;
- return locks_conflict(caller_fl, sys_fl);
+ return locks_conflict(caller_flc, sys_flc);
}
void
@@ -908,13 +959,13 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
ctx = locks_inode_context(inode);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
return;
}
retry:
spin_lock(&ctx->flc_lock);
- list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+ list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) {
if (!posix_test_locks_conflict(fl, cfl))
continue;
if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
@@ -930,7 +981,7 @@ retry:
locks_copy_conflock(fl, cfl);
goto out;
}
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
out:
spin_unlock(&ctx->flc_lock);
return;
@@ -972,25 +1023,27 @@ EXPORT_SYMBOL(posix_test_lock);
#define MAX_DEADLK_ITERATIONS 10
-/* Find a lock that the owner of the given block_fl is blocking on. */
-static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
+/* Find a lock that the owner of the given @blocker is blocking on. */
+static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker)
{
- struct file_lock *fl;
+ struct file_lock_core *flc;
- hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
- if (posix_same_owner(fl, block_fl)) {
- while (fl->fl_blocker)
- fl = fl->fl_blocker;
- return fl;
+ hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) {
+ if (posix_same_owner(flc, blocker)) {
+ while (flc->flc_blocker)
+ flc = flc->flc_blocker;
+ return flc;
}
}
return NULL;
}
/* Must be called with the blocked_lock_lock held! */
-static int posix_locks_deadlock(struct file_lock *caller_fl,
- struct file_lock *block_fl)
+static bool posix_locks_deadlock(struct file_lock *caller_fl,
+ struct file_lock *block_fl)
{
+ struct file_lock_core *caller = &caller_fl->c;
+ struct file_lock_core *blocker = &block_fl->c;
int i = 0;
lockdep_assert_held(&blocked_lock_lock);
@@ -999,16 +1052,16 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
* This deadlock detector can't reasonably detect deadlocks with
* FL_OFDLCK locks, since they aren't owned by a process, per-se.
*/
- if (IS_OFDLCK(caller_fl))
- return 0;
+ if (caller->flc_flags & FL_OFDLCK)
+ return false;
- while ((block_fl = what_owner_is_waiting_for(block_fl))) {
+ while ((blocker = what_owner_is_waiting_for(blocker))) {
if (i++ > MAX_DEADLK_ITERATIONS)
- return 0;
- if (posix_same_owner(caller_fl, block_fl))
- return 1;
+ return false;
+ if (posix_same_owner(caller, blocker))
+ return true;
}
- return 0;
+ return false;
}
/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
@@ -1027,14 +1080,14 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
bool found = false;
LIST_HEAD(dispose);
- ctx = locks_get_lock_context(inode, request->fl_type);
+ ctx = locks_get_lock_context(inode, request->c.flc_type);
if (!ctx) {
- if (request->fl_type != F_UNLCK)
+ if (request->c.flc_type != F_UNLCK)
return -ENOMEM;
- return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0;
+ return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0;
}
- if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
+ if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) {
new_fl = locks_alloc_lock();
if (!new_fl)
return -ENOMEM;
@@ -1042,41 +1095,41 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
- if (request->fl_flags & FL_ACCESS)
+ if (request->c.flc_flags & FL_ACCESS)
goto find_conflict;
- list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
- if (request->fl_file != fl->fl_file)
+ list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
+ if (request->c.flc_file != fl->c.flc_file)
continue;
- if (request->fl_type == fl->fl_type)
+ if (request->c.flc_type == fl->c.flc_type)
goto out;
found = true;
- locks_delete_lock_ctx(fl, &dispose);
+ locks_delete_lock_ctx(&fl->c, &dispose);
break;
}
- if (request->fl_type == F_UNLCK) {
- if ((request->fl_flags & FL_EXISTS) && !found)
+ if (lock_is_unlock(request)) {
+ if ((request->c.flc_flags & FL_EXISTS) && !found)
error = -ENOENT;
goto out;
}
find_conflict:
- list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
- if (!flock_locks_conflict(request, fl))
+ list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
+ if (!flock_locks_conflict(&request->c, &fl->c))
continue;
error = -EAGAIN;
- if (!(request->fl_flags & FL_SLEEP))
+ if (!(request->c.flc_flags & FL_SLEEP))
goto out;
error = FILE_LOCK_DEFERRED;
- locks_insert_block(fl, request, flock_locks_conflict);
+ locks_insert_block(&fl->c, &request->c, flock_locks_conflict);
goto out;
}
- if (request->fl_flags & FL_ACCESS)
+ if (request->c.flc_flags & FL_ACCESS)
goto out;
locks_copy_lock(new_fl, request);
locks_move_blocks(new_fl, request);
- locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
+ locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock);
new_fl = NULL;
error = 0;
@@ -1105,9 +1158,9 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
void *owner;
void (*func)(void);
- ctx = locks_get_lock_context(inode, request->fl_type);
+ ctx = locks_get_lock_context(inode, request->c.flc_type);
if (!ctx)
- return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM;
+ return lock_is_unlock(request) ? 0 : -ENOMEM;
/*
* We may need two file_lock structures for this operation,
@@ -1115,8 +1168,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
*
* In some cases we can be sure, that no new locks will be needed
*/
- if (!(request->fl_flags & FL_ACCESS) &&
- (request->fl_type != F_UNLCK ||
+ if (!(request->c.flc_flags & FL_ACCESS) &&
+ (request->c.flc_type != F_UNLCK ||
request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
new_fl = locks_alloc_lock();
new_fl2 = locks_alloc_lock();
@@ -1130,9 +1183,9 @@ retry:
* there are any, either return error or put the request on the
* blocker's list of waiters and the global blocked_hash.
*/
- if (request->fl_type != F_UNLCK) {
- list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
- if (!posix_locks_conflict(request, fl))
+ if (request->c.flc_type != F_UNLCK) {
+ list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
+ if (!posix_locks_conflict(&request->c, &fl->c))
continue;
if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
&& (*fl->fl_lmops->lm_lock_expirable)(fl)) {
@@ -1148,7 +1201,7 @@ retry:
if (conflock)
locks_copy_conflock(conflock, fl);
error = -EAGAIN;
- if (!(request->fl_flags & FL_SLEEP))
+ if (!(request->c.flc_flags & FL_SLEEP))
goto out;
/*
* Deadlock detection and insertion into the blocked
@@ -1160,10 +1213,10 @@ retry:
* Ensure that we don't find any locks blocked on this
* request during deadlock detection.
*/
- __locks_wake_up_blocks(request);
+ __locks_wake_up_blocks(&request->c);
if (likely(!posix_locks_deadlock(request, fl))) {
error = FILE_LOCK_DEFERRED;
- __locks_insert_block(fl, request,
+ __locks_insert_block(&fl->c, &request->c,
posix_locks_conflict);
}
spin_unlock(&blocked_lock_lock);
@@ -1173,22 +1226,22 @@ retry:
/* If we're just looking for a conflict, we're done. */
error = 0;
- if (request->fl_flags & FL_ACCESS)
+ if (request->c.flc_flags & FL_ACCESS)
goto out;
/* Find the first old lock with the same owner as the new lock */
- list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
- if (posix_same_owner(request, fl))
+ list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
+ if (posix_same_owner(&request->c, &fl->c))
break;
}
/* Process locks with this owner. */
- list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
- if (!posix_same_owner(request, fl))
+ list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) {
+ if (!posix_same_owner(&request->c, &fl->c))
break;
/* Detect adjacent or overlapping regions (if same lock type) */
- if (request->fl_type == fl->fl_type) {
+ if (request->c.flc_type == fl->c.flc_type) {
/* In all comparisons of start vs end, use
* "start - 1" rather than "end + 1". If end
* is OFFSET_MAX, end + 1 will become negative.
@@ -1215,7 +1268,7 @@ retry:
else
request->fl_end = fl->fl_end;
if (added) {
- locks_delete_lock_ctx(fl, &dispose);
+ locks_delete_lock_ctx(&fl->c, &dispose);
continue;
}
request = fl;
@@ -1228,7 +1281,7 @@ retry:
continue;
if (fl->fl_start > request->fl_end)
break;
- if (request->fl_type == F_UNLCK)
+ if (lock_is_unlock(request))
added = true;
if (fl->fl_start < request->fl_start)
left = fl;
@@ -1244,7 +1297,7 @@ retry:
* one (This may happen several times).
*/
if (added) {
- locks_delete_lock_ctx(fl, &dispose);
+ locks_delete_lock_ctx(&fl->c, &dispose);
continue;
}
/*
@@ -1261,8 +1314,9 @@ retry:
locks_move_blocks(new_fl, request);
request = new_fl;
new_fl = NULL;
- locks_insert_lock_ctx(request, &fl->fl_list);
- locks_delete_lock_ctx(fl, &dispose);
+ locks_insert_lock_ctx(&request->c,
+ &fl->c.flc_list);
+ locks_delete_lock_ctx(&fl->c, &dispose);
added = true;
}
}
@@ -1279,8 +1333,8 @@ retry:
error = 0;
if (!added) {
- if (request->fl_type == F_UNLCK) {
- if (request->fl_flags & FL_EXISTS)
+ if (lock_is_unlock(request)) {
+ if (request->c.flc_flags & FL_EXISTS)
error = -ENOENT;
goto out;
}
@@ -1291,7 +1345,7 @@ retry:
}
locks_copy_lock(new_fl, request);
locks_move_blocks(new_fl, request);
- locks_insert_lock_ctx(new_fl, &fl->fl_list);
+ locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list);
fl = new_fl;
new_fl = NULL;
}
@@ -1303,14 +1357,14 @@ retry:
left = new_fl2;
new_fl2 = NULL;
locks_copy_lock(left, right);
- locks_insert_lock_ctx(left, &fl->fl_list);
+ locks_insert_lock_ctx(&left->c, &fl->c.flc_list);
}
right->fl_start = request->fl_end + 1;
- locks_wake_up_blocks(right);
+ locks_wake_up_blocks(&right->c);
}
if (left) {
left->fl_end = request->fl_start - 1;
- locks_wake_up_blocks(left);
+ locks_wake_up_blocks(&left->c);
}
out:
spin_unlock(&ctx->flc_lock);
@@ -1364,8 +1418,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait,
- list_empty(&fl->fl_blocked_member));
+ error = wait_event_interruptible(fl->c.flc_wait,
+ list_empty(&fl->c.flc_blocked_member));
if (error)
break;
}
@@ -1373,37 +1427,37 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
return error;
}
-static void lease_clear_pending(struct file_lock *fl, int arg)
+static void lease_clear_pending(struct file_lease *fl, int arg)
{
switch (arg) {
case F_UNLCK:
- fl->fl_flags &= ~FL_UNLOCK_PENDING;
+ fl->c.flc_flags &= ~FL_UNLOCK_PENDING;
fallthrough;
case F_RDLCK:
- fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+ fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING;
}
}
/* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
+int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
{
- int error = assign_type(fl, arg);
+ int error = assign_type(&fl->c, arg);
if (error)
return error;
lease_clear_pending(fl, arg);
- locks_wake_up_blocks(fl);
+ locks_wake_up_blocks(&fl->c);
if (arg == F_UNLCK) {
- struct file *filp = fl->fl_file;
+ struct file *filp = fl->c.flc_file;
f_delown(filp);
filp->f_owner.signum = 0;
- fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
+ fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
if (fl->fl_fasync != NULL) {
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
fl->fl_fasync = NULL;
}
- locks_delete_lock_ctx(fl, dispose);
+ locks_delete_lock_ctx(&fl->c, dispose);
}
return 0;
}
@@ -1420,11 +1474,11 @@ static bool past_time(unsigned long then)
static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
struct file_lock_context *ctx = inode->i_flctx;
- struct file_lock *fl, *tmp;
+ struct file_lease *fl, *tmp;
lockdep_assert_held(&ctx->flc_lock);
- list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
lease_modify(fl, F_RDLCK, dispose);
@@ -1433,38 +1487,40 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
}
}
-static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
+static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc)
{
bool rc;
+ struct file_lease *lease = file_lease(lc);
+ struct file_lease *breaker = file_lease(bc);
if (lease->fl_lmops->lm_breaker_owns_lease
&& lease->fl_lmops->lm_breaker_owns_lease(lease))
return false;
- if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) {
+ if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) {
rc = false;
goto trace;
}
- if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) {
+ if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) {
rc = false;
goto trace;
}
- rc = locks_conflict(breaker, lease);
+ rc = locks_conflict(bc, lc);
trace:
trace_leases_conflict(rc, lease, breaker);
return rc;
}
static bool
-any_leases_conflict(struct inode *inode, struct file_lock *breaker)
+any_leases_conflict(struct inode *inode, struct file_lease *breaker)
{
struct file_lock_context *ctx = inode->i_flctx;
- struct file_lock *fl;
+ struct file_lock_core *flc;
lockdep_assert_held(&ctx->flc_lock);
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (leases_conflict(fl, breaker))
+ list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
+ if (leases_conflict(flc, &breaker->c))
return true;
}
return false;
@@ -1487,7 +1543,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
struct file_lock_context *ctx;
- struct file_lock *new_fl, *fl, *tmp;
+ struct file_lease *new_fl, *fl, *tmp;
unsigned long break_time;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
LIST_HEAD(dispose);
@@ -1495,7 +1551,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
if (IS_ERR(new_fl))
return PTR_ERR(new_fl);
- new_fl->fl_flags = type;
+ new_fl->c.flc_flags = type;
/* typically we will check that ctx is non-NULL before calling */
ctx = locks_inode_context(inode);
@@ -1519,22 +1575,22 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
break_time++; /* so that 0 means no break time */
}
- list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
- if (!leases_conflict(fl, new_fl))
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
+ if (!leases_conflict(&fl->c, &new_fl->c))
continue;
if (want_write) {
- if (fl->fl_flags & FL_UNLOCK_PENDING)
+ if (fl->c.flc_flags & FL_UNLOCK_PENDING)
continue;
- fl->fl_flags |= FL_UNLOCK_PENDING;
+ fl->c.flc_flags |= FL_UNLOCK_PENDING;
fl->fl_break_time = break_time;
} else {
if (lease_breaking(fl))
continue;
- fl->fl_flags |= FL_DOWNGRADE_PENDING;
+ fl->c.flc_flags |= FL_DOWNGRADE_PENDING;
fl->fl_downgrade_time = break_time;
}
if (fl->fl_lmops->lm_break(fl))
- locks_delete_lock_ctx(fl, &dispose);
+ locks_delete_lock_ctx(&fl->c, &dispose);
}
if (list_empty(&ctx->flc_lease))
@@ -1547,26 +1603,26 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
}
restart:
- fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+ fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
break_time = fl->fl_break_time;
if (break_time != 0)
break_time -= jiffies;
if (break_time == 0)
break_time++;
- locks_insert_block(fl, new_fl, leases_conflict);
+ locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
trace_break_lease_block(inode, new_fl);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
- error = wait_event_interruptible_timeout(new_fl->fl_wait,
- list_empty(&new_fl->fl_blocked_member),
- break_time);
+ error = wait_event_interruptible_timeout(new_fl->c.flc_wait,
+ list_empty(&new_fl->c.flc_blocked_member),
+ break_time);
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
- locks_delete_block(new_fl);
+ __locks_delete_block(&new_fl->c);
if (error >= 0) {
/*
* Wait for the next conflicting lease that has not been
@@ -1583,7 +1639,7 @@ out:
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
free_lock:
- locks_free_lock(new_fl);
+ locks_free_lease(new_fl);
return error;
}
EXPORT_SYMBOL(__break_lease);
@@ -1601,14 +1657,14 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
{
bool has_lease = false;
struct file_lock_context *ctx;
- struct file_lock *fl;
+ struct file_lock_core *flc;
ctx = locks_inode_context(inode);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
- fl = list_first_entry_or_null(&ctx->flc_lease,
- struct file_lock, fl_list);
- if (fl && (fl->fl_type == F_WRLCK))
+ flc = list_first_entry_or_null(&ctx->flc_lease,
+ struct file_lock_core, flc_list);
+ if (flc && flc->flc_type == F_WRLCK)
has_lease = true;
spin_unlock(&ctx->flc_lock);
}
@@ -1643,7 +1699,7 @@ EXPORT_SYMBOL(lease_get_mtime);
*/
int fcntl_getlease(struct file *filp)
{
- struct file_lock *fl;
+ struct file_lease *fl;
struct inode *inode = file_inode(filp);
struct file_lock_context *ctx;
int type = F_UNLCK;
@@ -1654,8 +1710,8 @@ int fcntl_getlease(struct file *filp)
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_file != filp)
+ list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+ if (fl->c.flc_file != filp)
continue;
type = target_leasetype(fl);
break;
@@ -1715,12 +1771,12 @@ check_conflicting_open(struct file *filp, const int arg, int flags)
}
static int
-generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv)
+generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv)
{
- struct file_lock *fl, *my_fl = NULL, *lease;
+ struct file_lease *fl, *my_fl = NULL, *lease;
struct inode *inode = file_inode(filp);
struct file_lock_context *ctx;
- bool is_deleg = (*flp)->fl_flags & FL_DELEG;
+ bool is_deleg = (*flp)->c.flc_flags & FL_DELEG;
int error;
LIST_HEAD(dispose);
@@ -1746,7 +1802,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
- error = check_conflicting_open(filp, arg, lease->fl_flags);
+ error = check_conflicting_open(filp, arg, lease->c.flc_flags);
if (error)
goto out;
@@ -1759,9 +1815,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
* except for this filp.
*/
error = -EAGAIN;
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_file == filp &&
- fl->fl_owner == lease->fl_owner) {
+ list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+ if (fl->c.flc_file == filp &&
+ fl->c.flc_owner == lease->c.flc_owner) {
my_fl = fl;
continue;
}
@@ -1776,7 +1832,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
* Modifying our existing lease is OK, but no getting a
* new lease if someone else is opening for write:
*/
- if (fl->fl_flags & FL_UNLOCK_PENDING)
+ if (fl->c.flc_flags & FL_UNLOCK_PENDING)
goto out;
}
@@ -1792,7 +1848,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
if (!leases_enable)
goto out;
- locks_insert_lock_ctx(lease, &ctx->flc_lease);
+ locks_insert_lock_ctx(&lease->c, &ctx->flc_lease);
/*
* The check in break_lease() is lockless. It's possible for another
* open to race in after we did the earlier check for a conflicting
@@ -1803,9 +1859,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
* precedes these checks.
*/
smp_mb();
- error = check_conflicting_open(filp, arg, lease->fl_flags);
+ error = check_conflicting_open(filp, arg, lease->c.flc_flags);
if (error) {
- locks_unlink_lock_ctx(lease);
+ locks_unlink_lock_ctx(&lease->c);
goto out;
}
@@ -1826,7 +1882,7 @@ out:
static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
- struct file_lock *fl, *victim = NULL;
+ struct file_lease *fl, *victim = NULL;
struct inode *inode = file_inode(filp);
struct file_lock_context *ctx;
LIST_HEAD(dispose);
@@ -1839,9 +1895,9 @@ static int generic_delete_lease(struct file *filp, void *owner)
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_file == filp &&
- fl->fl_owner == owner) {
+ list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+ if (fl->c.flc_file == filp &&
+ fl->c.flc_owner == owner) {
victim = fl;
break;
}
@@ -1866,21 +1922,9 @@ static int generic_delete_lease(struct file *filp, void *owner)
* The (input) flp->fl_lmops->lm_break function is required
* by break_lease().
*/
-int generic_setlease(struct file *filp, int arg, struct file_lock **flp,
+int generic_setlease(struct file *filp, int arg, struct file_lease **flp,
void **priv)
{
- struct inode *inode = file_inode(filp);
- vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
- int error;
-
- if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
- return -EACCES;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
- error = security_file_lock(filp, arg);
- if (error)
- return error;
-
switch (arg) {
case F_UNLCK:
return generic_delete_lease(filp, *priv);
@@ -1913,7 +1957,7 @@ lease_notifier_chain_init(void)
}
static inline void
-setlease_notifier(int arg, struct file_lock *lease)
+setlease_notifier(int arg, struct file_lease *lease)
{
if (arg != F_UNLCK)
srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
@@ -1931,6 +1975,19 @@ void lease_unregister_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+int
+kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
+{
+ if (lease)
+ setlease_notifier(arg, *lease);
+ if (filp->f_op->setlease)
+ return filp->f_op->setlease(filp, arg, lease, priv);
+ else
+ return generic_setlease(filp, arg, lease, priv);
+}
+EXPORT_SYMBOL_GPL(kernel_setlease);
+
/**
* vfs_setlease - sets a lease on an open file
* @filp: file pointer
@@ -1949,20 +2006,26 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier);
* may be NULL if the lm_setup operation doesn't require it.
*/
int
-vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv)
+vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
- if (lease)
- setlease_notifier(arg, *lease);
- if (filp->f_op->setlease)
- return filp->f_op->setlease(filp, arg, lease, priv);
- else
- return generic_setlease(filp, arg, lease, priv);
+ struct inode *inode = file_inode(filp);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
+ int error;
+
+ if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
+ return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ error = security_file_lock(filp, arg);
+ if (error)
+ return error;
+ return kernel_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);
static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
{
- struct file_lock *fl;
+ struct file_lease *fl;
struct fasync_struct *new;
int error;
@@ -1972,14 +2035,14 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
new = fasync_alloc();
if (!new) {
- locks_free_lock(fl);
+ locks_free_lease(fl);
return -ENOMEM;
}
new->fa_fd = fd;
error = vfs_setlease(filp, arg, &fl, (void **)&new);
if (fl)
- locks_free_lock(fl);
+ locks_free_lease(fl);
if (new)
fasync_free(new);
return error;
@@ -2017,8 +2080,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = flock_lock_inode(inode, fl);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait,
- list_empty(&fl->fl_blocked_member));
+ error = wait_event_interruptible(fl->c.flc_wait,
+ list_empty(&fl->c.flc_blocked_member));
if (error)
break;
}
@@ -2036,7 +2099,7 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+ switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) {
case FL_POSIX:
res = posix_lock_inode_wait(inode, fl);
break;
@@ -2098,13 +2161,13 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
flock_make_lock(f.file, &fl, type);
- error = security_file_lock(f.file, fl.fl_type);
+ error = security_file_lock(f.file, fl.c.flc_type);
if (error)
goto out_putf;
can_sleep = !(cmd & LOCK_NB);
if (can_sleep)
- fl.fl_flags |= FL_SLEEP;
+ fl.c.flc_flags |= FL_SLEEP;
if (f.file->f_op->flock)
error = f.file->f_op->flock(f.file,
@@ -2130,7 +2193,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
- WARN_ON_ONCE(filp != fl->fl_file);
+ WARN_ON_ONCE(filp != fl->c.flc_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_GETLK, fl);
posix_test_lock(filp, fl);
@@ -2145,25 +2208,28 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
*
* Used to translate a fl_pid into a namespace virtual pid number
*/
-static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
+static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns)
{
pid_t vnr;
struct pid *pid;
- if (IS_OFDLCK(fl))
+ if (fl->flc_flags & FL_OFDLCK)
return -1;
- if (IS_REMOTELCK(fl))
- return fl->fl_pid;
+
+ /* Remote locks report a negative pid value */
+ if (fl->flc_pid <= 0)
+ return fl->flc_pid;
+
/*
* If the flock owner process is dead and its pid has been already
* freed, the translation below won't work, but we still want to show
* flock owner pid number in init pidns.
*/
if (ns == &init_pid_ns)
- return (pid_t)fl->fl_pid;
+ return (pid_t) fl->flc_pid;
rcu_read_lock();
- pid = find_pid_ns(fl->fl_pid, &init_pid_ns);
+ pid = find_pid_ns(fl->flc_pid, &init_pid_ns);
vnr = pid_nr_ns(pid, ns);
rcu_read_unlock();
return vnr;
@@ -2171,7 +2237,7 @@ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
- flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
+ flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
/*
* Make sure we can represent the posix lock via
@@ -2186,19 +2252,19 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
flock->l_whence = 0;
- flock->l_type = fl->fl_type;
+ flock->l_type = fl->c.flc_type;
return 0;
}
#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
- flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
+ flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
flock->l_start = fl->fl_start;
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
flock->l_whence = 0;
- flock->l_type = fl->fl_type;
+ flock->l_type = fl->c.flc_type;
}
#endif
@@ -2227,16 +2293,16 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
if (flock->l_pid != 0)
goto out;
- fl->fl_flags |= FL_OFDLCK;
- fl->fl_owner = filp;
+ fl->c.flc_flags |= FL_OFDLCK;
+ fl->c.flc_owner = filp;
}
error = vfs_test_lock(filp, fl);
if (error)
goto out;
- flock->l_type = fl->fl_type;
- if (fl->fl_type != F_UNLCK) {
+ flock->l_type = fl->c.flc_type;
+ if (fl->c.flc_type != F_UNLCK) {
error = posix_lock_to_flock(flock, fl);
if (error)
goto out;
@@ -2283,7 +2349,7 @@ out:
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
- WARN_ON_ONCE(filp != fl->fl_file);
+ WARN_ON_ONCE(filp != fl->c.flc_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, cmd, fl);
else
@@ -2296,7 +2362,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
{
int error;
- error = security_file_lock(filp, fl->fl_type);
+ error = security_file_lock(filp, fl->c.flc_type);
if (error)
return error;
@@ -2304,8 +2370,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
error = vfs_lock_file(filp, cmd, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait,
- list_empty(&fl->fl_blocked_member));
+ error = wait_event_interruptible(fl->c.flc_wait,
+ list_empty(&fl->c.flc_blocked_member));
if (error)
break;
}
@@ -2318,13 +2384,13 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
static int
check_fmode_for_setlk(struct file_lock *fl)
{
- switch (fl->fl_type) {
+ switch (fl->c.flc_type) {
case F_RDLCK:
- if (!(fl->fl_file->f_mode & FMODE_READ))
+ if (!(fl->c.flc_file->f_mode & FMODE_READ))
return -EBADF;
break;
case F_WRLCK:
- if (!(fl->fl_file->f_mode & FMODE_WRITE))
+ if (!(fl->c.flc_file->f_mode & FMODE_WRITE))
return -EBADF;
}
return 0;
@@ -2363,8 +2429,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
cmd = F_SETLK;
- file_lock->fl_flags |= FL_OFDLCK;
- file_lock->fl_owner = filp;
+ file_lock->c.flc_flags |= FL_OFDLCK;
+ file_lock->c.flc_owner = filp;
break;
case F_OFD_SETLKW:
error = -EINVAL;
@@ -2372,11 +2438,11 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
cmd = F_SETLKW;
- file_lock->fl_flags |= FL_OFDLCK;
- file_lock->fl_owner = filp;
+ file_lock->c.flc_flags |= FL_OFDLCK;
+ file_lock->c.flc_owner = filp;
fallthrough;
case F_SETLKW:
- file_lock->fl_flags |= FL_SLEEP;
+ file_lock->c.flc_flags |= FL_SLEEP;
}
error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2386,8 +2452,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
* lock that was just acquired. There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
- if (!error && file_lock->fl_type != F_UNLCK &&
- !(file_lock->fl_flags & FL_OFDLCK)) {
+ if (!error && file_lock->c.flc_type != F_UNLCK &&
+ !(file_lock->c.flc_flags & FL_OFDLCK)) {
struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
@@ -2398,7 +2464,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
- file_lock->fl_type = F_UNLCK;
+ file_lock->c.flc_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
WARN_ON_ONCE(error);
error = -EBADF;
@@ -2437,16 +2503,16 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
if (flock->l_pid != 0)
goto out;
- fl->fl_flags |= FL_OFDLCK;
- fl->fl_owner = filp;
+ fl->c.flc_flags |= FL_OFDLCK;
+ fl->c.flc_owner = filp;
}
error = vfs_test_lock(filp, fl);
if (error)
goto out;
- flock->l_type = fl->fl_type;
- if (fl->fl_type != F_UNLCK)
+ flock->l_type = fl->c.flc_type;
+ if (fl->c.flc_type != F_UNLCK)
posix_lock_to_flock64(flock, fl);
out:
@@ -2486,8 +2552,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
cmd = F_SETLK64;
- file_lock->fl_flags |= FL_OFDLCK;
- file_lock->fl_owner = filp;
+ file_lock->c.flc_flags |= FL_OFDLCK;
+ file_lock->c.flc_owner = filp;
break;
case F_OFD_SETLKW:
error = -EINVAL;
@@ -2495,11 +2561,11 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
cmd = F_SETLKW64;
- file_lock->fl_flags |= FL_OFDLCK;
- file_lock->fl_owner = filp;
+ file_lock->c.flc_flags |= FL_OFDLCK;
+ file_lock->c.flc_owner = filp;
fallthrough;
case F_SETLKW64:
- file_lock->fl_flags |= FL_SLEEP;
+ file_lock->c.flc_flags |= FL_SLEEP;
}
error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2509,8 +2575,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
* lock that was just acquired. There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
- if (!error && file_lock->fl_type != F_UNLCK &&
- !(file_lock->fl_flags & FL_OFDLCK)) {
+ if (!error && file_lock->c.flc_type != F_UNLCK &&
+ !(file_lock->c.flc_flags & FL_OFDLCK)) {
struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
@@ -2521,7 +2587,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
- file_lock->fl_type = F_UNLCK;
+ file_lock->c.flc_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
WARN_ON_ONCE(error);
error = -EBADF;
@@ -2555,13 +2621,13 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
return;
locks_init_lock(&lock);
- lock.fl_type = F_UNLCK;
- lock.fl_flags = FL_POSIX | FL_CLOSE;
+ lock.c.flc_type = F_UNLCK;
+ lock.c.flc_flags = FL_POSIX | FL_CLOSE;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
- lock.fl_owner = owner;
- lock.fl_pid = current->tgid;
- lock.fl_file = filp;
+ lock.c.flc_owner = owner;
+ lock.c.flc_pid = current->tgid;
+ lock.c.flc_file = filp;
lock.fl_ops = NULL;
lock.fl_lmops = NULL;
@@ -2584,7 +2650,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
return;
flock_make_lock(filp, &fl, F_UNLCK);
- fl.fl_flags |= FL_CLOSE;
+ fl.c.flc_flags |= FL_CLOSE;
if (filp->f_op->flock)
filp->f_op->flock(filp, F_SETLKW, &fl);
@@ -2599,7 +2665,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
static void
locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
- struct file_lock *fl, *tmp;
+ struct file_lease *fl, *tmp;
LIST_HEAD(dispose);
if (list_empty(&ctx->flc_lease))
@@ -2607,8 +2673,8 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
- list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
- if (filp == fl->fl_file)
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list)
+ if (filp == fl->c.flc_file)
lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
@@ -2652,7 +2718,7 @@ void locks_remove_file(struct file *filp)
*/
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
- WARN_ON_ONCE(filp != fl->fl_file);
+ WARN_ON_ONCE(filp != fl->c.flc_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
@@ -2691,69 +2757,73 @@ struct locks_iterator {
loff_t li_pos;
};
-static void lock_get_status(struct seq_file *f, struct file_lock *fl,
+static void lock_get_status(struct seq_file *f, struct file_lock_core *flc,
loff_t id, char *pfx, int repeat)
{
struct inode *inode = NULL;
- unsigned int fl_pid;
+ unsigned int pid;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
- int type;
+ int type = flc->flc_type;
+ struct file_lock *fl = file_lock(flc);
+
+ pid = locks_translate_pid(flc, proc_pidns);
- fl_pid = locks_translate_pid(fl, proc_pidns);
/*
* If lock owner is dead (and pid is freed) or not visible in current
* pidns, zero is shown as a pid value. Check lock info from
* init_pid_ns to get saved lock pid value.
*/
-
- if (fl->fl_file != NULL)
- inode = file_inode(fl->fl_file);
+ if (flc->flc_file != NULL)
+ inode = file_inode(flc->flc_file);
seq_printf(f, "%lld: ", id);
if (repeat)
seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);
- if (IS_POSIX(fl)) {
- if (fl->fl_flags & FL_ACCESS)
+ if (flc->flc_flags & FL_POSIX) {
+ if (flc->flc_flags & FL_ACCESS)
seq_puts(f, "ACCESS");
- else if (IS_OFDLCK(fl))
+ else if (flc->flc_flags & FL_OFDLCK)
seq_puts(f, "OFDLCK");
else
seq_puts(f, "POSIX ");
seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" : "ADVISORY ");
- } else if (IS_FLOCK(fl)) {
+ } else if (flc->flc_flags & FL_FLOCK) {
seq_puts(f, "FLOCK ADVISORY ");
- } else if (IS_LEASE(fl)) {
- if (fl->fl_flags & FL_DELEG)
+ } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) {
+ struct file_lease *lease = file_lease(flc);
+
+ type = target_leasetype(lease);
+
+ if (flc->flc_flags & FL_DELEG)
seq_puts(f, "DELEG ");
else
seq_puts(f, "LEASE ");
- if (lease_breaking(fl))
+ if (lease_breaking(lease))
seq_puts(f, "BREAKING ");
- else if (fl->fl_file)
+ else if (flc->flc_file)
seq_puts(f, "ACTIVE ");
else
seq_puts(f, "BREAKER ");
} else {
seq_puts(f, "UNKNOWN UNKNOWN ");
}
- type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
(type == F_RDLCK) ? "READ" : "UNLCK");
if (inode) {
/* userspace relies on this representation of dev_t */
- seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
+ seq_printf(f, "%d %02x:%02x:%lu ", pid,
MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino);
} else {
- seq_printf(f, "%d <none>:0 ", fl_pid);
+ seq_printf(f, "%d <none>:0 ", pid);
}
- if (IS_POSIX(fl)) {
+ if (flc->flc_flags & FL_POSIX) {
if (fl->fl_end == OFFSET_MAX)
seq_printf(f, "%Ld EOF\n", fl->fl_start);
else
@@ -2763,17 +2833,18 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
}
}
-static struct file_lock *get_next_blocked_member(struct file_lock *node)
+static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node)
{
- struct file_lock *tmp;
+ struct file_lock_core *tmp;
/* NULL node or root node */
- if (node == NULL || node->fl_blocker == NULL)
+ if (node == NULL || node->flc_blocker == NULL)
return NULL;
/* Next member in the linked list could be itself */
- tmp = list_next_entry(node, fl_blocked_member);
- if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member)
+ tmp = list_next_entry(node, flc_blocked_member);
+ if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests,
+ flc_blocked_member)
|| tmp == node) {
return NULL;
}
@@ -2784,18 +2855,18 @@ static struct file_lock *get_next_blocked_member(struct file_lock *node)
static int locks_show(struct seq_file *f, void *v)
{
struct locks_iterator *iter = f->private;
- struct file_lock *cur, *tmp;
+ struct file_lock_core *cur, *tmp;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
int level = 0;
- cur = hlist_entry(v, struct file_lock, fl_link);
+ cur = hlist_entry(v, struct file_lock_core, flc_link);
if (locks_translate_pid(cur, proc_pidns) == 0)
return 0;
- /* View this crossed linked list as a binary tree, the first member of fl_blocked_requests
- * is the left child of current node, the next silibing in fl_blocked_member is the
- * right child, we can alse get the parent of current node from fl_blocker, so this
+ /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests
+ * is the left child of current node, the next silibing in flc_blocked_member is the
+ * right child, we can alse get the parent of current node from flc_blocker, so this
* question becomes traversal of a binary tree
*/
while (cur != NULL) {
@@ -2804,17 +2875,18 @@ static int locks_show(struct seq_file *f, void *v)
else
lock_get_status(f, cur, iter->li_pos, "", level);
- if (!list_empty(&cur->fl_blocked_requests)) {
+ if (!list_empty(&cur->flc_blocked_requests)) {
/* Turn left */
- cur = list_first_entry_or_null(&cur->fl_blocked_requests,
- struct file_lock, fl_blocked_member);
+ cur = list_first_entry_or_null(&cur->flc_blocked_requests,
+ struct file_lock_core,
+ flc_blocked_member);
level++;
} else {
/* Turn right */
tmp = get_next_blocked_member(cur);
/* Fall back to parent node */
- while (tmp == NULL && cur->fl_blocker != NULL) {
- cur = cur->fl_blocker;
+ while (tmp == NULL && cur->flc_blocker != NULL) {
+ cur = cur->flc_blocker;
level--;
tmp = get_next_blocked_member(cur);
}
@@ -2829,14 +2901,13 @@ static void __show_fd_locks(struct seq_file *f,
struct list_head *head, int *id,
struct file *filp, struct files_struct *files)
{
- struct file_lock *fl;
+ struct file_lock_core *fl;
- list_for_each_entry(fl, head, fl_list) {
+ list_for_each_entry(fl, head, flc_list) {
- if (filp != fl->fl_file)
+ if (filp != fl->flc_file)
continue;
- if (fl->fl_owner != files &&
- fl->fl_owner != filp)
+ if (fl->flc_owner != files && fl->flc_owner != filp)
continue;
(*id)++;
@@ -2915,6 +2986,9 @@ static int __init filelock_init(void)
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+ filelease_cache = kmem_cache_create("file_lock_cache",
+ sizeof(struct file_lease), 0, SLAB_PANIC, NULL);
+
for_each_possible_cpu(i) {
struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 82aa7a35db26..e60a840999aa 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -426,9 +426,7 @@ EXPORT_SYMBOL(mb_cache_destroy);
static int __init mbcache_init(void)
{
- mb_entry_cache = kmem_cache_create("mbcache",
- sizeof(struct mb_cache_entry), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+ mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT);
if (!mb_entry_cache)
return -ENOMEM;
return 0;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 73f37f298087..7cbd2b9f4d11 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -87,7 +87,7 @@ static int __init init_inodecache(void)
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (minix_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 64c5205e2b5e..3c60f1eaca61 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -214,7 +214,7 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from,
* anything at all.
*/
if (nr_extents == 0)
- return 0;
+ return -EINVAL;
/*
* Here we know that nr_extents is greater than zero which means
diff --git a/fs/mpage.c b/fs/mpage.c
index 738882e0766d..fa8b99a199fa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,7 @@ alloc_new:
GFP_NOFS);
bio->bi_iter.bi_sector = first_block << (blkbits - 9);
wbc_init_bio(wbc, bio);
+ bio->bi_write_hint = inode->i_write_hint;
}
/*
diff --git a/fs/namei.c b/fs/namei.c
index 9342fa6a38c2..c5b2a25be7d0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -17,8 +17,8 @@
#include <linux/init.h>
#include <linux/export.h>
-#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/wordpart.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/namei.h>
@@ -27,7 +27,6 @@
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
-#include <linux/ima.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
@@ -2680,10 +2679,8 @@ static int lookup_one_common(struct mnt_idmap *idmap,
if (!len)
return -EACCES;
- if (unlikely(name[0] == '.')) {
- if (len < 2 || (len == 2 && name[1] == '.'))
- return -EACCES;
- }
+ if (is_dot_dotdot(name, len))
+ return -EACCES;
while (len--) {
unsigned int c = *(const unsigned char *)name++;
@@ -3644,7 +3641,7 @@ static int do_open(struct nameidata *nd,
if (!error && !(file->f_mode & FMODE_OPENED))
error = vfs_open(&nd->path, file);
if (!error)
- error = ima_file_check(file, op->acc_mode);
+ error = security_file_post_open(file, op->acc_mode);
if (!error && do_truncate)
error = handle_truncate(idmap, file);
if (unlikely(error > 0)) {
@@ -3707,7 +3704,7 @@ static int vfs_tmpfile(struct mnt_idmap *idmap,
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
}
- ima_post_create_tmpfile(idmap, inode);
+ security_inode_post_create_tmpfile(idmap, inode);
return 0;
}
@@ -4054,7 +4051,7 @@ retry:
error = vfs_create(idmap, path.dentry->d_inode,
dentry, mode, true);
if (!error)
- ima_post_path_mknod(idmap, dentry);
+ security_path_post_mknod(idmap, dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(idmap, path.dentry->d_inode,
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index ad572f7ee897..43a651ed8264 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -83,8 +83,10 @@ static int fscache_begin_operation(struct netfs_cache_resources *cres,
cres->debug_id = cookie->debug_id;
cres->inval_counter = cookie->inval_counter;
- if (!fscache_begin_cookie_access(cookie, why))
+ if (!fscache_begin_cookie_access(cookie, why)) {
+ cres->cache_priv = NULL;
return -ENOBUFS;
+ }
again:
spin_lock(&cookie->lock);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index b4294a8aa2d4..f1eeb4914199 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
struct pnfs_block_dev *children;
u64 chunk_size;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
u64 disk_offset;
u64 pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index c97ebc42ec0f..93ef7f864980 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
} else {
if (dev->pr_registered) {
const struct pr_ops *ops =
- dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
+ file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops;
int error;
- error = ops->pr_register(dev->bdev_handle->bdev,
+ error = ops->pr_register(file_bdev(dev->bdev_file),
dev->pr_key, 0, false);
if (error)
pr_err("failed to unregister PR key.\n");
}
- if (dev->bdev_handle)
- bdev_release(dev->bdev_handle);
+ if (dev->bdev_file)
+ fput(dev->bdev_file);
}
}
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
map->start = dev->start;
map->len = dev->len;
map->disk_offset = dev->disk_offset;
- map->bdev = dev->bdev_handle->bdev;
+ map->bdev = file_bdev(dev->bdev_file);
return true;
}
@@ -236,26 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
- bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
NULL, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
- MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
- return PTR_ERR(bdev_handle);
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file));
+ return PTR_ERR(bdev_file);
}
- d->bdev_handle = bdev_handle;
- d->len = bdev_nr_bytes(bdev_handle->bdev);
+ d->bdev_file = bdev_file;
+ d->len = bdev_nr_bytes(file_bdev(bdev_file));
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
- bdev_handle->bdev->bd_disk->disk_name);
+ file_bdev(bdev_file)->bd_disk->disk_name);
return 0;
}
@@ -300,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
-static struct bdev_handle *
+static struct file *
bl_open_path(struct pnfs_block_volume *v, const char *prefix)
{
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
const char *devname;
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -311,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
if (!devname)
return ERR_PTR(-ENOMEM);
- bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
NULL, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
pr_warn("pNFS: failed to open device %s (%ld)\n",
- devname, PTR_ERR(bdev_handle));
+ devname, PTR_ERR(bdev_file));
}
kfree(devname);
- return bdev_handle;
+ return bdev_file;
}
static int
@@ -327,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
const struct pr_ops *ops;
int error;
@@ -340,14 +340,14 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
* On other distributions like Debian, the default SCSI by-id path will
* point to the dm-multipath device if one exists.
*/
- bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
- if (IS_ERR(bdev_handle))
- bdev_handle = bl_open_path(v, "wwn-0x");
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
- d->bdev_handle = bdev_handle;
-
- d->len = bdev_nr_bytes(d->bdev_handle->bdev);
+ bdev_file = bl_open_path(v, "dm-uuid-mpath-0x");
+ if (IS_ERR(bdev_file))
+ bdev_file = bl_open_path(v, "wwn-0x");
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
+ d->bdev_file = bdev_file;
+
+ d->len = bdev_nr_bytes(file_bdev(d->bdev_file));
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
@@ -355,20 +355,20 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return -ENODEV;
pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
- d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
+ file_bdev(d->bdev_file)->bd_disk->disk_name, d->pr_key);
- ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
+ ops = file_bdev(d->bdev_file)->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: block device %s does not support reservations.",
- d->bdev_handle->bdev->bd_disk->disk_name);
+ file_bdev(d->bdev_file)->bd_disk->disk_name);
error = -EINVAL;
goto out_blkdev_put;
}
- error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
+ error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
if (error) {
pr_err("pNFS: failed to register key for block device %s.",
- d->bdev_handle->bdev->bd_disk->disk_name);
+ file_bdev(d->bdev_file)->bd_disk->disk_name);
goto out_blkdev_put;
}
@@ -376,7 +376,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
out_blkdev_put:
- bdev_release(d->bdev_handle);
+ fput(d->bdev_file);
return error;
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 760d27dd7225..8adfcd4c8c1a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -356,15 +356,12 @@ static const struct svc_version *nfs4_callback_version[] = {
[4] = &nfs4_callback_version4,
};
-static struct svc_stat nfs4_callback_stats;
-
static struct svc_program nfs4_callback_program = {
.pg_prog = NFS4_CALLBACK, /* RPC service number */
.pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */
.pg_vers = nfs4_callback_version, /* version table */
.pg_name = "NFSv4 callback", /* service name */
.pg_class = "nfs", /* authentication class */
- .pg_stats = &nfs4_callback_stats,
.pg_authenticate = nfs_callback_authenticate,
.pg_init_request = svc_generic_init_request,
.pg_rpcbind_set = svc_generic_rpcbind_set,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index fbdc9ca80f71..de77848ae654 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -73,14 +73,9 @@ const struct rpc_program nfs_program = {
.number = NFS_PROGRAM,
.nrvers = ARRAY_SIZE(nfs_version),
.version = nfs_version,
- .stats = &nfs_rpcstat,
.pipe_dir_name = NFS_PIPE_DIRNAME,
};
-struct rpc_stat nfs_rpcstat = {
- .program = &nfs_program
-};
-
static struct nfs_subversion *find_nfs_version(unsigned int version)
{
struct nfs_subversion *nfs;
@@ -502,6 +497,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
const struct nfs_client_initdata *cl_init,
rpc_authflavor_t flavor)
{
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
struct rpc_clnt *clnt = NULL;
struct rpc_create_args args = {
.net = clp->cl_net,
@@ -513,6 +509,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
.servername = clp->cl_hostname,
.nodename = cl_init->nodename,
.program = &nfs_program,
+ .stats = &nn->rpcstats,
.version = clp->rpc_ops->version,
.authflavor = flavor,
.cred = cl_init->cred,
@@ -1182,6 +1179,8 @@ void nfs_clients_init(struct net *net)
#endif
spin_lock_init(&nn->nfs_client_lock);
nn->boot_time = ktime_get_real();
+ memset(&nn->rpcstats, 0, sizeof(nn->rpcstats));
+ nn->rpcstats.program = &nfs_program;
nfs_netns_sysfs_setup(nn, net);
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index fa1a14def45c..6bace5fece04 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -156,8 +156,8 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state
list = &flctx->flc_posix;
spin_lock(&flctx->flc_lock);
restart:
- list_for_each_entry(fl, list, fl_list) {
- if (nfs_file_open_context(fl->fl_file)->state != state)
+ for_each_file_lock(fl, list) {
+ if (nfs_file_open_context(fl->c.flc_file)->state != state)
continue;
spin_unlock(&flctx->flc_lock);
status = nfs4_lock_delegation_recall(fl, state, stateid);
@@ -181,7 +181,6 @@ static int nfs_delegation_claim_opens(struct inode *inode,
struct nfs_open_context *ctx;
struct nfs4_state_owner *sp;
struct nfs4_state *state;
- unsigned int seq;
int err;
again:
@@ -202,12 +201,9 @@ again:
sp = state->owner;
/* Block nfs4_proc_unlck */
mutex_lock(&sp->so_delegreturn_mutex);
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
err = nfs4_open_delegation_recall(ctx, state, stateid);
if (!err)
err = nfs_delegation_claim_locks(state, stateid);
- if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
- err = -EAGAIN;
mutex_unlock(&sp->so_delegreturn_mutex);
put_nfs_open_context(ctx);
if (err != 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c03926a1cc73..bb2f583eb28b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -606,6 +606,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
trace_nfs_direct_commit_complete(dreq);
+ spin_lock(&dreq->lock);
if (status < 0) {
/* Errors in commit are fatal */
dreq->error = status;
@@ -613,6 +614,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
} else {
status = dreq->error;
}
+ spin_unlock(&dreq->lock);
nfs_init_cinfo_from_dreq(&cinfo, dreq);
@@ -625,7 +627,10 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
spin_unlock(&dreq->lock);
nfs_release_request(req);
} else if (!nfs_write_match_verf(verf, req)) {
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_lock(&dreq->lock);
+ if (dreq->flags == 0)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
/*
* Despite the reboot, the write was successful,
* so reset wb_nio.
@@ -667,10 +672,17 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
LIST_HEAD(mds_list);
nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_commit_begin(cinfo.mds);
nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
- if (res < 0) /* res == -ENOMEM */
- nfs_direct_write_reschedule(dreq);
+ if (res < 0) { /* res == -ENOMEM */
+ spin_lock(&dreq->lock);
+ if (dreq->flags == 0)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ }
+ if (nfs_commit_end(cinfo.mds))
+ nfs_direct_write_complete(dreq);
}
static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
@@ -1037,8 +1049,7 @@ int __init nfs_init_directcache(void)
{
nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
sizeof(struct nfs_direct_req),
- 0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ 0, SLAB_RECLAIM_ACCOUNT,
NULL);
if (nfs_direct_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8577ccf621f5..407c6e15afe2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -720,15 +720,15 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
struct inode *inode = filp->f_mapping->host;
int status = 0;
- unsigned int saved_type = fl->fl_type;
+ unsigned int saved_type = fl->c.flc_type;
/* Try local locking first */
posix_test_lock(filp, fl);
- if (fl->fl_type != F_UNLCK) {
+ if (fl->c.flc_type != F_UNLCK) {
/* found a conflict */
goto out;
}
- fl->fl_type = saved_type;
+ fl->c.flc_type = saved_type;
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
goto out_noconflict;
@@ -740,7 +740,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
out:
return status;
out_noconflict:
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
goto out;
}
@@ -765,7 +765,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
* If we're signalled while cleaning up locks on process exit, we
* still need to complete the unlock.
*/
- if (status < 0 && !(fl->fl_flags & FL_CLOSE))
+ if (status < 0 && !(fl->c.flc_flags & FL_CLOSE))
return status;
}
@@ -832,12 +832,12 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
int is_local = 0;
dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n",
- filp, fl->fl_type, fl->fl_flags,
+ filp, fl->c.flc_type, fl->c.flc_flags,
(long long)fl->fl_start, (long long)fl->fl_end);
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
- if (fl->fl_flags & FL_RECLAIM)
+ if (fl->c.flc_flags & FL_RECLAIM)
return -ENOGRACE;
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
@@ -851,7 +851,7 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
if (IS_GETLK(cmd))
ret = do_getlk(filp, cmd, fl, is_local);
- else if (fl->fl_type == F_UNLCK)
+ else if (lock_is_unlock(fl))
ret = do_unlk(filp, cmd, fl, is_local);
else
ret = do_setlk(filp, cmd, fl, is_local);
@@ -869,16 +869,16 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
int is_local = 0;
dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n",
- filp, fl->fl_type, fl->fl_flags);
+ filp, fl->c.flc_type, fl->c.flc_flags);
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
/* We're simulating flock() locks using posix locks on the server */
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
return do_unlk(filp, cmd, fl, is_local);
return do_setlk(filp, cmd, fl, is_local);
}
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index acf4b88889dc..4fa304fa5bc4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -35,6 +35,7 @@
#include "../internal.h"
#include "../nfs4session.h"
#include "filelayout.h"
+#include "../nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -172,6 +173,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
if (!dsaddr->ds_list[i])
goto out_err_drain_dsaddrs;
+ trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr);
/* If DS was already in cache, free ds addrs */
while (!list_empty(&dsaddrs)) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ef817a0475ff..3e724cb7ef01 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -2016,7 +2016,7 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
mirror = flseg->mirror_array[idx];
mirror_ds = mirror->mirror_ds;
- if (!mirror_ds)
+ if (IS_ERR_OR_NULL(mirror_ds))
continue;
ds = mirror->mirror_ds->ds;
if (!ds)
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 853e8d609bb3..d0a0956f8a13 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -652,6 +652,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
ctx->fscache_uniq = NULL;
break;
case Opt_fscache:
+ trace_nfs_mount_assign(param->key, param->string);
ctx->options |= NFS_OPTION_FSCACHE;
kfree(ctx->fscache_uniq);
ctx->fscache_uniq = param->string;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 2d1bfee225c3..ddc1ee031955 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -301,11 +301,11 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
struct inode *inode = sreq->rreq->inode;
struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
struct page *page;
+ unsigned long idx;
int err;
pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
pgoff_t last = ((sreq->start + sreq->len -
sreq->transferred - 1) >> PAGE_SHIFT);
- XA_STATE(xas, &sreq->rreq->mapping->i_pages, start);
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
@@ -316,19 +316,14 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
pgio.pg_netfs = netfs; /* used in completion */
- xas_lock(&xas);
- xas_for_each(&xas, page, last) {
+ xa_for_each_range(&sreq->rreq->mapping->i_pages, idx, page, start, last) {
/* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */
- xas_pause(&xas);
- xas_unlock(&xas);
err = nfs_read_add_folio(&pgio, ctx, page_folio(page));
if (err < 0) {
netfs->error = err;
goto out;
}
- xas_lock(&xas);
}
- xas_unlock(&xas);
out:
nfs_pageio_complete_read(&pgio);
nfs_netfs_put(netfs);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ebb8d60e1152..c709c296ea9a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2372,7 +2372,7 @@ static int __init nfs_init_inodecache(void)
nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
sizeof(struct nfs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (nfs_inode_cachep == NULL)
return -ENOMEM;
@@ -2426,12 +2426,16 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
nfs_clients_init(net);
+ rpc_proc_register(net, &nn->rpcstats);
return nfs_fs_proc_net_init(net);
}
static void nfs_net_exit(struct net *net)
{
+ rpc_proc_unregister(net, "nfs");
nfs_fs_proc_net_exit(net);
nfs_clients_exit(net);
}
@@ -2486,15 +2490,12 @@ static int __init init_nfs_fs(void)
if (err)
goto out1;
- rpc_proc_register(&init_net, &nfs_rpcstat);
-
err = register_nfs_fs();
if (err)
goto out0;
return 0;
out0:
- rpc_proc_unregister(&init_net, "nfs");
nfs_destroy_directcache();
out1:
nfs_destroy_writepagecache();
@@ -2524,7 +2525,6 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_inodecache();
nfs_destroy_nfspagecache();
unregister_pernet_subsys(&nfs_net_ops);
- rpc_proc_unregister(&init_net, "nfs");
unregister_nfs_fs();
nfs_fs_proc_exit();
nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e3722ce6722e..06253695fe53 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -449,8 +449,6 @@ int nfs_try_get_tree(struct fs_context *);
int nfs_get_tree_common(struct fs_context *);
void nfs_kill_super(struct super_block *);
-extern struct rpc_stat nfs_rpcstat;
-
extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index c8374f74dce1..a68b21603ea9 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -9,6 +9,7 @@
#include <linux/nfs4.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/sunrpc/stats.h>
struct bl_dev_msg {
int32_t status;
@@ -34,6 +35,7 @@ struct nfs_net {
struct nfs_netns_client *nfs_client;
spinlock_t nfs_client_lock;
ktime_t boot_time;
+ struct rpc_stat rpcstats;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *proc_nfsfs;
#endif
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 674c012868b1..b0c8a39c2bbd 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -111,6 +111,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
cl_init.hostname = buf;
switch (ds_proto) {
+ case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_TCP_TLS:
if (mds_clp->cl_nconnect > 1)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2de66e4e8280..cbbe3f0193b8 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -963,7 +963,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
struct nfs_open_context *ctx = nfs_file_open_context(filp);
int status;
- if (fl->fl_flags & FL_CLOSE) {
+ if (fl->c.flc_flags & FL_CLOSE) {
l_ctx = nfs_get_lock_context(ctx);
if (IS_ERR(l_ctx))
l_ctx = NULL;
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b59876b01a1e..0282d93c8bcc 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -55,11 +55,14 @@ int nfs42_proc_removexattr(struct inode *inode, const char *name);
* They would be 7 bytes long in the eventual buffer ("user.x\0"), and
* 8 bytes long XDR-encoded.
*
- * Include the trailing eof word as well.
+ * Include the trailing eof word as well and make the result a multiple
+ * of 4 bytes.
*/
static inline u32 nfs42_listxattr_xdrsize(u32 buflen)
{
- return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4;
+ u32 size = 8 * buflen / (XATTR_USER_PREFIX_LEN + 2) + 4;
+
+ return (size + 3) & ~3;
}
#endif /* CONFIG_NFS_V4_2 */
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 49aaf28a6950..b6e3d8f77b91 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -1017,7 +1017,7 @@ int __init nfs4_xattr_cache_init(void)
nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
sizeof(struct nfs4_xattr_cache), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ (SLAB_RECLAIM_ACCOUNT),
nfs4_xattr_cache_init_once);
if (nfs4_xattr_cache_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 581698f1b7b2..7024230f0d1d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -120,7 +120,6 @@ struct nfs4_state_owner {
unsigned long so_flags;
struct list_head so_states;
struct nfs_seqid_counter so_seqid;
- seqcount_spinlock_t so_reclaim_seqcount;
struct mutex so_delegreturn_mutex;
};
@@ -330,7 +329,7 @@ extern int update_open_stateid(struct nfs4_state *state,
const nfs4_stateid *deleg_stateid,
fmode_t fmode);
extern int nfs4_proc_setlease(struct file *file, int arg,
- struct file_lock **lease, void **priv);
+ struct file_lease **lease, void **priv);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
extern void nfs4_update_changeattr(struct inode *dir,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 11e3a285594c..84573df5cf5a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -924,6 +924,7 @@ static int nfs4_set_client(struct nfs_server *server,
else
cl_init.max_connect = max_connect;
switch (proto) {
+ case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_TCP_TLS:
cl_init.nconnect = nconnect;
@@ -1000,6 +1001,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
cl_init.hostname = buf;
switch (ds_proto) {
+ case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_TCP_TLS:
if (mds_clp->cl_nconnect > 1) {
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e238abc78a13..1cd9652f3c28 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -439,7 +439,7 @@ void nfs42_ssc_unregister_ops(void)
}
#endif /* CONFIG_NFS_V4_2 */
-static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease,
+static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease,
void **priv)
{
return nfs4_proc_setlease(file, arg, lease, priv);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 23819a756508..ea390db94b62 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3069,10 +3069,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx);
struct inode *dir = d_inode(opendata->dir);
unsigned long dir_verifier;
- unsigned int seq;
int ret;
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
dir_verifier = nfs_save_change_attribute(dir);
ret = _nfs4_proc_open(opendata, ctx);
@@ -3125,11 +3123,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
if (ret != 0)
goto out;
- if (d_inode(dentry) == state->inode) {
+ if (d_inode(dentry) == state->inode)
nfs_inode_attach_open_context(ctx);
- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
- nfs4_schedule_stateid_recovery(server, state);
- }
out:
if (!opendata->cancelled) {
@@ -6800,7 +6795,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
switch (status) {
case 0:
- request->fl_type = F_UNLCK;
+ request->c.flc_type = F_UNLCK;
break;
case -NFS4ERR_DENIED:
status = 0;
@@ -7018,8 +7013,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
/* Ensure this is an unlock - when canceling a lock, the
* canceled lock is passed in, and it won't be an unlock.
*/
- fl->fl_type = F_UNLCK;
- if (fl->fl_flags & FL_CLOSE)
+ fl->c.flc_type = F_UNLCK;
+ if (fl->c.flc_flags & FL_CLOSE)
set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
@@ -7045,11 +7040,11 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
struct rpc_task *task;
struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
int status = 0;
- unsigned char fl_flags = request->fl_flags;
+ unsigned char saved_flags = request->c.flc_flags;
status = nfs4_set_lock_state(state, request);
/* Unlock _before_ we do the RPC call */
- request->fl_flags |= FL_EXISTS;
+ request->c.flc_flags |= FL_EXISTS;
/* Exclude nfs_delegation_claim_locks() */
mutex_lock(&sp->so_delegreturn_mutex);
/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
@@ -7073,14 +7068,16 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
status = -ENOMEM;
if (IS_ERR(seqid))
goto out;
- task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
+ task = nfs4_do_unlck(request,
+ nfs_file_open_context(request->c.flc_file),
+ lsp, seqid);
status = PTR_ERR(task);
if (IS_ERR(task))
goto out;
status = rpc_wait_for_completion_task(task);
rpc_put_task(task);
out:
- request->fl_flags = fl_flags;
+ request->c.flc_flags = saved_flags;
trace_nfs4_unlock(request, state, F_SETLK, status);
return status;
}
@@ -7191,7 +7188,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
- data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
+ data->fl.c.flc_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
goto out_restart;
}
@@ -7292,7 +7289,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
- data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
+ data = nfs4_alloc_lockdata(fl,
+ nfs_file_open_context(fl->c.flc_file),
fl->fl_u.nfs4_fl.owner, GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -7398,10 +7396,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
{
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs4_state_owner *sp = state->owner;
- unsigned char fl_flags = request->fl_flags;
+ unsigned char flags = request->c.flc_flags;
int status;
- request->fl_flags |= FL_ACCESS;
+ request->c.flc_flags |= FL_ACCESS;
status = locks_lock_inode_wait(state->inode, request);
if (status < 0)
goto out;
@@ -7410,7 +7408,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
/* Yes: cache locks! */
/* ...but avoid races with delegation recall... */
- request->fl_flags = fl_flags & ~FL_SLEEP;
+ request->c.flc_flags = flags & ~FL_SLEEP;
status = locks_lock_inode_wait(state->inode, request);
up_read(&nfsi->rwsem);
mutex_unlock(&sp->so_delegreturn_mutex);
@@ -7420,7 +7418,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
mutex_unlock(&sp->so_delegreturn_mutex);
status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
out:
- request->fl_flags = fl_flags;
+ request->c.flc_flags = flags;
return status;
}
@@ -7562,7 +7560,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
return -EINVAL;
- if (request->fl_type == F_UNLCK) {
+ if (lock_is_unlock(request)) {
if (state != NULL)
return nfs4_proc_unlck(state, cmd, request);
return 0;
@@ -7571,7 +7569,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
if (state == NULL)
return -ENOLCK;
- if ((request->fl_flags & FL_POSIX) &&
+ if ((request->c.flc_flags & FL_POSIX) &&
!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
return -ENOLCK;
@@ -7579,7 +7577,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
* Don't rely on the VFS having checked the file open mode,
* since it won't do this for flock() locks.
*/
- switch (request->fl_type) {
+ switch (request->c.flc_type) {
case F_RDLCK:
if (!(filp->f_mode & FMODE_READ))
return -EBADF;
@@ -7601,7 +7599,7 @@ static int nfs4_delete_lease(struct file *file, void **priv)
return generic_setlease(file, F_UNLCK, NULL, priv);
}
-static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
+static int nfs4_add_lease(struct file *file, int arg, struct file_lease **lease,
void **priv)
{
struct inode *inode = file_inode(file);
@@ -7619,7 +7617,7 @@ static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
return -EAGAIN;
}
-int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease,
+int nfs4_proc_setlease(struct file *file, int arg, struct file_lease **lease,
void **priv)
{
switch (arg) {
@@ -8970,10 +8968,12 @@ try_again:
return;
status = task->tk_status;
- if (status == 0)
+ if (status == 0) {
status = nfs4_detect_session_trunking(adata->clp,
task->tk_msg.rpc_resp, xprt);
-
+ trace_nfs4_trunked_exchange_id(adata->clp,
+ xprt->address_strings[RPC_DISPLAY_ADDR], status);
+ }
if (status == 0)
rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt,
@@ -10615,29 +10615,33 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
ssize_t error, error2, error3;
+ size_t left = size;
- error = generic_listxattr(dentry, list, size);
+ error = generic_listxattr(dentry, list, left);
if (error < 0)
return error;
if (list) {
list += error;
- size -= error;
+ left -= error;
}
- error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+ error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, left);
if (error2 < 0)
return error2;
if (list) {
list += error2;
- size -= error2;
+ left -= error2;
}
- error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size);
+ error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left);
if (error3 < 0)
return error3;
- return error + error2 + error3;
+ error += error2 + error3;
+ if (size && error > size)
+ return -ERANGE;
+ return error;
}
static void nfs4_enable_swap(struct inode *inode)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9a5d911a7edc..662e86ea3a2d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -513,7 +513,6 @@ nfs4_alloc_state_owner(struct nfs_server *server,
nfs4_init_seqid_counter(&sp->so_seqid);
atomic_set(&sp->so_count, 1);
INIT_LIST_HEAD(&sp->so_lru);
- seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock);
mutex_init(&sp->so_delegreturn_mutex);
return sp;
}
@@ -847,15 +846,15 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
*/
static struct nfs4_lock_state *
__nfs4_find_lock_state(struct nfs4_state *state,
- fl_owner_t fl_owner, fl_owner_t fl_owner2)
+ fl_owner_t owner, fl_owner_t owner2)
{
struct nfs4_lock_state *pos, *ret = NULL;
list_for_each_entry(pos, &state->lock_states, ls_locks) {
- if (pos->ls_owner == fl_owner) {
+ if (pos->ls_owner == owner) {
ret = pos;
break;
}
- if (pos->ls_owner == fl_owner2)
+ if (pos->ls_owner == owner2)
ret = pos;
}
if (ret)
@@ -868,7 +867,7 @@ __nfs4_find_lock_state(struct nfs4_state *state,
* exists, return an uninitialized one.
*
*/
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t owner)
{
struct nfs4_lock_state *lsp;
struct nfs_server *server = state->owner->so_server;
@@ -879,7 +878,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
nfs4_init_seqid_counter(&lsp->ls_seqid);
refcount_set(&lsp->ls_count, 1);
lsp->ls_state = state;
- lsp->ls_owner = fl_owner;
+ lsp->ls_owner = owner;
lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
@@ -980,7 +979,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
if (fl->fl_ops != NULL)
return 0;
- lsp = nfs4_get_lock_state(state, fl->fl_owner);
+ lsp = nfs4_get_lock_state(state, fl->c.flc_owner);
if (lsp == NULL)
return -ENOMEM;
fl->fl_u.nfs4_fl.owner = lsp;
@@ -993,7 +992,7 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
const struct nfs_lock_context *l_ctx)
{
struct nfs4_lock_state *lsp;
- fl_owner_t fl_owner, fl_flock_owner;
+ fl_owner_t owner, fl_flock_owner;
int ret = -ENOENT;
if (l_ctx == NULL)
@@ -1002,11 +1001,11 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
goto out;
- fl_owner = l_ctx->lockowner;
+ owner = l_ctx->lockowner;
fl_flock_owner = l_ctx->open_context->flock_owner;
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner);
+ lsp = __nfs4_find_lock_state(state, owner, fl_flock_owner);
if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
ret = -EIO;
else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -1529,8 +1528,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
down_write(&nfsi->rwsem);
spin_lock(&flctx->flc_lock);
restart:
- list_for_each_entry(fl, list, fl_list) {
- if (nfs_file_open_context(fl->fl_file)->state != state)
+ for_each_file_lock(fl, list) {
+ if (nfs_file_open_context(fl->c.flc_file)->state != state)
continue;
spin_unlock(&flctx->flc_lock);
status = ops->recover_lock(state, fl);
@@ -1667,7 +1666,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp,
* server that doesn't support a grace period.
*/
spin_lock(&sp->so_lock);
- raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
restart:
list_for_each_entry(state, &sp->so_states, open_states) {
if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
@@ -1735,7 +1733,6 @@ restart:
spin_lock(&sp->so_lock);
goto restart;
}
- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
#ifdef CONFIG_NFS_V4_2
if (found_ssc_copy_state)
@@ -1745,7 +1742,6 @@ restart:
out_err:
nfs4_put_open_state(state);
spin_lock(&sp->so_lock);
- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
return status;
}
@@ -1928,9 +1924,12 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
struct nfs_server *server;
struct rb_node *pos;
LIST_HEAD(freeme);
- int status = 0;
int lost_locks = 0;
+ int status;
+ status = nfs4_begin_drain_session(clp);
+ if (status < 0)
+ return status;
restart:
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
@@ -2694,6 +2693,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
/* Detect expired delegations... */
if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
section = "detect expired delegations";
+ status = nfs4_begin_drain_session(clp);
+ if (status < 0)
+ goto out_error;
nfs_reap_expired_delegations(clp);
continue;
}
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index d09bcfd7db89..8da5a9c000f4 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -145,6 +145,7 @@ static int do_nfs4_mount(struct nfs_server *server,
const char *export_path)
{
struct nfs_fs_context *root_ctx;
+ struct nfs_fs_context *ctx;
struct fs_context *root_fc;
struct vfsmount *root_mnt;
struct dentry *dentry;
@@ -157,6 +158,12 @@ static int do_nfs4_mount(struct nfs_server *server,
.dirfd = -1,
};
+ struct fs_parameter param_fsc = {
+ .key = "fsc",
+ .type = fs_value_is_string,
+ .dirfd = -1,
+ };
+
if (IS_ERR(server))
return PTR_ERR(server);
@@ -168,9 +175,26 @@ static int do_nfs4_mount(struct nfs_server *server,
kfree(root_fc->source);
root_fc->source = NULL;
+ ctx = nfs_fc2context(fc);
root_ctx = nfs_fc2context(root_fc);
root_ctx->internal = true;
root_ctx->server = server;
+
+ if (ctx->fscache_uniq) {
+ len = strlen(ctx->fscache_uniq);
+ param_fsc.size = len;
+ param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL);
+ if (param_fsc.string == NULL) {
+ put_fs_context(root_fc);
+ return -ENOMEM;
+ }
+ ret = vfs_parse_fs_param(root_fc, &param_fsc);
+ kfree(param_fsc.string);
+ if (ret < 0) {
+ put_fs_context(root_fc);
+ return ret;
+ }
+ }
/* We leave export_path unset as it's not used to find the root. */
len = strlen(hostname) + 5;
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d9ac556bebcf..d22c6670f770 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -28,4 +28,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(fl_getdevinfo);
#endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index d27919d7241d..10985a4b8259 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -77,6 +77,36 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
+TRACE_EVENT(nfs4_trunked_exchange_id,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const char *addr,
+ int error
+ ),
+
+ TP_ARGS(clp, addr, error),
+
+ TP_STRUCT__entry(
+ __string(main_addr, clp->cl_hostname)
+ __string(trunk_addr, addr)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(main_addr, clp->cl_hostname);
+ __assign_str(trunk_addr, addr);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) main_addr=%s trunk_addr=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __get_str(main_addr),
+ __get_str(trunk_addr)
+ )
+);
+
TRACE_EVENT(nfs4_sequence_done,
TP_PROTO(
const struct nfs4_session *session,
@@ -699,7 +729,7 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__entry->error = error < 0 ? -error : 0;
__entry->cmd = cmd;
- __entry->type = request->fl_type;
+ __entry->type = request->c.flc_type;
__entry->start = request->fl_start;
__entry->end = request->fl_end;
__entry->dev = inode->i_sb->s_dev;
@@ -771,7 +801,7 @@ TRACE_EVENT(nfs4_set_lock,
__entry->error = error < 0 ? -error : 0;
__entry->cmd = cmd;
- __entry->type = request->fl_type;
+ __entry->type = request->c.flc_type;
__entry->start = request->fl_start;
__entry->end = request->fl_end;
__entry->dev = inode->i_sb->s_dev;
@@ -1991,6 +2021,34 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status,
DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo);
DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid);
+TRACE_EVENT(fl_getdevinfo,
+ TP_PROTO(
+ const struct nfs_server *server,
+ const struct nfs4_deviceid *deviceid,
+ char *ds_remotestr
+ ),
+ TP_ARGS(server, deviceid, ds_remotestr),
+
+ TP_STRUCT__entry(
+ __string(mds_addr, server->nfs_client->cl_hostname)
+ __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE)
+ __string(ds_ips, ds_remotestr)
+ ),
+
+ TP_fast_assign(
+ __assign_str(mds_addr, server->nfs_client->cl_hostname);
+ __assign_str(ds_ips, ds_remotestr);
+ memcpy(__entry->deviceid, deviceid->data,
+ NFS4_DEVICEID4_SIZE);
+ ),
+ TP_printk(
+ "deviceid=%s, mds_addr=%s, ds_ips=%s",
+ __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE),
+ __get_str(mds_addr),
+ __get_str(ds_ips)
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
TP_PROTO(
const struct nfs_pgio_header *hdr
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 69406e60f391..1416099dfcd1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1305,7 +1305,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
static inline int nfs4_lock_type(struct file_lock *fl, int block)
{
- if (fl->fl_type == F_RDLCK)
+ if (lock_is_read(fl))
return block ? NFS4_READW_LT : NFS4_READ_LT;
return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
}
@@ -5052,10 +5052,10 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
fl->fl_end = fl->fl_start + (loff_t)length - 1;
if (length == ~(uint64_t)0)
fl->fl_end = OFFSET_MAX;
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
if (type & 1)
- fl->fl_type = F_RDLCK;
- fl->fl_pid = 0;
+ fl->c.flc_type = F_RDLCK;
+ fl->c.flc_pid = 0;
}
p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 7600100ba26f..432612d22437 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -175,10 +175,10 @@ static int __init root_nfs_cat(char *dest, const char *src,
size_t len = strlen(dest);
if (len && dest[len - 1] != ',')
- if (strlcat(dest, ",", destlen) > destlen)
+ if (strlcat(dest, ",", destlen) >= destlen)
return -1;
- if (strlcat(dest, src, destlen) > destlen)
+ if (strlcat(dest, src, destlen) >= destlen)
return -1;
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0c0fed1ecd0b..a5cc6199127f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1999,6 +1999,14 @@ pnfs_update_layout(struct inode *ino,
}
lookup_again:
+ if (!nfs4_valid_open_stateid(ctx->state)) {
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+ lseg = ERR_PTR(-EIO);
+ goto out;
+ }
+
lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
if (IS_ERR(lseg))
goto out;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index afd23910f3bf..88e061bd711b 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -919,6 +919,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
list_for_each_entry(da, &ds->ds_addrs, da_node) {
+ char servername[48];
+
dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
@@ -929,6 +931,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
.dstaddr = (struct sockaddr *)&da->da_addr,
.addrlen = da->da_addrlen,
.servername = clp->cl_hostname,
+ .xprtsec = clp->cl_xprtsec,
};
struct nfs4_add_xprt_data xprtdata = {
.clp = clp,
@@ -938,10 +941,45 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
.data = &xprtdata,
};
- if (da->da_transport != clp->cl_proto)
+ if (da->da_transport != clp->cl_proto &&
+ clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
continue;
+ if (da->da_transport == XPRT_TRANSPORT_TCP &&
+ mds_srv->nfs_client->cl_proto ==
+ XPRT_TRANSPORT_TCP_TLS) {
+ struct sockaddr *addr =
+ (struct sockaddr *)&da->da_addr;
+ struct sockaddr_in *sin =
+ (struct sockaddr_in *)&da->da_addr;
+ struct sockaddr_in6 *sin6 =
+ (struct sockaddr_in6 *)&da->da_addr;
+
+ /* for NFS with TLS we need to supply a correct
+ * servername of the trunked transport, not the
+ * servername of the main transport stored in
+ * clp->cl_hostname. And set the protocol to
+ * indicate to use TLS
+ */
+ servername[0] = '\0';
+ switch(addr->sa_family) {
+ case AF_INET:
+ snprintf(servername, sizeof(servername),
+ "%pI4", &sin->sin_addr.s_addr);
+ break;
+ case AF_INET6:
+ snprintf(servername, sizeof(servername),
+ "%pI6", &sin6->sin6_addr);
+ break;
+ default:
+ /* do not consider this address */
+ continue;
+ }
+ xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
+ xprt_args.servername = servername;
+ }
if (da->da_addr.ss_family != clp->cl_addr.ss_family)
continue;
+
/**
* Test this address for session trunking and
* add as an alias
@@ -953,6 +991,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
if (xprtdata.cred)
put_cred(xprtdata.cred);
} else {
+ if (da->da_transport == XPRT_TRANSPORT_TCP &&
+ mds_srv->nfs_client->cl_proto ==
+ XPRT_TRANSPORT_TCP_TLS)
+ da->da_transport = XPRT_TRANSPORT_TCP_TLS;
clp = nfs4_set_ds_client(mds_srv,
&da->da_addr,
da->da_addrlen,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 7dc21a48e3e7..a142287d86f6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -305,6 +305,8 @@ int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len);
if (IS_ERR(new)) {
error = PTR_ERR(new);
+ if (nfs_netfs_folio_unlock(folio))
+ folio_unlock(folio);
goto out;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 075b31c93f87..dc03f98f7616 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -516,8 +516,16 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
else
nfs_show_nfsv4_options(m, nfss, showdefaults);
- if (nfss->options & NFS_OPTION_FSCACHE)
+ if (nfss->options & NFS_OPTION_FSCACHE) {
+#ifdef CONFIG_NFS_FSCACHE
+ if (nfss->fscache_uniq)
+ seq_printf(m, ",fsc=%s", nfss->fscache_uniq);
+ else
+ seq_puts(m, ",fsc");
+#else
seq_puts(m, ",fsc");
+#endif
+ }
if (nfss->options & NFS_OPTION_MIGRATION)
seq_puts(m, ",migration");
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bb79d3a886ae..5de85d725fb9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -667,10 +667,6 @@ static int nfs_writepage_locked(struct folio *folio,
struct inode *inode = folio_file_mapping(folio)->host;
int err;
- if (wbc->sync_mode == WB_SYNC_NONE &&
- NFS_SERVER(inode)->write_congested)
- return AOP_WRITEPAGE_ACTIVATE;
-
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, 0, false,
&nfs_async_write_completion_ops);
@@ -1301,7 +1297,7 @@ static bool
is_whole_file_wrlock(struct file_lock *fl)
{
return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
- fl->fl_type == F_WRLCK;
+ lock_is_write(fl);
}
/* If we know the page is up to date, and we're not using byte range locks (or
@@ -1335,13 +1331,13 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio,
spin_lock(&flctx->flc_lock);
if (!list_empty(&flctx->flc_posix)) {
fl = list_first_entry(&flctx->flc_posix, struct file_lock,
- fl_list);
+ c.flc_list);
if (is_whole_file_wrlock(fl))
ret = 1;
} else if (!list_empty(&flctx->flc_flock)) {
fl = list_first_entry(&flctx->flc_flock, struct file_lock,
- fl_list);
- if (fl->fl_type == F_WRLCK)
+ c.flc_list);
+ if (lock_is_write(fl))
ret = 1;
}
spin_unlock(&flctx->flc_lock);
@@ -1650,7 +1646,7 @@ static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
!atomic_read(&cinfo->rpcs_out));
}
-static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
{
atomic_inc(&cinfo->rpcs_out);
}
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 46fd74d91ea9..3c040c81c77d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -328,10 +328,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
}
static void
-nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
{
struct nfs4_client *clp = ls->ls_stid.sc_client;
- struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev;
+ struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
nfsd4_scsi_pr_key(clp), 0, true);
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 4cbe0434cbb8..66a05fefae98 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -80,8 +80,6 @@ enum {
int nfsd_drc_slab_create(void);
void nfsd_drc_slab_free(void);
-int nfsd_net_reply_cache_init(struct nfsd_net *nn);
-void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9cb7f0c33df5..ddd3e0d9cfa6 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -61,13 +61,10 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
struct nfsd_fcache_disposal {
- struct work_struct work;
spinlock_t lock;
struct list_head freeme;
};
-static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
-
static struct kmem_cache *nfsd_file_slab;
static struct kmem_cache *nfsd_file_mark_slab;
static struct list_lru nfsd_file_lru;
@@ -283,7 +280,7 @@ nfsd_file_free(struct nfsd_file *nf)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
nfsd_file_check_write_error(nf);
- filp_close(nf->nf_file, NULL);
+ nfsd_filp_close(nf->nf_file);
}
/*
@@ -421,7 +418,37 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
spin_lock(&l->lock);
list_move_tail(&nf->nf_lru, &l->freeme);
spin_unlock(&l->lock);
- queue_work(nfsd_filecache_wq, &l->work);
+ svc_wake_up(nn->nfsd_serv);
+ }
+}
+
+/**
+ * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed.
+ * @nn: nfsd_net in which to find files to be disposed.
+ *
+ * When files held open for nfsv3 are removed from the filecache, whether
+ * due to memory pressure or garbage collection, they are queued to
+ * a per-net-ns queue. This function completes the disposal, either
+ * directly or by waking another nfsd thread to help with the work.
+ */
+void nfsd_file_net_dispose(struct nfsd_net *nn)
+{
+ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+ if (!list_empty(&l->freeme)) {
+ LIST_HEAD(dispose);
+ int i;
+
+ spin_lock(&l->lock);
+ for (i = 0; i < 8 && !list_empty(&l->freeme); i++)
+ list_move(l->freeme.next, &dispose);
+ spin_unlock(&l->lock);
+ if (!list_empty(&l->freeme))
+ /* Wake up another thread to share the work
+ * *before* doing any actual disposing.
+ */
+ svc_wake_up(nn->nfsd_serv);
+ nfsd_file_dispose_list(&dispose);
}
}
@@ -631,28 +658,6 @@ nfsd_file_close_inode_sync(struct inode *inode)
list_del_init(&nf->nf_lru);
nfsd_file_free(nf);
}
- flush_delayed_fput();
-}
-
-/**
- * nfsd_file_delayed_close - close unused nfsd_files
- * @work: dummy
- *
- * Scrape the freeme list for this nfsd_net, and then dispose of them
- * all.
- */
-static void
-nfsd_file_delayed_close(struct work_struct *work)
-{
- LIST_HEAD(head);
- struct nfsd_fcache_disposal *l = container_of(work,
- struct nfsd_fcache_disposal, work);
-
- spin_lock(&l->lock);
- list_splice_init(&l->freeme, &head);
- spin_unlock(&l->lock);
-
- nfsd_file_dispose_list(&head);
}
static int
@@ -662,8 +667,8 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
struct file_lock *fl = data;
/* Only close files for F_SETLEASE leases */
- if (fl->fl_flags & FL_LEASE)
- nfsd_file_close_inode(file_inode(fl->fl_file));
+ if (fl->c.flc_flags & FL_LEASE)
+ nfsd_file_close_inode(file_inode(fl->c.flc_file));
return 0;
}
@@ -717,25 +722,18 @@ nfsd_file_cache_init(void)
return ret;
ret = -ENOMEM;
- nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0);
- if (!nfsd_filecache_wq)
- goto out;
-
- nfsd_file_slab = kmem_cache_create("nfsd_file",
- sizeof(struct nfsd_file), 0, 0, NULL);
+ nfsd_file_slab = KMEM_CACHE(nfsd_file, 0);
if (!nfsd_file_slab) {
pr_err("nfsd: unable to create nfsd_file_slab\n");
goto out_err;
}
- nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
- sizeof(struct nfsd_file_mark), 0, 0, NULL);
+ nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0);
if (!nfsd_file_mark_slab) {
pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
goto out_err;
}
-
ret = list_lru_init(&nfsd_file_lru);
if (ret) {
pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
@@ -785,8 +783,6 @@ out_err:
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- destroy_workqueue(nfsd_filecache_wq);
- nfsd_filecache_wq = NULL;
rhltable_destroy(&nfsd_file_rhltable);
goto out;
}
@@ -832,7 +828,6 @@ nfsd_alloc_fcache_disposal(void)
l = kmalloc(sizeof(*l), GFP_KERNEL);
if (!l)
return NULL;
- INIT_WORK(&l->work, nfsd_file_delayed_close);
spin_lock_init(&l->lock);
INIT_LIST_HEAD(&l->freeme);
return l;
@@ -841,7 +836,6 @@ nfsd_alloc_fcache_disposal(void)
static void
nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
{
- cancel_work_sync(&l->work);
nfsd_file_dispose_list(&l->freeme);
kfree(l);
}
@@ -910,8 +904,6 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- destroy_workqueue(nfsd_filecache_wq);
- nfsd_filecache_wq = NULL;
rhltable_destroy(&nfsd_file_rhltable);
for_each_possible_cpu(i) {
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index e54165a3224f..c61884def906 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -56,6 +56,7 @@ void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
+void nfsd_file_net_dispose(struct nfsd_net *nn);
bool nfsd_file_is_cached(struct inode *inode);
__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 74b4360779a1..d4be519b5734 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -11,8 +11,10 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <linux/filelock.h>
+#include <linux/nfs4.h>
#include <linux/percpu_counter.h>
#include <linux/siphash.h>
+#include <linux/sunrpc/stats.h>
/* Hash tables for nfs4_clientid state */
#define CLIENT_HASH_BITS 4
@@ -26,10 +28,22 @@ struct nfsd4_client_tracking_ops;
enum {
/* cache misses due only to checksum comparison failures */
- NFSD_NET_PAYLOAD_MISSES,
+ NFSD_STATS_PAYLOAD_MISSES,
/* amount of memory (in bytes) currently consumed by the DRC */
- NFSD_NET_DRC_MEM_USAGE,
- NFSD_NET_COUNTERS_NUM
+ NFSD_STATS_DRC_MEM_USAGE,
+ NFSD_STATS_RC_HITS, /* repcache hits */
+ NFSD_STATS_RC_MISSES, /* repcache misses */
+ NFSD_STATS_RC_NOCACHE, /* uncached reqs */
+ NFSD_STATS_FH_STALE, /* FH stale error */
+ NFSD_STATS_IO_READ, /* bytes returned to read requests */
+ NFSD_STATS_IO_WRITE, /* bytes passed in write requests */
+#ifdef CONFIG_NFSD_V4
+ NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */
+ NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
+#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op))
+ NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */
+#endif
+ NFSD_STATS_COUNTERS_NUM
};
/*
@@ -164,7 +178,10 @@ struct nfsd_net {
atomic_t num_drc_entries;
/* Per-netns stats counters */
- struct percpu_counter counter[NFSD_NET_COUNTERS_NUM];
+ struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM];
+
+ /* sunrpc svc stats */
+ struct svc_stat nfsd_svcstats;
/* longest hash chain seen */
unsigned int longest_chain;
@@ -192,6 +209,10 @@ struct nfsd_net {
atomic_t nfsd_courtesy_clients;
struct shrinker *nfsd_client_shrinker;
struct work_struct nfsd_shrinker_work;
+
+ /* last time an admin-revoke happened for NFSv4.0 */
+ time64_t nfs40_last_revoke;
+
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index b78eceebd945..dfcc957e460d 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -71,13 +71,15 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp)
struct nfsd_attrs attrs = {
.na_iattr = &argp->attrs,
};
+ const struct timespec64 *guardtime = NULL;
dprintk("nfsd: SETATTR(3) %s\n",
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs,
- argp->check_guard, argp->guardtime);
+ if (argp->check_guard)
+ guardtime = &argp->guardtime;
+ resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, guardtime);
return rpc_success;
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f32128955ec8..a7a07470c1f8 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -295,17 +295,14 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
static bool
svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args)
{
- __be32 *p;
u32 check;
if (xdr_stream_decode_bool(xdr, &check) < 0)
return false;
if (check) {
- p = xdr_inline_decode(xdr, XDR_UNIT * 2);
- if (!p)
+ if (!svcxdr_decode_nfstime3(xdr, &args->guardtime))
return false;
args->check_guard = 1;
- args->guardtime = be32_to_cpup(p);
} else
args->check_guard = 0;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 926c29879c6a..87c9547989f6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -45,7 +45,7 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
-static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp);
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
@@ -85,7 +85,21 @@ static void encode_uint32(struct xdr_stream *xdr, u32 n)
static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
size_t len)
{
- WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
+ xdr_stream_encode_uint32_array(xdr, bitmap, len);
+}
+
+static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs4_cb_fattr *fattr)
+{
+ fattr->ncf_cb_change = 0;
+ fattr->ncf_cb_fsize = 0;
+ if (bitmap[0] & FATTR4_WORD0_CHANGE)
+ if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0)
+ return -NFSERR_BAD_XDR;
+ if (bitmap[0] & FATTR4_WORD0_SIZE)
+ if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0)
+ return -NFSERR_BAD_XDR;
+ return 0;
}
static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
@@ -334,6 +348,30 @@ encode_cb_recallany4args(struct xdr_stream *xdr,
}
/*
+ * CB_GETATTR4args
+ * struct CB_GETATTR4args {
+ * nfs_fh4 fh;
+ * bitmap4 attr_request;
+ * };
+ *
+ * The size and change attributes are the only one
+ * guaranteed to be serviced by the client.
+ */
+static void
+encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
+ struct nfs4_cb_fattr *fattr)
+{
+ struct nfs4_delegation *dp =
+ container_of(fattr, struct nfs4_delegation, dl_cb_fattr);
+ struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle;
+
+ encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR);
+ encode_nfs_fh4(xdr, fh);
+ encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap));
+ hdr->nops++;
+}
+
+/*
* CB_SEQUENCE4args
*
* struct CB_SEQUENCE4args {
@@ -469,6 +507,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
}
/*
+ * 20.1. Operation 3: CB_GETATTR - Get Attributes
+ */
+static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ struct nfs4_cb_fattr *ncf =
+ container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = cb->cb_clp->cl_cb_ident,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_getattr4args(xdr, &hdr, ncf);
+ encode_cb_nops(&hdr);
+}
+
+/*
* 20.2. Operation 4: CB_RECALL - Recall a Delegation
*/
static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -524,6 +582,42 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
}
/*
+ * 20.1. Operation 3: CB_GETATTR - Get Attributes
+ */
+static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+ u32 bitmap[3] = {0};
+ u32 attrlen;
+ struct nfs4_cb_fattr *ncf =
+ container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
+ status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status);
+ if (status)
+ return status;
+ if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0)
+ return -NFSERR_BAD_XDR;
+ if (xdr_stream_decode_u32(xdr, &attrlen) < 0)
+ return -NFSERR_BAD_XDR;
+ if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize)))
+ return -NFSERR_BAD_XDR;
+ status = decode_cb_fattr4(xdr, bitmap, ncf);
+ return status;
+}
+
+/*
* 20.2. Operation 4: CB_RECALL - Recall a Delegation
*/
static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
@@ -674,7 +768,7 @@ static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req,
const struct nfsd4_callback *cb = data;
const struct nfsd4_blocked_lock *nbl =
container_of(cb, struct nfsd4_blocked_lock, nbl_cb);
- struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner;
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.c.flc_owner;
struct nfs4_cb_compound_hdr hdr = {
.ident = 0,
.minorversion = cb->cb_clp->cl_minorversion,
@@ -831,6 +925,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload),
PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any),
+ PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr),
};
static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
@@ -887,7 +982,16 @@ static struct workqueue_struct *callback_wq;
static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
{
- return queue_work(callback_wq, &cb->cb_work);
+ trace_nfsd_cb_queue(cb->cb_clp, cb);
+ return queue_delayed_work(callback_wq, &cb->cb_work, 0);
+}
+
+static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
+ unsigned long msecs)
+{
+ trace_nfsd_cb_queue(cb->cb_clp, cb);
+ queue_delayed_work(callback_wq, &cb->cb_work,
+ msecs_to_jiffies(msecs));
}
static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
@@ -999,18 +1103,18 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate)
{
if (clp->cl_cb_state != newstate) {
clp->cl_cb_state = newstate;
- trace_nfsd_cb_state(clp);
+ trace_nfsd_cb_new_state(clp);
}
}
-static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+static void nfsd4_mark_cb_down(struct nfs4_client *clp)
{
if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
return;
nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN);
}
-static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp)
{
if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
return;
@@ -1022,7 +1126,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
if (task->tk_status)
- nfsd4_mark_cb_down(clp, task->tk_status);
+ nfsd4_mark_cb_down(clp);
else
nfsd4_mark_cb_state(clp, NFSD4_CB_UP);
}
@@ -1106,6 +1210,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
{
struct nfs4_client *clp = cb->cb_clp;
+ trace_nfsd_cb_destroy(clp, cb);
nfsd41_cb_release_slot(cb);
if (cb->cb_ops && cb->cb_ops->release)
cb->cb_ops->release(cb);
@@ -1158,6 +1263,8 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
if (!cb->cb_holds_slot)
goto need_restart;
+ /* This is the operation status code for CB_SEQUENCE */
+ trace_nfsd_cb_seq_status(task, cb);
switch (cb->cb_seq_status) {
case 0:
/*
@@ -1171,13 +1278,23 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
break;
case -ESERVERFAULT:
++session->se_cb_seq_nr;
- fallthrough;
+ nfsd4_mark_cb_fault(cb->cb_clp);
+ ret = false;
+ break;
case 1:
+ /*
+ * cb_seq_status remains 1 if an RPC Reply was never
+ * received. NFSD can't know if the client processed
+ * the CB_SEQUENCE operation. Ask the client to send a
+ * DESTROY_SESSION to recover.
+ */
+ fallthrough;
case -NFS4ERR_BADSESSION:
- nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+ nfsd4_mark_cb_fault(cb->cb_clp);
ret = false;
- break;
+ goto need_restart;
case -NFS4ERR_DELAY:
+ cb->cb_seq_status = 1;
if (!rpc_restart_call(task))
goto out;
@@ -1192,14 +1309,11 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
}
break;
default:
- nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
- dprintk("%s: unprocessed error %d\n", __func__,
- cb->cb_seq_status);
+ nfsd4_mark_cb_fault(cb->cb_clp);
}
-
nfsd41_cb_release_slot(cb);
- dprintk("%s: freed slot, new seqid=%d\n", __func__,
- clp->cl_cb_session->se_cb_seq_nr);
+
+ trace_nfsd_cb_free_slot(task, cb);
if (RPC_SIGNALLED(task))
goto need_restart;
@@ -1211,6 +1325,7 @@ retry_nowait:
goto out;
need_restart:
if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
+ trace_nfsd_cb_restart(clp, cb);
task->tk_status = 0;
cb->cb_need_restart = true;
}
@@ -1240,7 +1355,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
case -EIO:
case -ETIMEDOUT:
case -EACCES:
- nfsd4_mark_cb_down(clp, task->tk_status);
+ nfsd4_mark_cb_down(clp);
}
break;
default:
@@ -1295,12 +1410,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
nfsd41_cb_inflight_wait_complete(clp);
}
-/* requires cl_lock: */
static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
{
struct nfsd4_session *s;
struct nfsd4_conn *c;
+ lockdep_assert_held(&clp->cl_lock);
+
list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
list_for_each_entry(c, &s->se_conns, cn_persession) {
if (c->cn_flags & NFS4_CDFC4_BACK)
@@ -1324,11 +1440,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
struct nfsd4_conn *c;
int err;
+ trace_nfsd_cb_bc_update(clp, cb);
+
/*
* This is either an update, or the client dying; in either case,
* kill the old client:
*/
if (clp->cl_cb_client) {
+ trace_nfsd_cb_bc_shutdown(clp, cb);
rpc_shutdown_client(clp->cl_cb_client);
clp->cl_cb_client = NULL;
put_cred(clp->cl_cb_cred);
@@ -1340,13 +1459,15 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
}
if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
return;
+
spin_lock(&clp->cl_lock);
/*
* Only serialized callback code is allowed to clear these
* flags; main nfsd code can only set them:
*/
- BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
+ WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+
memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
c = __nfsd4_find_backchannel(clp);
if (c) {
@@ -1358,7 +1479,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
err = setup_callback_client(clp, &conn, ses);
if (err) {
- nfsd4_mark_cb_down(clp, err);
+ nfsd4_mark_cb_down(clp);
if (c)
svc_xprt_put(c->cn_xprt);
return;
@@ -1369,25 +1490,28 @@ static void
nfsd4_run_cb_work(struct work_struct *work)
{
struct nfsd4_callback *cb =
- container_of(work, struct nfsd4_callback, cb_work);
+ container_of(work, struct nfsd4_callback, cb_work.work);
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
int flags;
- if (cb->cb_need_restart) {
- cb->cb_need_restart = false;
- } else {
- if (cb->cb_ops && cb->cb_ops->prepare)
- cb->cb_ops->prepare(cb);
- }
+ trace_nfsd_cb_start(clp);
if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
nfsd4_process_cb_update(cb);
clnt = clp->cl_cb_client;
if (!clnt) {
- /* Callback channel broken, or client killed; give up: */
- nfsd41_destroy_cb(cb);
+ if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
+ nfsd41_destroy_cb(cb);
+ else {
+ /*
+ * XXX: Ideally, we could wait for the client to
+ * reconnect, but I haven't figured out how
+ * to do that yet.
+ */
+ nfsd4_queue_cb_delayed(cb, 25);
+ }
return;
}
@@ -1400,6 +1524,12 @@ nfsd4_run_cb_work(struct work_struct *work)
return;
}
+ if (cb->cb_need_restart) {
+ cb->cb_need_restart = false;
+ } else {
+ if (cb->cb_ops && cb->cb_ops->prepare)
+ cb->cb_ops->prepare(cb);
+ }
cb->cb_msg.rpc_cred = clp->cl_cb_cred;
flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
@@ -1414,8 +1544,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
cb->cb_ops = ops;
- INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
- cb->cb_seq_status = 1;
+ INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work);
cb->cb_status = 0;
cb->cb_need_restart = false;
cb->cb_holds_slot = false;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 5e8096bc5eaa..4f3072b5979a 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -25,7 +25,7 @@ static struct kmem_cache *nfs4_layout_cache;
static struct kmem_cache *nfs4_layout_stateid_cache;
static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
-static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+static const struct lease_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
#ifdef CONFIG_NFSD_FLEXFILELAYOUT
@@ -152,6 +152,23 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
#endif
}
+void nfsd4_close_layout(struct nfs4_layout_stateid *ls)
+{
+ struct nfsd_file *fl;
+
+ spin_lock(&ls->ls_stid.sc_file->fi_lock);
+ fl = ls->ls_file;
+ ls->ls_file = NULL;
+ spin_unlock(&ls->ls_stid.sc_file->fi_lock);
+
+ if (fl) {
+ if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+ kernel_setlease(fl->nf_file, F_UNLCK, NULL,
+ (void **)&ls);
+ nfsd_file_put(fl);
+ }
+}
+
static void
nfsd4_free_layout_stateid(struct nfs4_stid *stid)
{
@@ -169,9 +186,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
list_del_init(&ls->ls_perfile);
spin_unlock(&fp->fi_lock);
- if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
- vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
- nfsd_file_put(ls->ls_file);
+ nfsd4_close_layout(ls);
if (ls->ls_recalled)
atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
@@ -182,27 +197,26 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
static int
nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
{
- struct file_lock *fl;
+ struct file_lease *fl;
int status;
if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
return 0;
- fl = locks_alloc_lock();
+ fl = locks_alloc_lease();
if (!fl)
return -ENOMEM;
- locks_init_lock(fl);
+ locks_init_lease(fl);
fl->fl_lmops = &nfsd4_layouts_lm_ops;
- fl->fl_flags = FL_LAYOUT;
- fl->fl_type = F_RDLCK;
- fl->fl_end = OFFSET_MAX;
- fl->fl_owner = ls;
- fl->fl_pid = current->tgid;
- fl->fl_file = ls->ls_file->nf_file;
-
- status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+ fl->c.flc_flags = FL_LAYOUT;
+ fl->c.flc_type = F_RDLCK;
+ fl->c.flc_owner = ls;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_file = ls->ls_file->nf_file;
+
+ status = kernel_setlease(fl->c.flc_file, fl->c.flc_type, &fl, NULL);
if (status) {
- locks_free_lock(fl);
+ locks_free_lease(fl);
return status;
}
BUG_ON(fl != NULL);
@@ -236,7 +250,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
NFSPROC4_CLNT_CB_LAYOUT);
- if (parent->sc_type == NFS4_DELEG_STID)
+ if (parent->sc_type == SC_TYPE_DELEG)
ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
else
ls->ls_file = find_any_file(fp);
@@ -250,7 +264,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
}
spin_lock(&clp->cl_lock);
- stp->sc_type = NFS4_LAYOUT_STID;
+ stp->sc_type = SC_TYPE_LAYOUT;
list_add(&ls->ls_perclnt, &clp->cl_lo_states);
spin_unlock(&clp->cl_lock);
@@ -269,13 +283,13 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
{
struct nfs4_layout_stateid *ls;
struct nfs4_stid *stid;
- unsigned char typemask = NFS4_LAYOUT_STID;
+ unsigned short typemask = SC_TYPE_LAYOUT;
__be32 status;
if (create)
- typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+ typemask |= (SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG);
- status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+ status = nfsd4_lookup_stateid(cstate, stateid, typemask, 0, &stid,
net_generic(SVC_NET(rqstp), nfsd_net_id));
if (status)
goto out;
@@ -286,7 +300,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
goto out_put_stid;
}
- if (stid->sc_type != NFS4_LAYOUT_STID) {
+ if (stid->sc_type != SC_TYPE_LAYOUT) {
ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
nfs4_put_stid(stid);
@@ -518,7 +532,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
lrp->lrs_present = true;
} else {
trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
- nfs4_unhash_stid(&ls->ls_stid);
+ ls->ls_stid.sc_status |= SC_STATUS_CLOSED;
lrp->lrs_present = false;
}
spin_unlock(&ls->ls_lock);
@@ -605,7 +619,7 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
}
static void
-nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
{
struct nfs4_client *clp = ls->ls_stid.sc_client;
char addr_str[INET6_ADDRSTRLEN];
@@ -627,7 +641,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
argv[0] = (char *)nfsd_recall_failed;
argv[1] = addr_str;
- argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id;
+ argv[2] = file->nf_file->f_path.mnt->mnt_sb->s_id;
argv[3] = NULL;
error = call_usermodehelper(nfsd_recall_failed, argv, envp,
@@ -657,6 +671,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
struct nfsd_net *nn;
ktime_t now, cutoff;
const struct nfsd4_layout_ops *ops;
+ struct nfsd_file *fl;
trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task);
switch (task->tk_status) {
@@ -688,12 +703,17 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
* Unknown error or non-responding client, we'll need to fence.
*/
trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid);
-
- ops = nfsd4_layout_ops[ls->ls_layout_type];
- if (ops->fence_client)
- ops->fence_client(ls);
- else
- nfsd4_cb_layout_fail(ls);
+ rcu_read_lock();
+ fl = nfsd_file_get(ls->ls_file);
+ rcu_read_unlock();
+ if (fl) {
+ ops = nfsd4_layout_ops[ls->ls_layout_type];
+ if (ops->fence_client)
+ ops->fence_client(ls, fl);
+ else
+ nfsd4_cb_layout_fail(ls, fl);
+ nfsd_file_put(fl);
+ }
return 1;
case -NFS4ERR_NOMATCHING_LAYOUT:
trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid);
@@ -723,7 +743,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
};
static bool
-nfsd4_layout_lm_break(struct file_lock *fl)
+nfsd4_layout_lm_break(struct file_lease *fl)
{
/*
* We don't want the locks code to timeout the lease for us;
@@ -731,19 +751,19 @@ nfsd4_layout_lm_break(struct file_lock *fl)
* in time:
*/
fl->fl_break_time = 0;
- nfsd4_recall_file_layout(fl->fl_owner);
+ nfsd4_recall_file_layout(fl->c.flc_owner);
return false;
}
static int
-nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+nfsd4_layout_lm_change(struct file_lease *onlist, int arg,
struct list_head *dispose)
{
BUG_ON(!(arg & F_UNLCK));
return lease_modify(onlist, arg, dispose);
}
-static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
.lm_break = nfsd4_layout_lm_break,
.lm_change = nfsd4_layout_lm_change,
};
@@ -756,13 +776,11 @@ nfsd4_init_pnfs(void)
for (i = 0; i < DEVID_HASH_SIZE; i++)
INIT_LIST_HEAD(&nfsd_devid_hash[i]);
- nfs4_layout_cache = kmem_cache_create("nfs4_layout",
- sizeof(struct nfs4_layout), 0, 0, NULL);
+ nfs4_layout_cache = KMEM_CACHE(nfs4_layout, 0);
if (!nfs4_layout_cache)
return -ENOMEM;
- nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
- sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+ nfs4_layout_stateid_cache = KMEM_CACHE(nfs4_layout_stateid, 0);
if (!nfs4_layout_stateid_cache) {
kmem_cache_destroy(nfs4_layout_cache);
return -ENOMEM;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 14712fa08f76..2927b1263f08 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1143,6 +1143,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
};
struct inode *inode;
__be32 status = nfs_ok;
+ bool save_no_wcc;
int err;
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
@@ -1168,8 +1169,10 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs,
- 0, (time64_t)0);
+ save_no_wcc = cstate->current_fh.fh_no_wcc;
+ cstate->current_fh.fh_no_wcc = true;
+ status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs, NULL);
+ cstate->current_fh.fh_no_wcc = save_no_wcc;
if (!status)
status = nfserrno(attrs.na_labelerr);
if (!status)
@@ -2490,10 +2493,10 @@ nfsd4_proc_null(struct svc_rqst *rqstp)
return rpc_success;
}
-static inline void nfsd4_increment_op_stats(u32 opnum)
+static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum)
{
if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
- percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]);
+ percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]);
}
static const struct nfsd4_operation nfsd4_ops[];
@@ -2768,7 +2771,7 @@ encode_op:
status, nfsd4_op_name(op->opnum));
nfsd4_cstate_clear_replay(cstate);
- nfsd4_increment_op_stats(op->opnum);
+ nfsd4_increment_op_stats(nn, op->opnum);
}
fh_put(current_fh);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7d6c657e0409..84d4093ca713 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -87,6 +87,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
void nfsd4_end_grace(struct nfsd_net *nn);
static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
static void nfsd4_file_hash_remove(struct nfs4_file *fi);
+static void deleg_reaper(struct nfsd_net *nn);
/* Locking: */
@@ -127,6 +128,7 @@ static void free_session(struct nfsd4_session *);
static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops;
static struct workqueue_struct *laundry_wq;
@@ -318,6 +320,7 @@ free_nbl(struct kref *kref)
struct nfsd4_blocked_lock *nbl;
nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref);
+ locks_release_private(&nbl->nbl_lock);
kfree(nbl);
}
@@ -325,7 +328,6 @@ static void
free_blocked_lock(struct nfsd4_blocked_lock *nbl)
{
locks_delete_block(&nbl->nbl_lock);
- locks_release_private(&nbl->nbl_lock);
kref_put(&nbl->nbl_kref, free_nbl);
}
@@ -1189,6 +1191,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
dp->dl_recalled = false;
nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
&nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+ nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client,
+ &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR);
+ dp->dl_cb_fattr.ncf_file_modified = false;
+ dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE;
get_nfs4_file(fp);
dp->dl_stid.sc_file = fp;
return dp;
@@ -1210,6 +1216,8 @@ nfs4_put_stid(struct nfs4_stid *s)
return;
}
idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+ if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+ atomic_dec(&s->sc_client->cl_admin_revoked);
nfs4_free_cpntf_statelist(clp->net, s);
spin_unlock(&clp->cl_lock);
s->sc_free(s);
@@ -1249,7 +1257,7 @@ static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
WARN_ON_ONCE(!fp->fi_delegees);
- vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
+ kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
put_deleg_file(fp);
}
@@ -1260,11 +1268,6 @@ static void destroy_unhashed_deleg(struct nfs4_delegation *dp)
nfs4_put_stid(&dp->dl_stid);
}
-void nfs4_unhash_stid(struct nfs4_stid *s)
-{
- s->sc_type = 0;
-}
-
/**
* nfs4_delegation_exists - Discover if this delegation already exists
* @clp: a pointer to the nfs4_client we're granting a delegation to
@@ -1312,11 +1315,12 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
lockdep_assert_held(&state_lock);
lockdep_assert_held(&fp->fi_lock);
+ lockdep_assert_held(&clp->cl_lock);
if (nfs4_delegation_exists(clp, fp))
return -EAGAIN;
refcount_inc(&dp->dl_stid.sc_count);
- dp->dl_stid.sc_type = NFS4_DELEG_STID;
+ dp->dl_stid.sc_type = SC_TYPE_DELEG;
list_add(&dp->dl_perfile, &fp->fi_delegations);
list_add(&dp->dl_perclnt, &clp->cl_delegations);
return 0;
@@ -1328,7 +1332,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp)
}
static bool
-unhash_delegation_locked(struct nfs4_delegation *dp)
+unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
@@ -1337,7 +1341,13 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
if (!delegation_hashed(dp))
return false;
- dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
+ if (statusmask == SC_STATUS_REVOKED &&
+ dp->dl_stid.sc_client->cl_minorversion == 0)
+ statusmask = SC_STATUS_CLOSED;
+ dp->dl_stid.sc_status |= statusmask;
+ if (statusmask & SC_STATUS_ADMIN_REVOKED)
+ atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked);
+
/* Ensure that deleg break won't try to requeue it */
++dp->dl_time;
spin_lock(&fp->fi_lock);
@@ -1353,7 +1363,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
bool unhashed;
spin_lock(&state_lock);
- unhashed = unhash_delegation_locked(dp);
+ unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED);
spin_unlock(&state_lock);
if (unhashed)
destroy_unhashed_deleg(dp);
@@ -1367,9 +1377,9 @@ static void revoke_delegation(struct nfs4_delegation *dp)
trace_nfsd_stid_revoke(&dp->dl_stid);
- if (clp->cl_minorversion) {
+ if (dp->dl_stid.sc_status &
+ (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
spin_lock(&clp->cl_lock);
- dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
refcount_inc(&dp->dl_stid.sc_count);
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
spin_unlock(&clp->cl_lock);
@@ -1377,8 +1387,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
destroy_unhashed_deleg(dp);
}
-/*
- * SETCLIENTID state
+/*
+ * SETCLIENTID state
*/
static unsigned int clientid_hashval(u32 id)
@@ -1531,6 +1541,8 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
}
idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+ if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+ atomic_dec(&s->sc_client->cl_admin_revoked);
list_add(&stp->st_locks, reaplist);
}
@@ -1541,7 +1553,7 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
if (!unhash_ol_stateid(stp))
return false;
list_del_init(&stp->st_locks);
- nfs4_unhash_stid(&stp->st_stid);
+ stp->st_stid.sc_status |= SC_STATUS_CLOSED;
return true;
}
@@ -1599,7 +1611,7 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
while (!list_empty(&open_stp->st_locks)) {
stp = list_entry(open_stp->st_locks.next,
struct nfs4_ol_stateid, st_locks);
- WARN_ON(!unhash_lock_stateid(stp));
+ unhash_lock_stateid(stp);
put_ol_stateid_locked(stp, reaplist);
}
}
@@ -1620,6 +1632,7 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp)
LIST_HEAD(reaplist);
spin_lock(&stp->st_stid.sc_client->cl_lock);
+ stp->st_stid.sc_status |= SC_STATUS_CLOSED;
if (unhash_open_stateid(stp, &reaplist))
put_ol_stateid_locked(stp, &reaplist);
spin_unlock(&stp->st_stid.sc_client->cl_lock);
@@ -1675,6 +1688,136 @@ static void release_openowner(struct nfs4_openowner *oo)
nfs4_put_stateowner(&oo->oo_owner);
}
+static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp,
+ struct super_block *sb,
+ unsigned int sc_types)
+{
+ unsigned long id, tmp;
+ struct nfs4_stid *stid;
+
+ spin_lock(&clp->cl_lock);
+ idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id)
+ if ((stid->sc_type & sc_types) &&
+ stid->sc_status == 0 &&
+ stid->sc_file->fi_inode->i_sb == sb) {
+ refcount_inc(&stid->sc_count);
+ break;
+ }
+ spin_unlock(&clp->cl_lock);
+ return stid;
+}
+
+/**
+ * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem
+ * @net: used to identify instance of nfsd (there is one per net namespace)
+ * @sb: super_block used to identify target filesystem
+ *
+ * All nfs4 states (open, lock, delegation, layout) held by the server instance
+ * and associated with a file on the given filesystem will be revoked resulting
+ * in any files being closed and so all references from nfsd to the filesystem
+ * being released. Thus nfsd will no longer prevent the filesystem from being
+ * unmounted.
+ *
+ * The clients which own the states will subsequently being notified that the
+ * states have been "admin-revoked".
+ */
+void nfsd4_revoke_states(struct net *net, struct super_block *sb)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ unsigned int idhashval;
+ unsigned int sc_types;
+
+ sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT;
+
+ spin_lock(&nn->client_lock);
+ for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
+ struct list_head *head = &nn->conf_id_hashtbl[idhashval];
+ struct nfs4_client *clp;
+ retry:
+ list_for_each_entry(clp, head, cl_idhash) {
+ struct nfs4_stid *stid = find_one_sb_stid(clp, sb,
+ sc_types);
+ if (stid) {
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_delegation *dp;
+ struct nfs4_layout_stateid *ls;
+
+ spin_unlock(&nn->client_lock);
+ switch (stid->sc_type) {
+ case SC_TYPE_OPEN:
+ stp = openlockstateid(stid);
+ mutex_lock_nested(&stp->st_mutex,
+ OPEN_STATEID_MUTEX);
+
+ spin_lock(&clp->cl_lock);
+ if (stid->sc_status == 0) {
+ stid->sc_status |=
+ SC_STATUS_ADMIN_REVOKED;
+ atomic_inc(&clp->cl_admin_revoked);
+ spin_unlock(&clp->cl_lock);
+ release_all_access(stp);
+ } else
+ spin_unlock(&clp->cl_lock);
+ mutex_unlock(&stp->st_mutex);
+ break;
+ case SC_TYPE_LOCK:
+ stp = openlockstateid(stid);
+ mutex_lock_nested(&stp->st_mutex,
+ LOCK_STATEID_MUTEX);
+ spin_lock(&clp->cl_lock);
+ if (stid->sc_status == 0) {
+ struct nfs4_lockowner *lo =
+ lockowner(stp->st_stateowner);
+ struct nfsd_file *nf;
+
+ stid->sc_status |=
+ SC_STATUS_ADMIN_REVOKED;
+ atomic_inc(&clp->cl_admin_revoked);
+ spin_unlock(&clp->cl_lock);
+ nf = find_any_file(stp->st_stid.sc_file);
+ if (nf) {
+ get_file(nf->nf_file);
+ filp_close(nf->nf_file,
+ (fl_owner_t)lo);
+ nfsd_file_put(nf);
+ }
+ release_all_access(stp);
+ } else
+ spin_unlock(&clp->cl_lock);
+ mutex_unlock(&stp->st_mutex);
+ break;
+ case SC_TYPE_DELEG:
+ dp = delegstateid(stid);
+ spin_lock(&state_lock);
+ if (!unhash_delegation_locked(
+ dp, SC_STATUS_ADMIN_REVOKED))
+ dp = NULL;
+ spin_unlock(&state_lock);
+ if (dp)
+ revoke_delegation(dp);
+ break;
+ case SC_TYPE_LAYOUT:
+ ls = layoutstateid(stid);
+ nfsd4_close_layout(ls);
+ break;
+ }
+ nfs4_put_stid(stid);
+ spin_lock(&nn->client_lock);
+ if (clp->cl_minorversion == 0)
+ /* Allow cleanup after a lease period.
+ * store_release ensures cleanup will
+ * see any newly revoked states if it
+ * sees the time updated.
+ */
+ nn->nfs40_last_revoke =
+ ktime_get_boottime_seconds();
+ goto retry;
+ }
+ }
+ }
+ spin_unlock(&nn->client_lock);
+}
+
static inline int
hash_sessionid(struct nfs4_sessionid *sessionid)
{
@@ -2228,7 +2371,7 @@ __destroy_client(struct nfs4_client *clp)
spin_lock(&state_lock);
while (!list_empty(&clp->cl_delegations)) {
dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
- WARN_ON(!unhash_delegation_locked(dp));
+ unhash_delegation_locked(dp, SC_STATUS_CLOSED);
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
@@ -2460,14 +2603,16 @@ find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
}
static struct nfs4_stid *
-find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+find_stateid_by_type(struct nfs4_client *cl, stateid_t *t,
+ unsigned short typemask, unsigned short ok_states)
{
struct nfs4_stid *s;
spin_lock(&cl->cl_lock);
s = find_stateid_locked(cl, t);
if (s != NULL) {
- if (typemask & s->sc_type)
+ if ((s->sc_status & ~ok_states) == 0 &&
+ (typemask & s->sc_type))
refcount_inc(&s->sc_count);
else
s = NULL;
@@ -2487,9 +2632,9 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
static void seq_quote_mem(struct seq_file *m, char *data, int len)
{
- seq_printf(m, "\"");
+ seq_puts(m, "\"");
seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\");
- seq_printf(m, "\"");
+ seq_puts(m, "\"");
}
static const char *cb_state2str(int state)
@@ -2530,20 +2675,22 @@ static int client_info_show(struct seq_file *m, void *v)
seq_puts(m, "status: unconfirmed\n");
seq_printf(m, "seconds from last renew: %lld\n",
ktime_get_boottime_seconds() - clp->cl_time);
- seq_printf(m, "name: ");
+ seq_puts(m, "name: ");
seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
if (clp->cl_nii_domain.data) {
- seq_printf(m, "Implementation domain: ");
+ seq_puts(m, "Implementation domain: ");
seq_quote_mem(m, clp->cl_nii_domain.data,
clp->cl_nii_domain.len);
- seq_printf(m, "\nImplementation name: ");
+ seq_puts(m, "\nImplementation name: ");
seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len);
seq_printf(m, "\nImplementation time: [%lld, %ld]\n",
clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
}
seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state));
seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr);
+ seq_printf(m, "admin-revoked states: %d\n",
+ atomic_read(&clp->cl_admin_revoked));
drop_client(clp);
return 0;
@@ -2602,7 +2749,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo)
{
- seq_printf(s, "owner: ");
+ seq_puts(s, "owner: ");
seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len);
}
@@ -2620,20 +2767,13 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
struct nfs4_stateowner *oo;
unsigned int access, deny;
- if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID)
- return 0; /* XXX: or SEQ_SKIP? */
ols = openlockstateid(st);
oo = ols->st_stateowner;
nf = st->sc_file;
- spin_lock(&nf->fi_lock);
- file = find_any_file_locked(nf);
- if (!file)
- goto out;
-
- seq_printf(s, "- ");
+ seq_puts(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
- seq_printf(s, ": { type: open, ");
+ seq_puts(s, ": { type: open, ");
access = bmap_to_share_mode(ols->st_access_bmap);
deny = bmap_to_share_mode(ols->st_deny_bmap);
@@ -2645,14 +2785,19 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
deny & NFS4_SHARE_ACCESS_READ ? "r" : "-",
deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
- nfs4_show_superblock(s, file);
- seq_printf(s, ", ");
- nfs4_show_fname(s, file);
- seq_printf(s, ", ");
- nfs4_show_owner(s, oo);
- seq_printf(s, " }\n");
-out:
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
+ if (file) {
+ nfs4_show_superblock(s, file);
+ seq_puts(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_puts(s, ", ");
+ }
spin_unlock(&nf->fi_lock);
+ nfs4_show_owner(s, oo);
+ if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+ seq_puts(s, ", admin-revoked");
+ seq_puts(s, " }\n");
return 0;
}
@@ -2666,30 +2811,31 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
ols = openlockstateid(st);
oo = ols->st_stateowner;
nf = st->sc_file;
- spin_lock(&nf->fi_lock);
- file = find_any_file_locked(nf);
- if (!file)
- goto out;
- seq_printf(s, "- ");
+ seq_puts(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
- seq_printf(s, ": { type: lock, ");
+ seq_puts(s, ": { type: lock, ");
- /*
- * Note: a lock stateid isn't really the same thing as a lock,
- * it's the locking state held by one owner on a file, and there
- * may be multiple (or no) lock ranges associated with it.
- * (Same for the matter is true of open stateids.)
- */
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
+ if (file) {
+ /*
+ * Note: a lock stateid isn't really the same thing as a lock,
+ * it's the locking state held by one owner on a file, and there
+ * may be multiple (or no) lock ranges associated with it.
+ * (Same for the matter is true of open stateids.)
+ */
- nfs4_show_superblock(s, file);
- /* XXX: open stateid? */
- seq_printf(s, ", ");
- nfs4_show_fname(s, file);
- seq_printf(s, ", ");
+ nfs4_show_superblock(s, file);
+ /* XXX: open stateid? */
+ seq_puts(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_puts(s, ", ");
+ }
nfs4_show_owner(s, oo);
- seq_printf(s, " }\n");
-out:
+ if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+ seq_puts(s, ", admin-revoked");
+ seq_puts(s, " }\n");
spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2702,27 +2848,28 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
ds = delegstateid(st);
nf = st->sc_file;
- spin_lock(&nf->fi_lock);
- file = nf->fi_deleg_file;
- if (!file)
- goto out;
- seq_printf(s, "- ");
+ seq_puts(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
- seq_printf(s, ": { type: deleg, ");
+ seq_puts(s, ": { type: deleg, ");
- /* Kinda dead code as long as we only support read delegs: */
- seq_printf(s, "access: %s, ",
- ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
+ seq_printf(s, "access: %s",
+ ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
/* XXX: lease time, whether it's being recalled. */
- nfs4_show_superblock(s, file);
- seq_printf(s, ", ");
- nfs4_show_fname(s, file);
- seq_printf(s, " }\n");
-out:
+ spin_lock(&nf->fi_lock);
+ file = nf->fi_deleg_file;
+ if (file) {
+ seq_puts(s, ", ");
+ nfs4_show_superblock(s, file);
+ seq_puts(s, ", ");
+ nfs4_show_fname(s, file);
+ }
spin_unlock(&nf->fi_lock);
+ if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+ seq_puts(s, ", admin-revoked");
+ seq_puts(s, " }\n");
return 0;
}
@@ -2732,18 +2879,25 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
struct nfsd_file *file;
ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
- file = ls->ls_file;
- seq_printf(s, "- ");
+ seq_puts(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
- seq_printf(s, ": { type: layout, ");
+ seq_puts(s, ": { type: layout");
/* XXX: What else would be useful? */
- nfs4_show_superblock(s, file);
- seq_printf(s, ", ");
- nfs4_show_fname(s, file);
- seq_printf(s, " }\n");
+ spin_lock(&ls->ls_stid.sc_file->fi_lock);
+ file = ls->ls_file;
+ if (file) {
+ seq_puts(s, ", ");
+ nfs4_show_superblock(s, file);
+ seq_puts(s, ", ");
+ nfs4_show_fname(s, file);
+ }
+ spin_unlock(&ls->ls_stid.sc_file->fi_lock);
+ if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+ seq_puts(s, ", admin-revoked");
+ seq_puts(s, " }\n");
return 0;
}
@@ -2753,13 +2907,13 @@ static int states_show(struct seq_file *s, void *v)
struct nfs4_stid *st = v;
switch (st->sc_type) {
- case NFS4_OPEN_STID:
+ case SC_TYPE_OPEN:
return nfs4_show_open(s, st);
- case NFS4_LOCK_STID:
+ case SC_TYPE_LOCK:
return nfs4_show_lock(s, st);
- case NFS4_DELEG_STID:
+ case SC_TYPE_DELEG:
return nfs4_show_deleg(s, st);
- case NFS4_LAYOUT_STID:
+ case SC_TYPE_LAYOUT:
return nfs4_show_layout(s, st);
default:
return 0; /* XXX: or SEQ_SKIP? */
@@ -2888,12 +3042,38 @@ static void
nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
{
struct nfs4_client *clp = cb->cb_clp;
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- spin_lock(&nn->client_lock);
clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
- put_client_renew_locked(clp);
- spin_unlock(&nn->client_lock);
+ drop_client(clp);
+}
+
+static int
+nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+ struct nfs4_cb_fattr *ncf =
+ container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+ ncf->ncf_cb_status = task->tk_status;
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, 2 * HZ);
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+static void
+nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_cb_fattr *ncf =
+ container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+ struct nfs4_delegation *dp =
+ container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+ nfs4_put_stid(&dp->dl_stid);
+ clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
+ wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
}
static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
@@ -2901,6 +3081,25 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
.release = nfsd4_cb_recall_any_release,
};
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = {
+ .done = nfsd4_cb_getattr_done,
+ .release = nfsd4_cb_getattr_release,
+};
+
+static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
+{
+ struct nfs4_delegation *dp =
+ container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+ if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags))
+ return;
+ /* set to proper status when nfsd4_cb_getattr_done runs */
+ ncf->ncf_cb_status = NFS4ERR_IO;
+
+ refcount_inc(&dp->dl_stid.sc_count);
+ nfsd4_run_cb(&ncf->ncf_getattr);
+}
+
static struct nfs4_client *create_client(struct xdr_netobj name,
struct svc_rqst *rqstp, nfs4_verifier *verf)
{
@@ -3414,6 +3613,9 @@ out_new:
new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1];
+ /* Contrived initial CREATE_SESSION response */
+ new->cl_cs_slot.sl_status = nfserr_seq_misordered;
+
add_to_unconfirmed(new);
swap(new, conf);
out_copy:
@@ -3584,10 +3786,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_create_session *cr_ses = &u->create_session;
struct sockaddr *sa = svc_addr(rqstp);
struct nfs4_client *conf, *unconf;
+ struct nfsd4_clid_slot *cs_slot;
struct nfs4_client *old = NULL;
struct nfsd4_session *new;
struct nfsd4_conn *conn;
- struct nfsd4_clid_slot *cs_slot = NULL;
__be32 status = 0;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
@@ -3611,53 +3813,63 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_session;
spin_lock(&nn->client_lock);
+
+ /* RFC 8881 Section 18.36.4 Phase 1: Client record look-up. */
unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
conf = find_confirmed_client(&cr_ses->clientid, true, nn);
- WARN_ON_ONCE(conf && unconf);
+ if (!conf && !unconf) {
+ status = nfserr_stale_clientid;
+ goto out_free_conn;
+ }
+ /* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */
+ if (conf)
+ cs_slot = &conf->cl_cs_slot;
+ else
+ cs_slot = &unconf->cl_cs_slot;
+ status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+ switch (status) {
+ case nfs_ok:
+ cs_slot->sl_seqid++;
+ cr_ses->seqid = cs_slot->sl_seqid;
+ break;
+ case nfserr_replay_cache:
+ status = nfsd4_replay_create_session(cr_ses, cs_slot);
+ fallthrough;
+ case nfserr_jukebox:
+ /* The server MUST NOT cache NFS4ERR_DELAY */
+ goto out_free_conn;
+ default:
+ goto out_cache_error;
+ }
+
+ /* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */
if (conf) {
status = nfserr_wrong_cred;
if (!nfsd4_mach_creds_match(conf, rqstp))
- goto out_free_conn;
- cs_slot = &conf->cl_cs_slot;
- status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
- if (status) {
- if (status == nfserr_replay_cache)
- status = nfsd4_replay_create_session(cr_ses, cs_slot);
- goto out_free_conn;
- }
- } else if (unconf) {
+ goto out_cache_error;
+ } else {
status = nfserr_clid_inuse;
if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
!rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
trace_nfsd_clid_cred_mismatch(unconf, rqstp);
- goto out_free_conn;
+ goto out_cache_error;
}
status = nfserr_wrong_cred;
if (!nfsd4_mach_creds_match(unconf, rqstp))
- goto out_free_conn;
- cs_slot = &unconf->cl_cs_slot;
- status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
- if (status) {
- /* an unconfirmed replay returns misordered */
- status = nfserr_seq_misordered;
- goto out_free_conn;
- }
+ goto out_cache_error;
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
status = mark_client_expired_locked(old);
- if (status) {
- old = NULL;
- goto out_free_conn;
- }
+ if (status)
+ goto out_expired_error;
trace_nfsd_clid_replaced(&old->cl_clientid);
}
move_to_confirmed(unconf);
conf = unconf;
- } else {
- status = nfserr_stale_clientid;
- goto out_free_conn;
}
+
+ /* RFC 8881 Section 18.36.4 Phase 4: Session creation. */
status = nfs_ok;
/* Persistent sessions are not supported */
cr_ses->flags &= ~SESSION4_PERSIST;
@@ -3669,8 +3881,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
NFS4_MAX_SESSIONID_LEN);
- cs_slot->sl_seqid++;
- cr_ses->seqid = cs_slot->sl_seqid;
/* cache solo and embedded create sessions under the client_lock */
nfsd4_cache_create_session(cr_ses, cs_slot, status);
@@ -3683,6 +3893,20 @@ nfsd4_create_session(struct svc_rqst *rqstp,
if (old)
expire_client(old);
return status;
+
+out_expired_error:
+ old = NULL;
+ /*
+ * Revert the slot seq_nr change so the server will process
+ * the client's resend instead of returning a cached response.
+ */
+ if (status == nfserr_jukebox) {
+ cs_slot->sl_seqid--;
+ cr_ses->seqid = cs_slot->sl_seqid;
+ goto out_free_conn;
+ }
+out_cache_error:
+ nfsd4_cache_create_session(cr_ses, cs_slot, status);
out_free_conn:
spin_unlock(&nn->client_lock);
free_conn(conn);
@@ -4058,6 +4282,9 @@ out:
}
if (!list_empty(&clp->cl_revoked))
seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+ if (atomic_read(&clp->cl_admin_revoked))
+ seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED;
+ trace_nfsd_seq4_status(rqstp, seq);
out_no_session:
if (conn)
free_conn(conn);
@@ -4352,32 +4579,25 @@ nfsd4_free_slabs(void)
int
nfsd4_init_slabs(void)
{
- client_slab = kmem_cache_create("nfsd4_clients",
- sizeof(struct nfs4_client), 0, 0, NULL);
+ client_slab = KMEM_CACHE(nfs4_client, 0);
if (client_slab == NULL)
goto out;
- openowner_slab = kmem_cache_create("nfsd4_openowners",
- sizeof(struct nfs4_openowner), 0, 0, NULL);
+ openowner_slab = KMEM_CACHE(nfs4_openowner, 0);
if (openowner_slab == NULL)
goto out_free_client_slab;
- lockowner_slab = kmem_cache_create("nfsd4_lockowners",
- sizeof(struct nfs4_lockowner), 0, 0, NULL);
+ lockowner_slab = KMEM_CACHE(nfs4_lockowner, 0);
if (lockowner_slab == NULL)
goto out_free_openowner_slab;
- file_slab = kmem_cache_create("nfsd4_files",
- sizeof(struct nfs4_file), 0, 0, NULL);
+ file_slab = KMEM_CACHE(nfs4_file, 0);
if (file_slab == NULL)
goto out_free_lockowner_slab;
- stateid_slab = kmem_cache_create("nfsd4_stateids",
- sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
+ stateid_slab = KMEM_CACHE(nfs4_ol_stateid, 0);
if (stateid_slab == NULL)
goto out_free_file_slab;
- deleg_slab = kmem_cache_create("nfsd4_delegations",
- sizeof(struct nfs4_delegation), 0, 0, NULL);
+ deleg_slab = KMEM_CACHE(nfs4_delegation, 0);
if (deleg_slab == NULL)
goto out_free_stateid_slab;
- odstate_slab = kmem_cache_create("nfsd4_odstate",
- sizeof(struct nfs4_clnt_odstate), 0, 0, NULL);
+ odstate_slab = KMEM_CACHE(nfs4_clnt_odstate, 0);
if (odstate_slab == NULL)
goto out_free_deleg_slab;
return 0;
@@ -4531,7 +4751,8 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
continue;
if (local->st_stateowner != &oo->oo_owner)
continue;
- if (local->st_stid.sc_type == NFS4_OPEN_STID) {
+ if (local->st_stid.sc_type == SC_TYPE_OPEN &&
+ !local->st_stid.sc_status) {
ret = local;
refcount_inc(&ret->st_stid.sc_count);
break;
@@ -4540,22 +4761,75 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
return ret;
}
-static __be32
-nfsd4_verify_open_stid(struct nfs4_stid *s)
+static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
+ __releases(&s->sc_client->cl_lock)
{
- __be32 ret = nfs_ok;
+ struct nfs4_client *cl = s->sc_client;
+ LIST_HEAD(reaplist);
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_delegation *dp;
+ bool unhashed;
switch (s->sc_type) {
- default:
+ case SC_TYPE_OPEN:
+ stp = openlockstateid(s);
+ if (unhash_open_stateid(stp, &reaplist))
+ put_ol_stateid_locked(stp, &reaplist);
+ spin_unlock(&cl->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
break;
- case 0:
- case NFS4_CLOSED_STID:
- case NFS4_CLOSED_DELEG_STID:
- ret = nfserr_bad_stateid;
+ case SC_TYPE_LOCK:
+ stp = openlockstateid(s);
+ unhashed = unhash_lock_stateid(stp);
+ spin_unlock(&cl->cl_lock);
+ if (unhashed)
+ nfs4_put_stid(s);
break;
- case NFS4_REVOKED_DELEG_STID:
- ret = nfserr_deleg_revoked;
+ case SC_TYPE_DELEG:
+ dp = delegstateid(s);
+ list_del_init(&dp->dl_recall_lru);
+ spin_unlock(&cl->cl_lock);
+ nfs4_put_stid(s);
+ break;
+ default:
+ spin_unlock(&cl->cl_lock);
+ }
+}
+
+static void nfsd40_drop_revoked_stid(struct nfs4_client *cl,
+ stateid_t *stid)
+{
+ /* NFSv4.0 has no way for the client to tell the server
+ * that it can forget an admin-revoked stateid.
+ * So we keep it around until the first time that the
+ * client uses it, and drop it the first time
+ * nfserr_admin_revoked is returned.
+ * For v4.1 and later we wait until explicitly told
+ * to free the stateid.
+ */
+ if (cl->cl_minorversion == 0) {
+ struct nfs4_stid *st;
+
+ spin_lock(&cl->cl_lock);
+ st = find_stateid_locked(cl, stid);
+ if (st)
+ nfsd4_drop_revoked_stid(st);
+ else
+ spin_unlock(&cl->cl_lock);
}
+}
+
+static __be32
+nfsd4_verify_open_stid(struct nfs4_stid *s)
+{
+ __be32 ret = nfs_ok;
+
+ if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+ ret = nfserr_admin_revoked;
+ else if (s->sc_status & SC_STATUS_REVOKED)
+ ret = nfserr_deleg_revoked;
+ else if (s->sc_status & SC_STATUS_CLOSED)
+ ret = nfserr_bad_stateid;
return ret;
}
@@ -4567,6 +4841,10 @@ nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp)
mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX);
ret = nfsd4_verify_open_stid(&stp->st_stid);
+ if (ret == nfserr_admin_revoked)
+ nfsd40_drop_revoked_stid(stp->st_stid.sc_client,
+ &stp->st_stid.sc_stateid);
+
if (ret != nfs_ok)
mutex_unlock(&stp->st_mutex);
return ret;
@@ -4641,7 +4919,7 @@ retry:
open->op_stp = NULL;
refcount_inc(&stp->st_stid.sc_count);
- stp->st_stid.sc_type = NFS4_OPEN_STID;
+ stp->st_stid.sc_type = SC_TYPE_OPEN;
INIT_LIST_HEAD(&stp->st_locks);
stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
get_nfs4_file(fp);
@@ -4868,9 +5146,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task);
- if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
- dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
- return 1;
+ if (dp->dl_stid.sc_status)
+ /* CLOSED or REVOKED */
+ return 1;
switch (task->tk_status) {
case 0:
@@ -4922,9 +5200,9 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
/* Called from break_lease() with flc_lock held. */
static bool
-nfsd_break_deleg_cb(struct file_lock *fl)
+nfsd_break_deleg_cb(struct file_lease *fl)
{
- struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+ struct nfs4_delegation *dp = (struct nfs4_delegation *) fl->c.flc_owner;
struct nfs4_file *fp = dp->dl_stid.sc_file;
struct nfs4_client *clp = dp->dl_stid.sc_client;
struct nfsd_net *nn;
@@ -4958,9 +5236,9 @@ nfsd_break_deleg_cb(struct file_lock *fl)
* %true: Lease conflict was resolved
* %false: Lease conflict was not resolved.
*/
-static bool nfsd_breaker_owns_lease(struct file_lock *fl)
+static bool nfsd_breaker_owns_lease(struct file_lease *fl)
{
- struct nfs4_delegation *dl = fl->fl_owner;
+ struct nfs4_delegation *dl = fl->c.flc_owner;
struct svc_rqst *rqst;
struct nfs4_client *clp;
@@ -4975,10 +5253,10 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
}
static int
-nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+nfsd_change_deleg_cb(struct file_lease *onlist, int arg,
struct list_head *dispose)
{
- struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner;
+ struct nfs4_delegation *dp = (struct nfs4_delegation *) onlist->c.flc_owner;
struct nfs4_client *clp = dp->dl_stid.sc_client;
if (arg & F_UNLCK) {
@@ -4989,7 +5267,7 @@ nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
return -EAGAIN;
}
-static const struct lock_manager_operations nfsd_lease_mng_ops = {
+static const struct lease_manager_operations nfsd_lease_mng_ops = {
.lm_breaker_owns_lease = nfsd_breaker_owns_lease,
.lm_break = nfsd_break_deleg_cb,
.lm_change = nfsd_change_deleg_cb,
@@ -5113,12 +5391,12 @@ static int share_access_to_flags(u32 share_access)
return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
}
-static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl,
+ stateid_t *s)
{
struct nfs4_stid *ret;
- ret = find_stateid_by_type(cl, s,
- NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID);
+ ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED);
if (!ret)
return NULL;
return delegstateid(ret);
@@ -5141,10 +5419,15 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
if (deleg == NULL)
goto out;
- if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+ if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) {
nfs4_put_stid(&deleg->dl_stid);
- if (cl->cl_minorversion)
- status = nfserr_deleg_revoked;
+ status = nfserr_admin_revoked;
+ goto out;
+ }
+ if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) {
+ nfs4_put_stid(&deleg->dl_stid);
+ nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid);
+ status = nfserr_deleg_revoked;
goto out;
}
flags = share_access_to_flags(open->op_share_access);
@@ -5189,7 +5472,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
return 0;
if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
return nfserr_inval;
- return nfsd_setattr(rqstp, fh, &attrs, 0, (time64_t)0);
+ return nfsd_setattr(rqstp, fh, &attrs, NULL);
}
static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
@@ -5329,21 +5612,20 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
}
-static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
+static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
int flag)
{
- struct file_lock *fl;
+ struct file_lease *fl;
- fl = locks_alloc_lock();
+ fl = locks_alloc_lease();
if (!fl)
return NULL;
fl->fl_lmops = &nfsd_lease_mng_ops;
- fl->fl_flags = FL_DELEG;
- fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
- fl->fl_end = OFFSET_MAX;
- fl->fl_owner = (fl_owner_t)dp;
- fl->fl_pid = current->tgid;
- fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
+ fl->c.flc_flags = FL_DELEG;
+ fl->c.flc_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+ fl->c.flc_owner = (fl_owner_t)dp;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
return fl;
}
@@ -5461,7 +5743,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
struct nfs4_delegation *dp;
struct nfsd_file *nf = NULL;
- struct file_lock *fl;
+ struct file_lease *fl;
u32 dl_type;
/*
@@ -5531,9 +5813,10 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (!fl)
goto out_clnt_odstate;
- status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
+ status = kernel_setlease(fp->fi_deleg_file->nf_file,
+ fl->c.flc_type, &fl, NULL);
if (fl)
- locks_free_lock(fl);
+ locks_free_lease(fl);
if (status)
goto out_clnt_odstate;
@@ -5560,9 +5843,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
goto out_unlock;
spin_lock(&state_lock);
+ spin_lock(&clp->cl_lock);
spin_lock(&fp->fi_lock);
status = hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
+ spin_unlock(&clp->cl_lock);
spin_unlock(&state_lock);
if (status)
@@ -5570,7 +5855,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
return dp;
out_unlock:
- vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
+ kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
out_clnt_odstate:
put_clnt_odstate(dp->dl_clnt_odstate);
nfs4_put_stid(&dp->dl_stid);
@@ -5634,6 +5919,8 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
struct svc_fh *parent = NULL;
int cb_up;
int status = 0;
+ struct kstat stat;
+ struct path path;
cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
open->op_recall = false;
@@ -5671,6 +5958,18 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
+ path.mnt = currentfh->fh_export->ex_path.mnt;
+ path.dentry = currentfh->fh_dentry;
+ if (vfs_getattr(&path, &stat,
+ (STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE),
+ AT_STATX_SYNC_AS_STAT)) {
+ nfs4_put_stid(&dp->dl_stid);
+ destroy_delegation(dp);
+ goto out_no_deleg;
+ }
+ dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
+ dp->dl_cb_fattr.ncf_initial_cinfo =
+ nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry));
} else {
open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
@@ -5774,7 +6073,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
} else {
status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true);
if (status) {
- stp->st_stid.sc_type = NFS4_CLOSED_STID;
release_open_stateid(stp);
mutex_unlock(&stp->st_mutex);
goto out;
@@ -6128,6 +6426,43 @@ nfs4_process_client_reaplist(struct list_head *reaplist)
}
}
+static void nfs40_clean_admin_revoked(struct nfsd_net *nn,
+ struct laundry_time *lt)
+{
+ struct nfs4_client *clp;
+
+ spin_lock(&nn->client_lock);
+ if (nn->nfs40_last_revoke == 0 ||
+ nn->nfs40_last_revoke > lt->cutoff) {
+ spin_unlock(&nn->client_lock);
+ return;
+ }
+ nn->nfs40_last_revoke = 0;
+
+retry:
+ list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+ unsigned long id, tmp;
+ struct nfs4_stid *stid;
+
+ if (atomic_read(&clp->cl_admin_revoked) == 0)
+ continue;
+
+ spin_lock(&clp->cl_lock);
+ idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id)
+ if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+ refcount_inc(&stid->sc_count);
+ spin_unlock(&nn->client_lock);
+ /* this function drops ->cl_lock */
+ nfsd4_drop_revoked_stid(stid);
+ nfs4_put_stid(stid);
+ spin_lock(&nn->client_lock);
+ goto retry;
+ }
+ spin_unlock(&clp->cl_lock);
+ }
+ spin_unlock(&nn->client_lock);
+}
+
static time64_t
nfs4_laundromat(struct nfsd_net *nn)
{
@@ -6161,12 +6496,14 @@ nfs4_laundromat(struct nfsd_net *nn)
nfs4_get_client_reaplist(nn, &reaplist, &lt);
nfs4_process_client_reaplist(&reaplist);
+ nfs40_clean_admin_revoked(nn, &lt);
+
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (!state_expired(&lt, dp->dl_time))
break;
- WARN_ON(!unhash_delegation_locked(dp));
+ unhash_delegation_locked(dp, SC_STATUS_REVOKED);
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
@@ -6225,6 +6562,8 @@ nfs4_laundromat(struct nfsd_net *nn)
/* service the server-to-server copy delayed unmount list */
nfsd4_ssc_expire_umount(nn);
#endif
+ if (atomic_long_read(&num_delegations) >= max_delegations)
+ deleg_reaper(nn);
out:
return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
}
@@ -6274,7 +6613,7 @@ deleg_reaper(struct nfsd_net *nn)
list_add(&clp->cl_ra_cblist, &cblist);
/* release in nfsd4_cb_recall_any_release */
- atomic_inc(&clp->cl_rpc_users);
+ kref_get(&clp->cl_nfsdfs.cl_ref);
set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
clp->cl_ra_time = ktime_get_boottime_seconds();
}
@@ -6286,6 +6625,8 @@ deleg_reaper(struct nfsd_net *nn)
list_del_init(&clp->cl_ra_cblist);
clp->cl_ra->ra_keep = 0;
clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
+ clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
+ BIT(RCA4_TYPE_MASK_WDATA_DLG);
trace_nfsd_cb_recall_any(clp->cl_ra);
nfsd4_run_cb(&clp->cl_ra->ra_cb);
}
@@ -6379,6 +6720,9 @@ static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_sti
if (ret == nfs_ok)
ret = check_stateid_generation(in, &s->sc_stateid, has_session);
spin_unlock(&s->sc_lock);
+ if (ret == nfserr_admin_revoked)
+ nfsd40_drop_revoked_stid(s->sc_client,
+ &s->sc_stateid);
return ret;
}
@@ -6405,32 +6749,33 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
status = nfsd4_stid_check_stateid_generation(stateid, s, 1);
if (status)
goto out_unlock;
+ status = nfsd4_verify_open_stid(s);
+ if (status)
+ goto out_unlock;
+
switch (s->sc_type) {
- case NFS4_DELEG_STID:
+ case SC_TYPE_DELEG:
status = nfs_ok;
break;
- case NFS4_REVOKED_DELEG_STID:
- status = nfserr_deleg_revoked;
- break;
- case NFS4_OPEN_STID:
- case NFS4_LOCK_STID:
+ case SC_TYPE_OPEN:
+ case SC_TYPE_LOCK:
status = nfsd4_check_openowner_confirmed(openlockstateid(s));
break;
default:
printk("unknown stateid type %x\n", s->sc_type);
- fallthrough;
- case NFS4_CLOSED_STID:
- case NFS4_CLOSED_DELEG_STID:
status = nfserr_bad_stateid;
}
out_unlock:
spin_unlock(&cl->cl_lock);
+ if (status == nfserr_admin_revoked)
+ nfsd40_drop_revoked_stid(cl, stateid);
return status;
}
__be32
nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
- stateid_t *stateid, unsigned char typemask,
+ stateid_t *stateid,
+ unsigned short typemask, unsigned short statusmask,
struct nfs4_stid **s, struct nfsd_net *nn)
{
__be32 status;
@@ -6441,10 +6786,15 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
* only return revoked delegations if explicitly asked.
* otherwise we report revoked or bad_stateid status.
*/
- if (typemask & NFS4_REVOKED_DELEG_STID)
+ if (statusmask & SC_STATUS_REVOKED)
return_revoked = true;
- else if (typemask & NFS4_DELEG_STID)
- typemask |= NFS4_REVOKED_DELEG_STID;
+ if (typemask & SC_TYPE_DELEG)
+ /* Always allow REVOKED for DELEG so we can
+ * retturn the appropriate error.
+ */
+ statusmask |= SC_STATUS_REVOKED;
+
+ statusmask |= SC_STATUS_ADMIN_REVOKED;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
CLOSE_STATEID(stateid))
@@ -6457,14 +6807,17 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
}
if (status)
return status;
- stid = find_stateid_by_type(cstate->clp, stateid, typemask);
+ stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask);
if (!stid)
return nfserr_bad_stateid;
- if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+ if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) {
nfs4_put_stid(stid);
- if (cstate->minorversion)
- return nfserr_deleg_revoked;
- return nfserr_bad_stateid;
+ return nfserr_deleg_revoked;
+ }
+ if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+ nfsd40_drop_revoked_stid(cstate->clp, stateid);
+ nfs4_put_stid(stid);
+ return nfserr_admin_revoked;
}
*s = stid;
return nfs_ok;
@@ -6475,17 +6828,17 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
{
struct nfsd_file *ret = NULL;
- if (!s)
+ if (!s || s->sc_status)
return NULL;
switch (s->sc_type) {
- case NFS4_DELEG_STID:
+ case SC_TYPE_DELEG:
spin_lock(&s->sc_file->fi_lock);
ret = nfsd_file_get(s->sc_file->fi_deleg_file);
spin_unlock(&s->sc_file->fi_lock);
break;
- case NFS4_OPEN_STID:
- case NFS4_LOCK_STID:
+ case SC_TYPE_OPEN:
+ case SC_TYPE_LOCK:
if (flags & RD_STATE)
ret = find_readable_file(s->sc_file);
else
@@ -6598,7 +6951,8 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st,
goto out;
*stid = find_stateid_by_type(found, &cps->cp_p_stateid,
- NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID);
+ SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK,
+ 0);
if (*stid)
status = nfs_ok;
else
@@ -6655,8 +7009,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
}
status = nfsd4_lookup_stateid(cstate, stateid,
- NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
- &s, nn);
+ SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK,
+ 0, &s, nn);
if (status == nfserr_bad_stateid)
status = find_cpntf_state(nn, stateid, &s);
if (status)
@@ -6667,16 +7021,13 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
goto out;
switch (s->sc_type) {
- case NFS4_DELEG_STID:
+ case SC_TYPE_DELEG:
status = nfs4_check_delegmode(delegstateid(s), flags);
break;
- case NFS4_OPEN_STID:
- case NFS4_LOCK_STID:
+ case SC_TYPE_OPEN:
+ case SC_TYPE_LOCK:
status = nfs4_check_olstateid(openlockstateid(s), flags);
break;
- default:
- status = nfserr_bad_stateid;
- break;
}
if (status)
goto out;
@@ -6755,34 +7106,39 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
spin_lock(&cl->cl_lock);
s = find_stateid_locked(cl, stateid);
- if (!s)
+ if (!s || s->sc_status & SC_STATUS_CLOSED)
goto out_unlock;
+ if (s->sc_status & SC_STATUS_ADMIN_REVOKED) {
+ nfsd4_drop_revoked_stid(s);
+ ret = nfs_ok;
+ goto out;
+ }
spin_lock(&s->sc_lock);
switch (s->sc_type) {
- case NFS4_DELEG_STID:
+ case SC_TYPE_DELEG:
+ if (s->sc_status & SC_STATUS_REVOKED) {
+ spin_unlock(&s->sc_lock);
+ dp = delegstateid(s);
+ list_del_init(&dp->dl_recall_lru);
+ spin_unlock(&cl->cl_lock);
+ nfs4_put_stid(s);
+ ret = nfs_ok;
+ goto out;
+ }
ret = nfserr_locks_held;
break;
- case NFS4_OPEN_STID:
+ case SC_TYPE_OPEN:
ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
if (ret)
break;
ret = nfserr_locks_held;
break;
- case NFS4_LOCK_STID:
+ case SC_TYPE_LOCK:
spin_unlock(&s->sc_lock);
refcount_inc(&s->sc_count);
spin_unlock(&cl->cl_lock);
ret = nfsd4_free_lock_stateid(stateid, s);
goto out;
- case NFS4_REVOKED_DELEG_STID:
- spin_unlock(&s->sc_lock);
- dp = delegstateid(s);
- list_del_init(&dp->dl_recall_lru);
- spin_unlock(&cl->cl_lock);
- nfs4_put_stid(s);
- ret = nfs_ok;
- goto out;
- /* Default falls through and returns nfserr_bad_stateid */
}
spin_unlock(&s->sc_lock);
out_unlock:
@@ -6824,6 +7180,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
* @seqid: seqid (provided by client)
* @stateid: stateid (provided by client)
* @typemask: mask of allowable types for this operation
+ * @statusmask: mask of allowed states: 0 or STID_CLOSED
* @stpp: return pointer for the stateid found
* @nn: net namespace for request
*
@@ -6833,7 +7190,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
*/
static __be32
nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
- stateid_t *stateid, char typemask,
+ stateid_t *stateid,
+ unsigned short typemask, unsigned short statusmask,
struct nfs4_ol_stateid **stpp,
struct nfsd_net *nn)
{
@@ -6844,7 +7202,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
trace_nfsd_preprocess(seqid, stateid);
*stpp = NULL;
- status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid,
+ typemask, statusmask, &s, nn);
if (status)
return status;
stp = openlockstateid(s);
@@ -6866,7 +7225,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
struct nfs4_ol_stateid *stp;
status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
- NFS4_OPEN_STID, &stp, nn);
+ SC_TYPE_OPEN, 0, &stp, nn);
if (status)
return status;
oo = openowner(stp->st_stateowner);
@@ -6897,8 +7256,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
status = nfs4_preprocess_seqid_op(cstate,
- oc->oc_seqid, &oc->oc_req_stateid,
- NFS4_OPEN_STID, &stp, nn);
+ oc->oc_seqid, &oc->oc_req_stateid,
+ SC_TYPE_OPEN, 0, &stp, nn);
if (status)
goto out;
oo = openowner(stp->st_stateowner);
@@ -7028,18 +7387,20 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- dprintk("NFSD: nfsd4_close on file %pd\n",
+ dprintk("NFSD: nfsd4_close on file %pd\n",
cstate->current_fh.fh_dentry);
status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
- &close->cl_stateid,
- NFS4_OPEN_STID|NFS4_CLOSED_STID,
- &stp, nn);
+ &close->cl_stateid,
+ SC_TYPE_OPEN, SC_STATUS_CLOSED,
+ &stp, nn);
nfsd4_bump_seqid(cstate, status);
if (status)
- goto out;
+ goto out;
- stp->st_stid.sc_type = NFS4_CLOSED_STID;
+ spin_lock(&stp->st_stid.sc_client->cl_lock);
+ stp->st_stid.sc_status |= SC_STATUS_CLOSED;
+ spin_unlock(&stp->st_stid.sc_client->cl_lock);
/*
* Technically we don't _really_ have to increment or copy it, since
@@ -7081,7 +7442,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
if (status)
goto out;
dp = delegstateid(s);
@@ -7148,7 +7509,7 @@ nfsd4_lm_put_owner(fl_owner_t owner)
static bool
nfsd4_lm_lock_expirable(struct file_lock *cfl)
{
- struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner;
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *) cfl->c.flc_owner;
struct nfs4_client *clp = lo->lo_owner.so_client;
struct nfsd_net *nn;
@@ -7170,7 +7531,7 @@ nfsd4_lm_expire_lock(void)
static void
nfsd4_lm_notify(struct file_lock *fl)
{
- struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner;
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *) fl->c.flc_owner;
struct net *net = lo->lo_owner.so_client->net;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct nfsd4_blocked_lock *nbl = container_of(fl,
@@ -7207,7 +7568,7 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
struct nfs4_lockowner *lo;
if (fl->fl_lmops == &nfsd_posix_mng_ops) {
- lo = (struct nfs4_lockowner *) fl->fl_owner;
+ lo = (struct nfs4_lockowner *) fl->c.flc_owner;
xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner,
GFP_KERNEL);
if (!deny->ld_owner.data)
@@ -7226,7 +7587,7 @@ nevermind:
if (fl->fl_end != NFS4_MAX_UINT64)
deny->ld_length = fl->fl_end - fl->fl_start + 1;
deny->ld_type = NFS4_READ_LT;
- if (fl->fl_type != F_RDLCK)
+ if (fl->c.flc_type != F_RDLCK)
deny->ld_type = NFS4_WRITE_LT;
}
@@ -7348,7 +7709,7 @@ retry:
if (retstp)
goto out_found;
refcount_inc(&stp->st_stid.sc_count);
- stp->st_stid.sc_type = NFS4_LOCK_STID;
+ stp->st_stid.sc_type = SC_TYPE_LOCK;
stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
get_nfs4_file(fp);
stp->st_stid.sc_file = fp;
@@ -7492,8 +7853,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int lkflg;
int err;
bool new = false;
- unsigned char fl_type;
- unsigned int fl_flags = FL_POSIX;
+ unsigned char type;
+ unsigned int flags = FL_POSIX;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -7535,9 +7896,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&lock_stp, &new);
} else {
status = nfs4_preprocess_seqid_op(cstate,
- lock->lk_old_lock_seqid,
- &lock->lk_old_lock_stateid,
- NFS4_LOCK_STID, &lock_stp, nn);
+ lock->lk_old_lock_seqid,
+ &lock->lk_old_lock_stateid,
+ SC_TYPE_LOCK, 0, &lock_stp,
+ nn);
}
if (status)
goto out;
@@ -7556,14 +7918,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
if (lock->lk_reclaim)
- fl_flags |= FL_RECLAIM;
+ flags |= FL_RECLAIM;
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
if (nfsd4_has_session(cstate) ||
exportfs_lock_op_is_async(sb->s_export_op))
- fl_flags |= FL_SLEEP;
+ flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
@@ -7571,12 +7933,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (nf)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
spin_unlock(&fp->fi_lock);
- fl_type = F_RDLCK;
+ type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
if (nfsd4_has_session(cstate) ||
exportfs_lock_op_is_async(sb->s_export_op))
- fl_flags |= FL_SLEEP;
+ flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
@@ -7584,7 +7946,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (nf)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
spin_unlock(&fp->fi_lock);
- fl_type = F_WRLCK;
+ type = F_WRLCK;
break;
default:
status = nfserr_inval;
@@ -7604,7 +7966,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* on those filesystems:
*/
if (!exportfs_lock_op_is_async(sb->s_export_op))
- fl_flags &= ~FL_SLEEP;
+ flags &= ~FL_SLEEP;
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
if (!nbl) {
@@ -7614,11 +7976,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
file_lock = &nbl->nbl_lock;
- file_lock->fl_type = fl_type;
- file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
- file_lock->fl_pid = current->tgid;
- file_lock->fl_file = nf->nf_file;
- file_lock->fl_flags = fl_flags;
+ file_lock->c.flc_type = type;
+ file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
+ file_lock->c.flc_pid = current->tgid;
+ file_lock->c.flc_file = nf->nf_file;
+ file_lock->c.flc_flags = flags;
file_lock->fl_lmops = &nfsd_posix_mng_ops;
file_lock->fl_start = lock->lk_offset;
file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
@@ -7631,7 +7993,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- if (fl_flags & FL_SLEEP) {
+ if (flags & FL_SLEEP) {
nbl->nbl_time = ktime_get_boottime_seconds();
spin_lock(&nn->blocked_locks_lock);
list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
@@ -7668,7 +8030,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
out:
if (nbl) {
/* dequeue it if we queued it before */
- if (fl_flags & FL_SLEEP) {
+ if (flags & FL_SLEEP) {
spin_lock(&nn->blocked_locks_lock);
if (!list_empty(&nbl->nbl_list) &&
!list_empty(&nbl->nbl_lru)) {
@@ -7736,9 +8098,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct
err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
if (err)
goto out;
- lock->fl_file = nf->nf_file;
+ lock->c.flc_file = nf->nf_file;
err = nfserrno(vfs_test_lock(nf->nf_file, lock));
- lock->fl_file = NULL;
+ lock->c.flc_file = NULL;
out:
inode_unlock(inode);
nfsd_file_put(nf);
@@ -7783,11 +8145,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (lockt->lt_type) {
case NFS4_READ_LT:
case NFS4_READW_LT:
- file_lock->fl_type = F_RDLCK;
+ file_lock->c.flc_type = F_RDLCK;
break;
case NFS4_WRITE_LT:
case NFS4_WRITEW_LT:
- file_lock->fl_type = F_WRLCK;
+ file_lock->c.flc_type = F_WRLCK;
break;
default:
dprintk("NFSD: nfs4_lockt: bad lock type!\n");
@@ -7797,9 +8159,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
lo = find_lockowner_str(cstate->clp, &lockt->lt_owner);
if (lo)
- file_lock->fl_owner = (fl_owner_t)lo;
- file_lock->fl_pid = current->tgid;
- file_lock->fl_flags = FL_POSIX;
+ file_lock->c.flc_owner = (fl_owner_t)lo;
+ file_lock->c.flc_pid = current->tgid;
+ file_lock->c.flc_flags = FL_POSIX;
file_lock->fl_start = lockt->lt_offset;
file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
@@ -7810,7 +8172,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- if (file_lock->fl_type != F_UNLCK) {
+ if (file_lock->c.flc_type != F_UNLCK) {
status = nfserr_denied;
nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
}
@@ -7850,8 +8212,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_inval;
status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
- &locku->lu_stateid, NFS4_LOCK_STID,
- &stp, nn);
+ &locku->lu_stateid, SC_TYPE_LOCK, 0,
+ &stp, nn);
if (status)
goto out;
nf = find_any_file(stp->st_stid.sc_file);
@@ -7866,11 +8228,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto put_file;
}
- file_lock->fl_type = F_UNLCK;
- file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
- file_lock->fl_pid = current->tgid;
- file_lock->fl_file = nf->nf_file;
- file_lock->fl_flags = FL_POSIX;
+ file_lock->c.flc_type = F_UNLCK;
+ file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
+ file_lock->c.flc_pid = current->tgid;
+ file_lock->c.flc_file = nf->nf_file;
+ file_lock->c.flc_flags = FL_POSIX;
file_lock->fl_lmops = &nfsd_posix_mng_ops;
file_lock->fl_start = locku->lu_offset;
@@ -7927,8 +8289,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
if (flctx && !list_empty_careful(&flctx->flc_posix)) {
spin_lock(&flctx->flc_lock);
- list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
- if (fl->fl_owner == (fl_owner_t)lowner) {
+ for_each_file_lock(fl, &flctx->flc_posix) {
+ if (fl->c.flc_owner == (fl_owner_t)lowner) {
status = true;
break;
}
@@ -7996,7 +8358,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
stp = list_first_entry(&lo->lo_owner.so_stateids,
struct nfs4_ol_stateid,
st_perstateowner);
- WARN_ON(!unhash_lock_stateid(stp));
+ unhash_lock_stateid(stp);
put_ol_stateid_locked(stp, &reaplist);
}
spin_unlock(&clp->cl_lock);
@@ -8289,7 +8651,7 @@ nfs4_state_shutdown_net(struct net *net)
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- WARN_ON(!unhash_delegation_locked(dp));
+ unhash_delegation_locked(dp, SC_STATUS_CLOSED);
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
@@ -8431,6 +8793,8 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
* nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
* @rqstp: RPC transaction context
* @inode: file to be checked for a conflict
+ * @modified: return true if file was modified
+ * @size: new size of file if modified is true
*
* This function is called when there is a conflict between a write
* delegation and a change/size GETATTR from another client. The server
@@ -8439,27 +8803,30 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
* delegation before replying to the GETATTR. See RFC 8881 section
* 18.7.4.
*
- * The current implementation does not support CB_GETATTR yet. However
- * this can avoid recalling the delegation could be added in follow up
- * work.
- *
* Returns 0 if there is no conflict; otherwise an nfs_stat
* code is returned.
*/
__be32
-nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
+nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
+ bool *modified, u64 *size)
{
__be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct file_lock_context *ctx;
- struct file_lock *fl;
+ struct file_lease *fl;
struct nfs4_delegation *dp;
+ struct iattr attrs;
+ struct nfs4_cb_fattr *ncf;
+ *modified = false;
ctx = locks_inode_context(inode);
if (!ctx)
return 0;
spin_lock(&ctx->flc_lock);
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
- if (fl->fl_flags == FL_LAYOUT)
+ for_each_file_lock(fl, &ctx->flc_lease) {
+ unsigned char type = fl->c.flc_type;
+
+ if (fl->c.flc_flags == FL_LAYOUT)
continue;
if (fl->fl_lmops != &nfsd_lease_mng_ops) {
/*
@@ -8467,23 +8834,49 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
* we are done; there isn't any write delegation
* on this inode
*/
- if (fl->fl_type == F_RDLCK)
+ if (type == F_RDLCK)
break;
goto break_lease;
}
- if (fl->fl_type == F_WRLCK) {
- dp = fl->fl_owner;
+ if (type == F_WRLCK) {
+ dp = fl->c.flc_owner;
if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
spin_unlock(&ctx->flc_lock);
return 0;
}
break_lease:
+ nfsd_stats_wdeleg_getattr_inc(nn);
+ dp = fl->c.flc_owner;
+ ncf = &dp->dl_cb_fattr;
+ nfs4_cb_getattr(&dp->dl_cb_fattr);
spin_unlock(&ctx->flc_lock);
- nfsd_stats_wdeleg_getattr_inc();
- status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
- if (status != nfserr_jukebox ||
- !nfsd_wait_for_delegreturn(rqstp, inode))
- return status;
+ wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY,
+ TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT);
+ if (ncf->ncf_cb_status) {
+ /* Recall delegation only if client didn't respond */
+ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+ if (status != nfserr_jukebox ||
+ !nfsd_wait_for_delegreturn(rqstp, inode))
+ return status;
+ }
+ if (!ncf->ncf_file_modified &&
+ (ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
+ ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
+ ncf->ncf_file_modified = true;
+ if (ncf->ncf_file_modified) {
+ /*
+ * Per section 10.4.3 of RFC 8881, the server would
+ * not update the file's metadata with the client's
+ * modified size
+ */
+ attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
+ attrs.ia_valid = ATTR_MTIME | ATTR_CTIME;
+ setattr_copy(&nop_mnt_idmap, inode, &attrs);
+ mark_inode_dirty(inode);
+ ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
+ *size = ncf->ncf_cur_fsize;
+ *modified = true;
+ }
return 0;
}
break;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c719c475a068..1955481832e0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3490,11 +3490,13 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
struct dentry *dentry, const u32 *bmval,
int ignore_crossmnt)
{
+ DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
struct nfsd4_fattr_args args;
struct svc_fh *tempfh = NULL;
int starting_len = xdr->buf->len;
__be32 *attrlen_p, status;
int attrlen_offset;
+ u32 attrmask[3];
int err;
struct nfsd4_compoundres *resp = rqstp->rq_resp;
u32 minorversion = resp->cstate.minorversion;
@@ -3502,11 +3504,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
.mnt = exp->ex_path.mnt,
.dentry = dentry,
};
- union {
- u32 attrmask[3];
- unsigned long mask[2];
- } u;
unsigned long bit;
+ bool file_modified = false;
+ u64 size = 0;
WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
@@ -3519,21 +3519,21 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
/*
* Make a local copy of the attribute bitmap that can be modified.
*/
- memset(&u, 0, sizeof(u));
- u.attrmask[0] = bmval[0];
- u.attrmask[1] = bmval[1];
- u.attrmask[2] = bmval[2];
+ attrmask[0] = bmval[0];
+ attrmask[1] = bmval[1];
+ attrmask[2] = bmval[2];
args.rdattr_err = 0;
if (exp->ex_fslocs.migrated) {
- status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1],
- &u.attrmask[2], &args.rdattr_err);
+ status = fattr_handle_absent_fs(&attrmask[0], &attrmask[1],
+ &attrmask[2], &args.rdattr_err);
if (status)
goto out;
}
args.size = 0;
- if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
- status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
+ if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+ status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry),
+ &file_modified, &size);
if (status)
goto out;
}
@@ -3543,20 +3543,23 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
AT_STATX_SYNC_AS_STAT);
if (err)
goto out_nfserr;
- args.size = args.stat.size;
+ if (file_modified)
+ args.size = size;
+ else
+ args.size = args.stat.size;
if (!(args.stat.result_mask & STATX_BTIME))
/* underlying FS does not offer btime so we can't share it */
- u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
- if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+ if ((attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
- (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+ (attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
FATTR4_WORD1_SPACE_TOTAL))) {
err = vfs_statfs(&path, &args.statfs);
if (err)
goto out_nfserr;
}
- if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
+ if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
!fhp) {
tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
status = nfserr_jukebox;
@@ -3571,10 +3574,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
args.fhp = fhp;
args.acl = NULL;
- if (u.attrmask[0] & FATTR4_WORD0_ACL) {
+ if (attrmask[0] & FATTR4_WORD0_ACL) {
err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl);
if (err == -EOPNOTSUPP)
- u.attrmask[0] &= ~FATTR4_WORD0_ACL;
+ attrmask[0] &= ~FATTR4_WORD0_ACL;
else if (err == -EINVAL) {
status = nfserr_attrnotsupp;
goto out;
@@ -3586,17 +3589,17 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
args.context = NULL;
- if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
- u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+ attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
err = security_inode_getsecctx(d_inode(dentry),
&args.context, &args.contextlen);
else
err = -EOPNOTSUPP;
args.contextsupport = (err == 0);
- if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ if (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
if (err == -EOPNOTSUPP)
- u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
else if (err)
goto out_nfserr;
}
@@ -3604,8 +3607,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
/* attrmask */
- status = nfsd4_encode_bitmap4(xdr, u.attrmask[0],
- u.attrmask[1], u.attrmask[2]);
+ status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1],
+ attrmask[2]);
if (status)
goto out;
@@ -3614,7 +3617,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
if (!attrlen_p)
goto out_resource;
- for_each_set_bit(bit, (const unsigned long *)&u.mask,
+ bitmap_from_arr32(attr_bitmap, attrmask,
+ ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
+ for_each_set_bit(bit, attr_bitmap,
ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
if (status != nfs_ok)
@@ -5386,16 +5391,11 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
/*
* If the cookie is larger than the maximum number we can fit
- * in either the buffer we just got back from vfs_listxattr, or,
- * XDR-encoded, in the return buffer, it's invalid.
+ * in the buffer we just got back from vfs_listxattr, it's invalid.
*/
if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2))
return nfserr_badcookie;
- if (cookie > (listxattrs->lsxa_maxcount /
- (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4)))
- return nfserr_badcookie;
-
*offsetp = (u32)cookie;
return 0;
}
@@ -5412,6 +5412,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
u64 cookie;
char *sp;
__be32 status, tmp;
+ __be64 wire_cookie;
__be32 *p;
u32 nuser;
@@ -5427,7 +5428,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
*/
cookie_offset = xdr->buf->len;
count_offset = cookie_offset + 8;
- p = xdr_reserve_space(xdr, 12);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 3);
if (!p) {
status = nfserr_resource;
goto out;
@@ -5438,7 +5439,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
sp = listxattrs->lsxa_buf;
nuser = 0;
- xdrleft = listxattrs->lsxa_maxcount;
+ /* Bytes left is maxcount - 8 (cookie) - 4 (array count) */
+ xdrleft = listxattrs->lsxa_maxcount - XDR_UNIT * 3;
while (left > 0 && xdrleft > 0) {
slen = strlen(sp);
@@ -5451,7 +5453,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
slen -= XATTR_USER_PREFIX_LEN;
xdrlen = 4 + ((slen + 3) & ~3);
- if (xdrlen > xdrleft) {
+ /* Check if both entry and eof can fit in the XDR buffer */
+ if (xdrlen + XDR_UNIT > xdrleft) {
if (count == 0) {
/*
* Can't even fit the first attribute name.
@@ -5503,7 +5506,8 @@ wreof:
cookie = offset + count;
- write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8);
+ wire_cookie = cpu_to_be64(cookie);
+ write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &wire_cookie, 8);
tmp = cpu_to_be32(count);
write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4);
out:
@@ -5727,27 +5731,24 @@ release:
rqstp->rq_next_page = xdr->page_ptr + 1;
}
-/*
- * Encode the reply stored in the stateowner reply cache
- *
- * XDR note: do not encode rp->rp_buflen: the buffer contains the
- * previously sent already encoded operation.
+/**
+ * nfsd4_encode_replay - encode a result stored in the stateowner reply cache
+ * @xdr: send buffer's XDR stream
+ * @op: operation being replayed
+ *
+ * @op->replay->rp_buf contains the previously-sent already-encoded result.
*/
-void
-nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
+void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
{
- __be32 *p;
struct nfs4_replay *rp = op->replay;
- p = xdr_reserve_space(xdr, 8 + rp->rp_buflen);
- if (!p) {
- WARN_ON_ONCE(1);
- return;
- }
- *p++ = cpu_to_be32(op->opnum);
- *p++ = rp->rp_status; /* already xdr'ed */
+ trace_nfsd_stateowner_replay(op->opnum, rp);
- p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen);
+ if (xdr_stream_encode_u32(xdr, op->opnum) != XDR_UNIT)
+ return;
+ if (xdr_stream_encode_be32(xdr, rp->rp_status) != XDR_UNIT)
+ return;
+ xdr_stream_encode_opaque_fixed(xdr, rp->rp_buf, rp->rp_buflen);
}
void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5c1a4a0aa605..ba9d326b3de6 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -166,8 +166,7 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp,
int nfsd_drc_slab_create(void)
{
- drc_slab = kmem_cache_create("nfsd_drc",
- sizeof(struct nfsd_cacherep), 0, 0, NULL);
+ drc_slab = KMEM_CACHE(nfsd_cacherep, 0);
return drc_slab ? 0: -ENOMEM;
}
@@ -176,27 +175,6 @@ void nfsd_drc_slab_free(void)
kmem_cache_destroy(drc_slab);
}
-/**
- * nfsd_net_reply_cache_init - per net namespace reply cache set-up
- * @nn: nfsd_net being initialized
- *
- * Returns zero on succes; otherwise a negative errno is returned.
- */
-int nfsd_net_reply_cache_init(struct nfsd_net *nn)
-{
- return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
-}
-
-/**
- * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down
- * @nn: nfsd_net being freed
- *
- */
-void nfsd_net_reply_cache_destroy(struct nfsd_net *nn)
-{
- nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
-}
-
int nfsd_reply_cache_init(struct nfsd_net *nn)
{
unsigned int hashsize;
@@ -501,7 +479,7 @@ out:
int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
unsigned int len, struct nfsd_cacherep **cacherep)
{
- struct nfsd_net *nn;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct nfsd_cacherep *rp, *found;
__wsum csum;
struct nfsd_drc_bucket *b;
@@ -510,7 +488,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
int rtn = RC_DOIT;
if (type == RC_NOCACHE) {
- nfsd_stats_rc_nocache_inc();
+ nfsd_stats_rc_nocache_inc(nn);
goto out;
}
@@ -520,7 +498,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
* Since the common case is a cache miss followed by an insert,
* preallocate an entry.
*/
- nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
rp = nfsd_cacherep_alloc(rqstp, csum, nn);
if (!rp)
goto out;
@@ -537,7 +514,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
nfsd_cacherep_dispose(&dispose);
- nfsd_stats_rc_misses_inc();
+ nfsd_stats_rc_misses_inc(nn);
atomic_inc(&nn->num_drc_entries);
nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
goto out;
@@ -545,7 +522,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
found_entry:
/* We found a matching entry which is either in progress or done. */
nfsd_reply_cache_free_locked(NULL, rp, nn);
- nfsd_stats_rc_hits_inc();
+ nfsd_stats_rc_hits_inc(nn);
rtn = RC_DROPIT;
rp = found;
@@ -687,15 +664,15 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
atomic_read(&nn->num_drc_entries));
seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits);
seq_printf(m, "mem usage: %lld\n",
- percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE]));
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE]));
seq_printf(m, "cache hits: %lld\n",
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]));
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]));
seq_printf(m, "cache misses: %lld\n",
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]));
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]));
seq_printf(m, "not cached: %lld\n",
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]));
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]));
seq_printf(m, "payload misses: %lld\n",
- percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES]));
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]));
seq_printf(m, "longest chain len: %u\n", nn->longest_chain);
seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize);
return 0;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index f206ca32e7f5..ecd18bffeebc 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -281,6 +281,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
* 3. Is that directory the root of an exported file system?
*/
error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
+ nfsd4_revoke_states(netns(file), path.dentry->d_sb);
path_put(&path);
return error;
@@ -1671,14 +1672,17 @@ static __net_init int nfsd_net_init(struct net *net)
retval = nfsd_idmap_init(net);
if (retval)
goto out_idmap_error;
- retval = nfsd_net_reply_cache_init(nn);
+ retval = nfsd_stat_counters_init(nn);
if (retval)
goto out_repcache_error;
+ memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
+ nn->nfsd_svcstats.program = &nfsd_program;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
+ nfsd_proc_stat_init(net);
return 0;
@@ -1699,7 +1703,8 @@ static __net_exit void nfsd_net_exit(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- nfsd_net_reply_cache_destroy(nn);
+ nfsd_proc_stat_shutdown(net);
+ nfsd_stat_counters_destroy(nn);
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
nfsd_netns_free_versions(nn);
@@ -1722,12 +1727,9 @@ static int __init init_nfsd(void)
retval = nfsd4_init_pnfs();
if (retval)
goto out_free_slabs;
- retval = nfsd_stat_init(); /* Statistics */
- if (retval)
- goto out_free_pnfs;
retval = nfsd_drc_slab_create();
if (retval)
- goto out_free_stat;
+ goto out_free_pnfs;
nfsd_lockd_init(); /* lockd->nfsd callbacks */
retval = create_proc_exports_entry();
if (retval)
@@ -1761,8 +1763,6 @@ out_free_exports:
out_free_lockd:
nfsd_lockd_shutdown();
nfsd_drc_slab_free();
-out_free_stat:
- nfsd_stat_shutdown();
out_free_pnfs:
nfsd4_exit_pnfs();
out_free_slabs:
@@ -1780,7 +1780,6 @@ static void __exit exit_nfsd(void)
nfsd_drc_slab_free();
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
- nfsd_stat_shutdown();
nfsd_lockd_shutdown();
nfsd4_free_slabs();
nfsd4_exit_pnfs();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 304e9728b929..16c5a05f340e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -86,6 +86,7 @@ extern struct mutex nfsd_mutex;
extern spinlock_t nfsd_drc_lock;
extern unsigned long nfsd_drc_max_mem;
extern unsigned long nfsd_drc_mem_used;
+extern atomic_t nfsd_th_cnt; /* number of available threads */
extern const struct seq_operations nfs_exports_op;
@@ -274,6 +275,7 @@ void nfsd_lockd_shutdown(void);
#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE)
#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD)
#define nfserr_badname cpu_to_be32(NFSERR_BADNAME)
+#define nfserr_admin_revoked cpu_to_be32(NFS4ERR_ADMIN_REVOKED)
#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN)
#define nfserr_locked cpu_to_be32(NFSERR_LOCKED)
#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC)
@@ -365,6 +367,7 @@ void nfsd_lockd_shutdown(void);
#define NFSD_CLIENT_MAX_TRIM_PER_RUN 128
#define NFS4_CLIENTS_PER_GB 1024
#define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */
+#define NFSD_CB_GETATTR_TIMEOUT NFSD_DELEGRETURN_TIMEOUT
/*
* The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index dbfa0ac13564..40fecf7b224f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -327,6 +327,7 @@ out:
__be32
fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_export *exp = NULL;
struct dentry *dentry;
__be32 error;
@@ -395,7 +396,7 @@ skip_pseudoflavor_check:
out:
trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
if (error == nfserr_stale)
- nfsd_stats_fh_stale_inc(exp);
+ nfsd_stats_fh_stale_inc(nn, exp);
return error;
}
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a7315928a760..36370b957b63 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -103,7 +103,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
}
}
- resp->status = nfsd_setattr(rqstp, fhp, &attrs, 0, (time64_t)0);
+ resp->status = nfsd_setattr(rqstp, fhp, &attrs, NULL);
if (resp->status != nfs_ok)
goto out;
@@ -390,8 +390,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
*/
attr->ia_valid &= ATTR_SIZE;
if (attr->ia_valid)
- resp->status = nfsd_setattr(rqstp, newfhp, &attrs, 0,
- (time64_t)0);
+ resp->status = nfsd_setattr(rqstp, newfhp, &attrs,
+ NULL);
}
out_unlock:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a667802e08e7..c0d17b92b249 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,7 @@
#define NFSDDBG_FACILITY NFSDDBG_SVC
+atomic_t nfsd_th_cnt = ATOMIC_INIT(0);
extern struct svc_program nfsd_program;
static int nfsd(void *vrqstp);
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
@@ -80,7 +81,6 @@ unsigned long nfsd_drc_max_mem;
unsigned long nfsd_drc_mem_used;
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-static struct svc_stat nfsd_acl_svcstats;
static const struct svc_version *nfsd_acl_version[] = {
# if defined(CONFIG_NFSD_V2_ACL)
[2] = &nfsd_acl_version2,
@@ -99,15 +99,11 @@ static struct svc_program nfsd_acl_program = {
.pg_vers = nfsd_acl_version,
.pg_name = "nfsacl",
.pg_class = "nfsd",
- .pg_stats = &nfsd_acl_svcstats,
.pg_authenticate = &svc_set_client,
.pg_init_request = nfsd_acl_init_request,
.pg_rpcbind_set = nfsd_acl_rpcbind_set,
};
-static struct svc_stat nfsd_acl_svcstats = {
- .program = &nfsd_acl_program,
-};
#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
static const struct svc_version *nfsd_version[] = {
@@ -132,7 +128,6 @@ struct svc_program nfsd_program = {
.pg_vers = nfsd_version, /* version table */
.pg_name = "nfsd", /* program name */
.pg_class = "nfsd", /* authentication class */
- .pg_stats = &nfsd_svcstats, /* version table */
.pg_authenticate = &svc_set_client, /* export authentication */
.pg_init_request = nfsd_init_request,
.pg_rpcbind_set = nfsd_rpcbind_set,
@@ -666,7 +661,8 @@ int nfsd_create_serv(struct net *net)
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions(nn);
- serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
+ serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats,
+ nfsd_max_blksize, nfsd);
if (serv == NULL)
return -ENOMEM;
@@ -929,7 +925,7 @@ nfsd(void *vrqstp)
current->fs->umask = 0;
- atomic_inc(&nfsdstats.th_cnt);
+ atomic_inc(&nfsd_th_cnt);
set_freezable();
@@ -941,9 +937,11 @@ nfsd(void *vrqstp)
rqstp->rq_server->sv_maxconn = nn->max_connections;
svc_recv(rqstp);
+
+ nfsd_file_net_dispose(nn);
}
- atomic_dec(&nfsdstats.th_cnt);
+ atomic_dec(&nfsd_th_cnt);
out:
/* Release the thread */
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index de1e0dfed06a..925817f66917 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -37,7 +37,8 @@ struct nfsd4_layout_ops {
__be32 (*proc_layoutcommit)(struct inode *inode,
struct nfsd4_layoutcommit *lcp);
- void (*fence_client)(struct nfs4_layout_stateid *ls);
+ void (*fence_client)(struct nfs4_layout_stateid *ls,
+ struct nfsd_file *file);
};
extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
@@ -72,11 +73,13 @@ void nfsd4_setup_layout_type(struct svc_export *exp);
void nfsd4_return_all_client_layouts(struct nfs4_client *);
void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
struct nfs4_file *fp);
+void nfsd4_close_layout(struct nfs4_layout_stateid *ls);
int nfsd4_init_pnfs(void);
void nfsd4_exit_pnfs(void);
#else
struct nfs4_client;
struct nfs4_file;
+struct nfs4_layout_stateid;
static inline void nfsd4_setup_layout_type(struct svc_export *exp)
{
@@ -89,6 +92,9 @@ static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
struct nfs4_file *fp)
{
}
+static inline void nfsd4_close_layout(struct nfs4_layout_stateid *ls)
+{
+}
static inline void nfsd4_exit_pnfs(void)
{
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 41bdc913fa71..01c6f3445646 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,7 +68,7 @@ struct nfsd4_callback {
struct nfs4_client *cb_clp;
struct rpc_message cb_msg;
const struct nfsd4_callback_ops *cb_ops;
- struct work_struct cb_work;
+ struct delayed_work cb_work;
int cb_seq_status;
int cb_status;
bool cb_need_restart;
@@ -88,17 +88,34 @@ struct nfsd4_callback_ops {
*/
struct nfs4_stid {
refcount_t sc_count;
-#define NFS4_OPEN_STID 1
-#define NFS4_LOCK_STID 2
-#define NFS4_DELEG_STID 4
-/* For an open stateid kept around *only* to process close replays: */
-#define NFS4_CLOSED_STID 8
+
+ /* A new stateid is added to the cl_stateids idr early before it
+ * is fully initialised. Its sc_type is then zero. After
+ * initialisation the sc_type it set under cl_lock, and then
+ * never changes.
+ */
+#define SC_TYPE_OPEN BIT(0)
+#define SC_TYPE_LOCK BIT(1)
+#define SC_TYPE_DELEG BIT(2)
+#define SC_TYPE_LAYOUT BIT(3)
+ unsigned short sc_type;
+
+/* state_lock protects sc_status for delegation stateids.
+ * ->cl_lock protects sc_status for open and lock stateids.
+ * ->st_mutex also protect sc_status for open stateids.
+ * ->ls_lock protects sc_status for layout stateids.
+ */
+/*
+ * For an open stateid kept around *only* to process close replays.
+ * For deleg stateid, kept in idr until last reference is dropped.
+ */
+#define SC_STATUS_CLOSED BIT(0)
/* For a deleg stateid kept around only to process free_stateid's: */
-#define NFS4_REVOKED_DELEG_STID 16
-#define NFS4_CLOSED_DELEG_STID 32
-#define NFS4_LAYOUT_STID 64
+#define SC_STATUS_REVOKED BIT(1)
+#define SC_STATUS_ADMIN_REVOKED BIT(2)
+ unsigned short sc_status;
+
struct list_head sc_cp_list;
- unsigned char sc_type;
stateid_t sc_stateid;
spinlock_t sc_lock;
struct nfs4_client *sc_client;
@@ -117,6 +134,24 @@ struct nfs4_cpntf_state {
time64_t cpntf_time; /* last time stateid used */
};
+struct nfs4_cb_fattr {
+ struct nfsd4_callback ncf_getattr;
+ u32 ncf_cb_status;
+ u32 ncf_cb_bmap[1];
+
+ /* from CB_GETATTR reply */
+ u64 ncf_cb_change;
+ u64 ncf_cb_fsize;
+
+ unsigned long ncf_cb_flags;
+ bool ncf_file_modified;
+ u64 ncf_initial_cinfo;
+ u64 ncf_cur_fsize;
+};
+
+/* bits for ncf_cb_flags */
+#define CB_GETATTR_BUSY 0
+
/*
* Represents a delegation stateid. The nfs4_client holds references to these
* and they are put when it is being destroyed or when the delegation is
@@ -150,6 +185,9 @@ struct nfs4_delegation {
int dl_retries;
struct nfsd4_callback dl_recall;
bool dl_recalled;
+
+ /* for CB_GETATTR */
+ struct nfs4_cb_fattr dl_cb_fattr;
};
#define cb_to_delegation(cb) \
@@ -317,8 +355,9 @@ enum {
* 0. If they are not renewed within a lease period, they become eligible for
* destruction by the laundromat.
*
- * These objects can also be destroyed prematurely by the fault injection code,
- * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates.
+ * These objects can also be destroyed if the client sends certain forms of
+ * SETCLIENTID or EXCHANGE_ID operations.
+ *
* Care is taken *not* to do this however when the objects have an elevated
* refcount.
*
@@ -326,7 +365,7 @@ enum {
*
* o Each nfs4_clients is also hashed by name (the opaque quantity initially
* sent by the client to identify itself).
- *
+ *
* o cl_perclient list is used to ensure no dangling stateowner references
* when we expire the nfs4_client
*/
@@ -351,6 +390,7 @@ struct nfs4_client {
clientid_t cl_clientid; /* generated by server */
nfs4_verifier cl_confirm; /* generated by server */
u32 cl_minorversion;
+ atomic_t cl_admin_revoked; /* count of admin-revoked states */
/* NFSv4.1 client implementation id: */
struct xdr_netobj cl_nii_domain;
struct xdr_netobj cl_nii_name;
@@ -640,6 +680,7 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_SEQUENCE,
NFSPROC4_CLNT_CB_NOTIFY_LOCK,
NFSPROC4_CLNT_CB_RECALL_ANY,
+ NFSPROC4_CLNT_CB_GETATTR,
};
/* Returns true iff a is later than b: */
@@ -672,15 +713,15 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
stateid_t *stateid, int flags, struct nfsd_file **filp,
struct nfs4_stid **cstid);
__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
- stateid_t *stateid, unsigned char typemask,
- struct nfs4_stid **s, struct nfsd_net *nn);
+ stateid_t *stateid, unsigned short typemask,
+ unsigned short statusmask,
+ struct nfs4_stid **s, struct nfsd_net *nn);
struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
void (*sc_free)(struct nfs4_stid *));
int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy);
void nfs4_free_copy_state(struct nfsd4_copy *copy);
struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
struct nfs4_stid *p_stid);
-void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
@@ -714,6 +755,14 @@ static inline void get_nfs4_file(struct nfs4_file *fi)
}
struct nfsd_file *find_any_file(struct nfs4_file *f);
+#ifdef CONFIG_NFSD_V4
+void nfsd4_revoke_states(struct net *net, struct super_block *sb);
+#else
+static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb)
+{
+}
+#endif
+
/* grace period management */
void nfsd4_end_grace(struct nfsd_net *nn);
@@ -732,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
}
extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
- struct inode *inode);
+ struct inode *inode, bool *file_modified, u64 *size);
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 12d79f5d4eb1..be52fb1e928e 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -27,25 +27,22 @@
#include "nfsd.h"
-struct nfsd_stats nfsdstats;
-struct svc_stat nfsd_svcstats = {
- .program = &nfsd_program,
-};
-
static int nfsd_show(struct seq_file *seq, void *v)
{
+ struct net *net = pde_data(file_inode(seq->file));
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int i;
seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n",
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]),
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]),
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]),
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]),
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]),
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE]));
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]),
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]),
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]),
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]),
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]),
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE]));
/* thread usage: */
- seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt));
+ seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt));
/* deprecated thread usage histogram stats */
for (i = 0; i < 10; i++)
@@ -55,7 +52,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n");
/* show my rpc info */
- svc_seq_show(seq, &nfsd_svcstats);
+ svc_seq_show(seq, &nn->nfsd_svcstats);
#ifdef CONFIG_NFSD_V4
/* Show count for individual nfsv4 operations */
@@ -63,10 +60,10 @@ static int nfsd_show(struct seq_file *seq, void *v)
seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1);
for (i = 0; i <= LAST_NFS4_OP; i++) {
seq_printf(seq, " %lld",
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)]));
}
seq_printf(seq, "\nwdeleg_getattr %lld",
- percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]));
+ percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR]));
seq_putc(seq, '\n');
#endif
@@ -108,31 +105,24 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
percpu_counter_destroy(&counters[i]);
}
-static int nfsd_stat_counters_init(void)
+int nfsd_stat_counters_init(struct nfsd_net *nn)
{
- return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+ return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM);
}
-static void nfsd_stat_counters_destroy(void)
+void nfsd_stat_counters_destroy(struct nfsd_net *nn)
{
- nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+ nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM);
}
-int nfsd_stat_init(void)
+void nfsd_proc_stat_init(struct net *net)
{
- int err;
-
- err = nfsd_stat_counters_init();
- if (err)
- return err;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops);
-
- return 0;
+ svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
}
-void nfsd_stat_shutdown(void)
+void nfsd_proc_stat_shutdown(struct net *net)
{
- nfsd_stat_counters_destroy();
- svc_proc_unregister(&init_net, "nfsd");
+ svc_proc_unregister(net, "nfsd");
}
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 14f50c660b61..d2753e975dfd 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,94 +10,72 @@
#include <uapi/linux/nfsd/stats.h>
#include <linux/percpu_counter.h>
-
-enum {
- NFSD_STATS_RC_HITS, /* repcache hits */
- NFSD_STATS_RC_MISSES, /* repcache misses */
- NFSD_STATS_RC_NOCACHE, /* uncached reqs */
- NFSD_STATS_FH_STALE, /* FH stale error */
- NFSD_STATS_IO_READ, /* bytes returned to read requests */
- NFSD_STATS_IO_WRITE, /* bytes passed in write requests */
-#ifdef CONFIG_NFSD_V4
- NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */
- NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
-#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op))
- NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */
-#endif
- NFSD_STATS_COUNTERS_NUM
-};
-
-struct nfsd_stats {
- struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM];
-
- atomic_t th_cnt; /* number of available threads */
-};
-
-extern struct nfsd_stats nfsdstats;
-
-extern struct svc_stat nfsd_svcstats;
-
int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
-int nfsd_stat_init(void);
-void nfsd_stat_shutdown(void);
+int nfsd_stat_counters_init(struct nfsd_net *nn);
+void nfsd_stat_counters_destroy(struct nfsd_net *nn);
+void nfsd_proc_stat_init(struct net *net);
+void nfsd_proc_stat_shutdown(struct net *net);
-static inline void nfsd_stats_rc_hits_inc(void)
+static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn)
{
- percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]);
+ percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]);
}
-static inline void nfsd_stats_rc_misses_inc(void)
+static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn)
{
- percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]);
+ percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]);
}
-static inline void nfsd_stats_rc_nocache_inc(void)
+static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn)
{
- percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]);
+ percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]);
}
-static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
+static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn,
+ struct svc_export *exp)
{
- percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
+ percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]);
if (exp && exp->ex_stats)
percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]);
}
-static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
+static inline void nfsd_stats_io_read_add(struct nfsd_net *nn,
+ struct svc_export *exp, s64 amount)
{
- percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
+ percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount);
if (exp && exp->ex_stats)
percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount);
}
-static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
+static inline void nfsd_stats_io_write_add(struct nfsd_net *nn,
+ struct svc_export *exp, s64 amount)
{
- percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
+ percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount);
if (exp && exp->ex_stats)
percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount);
}
static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
{
- percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]);
+ percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]);
}
static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount)
{
- percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+ percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount);
}
static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
{
- percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+ percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount);
}
#ifdef CONFIG_NFSD_V4
-static inline void nfsd_stats_wdeleg_getattr_inc(void)
+static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn)
{
- percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]);
+ percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]);
}
#endif
#endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index d1e8cf079b0f..1cd2076210b1 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,8 +9,10 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/xprt.h>
#include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
#include "export.h"
#include "nfsfh.h"
@@ -102,7 +104,7 @@ TRACE_EVENT(nfsd_compound,
TP_fast_assign(
__entry->xid = be32_to_cpu(rqst->rq_xid);
__entry->opcnt = opcnt;
- __assign_str_len(tag, tag, taglen);
+ __assign_str(tag, tag);
),
TP_printk("xid=0x%08x opcnt=%u tag=%s",
__entry->xid, __entry->opcnt, __get_str(tag)
@@ -483,7 +485,7 @@ TRACE_EVENT(nfsd_dirent,
TP_fast_assign(
__entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
__entry->ino = ino;
- __assign_str_len(name, name, namlen)
+ __assign_str(name, name);
),
TP_printk("fh_hash=0x%08x ino=%llu name=%s",
__entry->fh_hash, __entry->ino, __get_str(name)
@@ -641,23 +643,18 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
DEFINE_STATESEQID_EVENT(preprocess);
DEFINE_STATESEQID_EVENT(open_confirm);
-TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
-TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
-TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
-TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
-
#define show_stid_type(x) \
__print_flags(x, "|", \
- { NFS4_OPEN_STID, "OPEN" }, \
- { NFS4_LOCK_STID, "LOCK" }, \
- { NFS4_DELEG_STID, "DELEG" }, \
- { NFS4_CLOSED_STID, "CLOSED" }, \
- { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \
- { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \
- { NFS4_LAYOUT_STID, "LAYOUT" })
+ { SC_TYPE_OPEN, "OPEN" }, \
+ { SC_TYPE_LOCK, "LOCK" }, \
+ { SC_TYPE_DELEG, "DELEG" }, \
+ { SC_TYPE_LAYOUT, "LAYOUT" })
+
+#define show_stid_status(x) \
+ __print_flags(x, "|", \
+ { SC_STATUS_CLOSED, "CLOSED" }, \
+ { SC_STATUS_REVOKED, "REVOKED" }, \
+ { SC_STATUS_ADMIN_REVOKED, "ADMIN_REVOKED" })
DECLARE_EVENT_CLASS(nfsd_stid_class,
TP_PROTO(
@@ -666,6 +663,7 @@ DECLARE_EVENT_CLASS(nfsd_stid_class,
TP_ARGS(stid),
TP_STRUCT__entry(
__field(unsigned long, sc_type)
+ __field(unsigned long, sc_status)
__field(int, sc_count)
__field(u32, cl_boot)
__field(u32, cl_id)
@@ -676,16 +674,18 @@ DECLARE_EVENT_CLASS(nfsd_stid_class,
const stateid_t *stp = &stid->sc_stateid;
__entry->sc_type = stid->sc_type;
+ __entry->sc_status = stid->sc_status;
__entry->sc_count = refcount_read(&stid->sc_count);
__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
__entry->cl_id = stp->si_opaque.so_clid.cl_id;
__entry->si_id = stp->si_opaque.so_id;
__entry->si_generation = stp->si_generation;
),
- TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
+ TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s state=%s",
__entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation,
- __entry->sc_count, show_stid_type(__entry->sc_type)
+ __entry->sc_count, show_stid_type(__entry->sc_type),
+ show_stid_status(__entry->sc_status)
)
);
@@ -696,6 +696,59 @@ DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \
DEFINE_STID_EVENT(revoke);
+TRACE_EVENT(nfsd_stateowner_replay,
+ TP_PROTO(
+ u32 opnum,
+ const struct nfs4_replay *rp
+ ),
+ TP_ARGS(opnum, rp),
+ TP_STRUCT__entry(
+ __field(unsigned long, status)
+ __field(u32, opnum)
+ ),
+ TP_fast_assign(
+ __entry->status = be32_to_cpu(rp->rp_status);
+ __entry->opnum = opnum;
+ ),
+ TP_printk("opnum=%u status=%lu",
+ __entry->opnum, __entry->status)
+);
+
+TRACE_EVENT_CONDITION(nfsd_seq4_status,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct nfsd4_sequence *sequence
+ ),
+ TP_ARGS(rqstp, sequence),
+ TP_CONDITION(sequence->status_flags),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __field(u32, xid)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, seqno)
+ __field(u32, reserved)
+ __field(unsigned long, status_flags)
+ ),
+ TP_fast_assign(
+ const struct nfsd4_sessionid *sid =
+ (struct nfsd4_sessionid *)&sequence->sessionid;
+
+ __entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->cl_boot = sid->clientid.cl_boot;
+ __entry->cl_id = sid->clientid.cl_id;
+ __entry->seqno = sid->sequence;
+ __entry->reserved = sid->reserved;
+ __entry->status_flags = sequence->status_flags;
+ ),
+ TP_printk("xid=0x%08x sessionid=%08x:%08x:%08x:%08x status_flags=%s",
+ __entry->xid, __entry->cl_boot, __entry->cl_id,
+ __entry->seqno, __entry->reserved,
+ show_nfs4_seq4_status(__entry->status_flags)
+ )
+);
+
DECLARE_EVENT_CLASS(nfsd_clientid_class,
TP_PROTO(const clientid_t *clid),
TP_ARGS(clid),
@@ -843,7 +896,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
__array(unsigned char, addr, sizeof(struct sockaddr_in6))
__field(unsigned long, flavor)
__array(unsigned char, verifier, NFS4_VERIFIER_SIZE)
- __string_len(name, name, clp->cl_name.len)
+ __string_len(name, clp->cl_name.data, clp->cl_name.len)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -853,7 +906,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
__entry->flavor = clp->cl_cred.cr_flavor;
memcpy(__entry->verifier, (void *)&clp->cl_verifier,
NFS4_VERIFIER_SIZE);
- __assign_str_len(name, clp->cl_name.data, clp->cl_name.len);
+ __assign_str(name, clp->cl_name.data);
),
TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
__entry->addr, __get_str(name),
@@ -1334,7 +1387,8 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \
TP_PROTO(const struct nfs4_client *clp), \
TP_ARGS(clp))
-DEFINE_NFSD_CB_EVENT(state);
+DEFINE_NFSD_CB_EVENT(start);
+DEFINE_NFSD_CB_EVENT(new_state);
DEFINE_NFSD_CB_EVENT(probe);
DEFINE_NFSD_CB_EVENT(lost);
DEFINE_NFSD_CB_EVENT(shutdown);
@@ -1405,6 +1459,128 @@ TRACE_EVENT(nfsd_cb_setup_err,
__entry->error)
);
+DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const struct nfsd4_callback *cb
+ ),
+ TP_ARGS(clp, cb),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(const void *, cb)
+ __field(bool, need_restart)
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __entry->cb = cb;
+ __entry->need_restart = cb->cb_need_restart;
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x cb=%p%s",
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->cb, __entry->need_restart ?
+ " (need restart)" : " (first try)"
+ )
+);
+
+#define DEFINE_NFSD_CB_LIFETIME_EVENT(name) \
+DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name, \
+ TP_PROTO( \
+ const struct nfs4_client *clp, \
+ const struct nfsd4_callback *cb \
+ ), \
+ TP_ARGS(clp, cb))
+
+DEFINE_NFSD_CB_LIFETIME_EVENT(queue);
+DEFINE_NFSD_CB_LIFETIME_EVENT(destroy);
+DEFINE_NFSD_CB_LIFETIME_EVENT(restart);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown);
+
+TRACE_EVENT(nfsd_cb_seq_status,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfsd4_callback *cb
+ ),
+ TP_ARGS(task, cb),
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, seqno)
+ __field(u32, reserved)
+ __field(int, tk_status)
+ __field(int, seq_status)
+ ),
+ TP_fast_assign(
+ const struct nfs4_client *clp = cb->cb_clp;
+ const struct nfsd4_session *session = clp->cl_cb_session;
+ const struct nfsd4_sessionid *sid =
+ (struct nfsd4_sessionid *)&session->se_sessionid;
+
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client ?
+ task->tk_client->cl_clid : -1;
+ __entry->cl_boot = sid->clientid.cl_boot;
+ __entry->cl_id = sid->clientid.cl_id;
+ __entry->seqno = sid->sequence;
+ __entry->reserved = sid->reserved;
+ __entry->tk_status = task->tk_status;
+ __entry->seq_status = cb->cb_seq_status;
+ ),
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n",
+ __entry->task_id, __entry->client_id,
+ __entry->cl_boot, __entry->cl_id,
+ __entry->seqno, __entry->reserved,
+ __entry->tk_status, __entry->seq_status
+ )
+);
+
+TRACE_EVENT(nfsd_cb_free_slot,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfsd4_callback *cb
+ ),
+ TP_ARGS(task, cb),
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, seqno)
+ __field(u32, reserved)
+ __field(u32, slot_seqno)
+ ),
+ TP_fast_assign(
+ const struct nfs4_client *clp = cb->cb_clp;
+ const struct nfsd4_session *session = clp->cl_cb_session;
+ const struct nfsd4_sessionid *sid =
+ (struct nfsd4_sessionid *)&session->se_sessionid;
+
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client ?
+ task->tk_client->cl_clid : -1;
+ __entry->cl_boot = sid->clientid.cl_boot;
+ __entry->cl_id = sid->clientid.cl_id;
+ __entry->seqno = sid->sequence;
+ __entry->reserved = sid->reserved;
+ __entry->slot_seqno = session->se_cb_seq_nr;
+ ),
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n",
+ __entry->task_id, __entry->client_id,
+ __entry->cl_boot, __entry->cl_id,
+ __entry->seqno, __entry->reserved,
+ __entry->slot_seqno
+ )
+);
+
TRACE_EVENT_CONDITION(nfsd_cb_recall,
TP_PROTO(
const struct nfs4_stid *stid
@@ -1800,7 +1976,7 @@ TRACE_EVENT(nfsd_ctl_time,
TP_fast_assign(
__entry->netns_ino = net->ns.inum;
__entry->time = time;
- __assign_str_len(name, name, namelen);
+ __assign_str(name, name);
),
TP_printk("file=%s time=%d\n",
__get_str(name), __entry->time
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b7c7a9273ea0..2e41eb4c3cec 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,7 +25,6 @@
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
#include <linux/jhash.h>
-#include <linux/ima.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
@@ -476,7 +475,6 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
* @rqstp: controlling RPC transaction
* @fhp: filehandle of target
* @attr: attributes to set
- * @check_guard: set to 1 if guardtime is a valid timestamp
* @guardtime: do not act if ctime.tv_sec does not match this timestamp
*
* This call may adjust the contents of @attr (in particular, this
@@ -488,8 +486,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
*/
__be32
nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct nfsd_attrs *attr,
- int check_guard, time64_t guardtime)
+ struct nfsd_attrs *attr, const struct timespec64 *guardtime)
{
struct dentry *dentry;
struct inode *inode;
@@ -497,7 +494,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int accmode = NFSD_MAY_SATTR;
umode_t ftype = 0;
__be32 err;
- int host_err;
+ int host_err = 0;
bool get_write_count;
bool size_change = (iap->ia_valid & ATTR_SIZE);
int retries;
@@ -538,9 +535,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_sanitize_attrs(inode, iap);
- if (check_guard && guardtime != inode_get_ctime_sec(inode))
- return nfserr_notsync;
-
/*
* The size case is special, it changes the file in addition to the
* attributes, and file systems don't expect it to be mixed with
@@ -555,6 +549,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
inode_lock(inode);
+ err = fh_fill_pre_attrs(fhp);
+ if (err)
+ goto out_unlock;
+
+ if (guardtime) {
+ struct timespec64 ctime = inode_get_ctime(inode);
+ if ((u32)guardtime->tv_sec != (u32)ctime.tv_sec ||
+ guardtime->tv_nsec != ctime.tv_nsec) {
+ err = nfserr_notsync;
+ goto out_fill_attrs;
+ }
+ }
+
for (retries = 1;;) {
struct iattr attrs;
@@ -582,13 +589,23 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
attr->na_aclerr = set_posix_acl(&nop_mnt_idmap,
dentry, ACL_TYPE_DEFAULT,
attr->na_dpacl);
+out_fill_attrs:
+ /*
+ * RFC 1813 Section 3.3.2 does not mandate that an NFS server
+ * returns wcc_data for SETATTR. Some client implementations
+ * depend on receiving wcc_data, however, to sort out partial
+ * updates (eg., the client requested that size and mode be
+ * modified, but the server changed only the file mode).
+ */
+ fh_fill_post_attrs(fhp);
+out_unlock:
inode_unlock(inode);
if (size_change)
put_write_access(inode);
out:
if (!host_err)
host_err = commit_metadata(fhp);
- return nfserrno(host_err);
+ return err != 0 ? err : nfserrno(host_err);
}
#if defined(CONFIG_NFSD_V4)
@@ -877,7 +894,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
goto out;
}
- host_err = ima_file_check(file, may_flags);
+ host_err = security_file_post_open(file, may_flags);
if (host_err) {
fput(file);
goto out;
@@ -1002,7 +1019,9 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned long *count, u32 *eof, ssize_t host_err)
{
if (host_err >= 0) {
- nfsd_stats_io_read_add(fhp->fh_export, host_err);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ nfsd_stats_io_read_add(nn, fhp->fh_export, host_err);
*eof = nfsd_eof_on_read(file, offset, host_err, *count);
*count = host_err;
fsnotify_access(file);
@@ -1185,7 +1204,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
goto out_nfserr;
}
*cnt = host_err;
- nfsd_stats_io_write_add(exp, *cnt);
+ nfsd_stats_io_write_add(nn, exp, *cnt);
fsnotify_modify(file);
host_err = filemap_check_wb_err(file->f_mapping, since);
if (host_err < 0)
@@ -1404,7 +1423,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
* if the attributes have not changed.
*/
if (iap->ia_valid)
- status = nfsd_setattr(rqstp, resfhp, attrs, 0, (time64_t)0);
+ status = nfsd_setattr(rqstp, resfhp, attrs, NULL);
else
status = nfserrno(commit_metadata(resfhp));
@@ -1833,7 +1852,7 @@ retry:
trap = lock_rename(tdentry, fdentry);
if (IS_ERR(trap)) {
err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
- goto out;
+ goto out_want_write;
}
err = fh_fill_pre_attrs(ffhp);
if (err != nfs_ok)
@@ -1903,13 +1922,14 @@ retry:
}
out_unlock:
unlock_rename(tdentry, fdentry);
+out_want_write:
fh_drop_write(ffhp);
/*
- * If the target dentry has cached open files, then we need to try to
- * close them prior to doing the rename. Flushing delayed fput
- * shouldn't be done with locks held however, so we delay it until this
- * point and then reattempt the whole shebang.
+ * If the target dentry has cached open files, then we need to
+ * try to close them prior to doing the rename. Final fput
+ * shouldn't be done with locks held however, so we delay it
+ * until this point and then reattempt the whole shebang.
*/
if (close_cached) {
close_cached = false;
@@ -2177,11 +2197,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
if (err == nfserr_eof || err == nfserr_toosmall)
err = nfs_ok; /* can still be found in ->err */
out_close:
- fput(file);
+ nfsd_filp_close(file);
out:
return err;
}
+/**
+ * nfsd_filp_close: close a file synchronously
+ * @fp: the file to close
+ *
+ * nfsd_filp_close() is similar in behaviour to filp_close().
+ * The difference is that if this is the final close on the
+ * file, the that finalisation happens immediately, rather then
+ * being handed over to a work_queue, as it the case for
+ * filp_close().
+ * When a user-space process closes a file (even when using
+ * filp_close() the finalisation happens before returning to
+ * userspace, so it is effectively synchronous. When a kernel thread
+ * uses file_close(), on the other hand, the handling is completely
+ * asynchronous. This means that any cost imposed by that finalisation
+ * is not imposed on the nfsd thread, and nfsd could potentually
+ * close files more quickly than the work queue finalises the close,
+ * which would lead to unbounded growth in the queue.
+ *
+ * In some contexts is it not safe to synchronously wait for
+ * close finalisation (see comment for __fput_sync()), but nfsd
+ * does not match those contexts. In partcilarly it does not, at the
+ * time that this function is called, hold and locks and no finalisation
+ * of any file, socket, or device driver would have any cause to wait
+ * for nfsd to make progress.
+ */
+void nfsd_filp_close(struct file *fp)
+{
+ get_file(fp);
+ filp_close(fp, NULL);
+ __fput_sync(fp);
+}
+
/*
* Get file system stats
* N.B. After this call fhp needs an fh_put
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 702fbc4483bf..c60fdb6200fd 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -69,7 +69,7 @@ __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
const char *, unsigned int,
struct svc_export **, struct dentry **);
__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
- struct nfsd_attrs *, int, time64_t);
+ struct nfsd_attrs *, const struct timespec64 *);
int nfsd_mountpoint(struct dentry *, struct svc_export *);
#ifdef CONFIG_NFSD_V4
__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
@@ -148,6 +148,8 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
struct dentry *, int);
+void nfsd_filp_close(struct file *fp);
+
static inline int fh_want_write(struct svc_fh *fh)
{
int ret;
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 03fe4e21306c..522067b7fd75 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -14,7 +14,7 @@ struct nfsd3_sattrargs {
struct svc_fh fh;
struct iattr attrs;
int check_guard;
- time64_t guardtime;
+ struct timespec64 guardtime;
};
struct nfsd3_diropargs {
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 0d39af1b00a0..e8b00309c449 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -54,3 +54,21 @@
#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+
+/*
+ * 1: CB_GETATTR opcode (32-bit)
+ * N: file_handle
+ * 1: number of entry in attribute array (32-bit)
+ * 1: entry 0 in attribute array (32-bit)
+ */
+#define NFS4_enc_cb_getattr_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + enc_nfs4_fh_sz + 1 + 1)
+/*
+ * 4: fattr_bitmap_maxsz
+ * 1: attribute array len
+ * 2: change attr (64-bit)
+ * 2: size (64-bit)
+ */
+#define NFS4_dec_cb_getattr_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz)
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7342de296ec3..89caef7513db 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -525,54 +525,55 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
if (ret < 0)
return ret;
- desc_kaddr = kmap(desc_bh->b_page);
+ desc_kaddr = kmap_local_page(desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(
inode, group, desc_bh, desc_kaddr);
n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
maxgroup);
- for (j = 0; j < n; j++, desc++, group++) {
+ for (j = 0; j < n; j++, desc++, group++, group_offset = 0) {
lock = nilfs_mdt_bgl_lock(inode, group);
- if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
- ret = nilfs_palloc_get_bitmap_block(
- inode, group, 1, &bitmap_bh);
- if (ret < 0)
- goto out_desc;
- bitmap_kaddr = kmap(bitmap_bh->b_page);
- bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
- pos = nilfs_palloc_find_available_slot(
- bitmap, group_offset,
- entries_per_group, lock);
- if (pos >= 0) {
- /* found a free entry */
- nilfs_palloc_group_desc_add_entries(
- desc, lock, -1);
- req->pr_entry_nr =
- entries_per_group * group + pos;
- kunmap(desc_bh->b_page);
- kunmap(bitmap_bh->b_page);
-
- req->pr_desc_bh = desc_bh;
- req->pr_bitmap_bh = bitmap_bh;
- return 0;
- }
- kunmap(bitmap_bh->b_page);
- brelse(bitmap_bh);
+ if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0)
+ continue;
+
+ kunmap_local(desc_kaddr);
+ ret = nilfs_palloc_get_bitmap_block(inode, group, 1,
+ &bitmap_bh);
+ if (unlikely(ret < 0)) {
+ brelse(desc_bh);
+ return ret;
}
- group_offset = 0;
+ desc_kaddr = kmap_local_page(desc_bh->b_page);
+ desc = nilfs_palloc_block_get_group_desc(
+ inode, group, desc_bh, desc_kaddr);
+
+ bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
+ bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+ pos = nilfs_palloc_find_available_slot(
+ bitmap, group_offset, entries_per_group, lock);
+ kunmap_local(bitmap_kaddr);
+ if (pos >= 0)
+ goto found;
+
+ brelse(bitmap_bh);
}
- kunmap(desc_bh->b_page);
+ kunmap_local(desc_kaddr);
brelse(desc_bh);
}
/* no entries left */
return -ENOSPC;
- out_desc:
- kunmap(desc_bh->b_page);
- brelse(desc_bh);
- return ret;
+found:
+ /* found a free entry */
+ nilfs_palloc_group_desc_add_entries(desc, lock, -1);
+ req->pr_entry_nr = entries_per_group * group + pos;
+ kunmap_local(desc_kaddr);
+
+ req->pr_desc_bh = desc_bh;
+ req->pr_bitmap_bh = bitmap_bh;
+ return 0;
}
/**
@@ -606,10 +607,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
- desc_kaddr = kmap(req->pr_desc_bh->b_page);
+ desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(inode, group,
req->pr_desc_bh, desc_kaddr);
- bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+ bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
@@ -621,8 +622,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
- kunmap(req->pr_bitmap_bh->b_page);
- kunmap(req->pr_desc_bh->b_page);
+ kunmap_local(bitmap_kaddr);
+ kunmap_local(desc_kaddr);
mark_buffer_dirty(req->pr_desc_bh);
mark_buffer_dirty(req->pr_bitmap_bh);
@@ -647,10 +648,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
- desc_kaddr = kmap(req->pr_desc_bh->b_page);
+ desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(inode, group,
req->pr_desc_bh, desc_kaddr);
- bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+ bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
@@ -662,8 +663,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
- kunmap(req->pr_bitmap_bh->b_page);
- kunmap(req->pr_desc_bh->b_page);
+ kunmap_local(bitmap_kaddr);
+ kunmap_local(desc_kaddr);
brelse(req->pr_bitmap_bh);
brelse(req->pr_desc_bh);
@@ -755,7 +756,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
/* Get the first entry number of the group */
group_min_nr = (__u64)group * epg;
- bitmap_kaddr = kmap(bitmap_bh->b_page);
+ bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
@@ -801,7 +802,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
entry_start = rounddown(group_offset, epb);
} while (true);
- kunmap(bitmap_bh->b_page);
+ kunmap_local(bitmap_kaddr);
mark_buffer_dirty(bitmap_bh);
brelse(bitmap_bh);
@@ -815,11 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
inode->i_ino);
}
- desc_kaddr = kmap_atomic(desc_bh->b_page);
+ desc_kaddr = kmap_local_page(desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(
inode, group, desc_bh, desc_kaddr);
nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
- kunmap_atomic(desc_kaddr);
+ kunmap_local(desc_kaddr);
mark_buffer_dirty(desc_bh);
nilfs_mdt_mark_dirty(inode);
brelse(desc_bh);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 7a8f166f2c8d..383f0afa2cea 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -548,13 +548,10 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
*/
void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
{
- down_write(&bmap->b_sem);
memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
NILFS_INODE_BMAP_SIZE * sizeof(__le64));
if (bmap->b_inode->i_ino == NILFS_DAT_INO)
bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
-
- up_write(&bmap->b_sem);
}
void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 13592e82eaf6..65659fa0372e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -724,7 +724,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
dat = nilfs_bmap_get_dat(btree);
ret = nilfs_dat_translate(dat, ptr, &blocknr);
if (ret < 0)
- goto out;
+ goto dat_error;
ptr = blocknr;
}
cnt = 1;
@@ -743,7 +743,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
if (dat) {
ret = nilfs_dat_translate(dat, ptr2, &blocknr);
if (ret < 0)
- goto out;
+ goto dat_error;
ptr2 = blocknr;
}
if (ptr2 != ptr + cnt || ++cnt == maxblocks)
@@ -781,6 +781,11 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
out:
nilfs_btree_free_path(path);
return ret;
+
+ dat_error:
+ if (ret == -ENOENT)
+ ret = -EINVAL; /* Notify bmap layer of metadata corruption */
+ goto out;
}
static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 39136637f715..69a5cced1e84 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -28,7 +28,7 @@ nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
{
__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
- do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+ tcno = div64_ul(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
return (unsigned long)tcno;
}
@@ -187,35 +187,90 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
}
/**
- * nilfs_cpfile_get_checkpoint - get a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @create: create flag
- * @cpp: pointer to a checkpoint
- * @bhp: pointer to a buffer head
- *
- * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
- * specified by @cno. A new checkpoint will be created if @cno is the current
- * checkpoint number and @create is nonzero.
- *
- * Return Value: On success, 0 is returned, and the checkpoint and the
- * buffer head of the buffer on which the checkpoint is located are stored in
- * the place pointed by @cpp and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
+ * nilfs_cpfile_read_checkpoint - read a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno: number of checkpoint entry to read
+ * @root: nilfs root object
+ * @ifile: ifile's inode to read and attach to @root
*
- * %-EIO - I/O error.
+ * This function imports checkpoint information from the checkpoint file and
+ * stores it to the inode file given by @ifile and the nilfs root object
+ * given by @root.
*
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL - Invalid checkpoint.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
+ */
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+ struct nilfs_root *root, struct inode *ifile)
+{
+ struct buffer_head *cp_bh;
+ struct nilfs_checkpoint *cp;
+ void *kaddr;
+ int ret;
+
+ if (cno < 1 || cno > nilfs_mdt_cno(cpfile))
+ return -EINVAL;
+
+ down_read(&NILFS_MDT(cpfile)->mi_sem);
+ ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+ if (unlikely(ret < 0)) {
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ goto out_sem;
+ }
+
+ kaddr = kmap_local_page(cp_bh->b_page);
+ cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+ if (nilfs_checkpoint_invalid(cp)) {
+ ret = -EINVAL;
+ goto put_cp;
+ }
+
+ ret = nilfs_read_inode_common(ifile, &cp->cp_ifile_inode);
+ if (unlikely(ret)) {
+ /*
+ * Since this inode is on a checkpoint entry, treat errors
+ * as metadata corruption.
+ */
+ nilfs_err(cpfile->i_sb,
+ "ifile inode (checkpoint number=%llu) corrupted",
+ (unsigned long long)cno);
+ ret = -EIO;
+ goto put_cp;
+ }
+
+ /* Configure the nilfs root object */
+ atomic64_set(&root->inodes_count, le64_to_cpu(cp->cp_inodes_count));
+ atomic64_set(&root->blocks_count, le64_to_cpu(cp->cp_blocks_count));
+ root->ifile = ifile;
+
+put_cp:
+ kunmap_local(kaddr);
+ brelse(cp_bh);
+out_sem:
+ up_read(&NILFS_MDT(cpfile)->mi_sem);
+ return ret;
+}
+
+/**
+ * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile
+ * @cpfile: checkpoint file inode
+ * @cno: number of checkpoint to set up
*
- * %-ENOENT - No such checkpoint.
+ * This function creates a checkpoint with the number specified by @cno on
+ * cpfile. If the specified checkpoint entry already exists due to a past
+ * failure, it will be reused without returning an error.
+ * In either case, the buffer of the block containing the checkpoint entry
+ * and the cpfile inode are made dirty for inclusion in the write log.
*
- * %-EINVAL - invalid checkpoint.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-EROFS - Read only filesystem
*/
-int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
- __u64 cno,
- int create,
- struct nilfs_checkpoint **cpp,
- struct buffer_head **bhp)
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
{
struct buffer_head *header_bh, *cp_bh;
struct nilfs_cpfile_header *header;
@@ -223,70 +278,128 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
void *kaddr;
int ret;
- if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
- (cno < nilfs_mdt_cno(cpfile) && create)))
- return -EINVAL;
+ if (WARN_ON_ONCE(cno < 1))
+ return -EIO;
down_write(&NILFS_MDT(cpfile)->mi_sem);
-
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
- if (ret < 0)
+ if (unlikely(ret < 0)) {
+ if (ret == -ENOENT) {
+ nilfs_error(cpfile->i_sb,
+ "checkpoint creation failed due to metadata corruption.");
+ ret = -EIO;
+ }
goto out_sem;
- ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
- if (ret < 0)
+ }
+ ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh);
+ if (unlikely(ret < 0))
goto out_header;
- kaddr = kmap(cp_bh->b_page);
+
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
if (nilfs_checkpoint_invalid(cp)) {
- if (!create) {
- kunmap(cp_bh->b_page);
- brelse(cp_bh);
- ret = -ENOENT;
- goto out_header;
- }
/* a newly-created checkpoint */
nilfs_checkpoint_clear_invalid(cp);
if (!nilfs_cpfile_is_in_first(cpfile, cno))
nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
kaddr, 1);
- mark_buffer_dirty(cp_bh);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh,
kaddr);
le64_add_cpu(&header->ch_ncheckpoints, 1);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(header_bh);
- nilfs_mdt_mark_dirty(cpfile);
+ } else {
+ kunmap_local(kaddr);
}
- if (cpp != NULL)
- *cpp = cp;
- *bhp = cp_bh;
+ /* Force the buffer and the inode to become dirty */
+ mark_buffer_dirty(cp_bh);
+ brelse(cp_bh);
+ nilfs_mdt_mark_dirty(cpfile);
- out_header:
+out_header:
brelse(header_bh);
- out_sem:
+out_sem:
up_write(&NILFS_MDT(cpfile)->mi_sem);
return ret;
}
/**
- * nilfs_cpfile_put_checkpoint - put a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @bh: buffer head
+ * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno: checkpoint number
+ * @root: nilfs root object
+ * @blkinc: number of blocks added by this checkpoint
+ * @ctime: checkpoint creation time
+ * @minor: minor checkpoint flag
+ *
+ * This function completes the checkpoint entry numbered by @cno in the
+ * cpfile with the data given by the arguments @root, @blkinc, @ctime, and
+ * @minor.
*
- * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
- * specified by @cno. @bh must be the buffer head which has been returned by
- * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
*/
-void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
- struct buffer_head *bh)
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+ struct nilfs_root *root, __u64 blkinc,
+ time64_t ctime, bool minor)
{
- kunmap(bh->b_page);
- brelse(bh);
+ struct buffer_head *cp_bh;
+ struct nilfs_checkpoint *cp;
+ void *kaddr;
+ int ret;
+
+ if (WARN_ON_ONCE(cno < 1))
+ return -EIO;
+
+ down_write(&NILFS_MDT(cpfile)->mi_sem);
+ ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+ if (unlikely(ret < 0)) {
+ if (ret == -ENOENT)
+ goto error;
+ goto out_sem;
+ }
+
+ kaddr = kmap_local_page(cp_bh->b_page);
+ cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+ if (unlikely(nilfs_checkpoint_invalid(cp))) {
+ kunmap_local(kaddr);
+ brelse(cp_bh);
+ goto error;
+ }
+
+ cp->cp_snapshot_list.ssl_next = 0;
+ cp->cp_snapshot_list.ssl_prev = 0;
+ cp->cp_inodes_count = cpu_to_le64(atomic64_read(&root->inodes_count));
+ cp->cp_blocks_count = cpu_to_le64(atomic64_read(&root->blocks_count));
+ cp->cp_nblk_inc = cpu_to_le64(blkinc);
+ cp->cp_create = cpu_to_le64(ctime);
+ cp->cp_cno = cpu_to_le64(cno);
+
+ if (minor)
+ nilfs_checkpoint_set_minor(cp);
+ else
+ nilfs_checkpoint_clear_minor(cp);
+
+ nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode);
+ nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode);
+
+ kunmap_local(kaddr);
+ brelse(cp_bh);
+out_sem:
+ up_write(&NILFS_MDT(cpfile)->mi_sem);
+ return ret;
+
+error:
+ nilfs_error(cpfile->i_sb,
+ "checkpoint finalization failed due to metadata corruption.");
+ ret = -EIO;
+ goto out_sem;
}
/**
@@ -347,7 +460,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
continue;
}
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(
cpfile, cno, cp_bh, kaddr);
nicps = 0;
@@ -369,7 +482,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cp_bh, kaddr, nicps);
if (count == 0) {
/* make hole */
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(cp_bh);
ret =
nilfs_cpfile_delete_checkpoint_block(
@@ -384,18 +497,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
}
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(cp_bh);
}
if (tnicps > 0) {
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh,
kaddr);
le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(cpfile);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
brelse(header_bh);
@@ -447,7 +560,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
}
ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
if (!nilfs_checkpoint_invalid(cp)) {
@@ -457,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
n++;
}
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
}
@@ -491,10 +604,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
ret = nilfs_cpfile_get_header_block(cpfile, &bh);
if (ret < 0)
goto out;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
if (curr == 0) {
ret = 0;
@@ -512,7 +625,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
ret = 0; /* No snapshots (started from a hole block) */
goto out;
}
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
while (n < nci) {
cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
curr = ~(__u64)0; /* Terminator */
@@ -528,7 +641,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
if (curr_blkoff != next_blkoff) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
0, &bh);
@@ -536,12 +649,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
WARN_ON(ret == -ENOENT);
goto out;
}
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
}
curr = next;
curr_blkoff = next_blkoff;
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
*cnop = curr;
ret = n;
@@ -650,24 +763,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
if (nilfs_checkpoint_invalid(cp)) {
ret = -ENOENT;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
goto out_cp;
}
if (nilfs_checkpoint_snapshot(cp)) {
ret = 0;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
goto out_cp;
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
if (ret < 0)
goto out_cp;
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
list = &header->ch_snapshot_list;
curr_bh = header_bh;
@@ -679,13 +792,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
curr = prev;
if (curr_blkoff != prev_blkoff) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(curr_bh);
ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
0, &curr_bh);
if (ret < 0)
goto out_header;
- kaddr = kmap_atomic(curr_bh->b_page);
+ kaddr = kmap_local_page(curr_bh->b_page);
}
curr_blkoff = prev_blkoff;
cp = nilfs_cpfile_block_get_checkpoint(
@@ -693,7 +806,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
list = &cp->cp_snapshot_list;
prev = le64_to_cpu(list->ssl_prev);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (prev != 0) {
ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
@@ -705,29 +818,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
get_bh(prev_bh);
}
- kaddr = kmap_atomic(curr_bh->b_page);
+ kaddr = kmap_local_page(curr_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, curr, curr_bh, kaddr);
list->ssl_prev = cpu_to_le64(cno);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
nilfs_checkpoint_set_snapshot(cp);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(prev_bh->b_page);
+ kaddr = kmap_local_page(prev_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, prev, prev_bh, kaddr);
list->ssl_next = cpu_to_le64(cno);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
le64_add_cpu(&header->ch_nsnapshots, 1);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(prev_bh);
mark_buffer_dirty(curr_bh);
@@ -768,23 +881,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
if (nilfs_checkpoint_invalid(cp)) {
ret = -ENOENT;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
goto out_cp;
}
if (!nilfs_checkpoint_snapshot(cp)) {
ret = 0;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
goto out_cp;
}
list = &cp->cp_snapshot_list;
next = le64_to_cpu(list->ssl_next);
prev = le64_to_cpu(list->ssl_prev);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
if (ret < 0)
@@ -808,29 +921,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
get_bh(prev_bh);
}
- kaddr = kmap_atomic(next_bh->b_page);
+ kaddr = kmap_local_page(next_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, next, next_bh, kaddr);
list->ssl_prev = cpu_to_le64(prev);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(prev_bh->b_page);
+ kaddr = kmap_local_page(prev_bh->b_page);
list = nilfs_cpfile_block_get_snapshot_list(
cpfile, prev, prev_bh, kaddr);
list->ssl_next = cpu_to_le64(next);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(cp_bh->b_page);
+ kaddr = kmap_local_page(cp_bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
nilfs_checkpoint_clear_snapshot(cp);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
le64_add_cpu(&header->ch_nsnapshots, -1);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(next_bh);
mark_buffer_dirty(prev_bh);
@@ -889,13 +1002,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
if (ret < 0)
goto out;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
if (nilfs_checkpoint_invalid(cp))
ret = -ENOENT;
else
ret = nilfs_checkpoint_snapshot(cp);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
out:
@@ -972,12 +1085,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
ret = nilfs_cpfile_get_header_block(cpfile, &bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
cpstat->cs_cno = nilfs_mdt_cno(cpfile);
cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
out_sem:
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index edabb2dc5756..f5b1d59289eb 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -16,10 +16,12 @@
#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */
-int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
- struct nilfs_checkpoint **,
- struct buffer_head **);
-void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+ struct nilfs_root *root, struct inode *ifile);
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+ struct nilfs_root *root, __u64 blkinc,
+ time64_t ctime, bool minor);
int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9cf6ba58f585..180fc8d36213 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -91,13 +91,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
entry->de_blocknr = cpu_to_le64(0);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_palloc_commit_alloc_entry(dat, req);
nilfs_dat_commit_entry(dat, req);
@@ -115,13 +115,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
entry->de_blocknr = cpu_to_le64(0);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_dat_commit_entry(dat, req);
@@ -145,12 +145,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
struct nilfs_dat_entry *entry;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
entry->de_blocknr = cpu_to_le64(blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_dat_commit_entry(dat, req);
}
@@ -167,12 +167,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
if (ret < 0)
return ret;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (blocknr == 0) {
ret = nilfs_palloc_prepare_free_entry(dat, req);
@@ -202,7 +202,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
sector_t blocknr;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
end = start = le64_to_cpu(entry->de_start);
@@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
}
entry->de_end = cpu_to_le64(end);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (blocknr == 0)
nilfs_dat_commit_free(dat, req);
@@ -227,12 +227,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
sector_t blocknr;
void *kaddr;
- kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (start == nilfs_mdt_cno(dat) && blocknr == 0)
nilfs_palloc_abort_free_entry(dat, req);
@@ -362,7 +362,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
}
}
- kaddr = kmap_atomic(entry_bh->b_page);
+ kaddr = kmap_local_page(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
nilfs_crit(dat->i_sb,
@@ -370,13 +370,13 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
__func__, (unsigned long long)vblocknr,
(unsigned long long)le64_to_cpu(entry->de_start),
(unsigned long long)le64_to_cpu(entry->de_end));
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(entry_bh);
return -EINVAL;
}
WARN_ON(blocknr == 0);
entry->de_blocknr = cpu_to_le64(blocknr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(entry_bh);
nilfs_mdt_mark_dirty(dat);
@@ -426,7 +426,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
}
}
- kaddr = kmap_atomic(entry_bh->b_page);
+ kaddr = kmap_local_page(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
blocknr = le64_to_cpu(entry->de_blocknr);
if (blocknr == 0) {
@@ -436,7 +436,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
*blocknrp = blocknr;
out:
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(entry_bh);
return ret;
}
@@ -457,10 +457,10 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
0, &entry_bh);
if (ret < 0)
return ret;
- kaddr = kmap_atomic(entry_bh->b_page);
+ kaddr = kmap_local_page(entry_bh->b_page);
/* last virtual block number in this block */
first = vinfo->vi_vblocknr;
- do_div(first, entries_per_block);
+ first = div64_ul(first, entries_per_block);
first *= entries_per_block;
last = first + entries_per_block - 1;
for (j = i, n = 0;
@@ -473,7 +473,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
vinfo->vi_end = le64_to_cpu(entry->de_end);
vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(entry_bh);
}
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 4c85914f2abc..893ab36824cc 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -66,7 +66,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
dat = nilfs_bmap_get_dat(direct);
ret = nilfs_dat_translate(dat, ptr, &blocknr);
if (ret < 0)
- return ret;
+ goto dat_error;
ptr = blocknr;
}
@@ -79,7 +79,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
if (dat) {
ret = nilfs_dat_translate(dat, ptr2, &blocknr);
if (ret < 0)
- return ret;
+ goto dat_error;
ptr2 = blocknr;
}
if (ptr2 != ptr + cnt)
@@ -87,6 +87,11 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
}
*ptrp = ptr;
return cnt;
+
+ dat_error:
+ if (ret == -ENOENT)
+ ret = -EINVAL; /* Notify bmap layer of metadata corruption */
+ return ret;
}
static __u64
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index a8a4bc8490b4..612e609158b5 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -15,6 +15,7 @@
#include "mdt.h"
#include "alloc.h"
#include "ifile.h"
+#include "cpfile.h"
/**
* struct nilfs_ifile_info - on-memory private data of ifile
@@ -115,11 +116,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
return ret;
}
- kaddr = kmap_atomic(req.pr_entry_bh->b_page);
+ kaddr = kmap_local_page(req.pr_entry_bh->b_page);
raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
req.pr_entry_bh, kaddr);
raw_inode->i_flags = 0;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(req.pr_entry_bh);
brelse(req.pr_entry_bh);
@@ -173,14 +174,18 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile,
* nilfs_ifile_read - read or get ifile inode
* @sb: super block instance
* @root: root object
+ * @cno: number of checkpoint entry to read
* @inode_size: size of an inode
- * @raw_inode: on-disk ifile inode
- * @inodep: buffer to store the inode
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL - Invalid checkpoint.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
*/
int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
- size_t inode_size, struct nilfs_inode *raw_inode,
- struct inode **inodep)
+ __u64 cno, size_t inode_size)
{
+ struct the_nilfs *nilfs;
struct inode *ifile;
int err;
@@ -201,13 +206,13 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
- err = nilfs_read_inode_common(ifile, raw_inode);
+ nilfs = sb->s_fs_info;
+ err = nilfs_cpfile_read_checkpoint(nilfs->ns_cpfile, cno, root, ifile);
if (err)
goto failed;
unlock_new_inode(ifile);
out:
- *inodep = ifile;
return 0;
failed:
iget_failed(ifile);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 35c5273f4821..625545cc2a98 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -21,15 +21,14 @@
static inline struct nilfs_inode *
nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
{
- void *kaddr = kmap(ibh->b_page);
+ void *kaddr = kmap_local_page(ibh->b_page);
return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
}
-static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
- struct buffer_head *ibh)
+static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode)
{
- kunmap(ibh->b_page);
+ kunmap_local(raw_inode);
}
int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
@@ -39,7 +38,6 @@ int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
- size_t inode_size, struct nilfs_inode *raw_inode,
- struct inode **inodep);
+ __u64 cno, size_t inode_size);
#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 9c334c722fc1..7340a01d80e1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -112,7 +112,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
"%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
__func__, inode->i_ino,
(unsigned long long)blkoff);
- err = 0;
+ err = -EAGAIN;
}
nilfs_transaction_abort(inode->i_sb);
goto out;
@@ -520,7 +520,7 @@ static int __nilfs_read_inode(struct super_block *sb,
inode, inode->i_mode,
huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
}
- nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+ nilfs_ifile_unmap_inode(raw_inode);
brelse(bh);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
@@ -529,7 +529,7 @@ static int __nilfs_read_inode(struct super_block *sb,
return 0;
failed_unmap:
- nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+ nilfs_ifile_unmap_inode(raw_inode);
brelse(bh);
bad_inode:
@@ -759,8 +759,18 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
return s_inode;
}
+/**
+ * nilfs_write_inode_common - export common inode information to on-disk inode
+ * @inode: inode object
+ * @raw_inode: on-disk inode
+ *
+ * This function writes standard information from the on-memory inode @inode
+ * to @raw_inode on ifile, cpfile or a super root block. Since inode bmap
+ * data is not exported, nilfs_bmap_write() must be called separately during
+ * log writing.
+ */
void nilfs_write_inode_common(struct inode *inode,
- struct nilfs_inode *raw_inode, int has_bmap)
+ struct nilfs_inode *raw_inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -778,21 +788,6 @@ void nilfs_write_inode_common(struct inode *inode,
raw_inode->i_flags = cpu_to_le32(ii->i_flags);
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
- if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
- struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-
- /* zero-fill unused portion in the case of super root block */
- raw_inode->i_xattr = 0;
- raw_inode->i_pad = 0;
- memset((void *)raw_inode + sizeof(*raw_inode), 0,
- nilfs->ns_inode_size - sizeof(*raw_inode));
- }
-
- if (has_bmap)
- nilfs_bmap_write(ii->i_bmap, raw_inode);
- else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- raw_inode->i_device_code =
- cpu_to_le64(huge_encode_dev(inode->i_rdev));
/*
* When extending inode, nilfs->ns_inode_size should be checked
* for substitutions of appended fields.
@@ -813,14 +808,13 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
if (flags & I_DIRTY_DATASYNC)
set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
- nilfs_write_inode_common(inode, raw_inode, 0);
- /*
- * XXX: call with has_bmap = 0 is a workaround to avoid
- * deadlock of bmap. This delays update of i_bmap to just
- * before writing.
- */
+ nilfs_write_inode_common(inode, raw_inode);
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ raw_inode->i_device_code =
+ cpu_to_le64(huge_encode_dev(inode->i_rdev));
- nilfs_ifile_unmap_inode(ifile, ino, ibh);
+ nilfs_ifile_unmap_inode(raw_inode);
}
#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index cfb6aca5ec38..f1a01c191cf5 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1111,7 +1111,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
minseg = range[0] + segbytes - 1;
- do_div(minseg, segbytes);
+ minseg = div64_ul(minseg, segbytes);
if (range[1] < 4096)
goto out;
@@ -1120,7 +1120,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
if (maxseg < segbytes)
goto out;
- do_div(maxseg, segbytes);
+ maxseg = div64_ul(maxseg, segbytes);
maxseg--;
ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index e45c01a559c0..4f792a0ad0f0 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -47,12 +47,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
set_buffer_mapped(bh);
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
memset(kaddr + bh_offset(bh), 0, i_blocksize(inode));
if (init_block)
init_block(inode, bh, kaddr);
flush_dcache_page(bh->b_page);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 98cffaf0ac12..2e29b98ba8ba 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -256,7 +256,8 @@ extern struct inode *nilfs_new_inode(struct inode *, umode_t);
extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern void nilfs_set_inode_flags(struct inode *);
extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
-extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+void nilfs_write_inode_common(struct inode *inode,
+ struct nilfs_inode *raw_inode);
struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
unsigned long ino);
struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5c2eba1987bd..14e470fb8870 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -103,11 +103,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
struct page *spage = sbh->b_page, *dpage = dbh->b_page;
struct buffer_head *bh;
- kaddr0 = kmap_atomic(spage);
- kaddr1 = kmap_atomic(dpage);
+ kaddr0 = kmap_local_page(spage);
+ kaddr1 = kmap_local_page(dpage);
memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
- kunmap_atomic(kaddr1);
- kunmap_atomic(kaddr0);
+ kunmap_local(kaddr1);
+ kunmap_local(kaddr0);
dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
dbh->b_blocknr = sbh->b_blocknr;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index a9b8d77c8c1d..49a70c68bf3c 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -482,9 +482,9 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
if (unlikely(!bh_org))
return -EIO;
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
memcpy(kaddr + from, bh_org->b_data, bh_org->b_size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh_org);
return 0;
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6e59dc19a732..dc431b4c34c9 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -220,9 +220,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
crc = crc32_le(crc, bh->b_data, bh->b_size);
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
raw_sum->ss_datasum = cpu_to_le32(crc);
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2bfb08052d39..aa5290cb7467 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -880,76 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
nilfs_mdt_clear_dirty(nilfs->ns_dat);
}
-static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
-{
- struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
- struct buffer_head *bh_cp;
- struct nilfs_checkpoint *raw_cp;
- int err;
-
- /* XXX: this interface will be changed */
- err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
- &raw_cp, &bh_cp);
- if (likely(!err)) {
- /*
- * The following code is duplicated with cpfile. But, it is
- * needed to collect the checkpoint even if it was not newly
- * created.
- */
- mark_buffer_dirty(bh_cp);
- nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
- nilfs_cpfile_put_checkpoint(
- nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
- } else if (err == -EINVAL || err == -ENOENT) {
- nilfs_error(sci->sc_super,
- "checkpoint creation failed due to metadata corruption.");
- err = -EIO;
- }
- return err;
-}
-
-static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
-{
- struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
- struct buffer_head *bh_cp;
- struct nilfs_checkpoint *raw_cp;
- int err;
-
- err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
- &raw_cp, &bh_cp);
- if (unlikely(err)) {
- if (err == -EINVAL || err == -ENOENT) {
- nilfs_error(sci->sc_super,
- "checkpoint finalization failed due to metadata corruption.");
- err = -EIO;
- }
- goto failed_ibh;
- }
- raw_cp->cp_snapshot_list.ssl_next = 0;
- raw_cp->cp_snapshot_list.ssl_prev = 0;
- raw_cp->cp_inodes_count =
- cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
- raw_cp->cp_blocks_count =
- cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
- raw_cp->cp_nblk_inc =
- cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
- raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
- raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
-
- if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
- nilfs_checkpoint_clear_minor(raw_cp);
- else
- nilfs_checkpoint_set_minor(raw_cp);
-
- nilfs_write_inode_common(sci->sc_root->ifile,
- &raw_cp->cp_ifile_inode, 1);
- nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
- return 0;
-
- failed_ibh:
- return err;
-}
-
static void nilfs_fill_in_file_bmap(struct inode *ifile,
struct nilfs_inode_info *ii)
@@ -963,7 +893,7 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
ibh);
nilfs_bmap_write(ii->i_bmap, raw_inode);
- nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+ nilfs_ifile_unmap_inode(raw_inode);
}
}
@@ -977,6 +907,33 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
}
}
+/**
+ * nilfs_write_root_mdt_inode - export root metadata inode information to
+ * the on-disk inode
+ * @inode: inode object of the root metadata file
+ * @raw_inode: on-disk inode
+ *
+ * nilfs_write_root_mdt_inode() writes inode information and bmap data of
+ * @inode to the inode area of the metadata file allocated on the super root
+ * block created to finalize the log. Since super root blocks are configured
+ * each time, this function zero-fills the unused area of @raw_inode.
+ */
+static void nilfs_write_root_mdt_inode(struct inode *inode,
+ struct nilfs_inode *raw_inode)
+{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+ nilfs_write_inode_common(inode, raw_inode);
+
+ /* zero-fill unused portion of raw_inode */
+ raw_inode->i_xattr = 0;
+ raw_inode->i_pad = 0;
+ memset((void *)raw_inode + sizeof(*raw_inode), 0,
+ nilfs->ns_inode_size - sizeof(*raw_inode));
+
+ nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode);
+}
+
static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
@@ -998,12 +955,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
raw_sr->sr_flags = 0;
- nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
- NILFS_SR_DAT_OFFSET(isz), 1);
- nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
- NILFS_SR_CPFILE_OFFSET(isz), 1);
- nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
- NILFS_SR_SUFILE_OFFSET(isz), 1);
+ nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr +
+ NILFS_SR_DAT_OFFSET(isz));
+ nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr +
+ NILFS_SR_CPFILE_OFFSET(isz));
+ nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr +
+ NILFS_SR_SUFILE_OFFSET(isz));
+
memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
set_buffer_uptodate(bh_sr);
unlock_buffer(bh_sr);
@@ -1230,7 +1188,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
break;
nilfs_sc_cstage_inc(sci);
/* Creating a checkpoint */
- err = nilfs_segctor_create_checkpoint(sci);
+ err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile,
+ nilfs->ns_cno);
if (unlikely(err))
break;
fallthrough;
@@ -2101,7 +2060,11 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (mode == SC_LSEG_SR &&
nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
- err = nilfs_segctor_fill_in_checkpoint(sci);
+ err = nilfs_cpfile_finalize_checkpoint(
+ nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root,
+ sci->sc_nblk_inc + sci->sc_nblk_this_inc,
+ sci->sc_seg_ctime,
+ !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags));
if (unlikely(err))
goto failed_to_write;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 0a8119456c21..6748218be7c5 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -48,7 +48,7 @@ nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
{
__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
- do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+ t = div64_ul(t, nilfs_sufile_segment_usages_per_block(sufile));
return (unsigned long)t;
}
@@ -107,11 +107,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
struct nilfs_sufile_header *header;
void *kaddr;
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(header_bh);
}
@@ -315,10 +315,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
ret = nilfs_sufile_get_header_block(sufile, &header_bh);
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
last_alloc = le64_to_cpu(header->sh_last_alloc);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nsegments = nilfs_sufile_get_nsegments(sufile);
maxsegnum = sui->allocmax;
@@ -352,7 +352,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
&su_bh);
if (ret < 0)
goto out_header;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
@@ -363,14 +363,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
continue;
/* found a clean segment */
nilfs_segment_usage_set_dirty(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
le64_add_cpu(&header->sh_ncleansegs, -1);
le64_add_cpu(&header->sh_ndirtysegs, 1);
header->sh_last_alloc = cpu_to_le64(segnum);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
sui->ncleansegs--;
mark_buffer_dirty(header_bh);
@@ -384,7 +384,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
goto out_header;
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(su_bh);
}
@@ -406,16 +406,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
struct nilfs_segment_usage *su;
void *kaddr;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
__func__, (unsigned long long)segnum);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
return;
}
nilfs_segment_usage_set_dirty(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_sufile_mod_counter(header_bh, -1, 1);
NILFS_SUI(sufile)->ncleansegs--;
@@ -432,11 +432,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
void *kaddr;
int clean, dirty;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
su->su_nblocks == cpu_to_le32(0)) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
return;
}
clean = nilfs_segment_usage_clean(su);
@@ -446,7 +446,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
su->su_lastmod = cpu_to_le64(0);
su->su_nblocks = cpu_to_le32(0);
su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
NILFS_SUI(sufile)->ncleansegs -= clean;
@@ -463,12 +463,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
void *kaddr;
int sudirty;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
__func__, (unsigned long long)segnum);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
return;
}
if (unlikely(nilfs_segment_usage_error(su)))
@@ -481,7 +481,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
(unsigned long long)segnum);
nilfs_segment_usage_set_clean(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(su_bh);
nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
@@ -509,12 +509,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
if (ret)
goto out_sem;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
if (unlikely(nilfs_segment_usage_error(su))) {
struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(bh);
if (nilfs_segment_is_active(nilfs, segnum)) {
nilfs_error(sufile->i_sb,
@@ -532,7 +532,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
ret = -EIO;
} else {
nilfs_segment_usage_set_dirty(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
brelse(bh);
@@ -562,7 +562,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
if (modtime) {
/*
@@ -573,7 +573,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
su->su_lastmod = cpu_to_le64(modtime);
}
su->su_nblocks = cpu_to_le32(nblocks);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
@@ -614,7 +614,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
if (ret < 0)
goto out_sem;
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
@@ -624,7 +624,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
spin_lock(&nilfs->ns_last_segment_lock);
sustat->ss_prot_seq = nilfs->ns_prot_seq;
spin_unlock(&nilfs->ns_last_segment_lock);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(header_bh);
out_sem:
@@ -640,15 +640,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
void *kaddr;
int suclean;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_error(su)) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
return;
}
suclean = nilfs_segment_usage_clean(su);
nilfs_segment_usage_set_error(su);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (suclean) {
nilfs_sufile_mod_counter(header_bh, -1, 0);
@@ -717,7 +717,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
/* hole */
continue;
}
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
su2 = su;
@@ -726,7 +726,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
nilfs_segment_is_active(nilfs, segnum + j)) {
ret = -EBUSY;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(su_bh);
goto out_header;
}
@@ -738,7 +738,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
nc++;
}
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
if (nc > 0) {
mark_buffer_dirty(su_bh);
ncleaned += nc;
@@ -823,10 +823,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
sui->allocmin = 0;
}
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(sufile);
@@ -891,7 +891,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
continue;
}
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
for (j = 0; j < n;
@@ -904,7 +904,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
si->sui_flags |=
BIT(NILFS_SEGMENT_USAGE_ACTIVE);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(su_bh);
}
ret = nsegs;
@@ -973,7 +973,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
goto out_header;
for (;;) {
- kaddr = kmap_atomic(bh->b_page);
+ kaddr = kmap_local_page(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, sup->sup_segnum, bh, kaddr);
@@ -1010,7 +1010,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
sup = (void *)sup + supsz;
if (sup >= supend)
@@ -1115,7 +1115,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
continue;
}
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
su_bh, kaddr);
for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
@@ -1145,7 +1145,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
}
if (nblocks >= minlen) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
@@ -1157,7 +1157,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
}
ndiscarded += nblocks;
- kaddr = kmap_atomic(su_bh->b_page);
+ kaddr = kmap_local_page(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(
sufile, segnum, su_bh, kaddr);
}
@@ -1166,7 +1166,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
start = seg_start;
nblocks = seg_end - seg_start + 1;
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
put_bh(su_bh);
}
@@ -1246,10 +1246,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
goto failed;
sui = NILFS_SUI(sufile);
- kaddr = kmap_atomic(header_bh->b_page);
+ kaddr = kmap_local_page(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
brelse(header_bh);
sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index df8674173b22..ac24ed109ce9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -448,7 +448,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
newnsegs = sb2off >> nilfs->ns_blocksize_bits;
- do_div(newnsegs, nilfs->ns_blocks_per_segment);
+ newnsegs = div64_ul(newnsegs, nilfs->ns_blocks_per_segment);
ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
up_write(&nilfs->ns_segctor_sem);
@@ -544,8 +544,6 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
{
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_root *root;
- struct nilfs_checkpoint *raw_cp;
- struct buffer_head *bh_cp;
int err = -ENOMEM;
root = nilfs_find_or_create_root(
@@ -557,38 +555,19 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
goto reuse; /* already attached checkpoint */
down_read(&nilfs->ns_segctor_sem);
- err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
- &bh_cp);
+ err = nilfs_ifile_read(sb, root, cno, nilfs->ns_inode_size);
up_read(&nilfs->ns_segctor_sem);
- if (unlikely(err)) {
- if (err == -ENOENT || err == -EINVAL) {
- nilfs_err(sb,
- "Invalid checkpoint (checkpoint number=%llu)",
- (unsigned long long)cno);
- err = -EINVAL;
- }
+ if (unlikely(err))
goto failed;
- }
-
- err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
- &raw_cp->cp_ifile_inode, &root->ifile);
- if (err)
- goto failed_bh;
-
- atomic64_set(&root->inodes_count,
- le64_to_cpu(raw_cp->cp_inodes_count));
- atomic64_set(&root->blocks_count,
- le64_to_cpu(raw_cp->cp_blocks_count));
-
- nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
reuse:
*rootp = root;
return 0;
- failed_bh:
- nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
failed:
+ if (err == -EINVAL)
+ nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)",
+ (unsigned long long)cno);
nilfs_put_root(root);
return err;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 71400496ed36..2ae2c1bbf6d1 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -413,7 +413,7 @@ static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
{
u64 max_count = U64_MAX;
- do_div(max_count, nilfs->ns_blocks_per_segment);
+ max_count = div64_ul(max_count, nilfs->ns_blocks_per_segment);
return min_t(u64, max_count, ULONG_MAX);
}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 1e4def21811e..224bccaab4cc 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -228,8 +228,10 @@ static int fanotify_get_response(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- ret = wait_event_killable(group->fanotify_data.access_waitq,
- event->state == FAN_EVENT_ANSWERED);
+ ret = wait_event_state(group->fanotify_data.access_waitq,
+ event->state == FAN_EVENT_ANSWERED,
+ (TASK_KILLABLE|TASK_FREEZABLE));
+
/* Signal pending? */
if (ret < 0) {
spin_lock(&group->notification_lock);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 8bfd690e9f10..2fc105a72a8f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -141,7 +141,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
}
/* Are inode/sb/mount interested in parent and name info with this event? */
-static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
+static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
__u32 mask)
{
__u32 marks_mask = 0;
@@ -160,13 +160,22 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
/* Did either inode/sb/mount subscribe for events with parent/name? */
marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
- if (mnt)
- marks_mask |= fsnotify_parent_needed_mask(mnt->mnt_fsnotify_mask);
+ marks_mask |= fsnotify_parent_needed_mask(mnt_mask);
/* Did they subscribe for this event with parent/name info? */
return mask & marks_mask;
}
+/* Are there any inode/mount/sb objects that are interested in this event? */
+static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
+ __u32 mask)
+{
+ __u32 marks_mask = inode->i_fsnotify_mask | mnt_mask |
+ inode->i_sb->s_fsnotify_mask;
+
+ return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
+}
+
/*
* Notify this dentry's parent about a child's events with child name info
* if parent is watching or if inode/sb/mount are interested in events with
@@ -179,7 +188,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
int data_type)
{
const struct path *path = fsnotify_data_path(data, data_type);
- struct mount *mnt = path ? real_mount(path->mnt) : NULL;
+ __u32 mnt_mask = path ? real_mount(path->mnt)->mnt_fsnotify_mask : 0;
struct inode *inode = d_inode(dentry);
struct dentry *parent;
bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
@@ -190,16 +199,13 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
struct qstr *file_name = NULL;
int ret = 0;
- /*
- * Do inode/sb/mount care about parent and name info on non-dir?
- * Do they care about any event at all?
- */
- if (!inode->i_fsnotify_marks && !inode->i_sb->s_fsnotify_marks &&
- (!mnt || !mnt->mnt_fsnotify_marks) && !parent_watched)
+ /* Optimize the likely case of nobody watching this path */
+ if (likely(!parent_watched &&
+ !fsnotify_object_watched(inode, mnt_mask, mask)))
return 0;
parent = NULL;
- parent_needed = fsnotify_event_needs_parent(inode, mnt, mask);
+ parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask);
if (!parent_watched && !parent_needed)
goto notify;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 34e1e3e36733..07e22a15ef02 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -27,26 +27,17 @@ static const struct file_operations ns_file_operations = {
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
struct inode *inode = d_inode(dentry);
- const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+ struct ns_common *ns = inode->i_private;
+ const struct proc_ns_operations *ns_ops = ns->ops;
return dynamic_dname(buffer, buflen, "%s:[%lu]",
ns_ops->name, inode->i_ino);
}
-static void ns_prune_dentry(struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- if (inode) {
- struct ns_common *ns = inode->i_private;
- atomic_long_set(&ns->stashed, 0);
- }
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
- .d_prune = ns_prune_dentry,
+const struct dentry_operations ns_dentry_operations = {
.d_delete = always_delete_dentry,
.d_dname = ns_dname,
+ .d_prune = stashed_dentry_prune,
};
static void nsfs_evict(struct inode *inode)
@@ -56,67 +47,16 @@ static void nsfs_evict(struct inode *inode)
ns->ops->put(ns);
}
-static int __ns_get_path(struct path *path, struct ns_common *ns)
-{
- struct vfsmount *mnt = nsfs_mnt;
- struct dentry *dentry;
- struct inode *inode;
- unsigned long d;
-
- rcu_read_lock();
- d = atomic_long_read(&ns->stashed);
- if (!d)
- goto slow;
- dentry = (struct dentry *)d;
- if (!lockref_get_not_dead(&dentry->d_lockref))
- goto slow;
- rcu_read_unlock();
- ns->ops->put(ns);
-got_it:
- path->mnt = mntget(mnt);
- path->dentry = dentry;
- return 0;
-slow:
- rcu_read_unlock();
- inode = new_inode_pseudo(mnt->mnt_sb);
- if (!inode) {
- ns->ops->put(ns);
- return -ENOMEM;
- }
- inode->i_ino = ns->inum;
- simple_inode_init_ts(inode);
- inode->i_flags |= S_IMMUTABLE;
- inode->i_mode = S_IFREG | S_IRUGO;
- inode->i_fop = &ns_file_operations;
- inode->i_private = ns;
-
- dentry = d_make_root(inode); /* not the normal use, but... */
- if (!dentry)
- return -ENOMEM;
- dentry->d_fsdata = (void *)ns->ops;
- d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
- if (d) {
- d_delete(dentry); /* make sure ->d_prune() does nothing */
- dput(dentry);
- cpu_relax();
- return -EAGAIN;
- }
- goto got_it;
-}
-
int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
void *private_data)
{
- int ret;
+ struct ns_common *ns;
- do {
- struct ns_common *ns = ns_get_cb(private_data);
- if (!ns)
- return -ENOENT;
- ret = __ns_get_path(path, ns);
- } while (ret == -EAGAIN);
+ ns = ns_get_cb(private_data);
+ if (!ns)
+ return -ENOENT;
- return ret;
+ return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path);
}
struct ns_get_path_task_args {
@@ -146,6 +86,7 @@ int open_related_ns(struct ns_common *ns,
struct ns_common *(*get_ns)(struct ns_common *ns))
{
struct path path = {};
+ struct ns_common *relative;
struct file *f;
int err;
int fd;
@@ -154,19 +95,14 @@ int open_related_ns(struct ns_common *ns,
if (fd < 0)
return fd;
- do {
- struct ns_common *relative;
-
- relative = get_ns(ns);
- if (IS_ERR(relative)) {
- put_unused_fd(fd);
- return PTR_ERR(relative);
- }
-
- err = __ns_get_path(&path, relative);
- } while (err == -EAGAIN);
+ relative = get_ns(ns);
+ if (IS_ERR(relative)) {
+ put_unused_fd(fd);
+ return PTR_ERR(relative);
+ }
- if (err) {
+ err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
+ if (err < 0) {
put_unused_fd(fd);
return err;
}
@@ -249,7 +185,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+ const struct ns_common *ns = inode->i_private;
+ const struct proc_ns_operations *ns_ops = ns->ops;
seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
return 0;
@@ -261,6 +198,28 @@ static const struct super_operations nsfs_ops = {
.show_path = nsfs_show_path,
};
+static int nsfs_init_inode(struct inode *inode, void *data)
+{
+ struct ns_common *ns = data;
+
+ inode->i_private = data;
+ inode->i_mode |= S_IRUGO;
+ inode->i_fop = &ns_file_operations;
+ inode->i_ino = ns->inum;
+ return 0;
+}
+
+static void nsfs_put_data(void *data)
+{
+ struct ns_common *ns = data;
+ ns->ops->put(ns);
+}
+
+static const struct stashed_operations nsfs_stashed_ops = {
+ .init_inode = nsfs_init_inode,
+ .put_data = nsfs_put_data,
+};
+
static int nsfs_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
@@ -268,6 +227,7 @@ static int nsfs_init_fs_context(struct fs_context *fc)
return -ENOMEM;
ctx->ops = &nsfs_ops;
ctx->dops = &ns_dentry_operations;
+ fc->s_fs_info = (void *)&nsfs_stashed_ops;
return 0;
}
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
deleted file mode 100644
index 7b2509741735..000000000000
--- a/fs/ntfs/Kconfig
+++ /dev/null
@@ -1,81 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config NTFS_FS
- tristate "NTFS file system support"
- select BUFFER_HEAD
- select NLS
- help
- NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
-
- Saying Y or M here enables read support. There is partial, but
- safe, write support available. For write support you must also
- say Y to "NTFS write support" below.
-
- There are also a number of user-space tools available, called
- ntfsprogs. These include ntfsundelete and ntfsresize, that work
- without NTFS support enabled in the kernel.
-
- This is a rewrite from scratch of Linux NTFS support and replaced
- the old NTFS code starting with Linux 2.5.11. A backport to
- the Linux 2.4 kernel series is separately available as a patch
- from the project web site.
-
- For more information see <file:Documentation/filesystems/ntfs.rst>
- and <http://www.linux-ntfs.org/>.
-
- To compile this file system support as a module, choose M here: the
- module will be called ntfs.
-
- If you are not using Windows NT, 2000, XP or 2003 in addition to
- Linux on your computer it is safe to say N.
-
-config NTFS_DEBUG
- bool "NTFS debugging support"
- depends on NTFS_FS
- help
- If you are experiencing any problems with the NTFS file system, say
- Y here. This will result in additional consistency checks to be
- performed by the driver as well as additional debugging messages to
- be written to the system log. Note that debugging messages are
- disabled by default. To enable them, supply the option debug_msgs=1
- at the kernel command line when booting the kernel or as an option
- to insmod when loading the ntfs module. Once the driver is active,
- you can enable debugging messages by doing (as root):
- echo 1 > /proc/sys/fs/ntfs-debug
- Replacing the "1" with "0" would disable debug messages.
-
- If you leave debugging messages disabled, this results in little
- overhead, but enabling debug messages results in very significant
- slowdown of the system.
-
- When reporting bugs, please try to have available a full dump of
- debugging messages while the misbehaviour was occurring.
-
-config NTFS_RW
- bool "NTFS write support"
- depends on NTFS_FS
- depends on PAGE_SIZE_LESS_THAN_64KB
- help
- This enables the partial, but safe, write support in the NTFS driver.
-
- The only supported operation is overwriting existing files, without
- changing the file length. No file or directory creation, deletion or
- renaming is possible. Note only non-resident files can be written to
- so you may find that some very small files (<500 bytes or so) cannot
- be written to.
-
- While we cannot guarantee that it will not damage any data, we have
- so far not received a single report where the driver would have
- damaged someones data so we assume it is perfectly safe to use.
-
- Note: While write support is safe in this version (a rewrite from
- scratch of the NTFS support), it should be noted that the old NTFS
- write support, included in Linux 2.5.10 and before (since 1997),
- is not safe.
-
- This is currently useful with TopologiLinux. TopologiLinux is run
- on top of any DOS/Microsoft Windows system without partitioning your
- hard disk. Unlike other Linux distributions TopologiLinux does not
- need its own partition. For more information see
- <http://topologi-linux.sourceforge.net/>
-
- It is perfectly safe to say N here.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
deleted file mode 100644
index 3e736572ed00..000000000000
--- a/fs/ntfs/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Rules for making the NTFS driver.
-
-obj-$(CONFIG_NTFS_FS) += ntfs.o
-
-ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
- index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
- unistr.o upcase.o
-
-ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-
-ccflags-y := -DNTFS_VERSION=\"2.1.32\"
-ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
-ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
-
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
deleted file mode 100644
index 2d01517a2d59..000000000000
--- a/fs/ntfs/aops.c
+++ /dev/null
@@ -1,1744 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * aops.c - NTFS kernel address space operations and page cache handling.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include <linux/bit_spinlock.h>
-#include <linux/bio.h>
-
-#include "aops.h"
-#include "attrib.h"
-#include "debug.h"
-#include "inode.h"
-#include "mft.h"
-#include "runlist.h"
-#include "types.h"
-#include "ntfs.h"
-
-/**
- * ntfs_end_buffer_async_read - async io completion for reading attributes
- * @bh: buffer head on which io is completed
- * @uptodate: whether @bh is now uptodate or not
- *
- * Asynchronous I/O completion handler for reading pages belonging to the
- * attribute address space of an inode. The inodes can either be files or
- * directories or they can be fake inodes describing some attribute.
- *
- * If NInoMstProtected(), perform the post read mst fixups when all IO on the
- * page has been completed and mark the page uptodate or set the error bit on
- * the page. To determine the size of the records that need fixing up, we
- * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
- * record size, and index_block_size_bits, to the log(base 2) of the ntfs
- * record size.
- */
-static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
-{
- unsigned long flags;
- struct buffer_head *first, *tmp;
- struct page *page;
- struct inode *vi;
- ntfs_inode *ni;
- int page_uptodate = 1;
-
- page = bh->b_page;
- vi = page->mapping->host;
- ni = NTFS_I(vi);
-
- if (likely(uptodate)) {
- loff_t i_size;
- s64 file_ofs, init_size;
-
- set_buffer_uptodate(bh);
-
- file_ofs = ((s64)page->index << PAGE_SHIFT) +
- bh_offset(bh);
- read_lock_irqsave(&ni->size_lock, flags);
- init_size = ni->initialized_size;
- i_size = i_size_read(vi);
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (unlikely(init_size > i_size)) {
- /* Race with shrinking truncate. */
- init_size = i_size;
- }
- /* Check for the current buffer head overflowing. */
- if (unlikely(file_ofs + bh->b_size > init_size)) {
- int ofs;
- void *kaddr;
-
- ofs = 0;
- if (file_ofs < init_size)
- ofs = init_size - file_ofs;
- kaddr = kmap_atomic(page);
- memset(kaddr + bh_offset(bh) + ofs, 0,
- bh->b_size - ofs);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- }
- } else {
- clear_buffer_uptodate(bh);
- SetPageError(page);
- ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
- "0x%llx.", (unsigned long long)bh->b_blocknr);
- }
- first = page_buffers(page);
- spin_lock_irqsave(&first->b_uptodate_lock, flags);
- clear_buffer_async_read(bh);
- unlock_buffer(bh);
- tmp = bh;
- do {
- if (!buffer_uptodate(tmp))
- page_uptodate = 0;
- if (buffer_async_read(tmp)) {
- if (likely(buffer_locked(tmp)))
- goto still_busy;
- /* Async buffers must be locked. */
- BUG();
- }
- tmp = tmp->b_this_page;
- } while (tmp != bh);
- spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- /*
- * If none of the buffers had errors then we can set the page uptodate,
- * but we first have to perform the post read mst fixups, if the
- * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
- * Note we ignore fixup errors as those are detected when
- * map_mft_record() is called which gives us per record granularity
- * rather than per page granularity.
- */
- if (!NInoMstProtected(ni)) {
- if (likely(page_uptodate && !PageError(page)))
- SetPageUptodate(page);
- } else {
- u8 *kaddr;
- unsigned int i, recs;
- u32 rec_size;
-
- rec_size = ni->itype.index.block_size;
- recs = PAGE_SIZE / rec_size;
- /* Should have been verified before we got here... */
- BUG_ON(!recs);
- kaddr = kmap_atomic(page);
- for (i = 0; i < recs; i++)
- post_read_mst_fixup((NTFS_RECORD*)(kaddr +
- i * rec_size), rec_size);
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
- if (likely(page_uptodate && !PageError(page)))
- SetPageUptodate(page);
- }
- unlock_page(page);
- return;
-still_busy:
- spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
-}
-
-/**
- * ntfs_read_block - fill a @folio of an address space with data
- * @folio: page cache folio to fill with data
- *
- * We read each buffer asynchronously and when all buffers are read in, our io
- * completion handler ntfs_end_buffer_read_async(), if required, automatically
- * applies the mst fixups to the folio before finally marking it uptodate and
- * unlocking it.
- *
- * We only enforce allocated_size limit because i_size is checked for in
- * generic_file_read().
- *
- * Return 0 on success and -errno on error.
- *
- * Contains an adapted version of fs/buffer.c::block_read_full_folio().
- */
-static int ntfs_read_block(struct folio *folio)
-{
- loff_t i_size;
- VCN vcn;
- LCN lcn;
- s64 init_size;
- struct inode *vi;
- ntfs_inode *ni;
- ntfs_volume *vol;
- runlist_element *rl;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
- sector_t iblock, lblock, zblock;
- unsigned long flags;
- unsigned int blocksize, vcn_ofs;
- int i, nr;
- unsigned char blocksize_bits;
-
- vi = folio->mapping->host;
- ni = NTFS_I(vi);
- vol = ni->vol;
-
- /* $MFT/$DATA must have its complete runlist in memory at all times. */
- BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
-
- blocksize = vol->sb->s_blocksize;
- blocksize_bits = vol->sb->s_blocksize_bits;
-
- head = folio_buffers(folio);
- if (!head)
- head = create_empty_buffers(folio, blocksize, 0);
- bh = head;
-
- /*
- * We may be racing with truncate. To avoid some of the problems we
- * now take a snapshot of the various sizes and use those for the whole
- * of the function. In case of an extending truncate it just means we
- * may leave some buffers unmapped which are now allocated. This is
- * not a problem since these buffers will just get mapped when a write
- * occurs. In case of a shrinking truncate, we will detect this later
- * on due to the runlist being incomplete and if the folio is being
- * fully truncated, truncate will throw it away as soon as we unlock
- * it so no need to worry what we do with it.
- */
- iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
- read_lock_irqsave(&ni->size_lock, flags);
- lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
- init_size = ni->initialized_size;
- i_size = i_size_read(vi);
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (unlikely(init_size > i_size)) {
- /* Race with shrinking truncate. */
- init_size = i_size;
- }
- zblock = (init_size + blocksize - 1) >> blocksize_bits;
-
- /* Loop through all the buffers in the folio. */
- rl = NULL;
- nr = i = 0;
- do {
- int err = 0;
-
- if (unlikely(buffer_uptodate(bh)))
- continue;
- if (unlikely(buffer_mapped(bh))) {
- arr[nr++] = bh;
- continue;
- }
- bh->b_bdev = vol->sb->s_bdev;
- /* Is the block within the allowed limits? */
- if (iblock < lblock) {
- bool is_retry = false;
-
- /* Convert iblock into corresponding vcn and offset. */
- vcn = (VCN)iblock << blocksize_bits >>
- vol->cluster_size_bits;
- vcn_ofs = ((VCN)iblock << blocksize_bits) &
- vol->cluster_size_mask;
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- /* Successful remap. */
- if (lcn >= 0) {
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn << vol->cluster_size_bits)
- + vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
- /* Only read initialized data blocks. */
- if (iblock < zblock) {
- arr[nr++] = bh;
- continue;
- }
- /* Fully non-initialized data block, zero it. */
- goto handle_zblock;
- }
- /* It is a hole, need to zero it. */
- if (lcn == LCN_HOLE)
- goto handle_hole;
- /* If first try and runlist unmapped, map and retry. */
- if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
- is_retry = true;
- /*
- * Attempt to map runlist, dropping lock for
- * the duration.
- */
- up_read(&ni->runlist.lock);
- err = ntfs_map_runlist(ni, vcn);
- if (likely(!err))
- goto lock_retry_remap;
- rl = NULL;
- } else if (!rl)
- up_read(&ni->runlist.lock);
- /*
- * If buffer is outside the runlist, treat it as a
- * hole. This can happen due to concurrent truncate
- * for example.
- */
- if (err == -ENOENT || lcn == LCN_ENOENT) {
- err = 0;
- goto handle_hole;
- }
- /* Hard error, zero out region. */
- if (!err)
- err = -EIO;
- bh->b_blocknr = -1;
- folio_set_error(folio);
- ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
- "attribute type 0x%x, vcn 0x%llx, "
- "offset 0x%x because its location on "
- "disk could not be determined%s "
- "(error code %i).", ni->mft_no,
- ni->type, (unsigned long long)vcn,
- vcn_ofs, is_retry ? " even after "
- "retrying" : "", err);
- }
- /*
- * Either iblock was outside lblock limits or
- * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
- * of the folio and set the buffer uptodate.
- */
-handle_hole:
- bh->b_blocknr = -1UL;
- clear_buffer_mapped(bh);
-handle_zblock:
- folio_zero_range(folio, i * blocksize, blocksize);
- if (likely(!err))
- set_buffer_uptodate(bh);
- } while (i++, iblock++, (bh = bh->b_this_page) != head);
-
- /* Release the lock if we took it. */
- if (rl)
- up_read(&ni->runlist.lock);
-
- /* Check we have at least one buffer ready for i/o. */
- if (nr) {
- struct buffer_head *tbh;
-
- /* Lock the buffers. */
- for (i = 0; i < nr; i++) {
- tbh = arr[i];
- lock_buffer(tbh);
- tbh->b_end_io = ntfs_end_buffer_async_read;
- set_buffer_async_read(tbh);
- }
- /* Finally, start i/o on the buffers. */
- for (i = 0; i < nr; i++) {
- tbh = arr[i];
- if (likely(!buffer_uptodate(tbh)))
- submit_bh(REQ_OP_READ, tbh);
- else
- ntfs_end_buffer_async_read(tbh, 1);
- }
- return 0;
- }
- /* No i/o was scheduled on any of the buffers. */
- if (likely(!folio_test_error(folio)))
- folio_mark_uptodate(folio);
- else /* Signal synchronous i/o error. */
- nr = -EIO;
- folio_unlock(folio);
- return nr;
-}
-
-/**
- * ntfs_read_folio - fill a @folio of a @file with data from the device
- * @file: open file to which the folio @folio belongs or NULL
- * @folio: page cache folio to fill with data
- *
- * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
- * file @file by calling the ntfs version of the generic block_read_full_folio()
- * function, ntfs_read_block(), which in turn creates and reads in the buffers
- * associated with the folio asynchronously.
- *
- * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
- * data from the mft record (which at this stage is most likely in memory) and
- * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
- * even if the mft record is not cached at this point in time, we need to wait
- * for it to be read in before we can do the copy.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_read_folio(struct file *file, struct folio *folio)
-{
- struct page *page = &folio->page;
- loff_t i_size;
- struct inode *vi;
- ntfs_inode *ni, *base_ni;
- u8 *addr;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *mrec;
- unsigned long flags;
- u32 attr_len;
- int err = 0;
-
-retry_readpage:
- BUG_ON(!PageLocked(page));
- vi = page->mapping->host;
- i_size = i_size_read(vi);
- /* Is the page fully outside i_size? (truncate in progress) */
- if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
- PAGE_SHIFT)) {
- zero_user(page, 0, PAGE_SIZE);
- ntfs_debug("Read outside i_size - truncated?");
- goto done;
- }
- /*
- * This can potentially happen because we clear PageUptodate() during
- * ntfs_writepage() of MstProtected() attributes.
- */
- if (PageUptodate(page)) {
- unlock_page(page);
- return 0;
- }
- ni = NTFS_I(vi);
- /*
- * Only $DATA attributes can be encrypted and only unnamed $DATA
- * attributes can be compressed. Index root can have the flags set but
- * this means to create compressed/encrypted files, not that the
- * attribute is compressed/encrypted. Note we need to check for
- * AT_INDEX_ALLOCATION since this is the type of both directory and
- * index inodes.
- */
- if (ni->type != AT_INDEX_ALLOCATION) {
- /* If attribute is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- BUG_ON(ni->type != AT_DATA);
- err = -EACCES;
- goto err_out;
- }
- /* Compressed data streams are handled in compress.c. */
- if (NInoNonResident(ni) && NInoCompressed(ni)) {
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
- return ntfs_read_compressed_block(page);
- }
- }
- /* NInoNonResident() == NInoIndexAllocPresent() */
- if (NInoNonResident(ni)) {
- /* Normal, non-resident data stream. */
- return ntfs_read_block(folio);
- }
- /*
- * Attribute is resident, implying it is not compressed or encrypted.
- * This also means the attribute is smaller than an mft record and
- * hence smaller than a page, so can simply zero out any pages with
- * index above 0. Note the attribute can actually be marked compressed
- * but if it is resident the actual data is not compressed so we are
- * ok to ignore the compressed flag here.
- */
- if (unlikely(page->index > 0)) {
- zero_user(page, 0, PAGE_SIZE);
- goto done;
- }
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Map, pin, and lock the mft record. */
- mrec = map_mft_record(base_ni);
- if (IS_ERR(mrec)) {
- err = PTR_ERR(mrec);
- goto err_out;
- }
- /*
- * If a parallel write made the attribute non-resident, drop the mft
- * record and retry the read_folio.
- */
- if (unlikely(NInoNonResident(ni))) {
- unmap_mft_record(base_ni);
- goto retry_readpage;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err))
- goto put_unm_err_out;
- attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
- read_lock_irqsave(&ni->size_lock, flags);
- if (unlikely(attr_len > ni->initialized_size))
- attr_len = ni->initialized_size;
- i_size = i_size_read(vi);
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (unlikely(attr_len > i_size)) {
- /* Race with shrinking truncate. */
- attr_len = i_size;
- }
- addr = kmap_atomic(page);
- /* Copy the data to the page. */
- memcpy(addr, (u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset),
- attr_len);
- /* Zero the remainder of the page. */
- memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
- flush_dcache_page(page);
- kunmap_atomic(addr);
-put_unm_err_out:
- ntfs_attr_put_search_ctx(ctx);
-unm_err_out:
- unmap_mft_record(base_ni);
-done:
- SetPageUptodate(page);
-err_out:
- unlock_page(page);
- return err;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_write_block - write a @folio to the backing store
- * @folio: page cache folio to write out
- * @wbc: writeback control structure
- *
- * This function is for writing folios belonging to non-resident, non-mst
- * protected attributes to their backing store.
- *
- * For a folio with buffers, map and write the dirty buffers asynchronously
- * under folio writeback. For a folio without buffers, create buffers for the
- * folio, then proceed as above.
- *
- * If a folio doesn't have buffers the folio dirty state is definitive. If
- * a folio does have buffers, the folio dirty state is just a hint,
- * and the buffer dirty state is definitive. (A hint which has rules:
- * dirty buffers against a clean folio is illegal. Other combinations are
- * legal and need to be handled. In particular a dirty folio containing
- * clean buffers for example.)
- *
- * Return 0 on success and -errno on error.
- *
- * Based on ntfs_read_block() and __block_write_full_folio().
- */
-static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
-{
- VCN vcn;
- LCN lcn;
- s64 initialized_size;
- loff_t i_size;
- sector_t block, dblock, iblock;
- struct inode *vi;
- ntfs_inode *ni;
- ntfs_volume *vol;
- runlist_element *rl;
- struct buffer_head *bh, *head;
- unsigned long flags;
- unsigned int blocksize, vcn_ofs;
- int err;
- bool need_end_writeback;
- unsigned char blocksize_bits;
-
- vi = folio->mapping->host;
- ni = NTFS_I(vi);
- vol = ni->vol;
-
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx.", ni->mft_no, ni->type, folio->index);
-
- BUG_ON(!NInoNonResident(ni));
- BUG_ON(NInoMstProtected(ni));
- blocksize = vol->sb->s_blocksize;
- blocksize_bits = vol->sb->s_blocksize_bits;
- head = folio_buffers(folio);
- if (!head) {
- BUG_ON(!folio_test_uptodate(folio));
- head = create_empty_buffers(folio, blocksize,
- (1 << BH_Uptodate) | (1 << BH_Dirty));
- }
- bh = head;
-
- /* NOTE: Different naming scheme to ntfs_read_block()! */
-
- /* The first block in the folio. */
- block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
-
- read_lock_irqsave(&ni->size_lock, flags);
- i_size = i_size_read(vi);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
-
- /* The first out of bounds block for the data size. */
- dblock = (i_size + blocksize - 1) >> blocksize_bits;
-
- /* The last (fully or partially) initialized block. */
- iblock = initialized_size >> blocksize_bits;
-
- /*
- * Be very careful. We have no exclusion from block_dirty_folio
- * here, and the (potentially unmapped) buffers may become dirty at
- * any time. If a buffer becomes dirty here after we've inspected it
- * then we just miss that fact, and the folio stays dirty.
- *
- * Buffers outside i_size may be dirtied by block_dirty_folio;
- * handle that here by just cleaning them.
- */
-
- /*
- * Loop through all the buffers in the folio, mapping all the dirty
- * buffers to disk addresses and handling any aliases from the
- * underlying block device's mapping.
- */
- rl = NULL;
- err = 0;
- do {
- bool is_retry = false;
-
- if (unlikely(block >= dblock)) {
- /*
- * Mapped buffers outside i_size will occur, because
- * this folio can be outside i_size when there is a
- * truncate in progress. The contents of such buffers
- * were zeroed by ntfs_writepage().
- *
- * FIXME: What about the small race window where
- * ntfs_writepage() has not done any clearing because
- * the folio was within i_size but before we get here,
- * vmtruncate() modifies i_size?
- */
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- continue;
- }
-
- /* Clean buffers are not written out, so no need to map them. */
- if (!buffer_dirty(bh))
- continue;
-
- /* Make sure we have enough initialized size. */
- if (unlikely((block >= iblock) &&
- (initialized_size < i_size))) {
- /*
- * If this folio is fully outside initialized
- * size, zero out all folios between the current
- * initialized size and the current folio. Just
- * use ntfs_read_folio() to do the zeroing
- * transparently.
- */
- if (block > iblock) {
- // TODO:
- // For each folio do:
- // - read_cache_folio()
- // Again for each folio do:
- // - wait_on_folio_locked()
- // - Check (folio_test_uptodate(folio) &&
- // !folio_test_error(folio))
- // Update initialized size in the attribute and
- // in the inode.
- // Again, for each folio do:
- // block_dirty_folio();
- // folio_put()
- // We don't need to wait on the writes.
- // Update iblock.
- }
- /*
- * The current folio straddles initialized size. Zero
- * all non-uptodate buffers and set them uptodate (and
- * dirty?). Note, there aren't any non-uptodate buffers
- * if the folio is uptodate.
- * FIXME: For an uptodate folio, the buffers may need to
- * be written out because they were not initialized on
- * disk before.
- */
- if (!folio_test_uptodate(folio)) {
- // TODO:
- // Zero any non-uptodate buffers up to i_size.
- // Set them uptodate and dirty.
- }
- // TODO:
- // Update initialized size in the attribute and in the
- // inode (up to i_size).
- // Update iblock.
- // FIXME: This is inefficient. Try to batch the two
- // size changes to happen in one go.
- ntfs_error(vol->sb, "Writing beyond initialized size "
- "is not supported yet. Sorry.");
- err = -EOPNOTSUPP;
- break;
- // Do NOT set_buffer_new() BUT DO clear buffer range
- // outside write request range.
- // set_buffer_uptodate() on complete buffers as well as
- // set_buffer_dirty().
- }
-
- /* No need to map buffers that are already mapped. */
- if (buffer_mapped(bh))
- continue;
-
- /* Unmapped, dirty buffer. Need to map it. */
- bh->b_bdev = vol->sb->s_bdev;
-
- /* Convert block into corresponding vcn and offset. */
- vcn = (VCN)block << blocksize_bits;
- vcn_ofs = vcn & vol->cluster_size_mask;
- vcn >>= vol->cluster_size_bits;
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- /* Successful remap. */
- if (lcn >= 0) {
- /* Setup buffer head to point to correct block. */
- bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
- vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
- continue;
- }
- /* It is a hole, need to instantiate it. */
- if (lcn == LCN_HOLE) {
- u8 *kaddr;
- unsigned long *bpos, *bend;
-
- /* Check if the buffer is zero. */
- kaddr = kmap_local_folio(folio, bh_offset(bh));
- bpos = (unsigned long *)kaddr;
- bend = (unsigned long *)(kaddr + blocksize);
- do {
- if (unlikely(*bpos))
- break;
- } while (likely(++bpos < bend));
- kunmap_local(kaddr);
- if (bpos == bend) {
- /*
- * Buffer is zero and sparse, no need to write
- * it.
- */
- bh->b_blocknr = -1;
- clear_buffer_dirty(bh);
- continue;
- }
- // TODO: Instantiate the hole.
- // clear_buffer_new(bh);
- // clean_bdev_bh_alias(bh);
- ntfs_error(vol->sb, "Writing into sparse regions is "
- "not supported yet. Sorry.");
- err = -EOPNOTSUPP;
- break;
- }
- /* If first try and runlist unmapped, map and retry. */
- if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
- is_retry = true;
- /*
- * Attempt to map runlist, dropping lock for
- * the duration.
- */
- up_read(&ni->runlist.lock);
- err = ntfs_map_runlist(ni, vcn);
- if (likely(!err))
- goto lock_retry_remap;
- rl = NULL;
- } else if (!rl)
- up_read(&ni->runlist.lock);
- /*
- * If buffer is outside the runlist, truncate has cut it out
- * of the runlist. Just clean and clear the buffer and set it
- * uptodate so it can get discarded by the VM.
- */
- if (err == -ENOENT || lcn == LCN_ENOENT) {
- bh->b_blocknr = -1;
- clear_buffer_dirty(bh);
- folio_zero_range(folio, bh_offset(bh), blocksize);
- set_buffer_uptodate(bh);
- err = 0;
- continue;
- }
- /* Failed to map the buffer, even after retrying. */
- if (!err)
- err = -EIO;
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
- "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
- "because its location on disk could not be "
- "determined%s (error code %i).", ni->mft_no,
- ni->type, (unsigned long long)vcn,
- vcn_ofs, is_retry ? " even after "
- "retrying" : "", err);
- break;
- } while (block++, (bh = bh->b_this_page) != head);
-
- /* Release the lock if we took it. */
- if (rl)
- up_read(&ni->runlist.lock);
-
- /* For the error case, need to reset bh to the beginning. */
- bh = head;
-
- /* Just an optimization, so ->read_folio() is not called later. */
- if (unlikely(!folio_test_uptodate(folio))) {
- int uptodate = 1;
- do {
- if (!buffer_uptodate(bh)) {
- uptodate = 0;
- bh = head;
- break;
- }
- } while ((bh = bh->b_this_page) != head);
- if (uptodate)
- folio_mark_uptodate(folio);
- }
-
- /* Setup all mapped, dirty buffers for async write i/o. */
- do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
- lock_buffer(bh);
- if (test_clear_buffer_dirty(bh)) {
- BUG_ON(!buffer_uptodate(bh));
- mark_buffer_async_write(bh);
- } else
- unlock_buffer(bh);
- } else if (unlikely(err)) {
- /*
- * For the error case. The buffer may have been set
- * dirty during attachment to a dirty folio.
- */
- if (err != -ENOMEM)
- clear_buffer_dirty(bh);
- }
- } while ((bh = bh->b_this_page) != head);
-
- if (unlikely(err)) {
- // TODO: Remove the -EOPNOTSUPP check later on...
- if (unlikely(err == -EOPNOTSUPP))
- err = 0;
- else if (err == -ENOMEM) {
- ntfs_warning(vol->sb, "Error allocating memory. "
- "Redirtying folio so we try again "
- "later.");
- /*
- * Put the folio back on mapping->dirty_pages, but
- * leave its buffer's dirty state as-is.
- */
- folio_redirty_for_writepage(wbc, folio);
- err = 0;
- } else
- folio_set_error(folio);
- }
-
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */
-
- /* Submit the prepared buffers for i/o. */
- need_end_writeback = true;
- do {
- struct buffer_head *next = bh->b_this_page;
- if (buffer_async_write(bh)) {
- submit_bh(REQ_OP_WRITE, bh);
- need_end_writeback = false;
- }
- bh = next;
- } while (bh != head);
- folio_unlock(folio);
-
- /* If no i/o was started, need to end writeback here. */
- if (unlikely(need_end_writeback))
- folio_end_writeback(folio);
-
- ntfs_debug("Done.");
- return err;
-}
-
-/**
- * ntfs_write_mst_block - write a @page to the backing store
- * @page: page cache page to write out
- * @wbc: writeback control structure
- *
- * This function is for writing pages belonging to non-resident, mst protected
- * attributes to their backing store. The only supported attributes are index
- * allocation and $MFT/$DATA. Both directory inodes and index inodes are
- * supported for the index allocation case.
- *
- * The page must remain locked for the duration of the write because we apply
- * the mst fixups, write, and then undo the fixups, so if we were to unlock the
- * page before undoing the fixups, any other user of the page will see the
- * page contents as corrupt.
- *
- * We clear the page uptodate flag for the duration of the function to ensure
- * exclusion for the $MFT/$DATA case against someone mapping an mft record we
- * are about to apply the mst fixups to.
- *
- * Return 0 on success and -errno on error.
- *
- * Based on ntfs_write_block(), ntfs_mft_writepage(), and
- * write_mft_record_nolock().
- */
-static int ntfs_write_mst_block(struct page *page,
- struct writeback_control *wbc)
-{
- sector_t block, dblock, rec_block;
- struct inode *vi = page->mapping->host;
- ntfs_inode *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
- u8 *kaddr;
- unsigned int rec_size = ni->itype.index.block_size;
- ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
- struct buffer_head *bh, *head, *tbh, *rec_start_bh;
- struct buffer_head *bhs[MAX_BUF_PER_PAGE];
- runlist_element *rl;
- int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
- unsigned bh_size, rec_size_bits;
- bool sync, is_mft, page_is_dirty, rec_is_dirty;
- unsigned char bh_size_bits;
-
- if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
- return -EINVAL;
-
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx.", vi->i_ino, ni->type, page->index);
- BUG_ON(!NInoNonResident(ni));
- BUG_ON(!NInoMstProtected(ni));
- is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
- /*
- * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
- * in its page cache were to be marked dirty. However this should
- * never happen with the current driver and considering we do not
- * handle this case here we do want to BUG(), at least for now.
- */
- BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
- (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
- bh_size = vol->sb->s_blocksize;
- bh_size_bits = vol->sb->s_blocksize_bits;
- max_bhs = PAGE_SIZE / bh_size;
- BUG_ON(!max_bhs);
- BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
-
- /* Were we called for sync purposes? */
- sync = (wbc->sync_mode == WB_SYNC_ALL);
-
- /* Make sure we have mapped buffers. */
- bh = head = page_buffers(page);
- BUG_ON(!bh);
-
- rec_size_bits = ni->itype.index.block_size_bits;
- BUG_ON(!(PAGE_SIZE >> rec_size_bits));
- bhs_per_rec = rec_size >> bh_size_bits;
- BUG_ON(!bhs_per_rec);
-
- /* The first block in the page. */
- rec_block = block = (sector_t)page->index <<
- (PAGE_SHIFT - bh_size_bits);
-
- /* The first out of bounds block for the data size. */
- dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
-
- rl = NULL;
- err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
- page_is_dirty = rec_is_dirty = false;
- rec_start_bh = NULL;
- do {
- bool is_retry = false;
-
- if (likely(block < rec_block)) {
- if (unlikely(block >= dblock)) {
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- continue;
- }
- /*
- * This block is not the first one in the record. We
- * ignore the buffer's dirty state because we could
- * have raced with a parallel mark_ntfs_record_dirty().
- */
- if (!rec_is_dirty)
- continue;
- if (unlikely(err2)) {
- if (err2 != -ENOMEM)
- clear_buffer_dirty(bh);
- continue;
- }
- } else /* if (block == rec_block) */ {
- BUG_ON(block > rec_block);
- /* This block is the first one in the record. */
- rec_block += bhs_per_rec;
- err2 = 0;
- if (unlikely(block >= dblock)) {
- clear_buffer_dirty(bh);
- continue;
- }
- if (!buffer_dirty(bh)) {
- /* Clean records are not written out. */
- rec_is_dirty = false;
- continue;
- }
- rec_is_dirty = true;
- rec_start_bh = bh;
- }
- /* Need to map the buffer if it is not mapped already. */
- if (unlikely(!buffer_mapped(bh))) {
- VCN vcn;
- LCN lcn;
- unsigned int vcn_ofs;
-
- bh->b_bdev = vol->sb->s_bdev;
- /* Obtain the vcn and offset of the current block. */
- vcn = (VCN)block << bh_size_bits;
- vcn_ofs = vcn & vol->cluster_size_mask;
- vcn >>= vol->cluster_size_bits;
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- /* Successful remap. */
- if (likely(lcn >= 0)) {
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn <<
- vol->cluster_size_bits) +
- vcn_ofs) >> bh_size_bits;
- set_buffer_mapped(bh);
- } else {
- /*
- * Remap failed. Retry to map the runlist once
- * unless we are working on $MFT which always
- * has the whole of its runlist in memory.
- */
- if (!is_mft && !is_retry &&
- lcn == LCN_RL_NOT_MAPPED) {
- is_retry = true;
- /*
- * Attempt to map runlist, dropping
- * lock for the duration.
- */
- up_read(&ni->runlist.lock);
- err2 = ntfs_map_runlist(ni, vcn);
- if (likely(!err2))
- goto lock_retry_remap;
- if (err2 == -ENOMEM)
- page_is_dirty = true;
- lcn = err2;
- } else {
- err2 = -EIO;
- if (!rl)
- up_read(&ni->runlist.lock);
- }
- /* Hard error. Abort writing this record. */
- if (!err || err == -ENOMEM)
- err = err2;
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Cannot write ntfs record "
- "0x%llx (inode 0x%lx, "
- "attribute type 0x%x) because "
- "its location on disk could "
- "not be determined (error "
- "code %lli).",
- (long long)block <<
- bh_size_bits >>
- vol->mft_record_size_bits,
- ni->mft_no, ni->type,
- (long long)lcn);
- /*
- * If this is not the first buffer, remove the
- * buffers in this record from the list of
- * buffers to write and clear their dirty bit
- * if not error -ENOMEM.
- */
- if (rec_start_bh != bh) {
- while (bhs[--nr_bhs] != rec_start_bh)
- ;
- if (err2 != -ENOMEM) {
- do {
- clear_buffer_dirty(
- rec_start_bh);
- } while ((rec_start_bh =
- rec_start_bh->
- b_this_page) !=
- bh);
- }
- }
- continue;
- }
- }
- BUG_ON(!buffer_uptodate(bh));
- BUG_ON(nr_bhs >= max_bhs);
- bhs[nr_bhs++] = bh;
- } while (block++, (bh = bh->b_this_page) != head);
- if (unlikely(rl))
- up_read(&ni->runlist.lock);
- /* If there were no dirty buffers, we are done. */
- if (!nr_bhs)
- goto done;
- /* Map the page so we can access its contents. */
- kaddr = kmap(page);
- /* Clear the page uptodate flag whilst the mst fixups are applied. */
- BUG_ON(!PageUptodate(page));
- ClearPageUptodate(page);
- for (i = 0; i < nr_bhs; i++) {
- unsigned int ofs;
-
- /* Skip buffers which are not at the beginning of records. */
- if (i % bhs_per_rec)
- continue;
- tbh = bhs[i];
- ofs = bh_offset(tbh);
- if (is_mft) {
- ntfs_inode *tni;
- unsigned long mft_no;
-
- /* Get the mft record number. */
- mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
- >> rec_size_bits;
- /* Check whether to write this mft record. */
- tni = NULL;
- if (!ntfs_may_write_mft_record(vol, mft_no,
- (MFT_RECORD*)(kaddr + ofs), &tni)) {
- /*
- * The record should not be written. This
- * means we need to redirty the page before
- * returning.
- */
- page_is_dirty = true;
- /*
- * Remove the buffers in this mft record from
- * the list of buffers to write.
- */
- do {
- bhs[i] = NULL;
- } while (++i % bhs_per_rec);
- continue;
- }
- /*
- * The record should be written. If a locked ntfs
- * inode was returned, add it to the array of locked
- * ntfs inodes.
- */
- if (tni)
- locked_nis[nr_locked_nis++] = tni;
- }
- /* Apply the mst protection fixups. */
- err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
- rec_size);
- if (unlikely(err2)) {
- if (!err || err == -ENOMEM)
- err = -EIO;
- ntfs_error(vol->sb, "Failed to apply mst fixups "
- "(inode 0x%lx, attribute type 0x%x, "
- "page index 0x%lx, page offset 0x%x)!"
- " Unmount and run chkdsk.", vi->i_ino,
- ni->type, page->index, ofs);
- /*
- * Mark all the buffers in this record clean as we do
- * not want to write corrupt data to disk.
- */
- do {
- clear_buffer_dirty(bhs[i]);
- bhs[i] = NULL;
- } while (++i % bhs_per_rec);
- continue;
- }
- nr_recs++;
- }
- /* If no records are to be written out, we are done. */
- if (!nr_recs)
- goto unm_done;
- flush_dcache_page(page);
- /* Lock buffers and start synchronous write i/o on them. */
- for (i = 0; i < nr_bhs; i++) {
- tbh = bhs[i];
- if (!tbh)
- continue;
- if (!trylock_buffer(tbh))
- BUG();
- /* The buffer dirty state is now irrelevant, just clean it. */
- clear_buffer_dirty(tbh);
- BUG_ON(!buffer_uptodate(tbh));
- BUG_ON(!buffer_mapped(tbh));
- get_bh(tbh);
- tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, tbh);
- }
- /* Synchronize the mft mirror now if not @sync. */
- if (is_mft && !sync)
- goto do_mirror;
-do_wait:
- /* Wait on i/o completion of buffers. */
- for (i = 0; i < nr_bhs; i++) {
- tbh = bhs[i];
- if (!tbh)
- continue;
- wait_on_buffer(tbh);
- if (unlikely(!buffer_uptodate(tbh))) {
- ntfs_error(vol->sb, "I/O error while writing ntfs "
- "record buffer (inode 0x%lx, "
- "attribute type 0x%x, page index "
- "0x%lx, page offset 0x%lx)! Unmount "
- "and run chkdsk.", vi->i_ino, ni->type,
- page->index, bh_offset(tbh));
- if (!err || err == -ENOMEM)
- err = -EIO;
- /*
- * Set the buffer uptodate so the page and buffer
- * states do not become out of sync.
- */
- set_buffer_uptodate(tbh);
- }
- }
- /* If @sync, now synchronize the mft mirror. */
- if (is_mft && sync) {
-do_mirror:
- for (i = 0; i < nr_bhs; i++) {
- unsigned long mft_no;
- unsigned int ofs;
-
- /*
- * Skip buffers which are not at the beginning of
- * records.
- */
- if (i % bhs_per_rec)
- continue;
- tbh = bhs[i];
- /* Skip removed buffers (and hence records). */
- if (!tbh)
- continue;
- ofs = bh_offset(tbh);
- /* Get the mft record number. */
- mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
- >> rec_size_bits;
- if (mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, mft_no,
- (MFT_RECORD*)(kaddr + ofs),
- sync);
- }
- if (!sync)
- goto do_wait;
- }
- /* Remove the mst protection fixups again. */
- for (i = 0; i < nr_bhs; i++) {
- if (!(i % bhs_per_rec)) {
- tbh = bhs[i];
- if (!tbh)
- continue;
- post_write_mst_fixup((NTFS_RECORD*)(kaddr +
- bh_offset(tbh)));
- }
- }
- flush_dcache_page(page);
-unm_done:
- /* Unlock any locked inodes. */
- while (nr_locked_nis-- > 0) {
- ntfs_inode *tni, *base_tni;
-
- tni = locked_nis[nr_locked_nis];
- /* Get the base inode. */
- mutex_lock(&tni->extent_lock);
- if (tni->nr_extents >= 0)
- base_tni = tni;
- else {
- base_tni = tni->ext.base_ntfs_ino;
- BUG_ON(!base_tni);
- }
- mutex_unlock(&tni->extent_lock);
- ntfs_debug("Unlocking %s inode 0x%lx.",
- tni == base_tni ? "base" : "extent",
- tni->mft_no);
- mutex_unlock(&tni->mrec_lock);
- atomic_dec(&tni->count);
- iput(VFS_I(base_tni));
- }
- SetPageUptodate(page);
- kunmap(page);
-done:
- if (unlikely(err && err != -ENOMEM)) {
- /*
- * Set page error if there is only one ntfs record in the page.
- * Otherwise we would loose per-record granularity.
- */
- if (ni->itype.index.block_size == PAGE_SIZE)
- SetPageError(page);
- NVolSetErrors(vol);
- }
- if (page_is_dirty) {
- ntfs_debug("Page still contains one or more dirty ntfs "
- "records. Redirtying the page starting at "
- "record 0x%lx.", page->index <<
- (PAGE_SHIFT - rec_size_bits));
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- } else {
- /*
- * Keep the VM happy. This must be done otherwise the
- * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
- * the page is clean.
- */
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
- end_page_writeback(page);
- }
- if (likely(!err))
- ntfs_debug("Done.");
- return err;
-}
-
-/**
- * ntfs_writepage - write a @page to the backing store
- * @page: page cache page to write out
- * @wbc: writeback control structure
- *
- * This is called from the VM when it wants to have a dirty ntfs page cache
- * page cleaned. The VM has already locked the page and marked it clean.
- *
- * For non-resident attributes, ntfs_writepage() writes the @page by calling
- * the ntfs version of the generic block_write_full_folio() function,
- * ntfs_write_block(), which in turn if necessary creates and writes the
- * buffers associated with the page asynchronously.
- *
- * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
- * the data to the mft record (which at this stage is most likely in memory).
- * The mft record is then marked dirty and written out asynchronously via the
- * vfs inode dirty code path for the inode the mft record belongs to or via the
- * vm page dirty code path for the page the mft record is in.
- *
- * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio().
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- loff_t i_size;
- struct inode *vi = folio->mapping->host;
- ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
- char *addr;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *m = NULL;
- u32 attr_len;
- int err;
-
-retry_writepage:
- BUG_ON(!folio_test_locked(folio));
- i_size = i_size_read(vi);
- /* Is the folio fully outside i_size? (truncate in progress) */
- if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
- PAGE_SHIFT)) {
- /*
- * The folio may have dirty, unmapped buffers. Make them
- * freeable here, so the page does not leak.
- */
- block_invalidate_folio(folio, 0, folio_size(folio));
- folio_unlock(folio);
- ntfs_debug("Write outside i_size - truncated?");
- return 0;
- }
- /*
- * Only $DATA attributes can be encrypted and only unnamed $DATA
- * attributes can be compressed. Index root can have the flags set but
- * this means to create compressed/encrypted files, not that the
- * attribute is compressed/encrypted. Note we need to check for
- * AT_INDEX_ALLOCATION since this is the type of both directory and
- * index inodes.
- */
- if (ni->type != AT_INDEX_ALLOCATION) {
- /* If file is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- folio_unlock(folio);
- BUG_ON(ni->type != AT_DATA);
- ntfs_debug("Denying write access to encrypted file.");
- return -EACCES;
- }
- /* Compressed data streams are handled in compress.c. */
- if (NInoNonResident(ni) && NInoCompressed(ni)) {
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
- // TODO: Implement and replace this with
- // return ntfs_write_compressed_block(page);
- folio_unlock(folio);
- ntfs_error(vi->i_sb, "Writing to compressed files is "
- "not supported yet. Sorry.");
- return -EOPNOTSUPP;
- }
- // TODO: Implement and remove this check.
- if (NInoNonResident(ni) && NInoSparse(ni)) {
- folio_unlock(folio);
- ntfs_error(vi->i_sb, "Writing to sparse files is not "
- "supported yet. Sorry.");
- return -EOPNOTSUPP;
- }
- }
- /* NInoNonResident() == NInoIndexAllocPresent() */
- if (NInoNonResident(ni)) {
- /* We have to zero every time due to mmap-at-end-of-file. */
- if (folio->index >= (i_size >> PAGE_SHIFT)) {
- /* The folio straddles i_size. */
- unsigned int ofs = i_size & (folio_size(folio) - 1);
- folio_zero_segment(folio, ofs, folio_size(folio));
- }
- /* Handle mst protected attributes. */
- if (NInoMstProtected(ni))
- return ntfs_write_mst_block(page, wbc);
- /* Normal, non-resident data stream. */
- return ntfs_write_block(folio, wbc);
- }
- /*
- * Attribute is resident, implying it is not compressed, encrypted, or
- * mst protected. This also means the attribute is smaller than an mft
- * record and hence smaller than a folio, so can simply return error on
- * any folios with index above 0. Note the attribute can actually be
- * marked compressed but if it is resident the actual data is not
- * compressed so we are ok to ignore the compressed flag here.
- */
- BUG_ON(folio_buffers(folio));
- BUG_ON(!folio_test_uptodate(folio));
- if (unlikely(folio->index > 0)) {
- ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. "
- "Aborting write.", folio->index);
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
- folio_end_writeback(folio);
- return -EIO;
- }
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Map, pin, and lock the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- /*
- * If a parallel write made the attribute non-resident, drop the mft
- * record and retry the writepage.
- */
- if (unlikely(NInoNonResident(ni))) {
- unmap_mft_record(base_ni);
- goto retry_writepage;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err))
- goto err_out;
- /*
- * Keep the VM happy. This must be done otherwise
- * PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
- */
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
- attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
- i_size = i_size_read(vi);
- if (unlikely(attr_len > i_size)) {
- /* Race with shrinking truncate or a failed truncate. */
- attr_len = i_size;
- /*
- * If the truncate failed, fix it up now. If a concurrent
- * truncate, we do its job, so it does not have to do anything.
- */
- err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
- attr_len);
- /* Shrinking cannot fail. */
- BUG_ON(err);
- }
- addr = kmap_local_folio(folio, 0);
- /* Copy the data from the folio to the mft record. */
- memcpy((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset),
- addr, attr_len);
- /* Zero out of bounds area in the page cache folio. */
- memset(addr + attr_len, 0, folio_size(folio) - attr_len);
- kunmap_local(addr);
- flush_dcache_folio(folio);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- /* We are done with the folio. */
- folio_end_writeback(folio);
- /* Finally, mark the mft record dirty, so it gets written back. */
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- return 0;
-err_out:
- if (err == -ENOMEM) {
- ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
- "page so we try again later.");
- /*
- * Put the folio back on mapping->dirty_pages, but leave its
- * buffers' dirty state as-is.
- */
- folio_redirty_for_writepage(wbc, folio);
- err = 0;
- } else {
- ntfs_error(vi->i_sb, "Resident attribute write failed with "
- "error %i.", err);
- folio_set_error(folio);
- NVolSetErrors(ni->vol);
- }
- folio_unlock(folio);
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- return err;
-}
-
-#endif /* NTFS_RW */
-
-/**
- * ntfs_bmap - map logical file block to physical device block
- * @mapping: address space mapping to which the block to be mapped belongs
- * @block: logical block to map to its physical device block
- *
- * For regular, non-resident files (i.e. not compressed and not encrypted), map
- * the logical @block belonging to the file described by the address space
- * mapping @mapping to its physical device block.
- *
- * The size of the block is equal to the @s_blocksize field of the super block
- * of the mounted file system which is guaranteed to be smaller than or equal
- * to the cluster size thus the block is guaranteed to fit entirely inside the
- * cluster which means we do not need to care how many contiguous bytes are
- * available after the beginning of the block.
- *
- * Return the physical device block if the mapping succeeded or 0 if the block
- * is sparse or there was an error.
- *
- * Note: This is a problem if someone tries to run bmap() on $Boot system file
- * as that really is in block zero but there is nothing we can do. bmap() is
- * just broken in that respect (just like it cannot distinguish sparse from
- * not available or error).
- */
-static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
-{
- s64 ofs, size;
- loff_t i_size;
- LCN lcn;
- unsigned long blocksize, flags;
- ntfs_inode *ni = NTFS_I(mapping->host);
- ntfs_volume *vol = ni->vol;
- unsigned delta;
- unsigned char blocksize_bits, cluster_size_shift;
-
- ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
- ni->mft_no, (unsigned long long)block);
- if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
- ntfs_error(vol->sb, "BMAP does not make sense for %s "
- "attributes, returning 0.",
- (ni->type != AT_DATA) ? "non-data" :
- (!NInoNonResident(ni) ? "resident" :
- "encrypted"));
- return 0;
- }
- /* None of these can happen. */
- BUG_ON(NInoCompressed(ni));
- BUG_ON(NInoMstProtected(ni));
- blocksize = vol->sb->s_blocksize;
- blocksize_bits = vol->sb->s_blocksize_bits;
- ofs = (s64)block << blocksize_bits;
- read_lock_irqsave(&ni->size_lock, flags);
- size = ni->initialized_size;
- i_size = i_size_read(VFS_I(ni));
- read_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * If the offset is outside the initialized size or the block straddles
- * the initialized size then pretend it is a hole unless the
- * initialized size equals the file size.
- */
- if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
- goto hole;
- cluster_size_shift = vol->cluster_size_bits;
- down_read(&ni->runlist.lock);
- lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
- up_read(&ni->runlist.lock);
- if (unlikely(lcn < LCN_HOLE)) {
- /*
- * Step down to an integer to avoid gcc doing a long long
- * comparision in the switch when we know @lcn is between
- * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
- *
- * Otherwise older gcc (at least on some architectures) will
- * try to use __cmpdi2() which is of course not available in
- * the kernel.
- */
- switch ((int)lcn) {
- case LCN_ENOENT:
- /*
- * If the offset is out of bounds then pretend it is a
- * hole.
- */
- goto hole;
- case LCN_ENOMEM:
- ntfs_error(vol->sb, "Not enough memory to complete "
- "mapping for inode 0x%lx. "
- "Returning 0.", ni->mft_no);
- break;
- default:
- ntfs_error(vol->sb, "Failed to complete mapping for "
- "inode 0x%lx. Run chkdsk. "
- "Returning 0.", ni->mft_no);
- break;
- }
- return 0;
- }
- if (lcn < 0) {
- /* It is a hole. */
-hole:
- ntfs_debug("Done (returning hole).");
- return 0;
- }
- /*
- * The block is really allocated and fullfils all our criteria.
- * Convert the cluster to units of block size and return the result.
- */
- delta = ofs & vol->cluster_size_mask;
- if (unlikely(sizeof(block) < sizeof(lcn))) {
- block = lcn = ((lcn << cluster_size_shift) + delta) >>
- blocksize_bits;
- /* If the block number was truncated return 0. */
- if (unlikely(block != lcn)) {
- ntfs_error(vol->sb, "Physical block 0x%llx is too "
- "large to be returned, returning 0.",
- (long long)lcn);
- return 0;
- }
- } else
- block = ((lcn << cluster_size_shift) + delta) >>
- blocksize_bits;
- ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
- return block;
-}
-
-/*
- * ntfs_normal_aops - address space operations for normal inodes and attributes
- *
- * Note these are not used for compressed or mst protected inodes and
- * attributes.
- */
-const struct address_space_operations ntfs_normal_aops = {
- .read_folio = ntfs_read_folio,
-#ifdef NTFS_RW
- .writepage = ntfs_writepage,
- .dirty_folio = block_dirty_folio,
-#endif /* NTFS_RW */
- .bmap = ntfs_bmap,
- .migrate_folio = buffer_migrate_folio,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_folio = generic_error_remove_folio,
-};
-
-/*
- * ntfs_compressed_aops - address space operations for compressed inodes
- */
-const struct address_space_operations ntfs_compressed_aops = {
- .read_folio = ntfs_read_folio,
-#ifdef NTFS_RW
- .writepage = ntfs_writepage,
- .dirty_folio = block_dirty_folio,
-#endif /* NTFS_RW */
- .migrate_folio = buffer_migrate_folio,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_folio = generic_error_remove_folio,
-};
-
-/*
- * ntfs_mst_aops - general address space operations for mst protecteed inodes
- * and attributes
- */
-const struct address_space_operations ntfs_mst_aops = {
- .read_folio = ntfs_read_folio, /* Fill page with data. */
-#ifdef NTFS_RW
- .writepage = ntfs_writepage, /* Write dirty page to disk. */
- .dirty_folio = filemap_dirty_folio,
-#endif /* NTFS_RW */
- .migrate_folio = buffer_migrate_folio,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_folio = generic_error_remove_folio,
-};
-
-#ifdef NTFS_RW
-
-/**
- * mark_ntfs_record_dirty - mark an ntfs record dirty
- * @page: page containing the ntfs record to mark dirty
- * @ofs: byte offset within @page at which the ntfs record begins
- *
- * Set the buffers and the page in which the ntfs record is located dirty.
- *
- * The latter also marks the vfs inode the ntfs record belongs to dirty
- * (I_DIRTY_PAGES only).
- *
- * If the page does not have buffers, we create them and set them uptodate.
- * The page may not be locked which is why we need to handle the buffers under
- * the mapping->i_private_lock. Once the buffers are marked dirty we no longer
- * need the lock since try_to_free_buffers() does not free dirty buffers.
- */
-void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
- struct address_space *mapping = page->mapping;
- ntfs_inode *ni = NTFS_I(mapping->host);
- struct buffer_head *bh, *head, *buffers_to_free = NULL;
- unsigned int end, bh_size, bh_ofs;
-
- BUG_ON(!PageUptodate(page));
- end = ofs + ni->itype.index.block_size;
- bh_size = VFS_I(ni)->i_sb->s_blocksize;
- spin_lock(&mapping->i_private_lock);
- if (unlikely(!page_has_buffers(page))) {
- spin_unlock(&mapping->i_private_lock);
- bh = head = alloc_page_buffers(page, bh_size, true);
- spin_lock(&mapping->i_private_lock);
- if (likely(!page_has_buffers(page))) {
- struct buffer_head *tail;
-
- do {
- set_buffer_uptodate(bh);
- tail = bh;
- bh = bh->b_this_page;
- } while (bh);
- tail->b_this_page = head;
- attach_page_private(page, head);
- } else
- buffers_to_free = bh;
- }
- bh = head = page_buffers(page);
- BUG_ON(!bh);
- do {
- bh_ofs = bh_offset(bh);
- if (bh_ofs + bh_size <= ofs)
- continue;
- if (unlikely(bh_ofs >= end))
- break;
- set_buffer_dirty(bh);
- } while ((bh = bh->b_this_page) != head);
- spin_unlock(&mapping->i_private_lock);
- filemap_dirty_folio(mapping, page_folio(page));
- if (unlikely(buffers_to_free)) {
- do {
- bh = buffers_to_free->b_this_page;
- free_buffer_head(buffers_to_free);
- buffers_to_free = bh;
- } while (buffers_to_free);
- }
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
deleted file mode 100644
index 8d0958a149cb..000000000000
--- a/fs/ntfs/aops.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * aops.h - Defines for NTFS kernel address space operations and page cache
- * handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_AOPS_H
-#define _LINUX_NTFS_AOPS_H
-
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/fs.h>
-
-#include "inode.h"
-
-/**
- * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
- * @page: the page to release
- *
- * Unpin, unmap and release a page that was obtained from ntfs_map_page().
- */
-static inline void ntfs_unmap_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
-/**
- * ntfs_map_page - map a page into accessible memory, reading it if necessary
- * @mapping: address space for which to obtain the page
- * @index: index into the page cache for @mapping of the page to map
- *
- * Read a page from the page cache of the address space @mapping at position
- * @index, where @index is in units of PAGE_SIZE, and not in bytes.
- *
- * If the page is not in memory it is loaded from disk first using the
- * read_folio method defined in the address space operations of @mapping
- * and the page is added to the page cache of @mapping in the process.
- *
- * If the page belongs to an mst protected attribute and it is marked as such
- * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
- * error checking is performed. This means the caller has to verify whether
- * the ntfs record(s) contained in the page are valid or not using one of the
- * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
- * expecting to see. (For details of the macros, see fs/ntfs/layout.h.)
- *
- * If the page is in high memory it is mapped into memory directly addressible
- * by the kernel.
- *
- * Finally the page count is incremented, thus pinning the page into place.
- *
- * The above means that page_address(page) can be used on all pages obtained
- * with ntfs_map_page() to get the kernel virtual address of the page.
- *
- * When finished with the page, the caller has to call ntfs_unmap_page() to
- * unpin, unmap and release the page.
- *
- * Note this does not grant exclusive access. If such is desired, the caller
- * must provide it independently of the ntfs_{un}map_page() calls by using
- * a {rw_}semaphore or other means of serialization. A spin lock cannot be
- * used as ntfs_map_page() can block.
- *
- * The unlocked and uptodate page is returned on success or an encoded error
- * on failure. Caller has to test for error using the IS_ERR() macro on the
- * return value. If that evaluates to 'true', the negative error code can be
- * obtained using PTR_ERR() on the return value of ntfs_map_page().
- */
-static inline struct page *ntfs_map_page(struct address_space *mapping,
- unsigned long index)
-{
- struct page *page = read_mapping_page(mapping, index, NULL);
-
- if (!IS_ERR(page))
- kmap(page);
- return page;
-}
-
-#ifdef NTFS_RW
-
-extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
deleted file mode 100644
index f79408f9127a..000000000000
--- a/fs/ntfs/attrib.c
+++ /dev/null
@@ -1,2624 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/writeback.h>
-
-#include "attrib.h"
-#include "debug.h"
-#include "layout.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-#include "types.h"
-
-/**
- * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
- * @ni: ntfs inode for which to map (part of) a runlist
- * @vcn: map runlist part containing this vcn
- * @ctx: active attribute search context if present or NULL if not
- *
- * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped
- * runlist fragments and allows their mapping. If you do not have the mft
- * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
- * will perform the necessary mapping and unmapping.
- *
- * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
- * restores it before returning. Thus, @ctx will be left pointing to the same
- * attribute on return as on entry. However, the actual pointers in @ctx may
- * point to different memory locations on return, so you must remember to reset
- * any cached pointers from the @ctx, i.e. after the call to
- * ntfs_map_runlist_nolock(), you will probably want to do:
- * m = ctx->mrec;
- * a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * Return 0 on success and -errno on error. There is one special error code
- * which is not an error as such. This is -ENOENT. It means that @vcn is out
- * of bounds of the runlist.
- *
- * Note the runlist can be NULL after this function returns if @vcn is zero and
- * the attribute has zero allocated size, i.e. there simply is no runlist.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- * is no longer valid, i.e. you need to either call
- * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- * In that case PTR_ERR(@ctx->mrec) will give you the error code for
- * why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- * and is locked on return. Note the runlist will be modified.
- * - If @ctx is NULL, the base mft record of @ni must not be mapped on
- * entry and it will be left unmapped on return.
- * - If @ctx is not NULL, the base mft record must be mapped on entry
- * and it will be left mapped on return.
- */
-int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
-{
- VCN end_vcn;
- unsigned long flags;
- ntfs_inode *base_ni;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- runlist_element *rl;
- struct page *put_this_page = NULL;
- int err = 0;
- bool ctx_is_temporary, ctx_needs_reset;
- ntfs_attr_search_ctx old_ctx = { NULL, };
-
- ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
- (unsigned long long)vcn);
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- if (!ctx) {
- ctx_is_temporary = ctx_needs_reset = true;
- m = map_mft_record(base_ni);
- if (IS_ERR(m))
- return PTR_ERR(m);
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- } else {
- VCN allocated_size_vcn;
-
- BUG_ON(IS_ERR(ctx->mrec));
- a = ctx->attr;
- BUG_ON(!a->non_resident);
- ctx_is_temporary = false;
- end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size_vcn = ni->allocated_size >>
- ni->vol->cluster_size_bits;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
- end_vcn = allocated_size_vcn - 1;
- /*
- * If we already have the attribute extent containing @vcn in
- * @ctx, no need to look it up again. We slightly cheat in
- * that if vcn exceeds the allocated size, we will refuse to
- * map the runlist below, so there is definitely no need to get
- * the right attribute extent.
- */
- if (vcn >= allocated_size_vcn || (a->type == ni->type &&
- a->name_length == ni->name_len &&
- !memcmp((u8*)a + le16_to_cpu(a->name_offset),
- ni->name, ni->name_len) &&
- sle64_to_cpu(a->data.non_resident.lowest_vcn)
- <= vcn && end_vcn >= vcn))
- ctx_needs_reset = false;
- else {
- /* Save the old search context. */
- old_ctx = *ctx;
- /*
- * If the currently mapped (extent) inode is not the
- * base inode we will unmap it when we reinitialize the
- * search context which means we need to get a
- * reference to the page containing the mapped mft
- * record so we do not accidentally drop changes to the
- * mft record when it has not been marked dirty yet.
- */
- if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
- old_ctx.base_ntfs_ino) {
- put_this_page = old_ctx.ntfs_ino->page;
- get_page(put_this_page);
- }
- /*
- * Reinitialize the search context so we can lookup the
- * needed attribute extent.
- */
- ntfs_attr_reinit_search_ctx(ctx);
- ctx_needs_reset = true;
- }
- }
- if (ctx_needs_reset) {
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, vcn, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- BUG_ON(!ctx->attr->non_resident);
- }
- a = ctx->attr;
- /*
- * Only decompress the mapping pairs if @vcn is inside it. Otherwise
- * we get into problems when we try to map an out of bounds vcn because
- * we then try to map the already mapped runlist fragment and
- * ntfs_mapping_pairs_decompress() fails.
- */
- end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
- if (unlikely(vcn && vcn >= end_vcn)) {
- err = -ENOENT;
- goto err_out;
- }
- rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl);
- if (IS_ERR(rl))
- err = PTR_ERR(rl);
- else
- ni->runlist.rl = rl;
-err_out:
- if (ctx_is_temporary) {
- if (likely(ctx))
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- } else if (ctx_needs_reset) {
- /*
- * If there is no attribute list, restoring the search context
- * is accomplished simply by copying the saved context back over
- * the caller supplied context. If there is an attribute list,
- * things are more complicated as we need to deal with mapping
- * of mft records and resulting potential changes in pointers.
- */
- if (NInoAttrList(base_ni)) {
- /*
- * If the currently mapped (extent) inode is not the
- * one we had before, we need to unmap it and map the
- * old one.
- */
- if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
- /*
- * If the currently mapped inode is not the
- * base inode, unmap it.
- */
- if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
- ctx->base_ntfs_ino) {
- unmap_extent_mft_record(ctx->ntfs_ino);
- ctx->mrec = ctx->base_mrec;
- BUG_ON(!ctx->mrec);
- }
- /*
- * If the old mapped inode is not the base
- * inode, map it.
- */
- if (old_ctx.base_ntfs_ino &&
- old_ctx.ntfs_ino !=
- old_ctx.base_ntfs_ino) {
-retry_map:
- ctx->mrec = map_mft_record(
- old_ctx.ntfs_ino);
- /*
- * Something bad has happened. If out
- * of memory retry till it succeeds.
- * Any other errors are fatal and we
- * return the error code in ctx->mrec.
- * Let the caller deal with it... We
- * just need to fudge things so the
- * caller can reinit and/or put the
- * search context safely.
- */
- if (IS_ERR(ctx->mrec)) {
- if (PTR_ERR(ctx->mrec) ==
- -ENOMEM) {
- schedule();
- goto retry_map;
- } else
- old_ctx.ntfs_ino =
- old_ctx.
- base_ntfs_ino;
- }
- }
- }
- /* Update the changed pointers in the saved context. */
- if (ctx->mrec != old_ctx.mrec) {
- if (!IS_ERR(ctx->mrec))
- old_ctx.attr = (ATTR_RECORD*)(
- (u8*)ctx->mrec +
- ((u8*)old_ctx.attr -
- (u8*)old_ctx.mrec));
- old_ctx.mrec = ctx->mrec;
- }
- }
- /* Restore the search context to the saved one. */
- *ctx = old_ctx;
- /*
- * We drop the reference on the page we took earlier. In the
- * case that IS_ERR(ctx->mrec) is true this means we might lose
- * some changes to the mft record that had been made between
- * the last time it was marked dirty/written out and now. This
- * at this stage is not a problem as the mapping error is fatal
- * enough that the mft record cannot be written out anyway and
- * the caller is very likely to shutdown the whole inode
- * immediately and mark the volume dirty for chkdsk to pick up
- * the pieces anyway.
- */
- if (put_this_page)
- put_page(put_this_page);
- }
- return err;
-}
-
-/**
- * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
- * @ni: ntfs inode for which to map (part of) a runlist
- * @vcn: map runlist part containing this vcn
- *
- * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
- *
- * Return 0 on success and -errno on error. There is one special error code
- * which is not an error as such. This is -ENOENT. It means that @vcn is out
- * of bounds of the runlist.
- *
- * Locking: - The runlist must be unlocked on entry and is unlocked on return.
- * - This function takes the runlist lock for writing and may modify
- * the runlist.
- */
-int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
-{
- int err = 0;
-
- down_write(&ni->runlist.lock);
- /* Make sure someone else didn't do the work while we were sleeping. */
- if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
- LCN_RL_NOT_MAPPED))
- err = ntfs_map_runlist_nolock(ni, vcn, NULL);
- up_write(&ni->runlist.lock);
- return err;
-}
-
-/**
- * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
- * @ni: ntfs inode of the attribute whose runlist to search
- * @vcn: vcn to convert
- * @write_locked: true if the runlist is locked for writing
- *
- * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
- * described by the ntfs inode @ni and return the corresponding logical cluster
- * number (lcn).
- *
- * If the @vcn is not mapped yet, the attempt is made to map the attribute
- * extent containing the @vcn and the vcn to lcn conversion is retried.
- *
- * If @write_locked is true the caller has locked the runlist for writing and
- * if false for reading.
- *
- * Since lcns must be >= 0, we use negative return codes with special meaning:
- *
- * Return code Meaning / Description
- * ==========================================
- * LCN_HOLE Hole / not allocated on disk.
- * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds.
- * LCN_ENOMEM Not enough memory to map runlist.
- * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc).
- *
- * Locking: - The runlist must be locked on entry and is left locked on return.
- * - If @write_locked is 'false', i.e. the runlist is locked for reading,
- * the lock may be dropped inside the function so you cannot rely on
- * the runlist still being the same when this function returns.
- */
-LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
- const bool write_locked)
-{
- LCN lcn;
- unsigned long flags;
- bool is_retry = false;
-
- BUG_ON(!ni);
- ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
- ni->mft_no, (unsigned long long)vcn,
- write_locked ? "write" : "read");
- BUG_ON(!NInoNonResident(ni));
- BUG_ON(vcn < 0);
- if (!ni->runlist.rl) {
- read_lock_irqsave(&ni->size_lock, flags);
- if (!ni->allocated_size) {
- read_unlock_irqrestore(&ni->size_lock, flags);
- return LCN_ENOENT;
- }
- read_unlock_irqrestore(&ni->size_lock, flags);
- }
-retry_remap:
- /* Convert vcn to lcn. If that fails map the runlist and retry once. */
- lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
- if (likely(lcn >= LCN_HOLE)) {
- ntfs_debug("Done, lcn 0x%llx.", (long long)lcn);
- return lcn;
- }
- if (lcn != LCN_RL_NOT_MAPPED) {
- if (lcn != LCN_ENOENT)
- lcn = LCN_EIO;
- } else if (!is_retry) {
- int err;
-
- if (!write_locked) {
- up_read(&ni->runlist.lock);
- down_write(&ni->runlist.lock);
- if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
- LCN_RL_NOT_MAPPED)) {
- up_write(&ni->runlist.lock);
- down_read(&ni->runlist.lock);
- goto retry_remap;
- }
- }
- err = ntfs_map_runlist_nolock(ni, vcn, NULL);
- if (!write_locked) {
- up_write(&ni->runlist.lock);
- down_read(&ni->runlist.lock);
- }
- if (likely(!err)) {
- is_retry = true;
- goto retry_remap;
- }
- if (err == -ENOENT)
- lcn = LCN_ENOENT;
- else if (err == -ENOMEM)
- lcn = LCN_ENOMEM;
- else
- lcn = LCN_EIO;
- }
- if (lcn != LCN_ENOENT)
- ntfs_error(ni->vol->sb, "Failed with error code %lli.",
- (long long)lcn);
- return lcn;
-}
-
-/**
- * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
- * @ni: ntfs inode describing the runlist to search
- * @vcn: vcn to find
- * @ctx: active attribute search context if present or NULL if not
- *
- * Find the virtual cluster number @vcn in the runlist described by the ntfs
- * inode @ni and return the address of the runlist element containing the @vcn.
- *
- * If the @vcn is not mapped yet, the attempt is made to map the attribute
- * extent containing the @vcn and the vcn to lcn conversion is retried.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
- * runlist fragments and allows their mapping. If you do not have the mft
- * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
- * will perform the necessary mapping and unmapping.
- *
- * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
- * restores it before returning. Thus, @ctx will be left pointing to the same
- * attribute on return as on entry. However, the actual pointers in @ctx may
- * point to different memory locations on return, so you must remember to reset
- * any cached pointers from the @ctx, i.e. after the call to
- * ntfs_attr_find_vcn_nolock(), you will probably want to do:
- * m = ctx->mrec;
- * a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- * Note you need to distinguish between the lcn of the returned runlist element
- * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on
- * read and allocate clusters on write.
- *
- * Return the runlist element containing the @vcn on success and
- * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR()
- * to decide if the return is success or failure and PTR_ERR() to get to the
- * error code if IS_ERR() is true.
- *
- * The possible error return codes are:
- * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
- * -ENOMEM - Not enough memory to map runlist.
- * -EIO - Critical error (runlist/file is corrupt, i/o error, etc).
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- * is no longer valid, i.e. you need to either call
- * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- * In that case PTR_ERR(@ctx->mrec) will give you the error code for
- * why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- * and is locked on return. Note the runlist may be modified when
- * needed runlist fragments need to be mapped.
- * - If @ctx is NULL, the base mft record of @ni must not be mapped on
- * entry and it will be left unmapped on return.
- * - If @ctx is not NULL, the base mft record must be mapped on entry
- * and it will be left mapped on return.
- */
-runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
- ntfs_attr_search_ctx *ctx)
-{
- unsigned long flags;
- runlist_element *rl;
- int err = 0;
- bool is_retry = false;
-
- BUG_ON(!ni);
- ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
- ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
- BUG_ON(!NInoNonResident(ni));
- BUG_ON(vcn < 0);
- if (!ni->runlist.rl) {
- read_lock_irqsave(&ni->size_lock, flags);
- if (!ni->allocated_size) {
- read_unlock_irqrestore(&ni->size_lock, flags);
- return ERR_PTR(-ENOENT);
- }
- read_unlock_irqrestore(&ni->size_lock, flags);
- }
-retry_remap:
- rl = ni->runlist.rl;
- if (likely(rl && vcn >= rl[0].vcn)) {
- while (likely(rl->length)) {
- if (unlikely(vcn < rl[1].vcn)) {
- if (likely(rl->lcn >= LCN_HOLE)) {
- ntfs_debug("Done.");
- return rl;
- }
- break;
- }
- rl++;
- }
- if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
- if (likely(rl->lcn == LCN_ENOENT))
- err = -ENOENT;
- else
- err = -EIO;
- }
- }
- if (!err && !is_retry) {
- /*
- * If the search context is invalid we cannot map the unmapped
- * region.
- */
- if (IS_ERR(ctx->mrec))
- err = PTR_ERR(ctx->mrec);
- else {
- /*
- * The @vcn is in an unmapped region, map the runlist
- * and retry.
- */
- err = ntfs_map_runlist_nolock(ni, vcn, ctx);
- if (likely(!err)) {
- is_retry = true;
- goto retry_remap;
- }
- }
- if (err == -EINVAL)
- err = -EIO;
- } else if (!err)
- err = -EIO;
- if (err != -ENOENT)
- ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
- return ERR_PTR(err);
-}
-
-/**
- * ntfs_attr_find - find (next) attribute in mft record
- * @type: attribute type to find
- * @name: attribute name to find (optional, i.e. NULL means don't care)
- * @name_len: attribute name length (only needed if @name present)
- * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @val: attribute value to find (optional, resident attributes only)
- * @val_len: attribute value length
- * @ctx: search context with mft record and attribute to search from
- *
- * You should not need to call this function directly. Use ntfs_attr_lookup()
- * instead.
- *
- * ntfs_attr_find() takes a search context @ctx as parameter and searches the
- * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
- * attribute of @type, optionally @name and @val.
- *
- * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
- * point to the found attribute.
- *
- * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
- * @ctx->attr will point to the attribute before which the attribute being
- * searched for would need to be inserted if such an action were to be desired.
- *
- * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is
- * undefined and in particular do not rely on it not changing.
- *
- * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it
- * is 'false', the search begins after @ctx->attr.
- *
- * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
- * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
- * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at
- * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case
- * sensitive. When @name is present, @name_len is the @name length in Unicode
- * characters.
- *
- * If @name is not present (NULL), we assume that the unnamed attribute is
- * being searched for.
- *
- * Finally, the resident attribute value @val is looked for, if present. If
- * @val is not present (NULL), @val_len is ignored.
- *
- * ntfs_attr_find() only searches the specified mft record and it ignores the
- * presence of an attribute list attribute (unless it is the one being searched
- * for, obviously). If you need to take attribute lists into consideration,
- * use ntfs_attr_lookup() instead (see below). This also means that you cannot
- * use ntfs_attr_find() to search for extent records of non-resident
- * attributes, as extents with lowest_vcn != 0 are usually described by the
- * attribute list attribute only. - Note that it is possible that the first
- * extent is only in the attribute list while the last extent is in the base
- * mft record, so do not rely on being able to find the first extent in the
- * base mft record.
- *
- * Warning: Never use @val when looking for attribute types which can be
- * non-resident as this most likely will result in a crash!
- */
-static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
- const u32 name_len, const IGNORE_CASE_BOOL ic,
- const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
-{
- ATTR_RECORD *a;
- ntfs_volume *vol = ctx->ntfs_ino->vol;
- ntfschar *upcase = vol->upcase;
- u32 upcase_len = vol->upcase_len;
-
- /*
- * Iterate over attributes in mft record starting at @ctx->attr, or the
- * attribute following that, if @ctx->is_first is 'true'.
- */
- if (ctx->is_first) {
- a = ctx->attr;
- ctx->is_first = false;
- } else
- a = (ATTR_RECORD*)((u8*)ctx->attr +
- le32_to_cpu(ctx->attr->length));
- for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
- u8 *mrec_end = (u8 *)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_allocated);
- u8 *name_end;
-
- /* check whether ATTR_RECORD wrap */
- if ((u8 *)a < (u8 *)ctx->mrec)
- break;
-
- /* check whether Attribute Record Header is within bounds */
- if ((u8 *)a > mrec_end ||
- (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
- break;
-
- /* check whether ATTR_RECORD's name is within bounds */
- name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
- a->name_length * sizeof(ntfschar);
- if (name_end > mrec_end)
- break;
-
- ctx->attr = a;
- if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
- a->type == AT_END))
- return -ENOENT;
- if (unlikely(!a->length))
- break;
-
- /* check whether ATTR_RECORD's length wrap */
- if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
- break;
- /* check whether ATTR_RECORD's length is within bounds */
- if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
- break;
-
- if (a->type != type)
- continue;
- /*
- * If @name is present, compare the two names. If @name is
- * missing, assume we want an unnamed attribute.
- */
- if (!name) {
- /* The search failed if the found attribute is named. */
- if (a->name_length)
- return -ENOENT;
- } else if (!ntfs_are_names_equal(name, name_len,
- (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
- a->name_length, ic, upcase, upcase_len)) {
- register int rc;
-
- rc = ntfs_collate_names(name, name_len,
- (ntfschar*)((u8*)a +
- le16_to_cpu(a->name_offset)),
- a->name_length, 1, IGNORE_CASE,
- upcase, upcase_len);
- /*
- * If @name collates before a->name, there is no
- * matching attribute.
- */
- if (rc == -1)
- return -ENOENT;
- /* If the strings are not equal, continue search. */
- if (rc)
- continue;
- rc = ntfs_collate_names(name, name_len,
- (ntfschar*)((u8*)a +
- le16_to_cpu(a->name_offset)),
- a->name_length, 1, CASE_SENSITIVE,
- upcase, upcase_len);
- if (rc == -1)
- return -ENOENT;
- if (rc)
- continue;
- }
- /*
- * The names match or @name not present and attribute is
- * unnamed. If no @val specified, we have found the attribute
- * and are done.
- */
- if (!val)
- return 0;
- /* @val is present; compare values. */
- else {
- register int rc;
-
- rc = memcmp(val, (u8*)a + le16_to_cpu(
- a->data.resident.value_offset),
- min_t(u32, val_len, le32_to_cpu(
- a->data.resident.value_length)));
- /*
- * If @val collates before the current attribute's
- * value, there is no matching attribute.
- */
- if (!rc) {
- register u32 avl;
-
- avl = le32_to_cpu(
- a->data.resident.value_length);
- if (val_len == avl)
- return 0;
- if (val_len < avl)
- return -ENOENT;
- } else if (rc < 0)
- return -ENOENT;
- }
- }
- ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk.");
- NVolSetErrors(vol);
- return -EIO;
-}
-
-/**
- * load_attribute_list - load an attribute list into memory
- * @vol: ntfs volume from which to read
- * @runlist: runlist of the attribute list
- * @al_start: destination buffer
- * @size: size of the destination buffer in bytes
- * @initialized_size: initialized size of the attribute list
- *
- * Walk the runlist @runlist and load all clusters from it copying them into
- * the linear buffer @al. The maximum number of bytes copied to @al is @size
- * bytes. Note, @size does not need to be a multiple of the cluster size. If
- * @initialized_size is less than @size, the region in @al between
- * @initialized_size and @size will be zeroed and not read from disk.
- *
- * Return 0 on success or -errno on error.
- */
-int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
- const s64 size, const s64 initialized_size)
-{
- LCN lcn;
- u8 *al = al_start;
- u8 *al_end = al + initialized_size;
- runlist_element *rl;
- struct buffer_head *bh;
- struct super_block *sb;
- unsigned long block_size;
- unsigned long block, max_block;
- int err = 0;
- unsigned char block_size_bits;
-
- ntfs_debug("Entering.");
- if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
- initialized_size > size)
- return -EINVAL;
- if (!initialized_size) {
- memset(al, 0, size);
- return 0;
- }
- sb = vol->sb;
- block_size = sb->s_blocksize;
- block_size_bits = sb->s_blocksize_bits;
- down_read(&runlist->lock);
- rl = runlist->rl;
- if (!rl) {
- ntfs_error(sb, "Cannot read attribute list since runlist is "
- "missing.");
- goto err_out;
- }
- /* Read all clusters specified by the runlist one run at a time. */
- while (rl->length) {
- lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
- ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
- (unsigned long long)rl->vcn,
- (unsigned long long)lcn);
- /* The attribute list cannot be sparse. */
- if (lcn < 0) {
- ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot "
- "read attribute list.");
- goto err_out;
- }
- block = lcn << vol->cluster_size_bits >> block_size_bits;
- /* Read the run from device in chunks of block_size bytes. */
- max_block = block + (rl->length << vol->cluster_size_bits >>
- block_size_bits);
- ntfs_debug("max_block = 0x%lx.", max_block);
- do {
- ntfs_debug("Reading block = 0x%lx.", block);
- bh = sb_bread(sb, block);
- if (!bh) {
- ntfs_error(sb, "sb_bread() failed. Cannot "
- "read attribute list.");
- goto err_out;
- }
- if (al + block_size >= al_end)
- goto do_final;
- memcpy(al, bh->b_data, block_size);
- brelse(bh);
- al += block_size;
- } while (++block < max_block);
- rl++;
- }
- if (initialized_size < size) {
-initialize:
- memset(al_start + initialized_size, 0, size - initialized_size);
- }
-done:
- up_read(&runlist->lock);
- return err;
-do_final:
- if (al < al_end) {
- /*
- * Partial block.
- *
- * Note: The attribute list can be smaller than its allocation
- * by multiple clusters. This has been encountered by at least
- * two people running Windows XP, thus we cannot do any
- * truncation sanity checking here. (AIA)
- */
- memcpy(al, bh->b_data, al_end - al);
- brelse(bh);
- if (initialized_size < size)
- goto initialize;
- goto done;
- }
- brelse(bh);
- /* Real overflow! */
- ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
- "is truncated.");
-err_out:
- err = -EIO;
- goto done;
-}
-
-/**
- * ntfs_external_attr_find - find an attribute in the attribute list of an inode
- * @type: attribute type to find
- * @name: attribute name to find (optional, i.e. NULL means don't care)
- * @name_len: attribute name length (only needed if @name present)
- * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
- * @val: attribute value to find (optional, resident attributes only)
- * @val_len: attribute value length
- * @ctx: search context with mft record and attribute to search from
- *
- * You should not need to call this function directly. Use ntfs_attr_lookup()
- * instead.
- *
- * Find an attribute by searching the attribute list for the corresponding
- * attribute list entry. Having found the entry, map the mft record if the
- * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
- * in there and return it.
- *
- * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
- * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent
- * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
- * then the base inode).
- *
- * After finishing with the attribute/mft record you need to call
- * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
- * mapped inodes, etc).
- *
- * If the attribute is found, ntfs_external_attr_find() returns 0 and
- * @ctx->attr will point to the found attribute. @ctx->mrec will point to the
- * mft record in which @ctx->attr is located and @ctx->al_entry will point to
- * the attribute list entry for the attribute.
- *
- * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
- * @ctx->attr will point to the attribute in the base mft record before which
- * the attribute being searched for would need to be inserted if such an action
- * were to be desired. @ctx->mrec will point to the mft record in which
- * @ctx->attr is located and @ctx->al_entry will point to the attribute list
- * entry of the attribute before which the attribute being searched for would
- * need to be inserted if such an action were to be desired.
- *
- * Thus to insert the not found attribute, one wants to add the attribute to
- * @ctx->mrec (the base mft record) and if there is not enough space, the
- * attribute should be placed in a newly allocated extent mft record. The
- * attribute list entry for the inserted attribute should be inserted in the
- * attribute list attribute at @ctx->al_entry.
- *
- * On actual error, ntfs_external_attr_find() returns -EIO. In this case
- * @ctx->attr is undefined and in particular do not rely on it not changing.
- */
-static int ntfs_external_attr_find(const ATTR_TYPE type,
- const ntfschar *name, const u32 name_len,
- const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
- const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
-{
- ntfs_inode *base_ni, *ni;
- ntfs_volume *vol;
- ATTR_LIST_ENTRY *al_entry, *next_al_entry;
- u8 *al_start, *al_end;
- ATTR_RECORD *a;
- ntfschar *al_name;
- u32 al_name_len;
- int err = 0;
- static const char *es = " Unmount and run chkdsk.";
-
- ni = ctx->ntfs_ino;
- base_ni = ctx->base_ntfs_ino;
- ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
- if (!base_ni) {
- /* First call happens with the base mft record. */
- base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
- ctx->base_mrec = ctx->mrec;
- }
- if (ni == base_ni)
- ctx->base_attr = ctx->attr;
- if (type == AT_END)
- goto not_found;
- vol = base_ni->vol;
- al_start = base_ni->attr_list;
- al_end = al_start + base_ni->attr_list_size;
- if (!ctx->al_entry)
- ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
- /*
- * Iterate over entries in attribute list starting at @ctx->al_entry,
- * or the entry following that, if @ctx->is_first is 'true'.
- */
- if (ctx->is_first) {
- al_entry = ctx->al_entry;
- ctx->is_first = false;
- } else
- al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
- le16_to_cpu(ctx->al_entry->length));
- for (;; al_entry = next_al_entry) {
- /* Out of bounds check. */
- if ((u8*)al_entry < base_ni->attr_list ||
- (u8*)al_entry > al_end)
- break; /* Inode is corrupt. */
- ctx->al_entry = al_entry;
- /* Catch the end of the attribute list. */
- if ((u8*)al_entry == al_end)
- goto not_found;
- if (!al_entry->length)
- break;
- if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
- le16_to_cpu(al_entry->length) > al_end)
- break;
- next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
- le16_to_cpu(al_entry->length));
- if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
- goto not_found;
- if (type != al_entry->type)
- continue;
- /*
- * If @name is present, compare the two names. If @name is
- * missing, assume we want an unnamed attribute.
- */
- al_name_len = al_entry->name_length;
- al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
- if (!name) {
- if (al_name_len)
- goto not_found;
- } else if (!ntfs_are_names_equal(al_name, al_name_len, name,
- name_len, ic, vol->upcase, vol->upcase_len)) {
- register int rc;
-
- rc = ntfs_collate_names(name, name_len, al_name,
- al_name_len, 1, IGNORE_CASE,
- vol->upcase, vol->upcase_len);
- /*
- * If @name collates before al_name, there is no
- * matching attribute.
- */
- if (rc == -1)
- goto not_found;
- /* If the strings are not equal, continue search. */
- if (rc)
- continue;
- /*
- * FIXME: Reverse engineering showed 0, IGNORE_CASE but
- * that is inconsistent with ntfs_attr_find(). The
- * subsequent rc checks were also different. Perhaps I
- * made a mistake in one of the two. Need to recheck
- * which is correct or at least see what is going on...
- * (AIA)
- */
- rc = ntfs_collate_names(name, name_len, al_name,
- al_name_len, 1, CASE_SENSITIVE,
- vol->upcase, vol->upcase_len);
- if (rc == -1)
- goto not_found;
- if (rc)
- continue;
- }
- /*
- * The names match or @name not present and attribute is
- * unnamed. Now check @lowest_vcn. Continue search if the
- * next attribute list entry still fits @lowest_vcn. Otherwise
- * we have reached the right one or the search has failed.
- */
- if (lowest_vcn && (u8*)next_al_entry >= al_start &&
- (u8*)next_al_entry + 6 < al_end &&
- (u8*)next_al_entry + le16_to_cpu(
- next_al_entry->length) <= al_end &&
- sle64_to_cpu(next_al_entry->lowest_vcn) <=
- lowest_vcn &&
- next_al_entry->type == al_entry->type &&
- next_al_entry->name_length == al_name_len &&
- ntfs_are_names_equal((ntfschar*)((u8*)
- next_al_entry +
- next_al_entry->name_offset),
- next_al_entry->name_length,
- al_name, al_name_len, CASE_SENSITIVE,
- vol->upcase, vol->upcase_len))
- continue;
- if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
- if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
- ntfs_error(vol->sb, "Found stale mft "
- "reference in attribute list "
- "of base inode 0x%lx.%s",
- base_ni->mft_no, es);
- err = -EIO;
- break;
- }
- } else { /* Mft references do not match. */
- /* If there is a mapped record unmap it first. */
- if (ni != base_ni)
- unmap_extent_mft_record(ni);
- /* Do we want the base record back? */
- if (MREF_LE(al_entry->mft_reference) ==
- base_ni->mft_no) {
- ni = ctx->ntfs_ino = base_ni;
- ctx->mrec = ctx->base_mrec;
- } else {
- /* We want an extent record. */
- ctx->mrec = map_extent_mft_record(base_ni,
- le64_to_cpu(
- al_entry->mft_reference), &ni);
- if (IS_ERR(ctx->mrec)) {
- ntfs_error(vol->sb, "Failed to map "
- "extent mft record "
- "0x%lx of base inode "
- "0x%lx.%s",
- MREF_LE(al_entry->
- mft_reference),
- base_ni->mft_no, es);
- err = PTR_ERR(ctx->mrec);
- if (err == -ENOENT)
- err = -EIO;
- /* Cause @ctx to be sanitized below. */
- ni = NULL;
- break;
- }
- ctx->ntfs_ino = ni;
- }
- ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
- le16_to_cpu(ctx->mrec->attrs_offset));
- }
- /*
- * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
- * mft record containing the attribute represented by the
- * current al_entry.
- */
- /*
- * We could call into ntfs_attr_find() to find the right
- * attribute in this mft record but this would be less
- * efficient and not quite accurate as ntfs_attr_find() ignores
- * the attribute instance numbers for example which become
- * important when one plays with attribute lists. Also,
- * because a proper match has been found in the attribute list
- * entry above, the comparison can now be optimized. So it is
- * worth re-implementing a simplified ntfs_attr_find() here.
- */
- a = ctx->attr;
- /*
- * Use a manual loop so we can still use break and continue
- * with the same meanings as above.
- */
-do_next_attr_loop:
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_allocated))
- break;
- if (a->type == AT_END)
- break;
- if (!a->length)
- break;
- if (al_entry->instance != a->instance)
- goto do_next_attr;
- /*
- * If the type and/or the name are mismatched between the
- * attribute list entry and the attribute record, there is
- * corruption so we break and return error EIO.
- */
- if (al_entry->type != a->type)
- break;
- if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
- le16_to_cpu(a->name_offset)), a->name_length,
- al_name, al_name_len, CASE_SENSITIVE,
- vol->upcase, vol->upcase_len))
- break;
- ctx->attr = a;
- /*
- * If no @val specified or @val specified and it matches, we
- * have found it!
- */
- if (!val || (!a->non_resident && le32_to_cpu(
- a->data.resident.value_length) == val_len &&
- !memcmp((u8*)a +
- le16_to_cpu(a->data.resident.value_offset),
- val, val_len))) {
- ntfs_debug("Done, found.");
- return 0;
- }
-do_next_attr:
- /* Proceed to the next attribute in the current mft record. */
- a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
- goto do_next_attr_loop;
- }
- if (!err) {
- ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
- "attribute list attribute.%s", base_ni->mft_no,
- es);
- err = -EIO;
- }
- if (ni != base_ni) {
- if (ni)
- unmap_extent_mft_record(ni);
- ctx->ntfs_ino = base_ni;
- ctx->mrec = ctx->base_mrec;
- ctx->attr = ctx->base_attr;
- }
- if (err != -ENOMEM)
- NVolSetErrors(vol);
- return err;
-not_found:
- /*
- * If we were looking for AT_END, we reset the search context @ctx and
- * use ntfs_attr_find() to seek to the end of the base mft record.
- */
- if (type == AT_END) {
- ntfs_attr_reinit_search_ctx(ctx);
- return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
- ctx);
- }
- /*
- * The attribute was not found. Before we return, we want to ensure
- * @ctx->mrec and @ctx->attr indicate the position at which the
- * attribute should be inserted in the base mft record. Since we also
- * want to preserve @ctx->al_entry we cannot reinitialize the search
- * context using ntfs_attr_reinit_search_ctx() as this would set
- * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see
- * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve
- * @ctx->al_entry as the remaining fields (base_*) are identical to
- * their non base_ counterparts and we cannot set @ctx->base_attr
- * correctly yet as we do not know what @ctx->attr will be set to by
- * the call to ntfs_attr_find() below.
- */
- if (ni != base_ni)
- unmap_extent_mft_record(ni);
- ctx->mrec = ctx->base_mrec;
- ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
- le16_to_cpu(ctx->mrec->attrs_offset));
- ctx->is_first = true;
- ctx->ntfs_ino = base_ni;
- ctx->base_ntfs_ino = NULL;
- ctx->base_mrec = NULL;
- ctx->base_attr = NULL;
- /*
- * In case there are multiple matches in the base mft record, need to
- * keep enumerating until we get an attribute not found response (or
- * another error), otherwise we would keep returning the same attribute
- * over and over again and all programs using us for enumeration would
- * lock up in a tight loop.
- */
- do {
- err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
- ctx);
- } while (!err);
- ntfs_debug("Done, not found.");
- return err;
-}
-
-/**
- * ntfs_attr_lookup - find an attribute in an ntfs inode
- * @type: attribute type to find
- * @name: attribute name to find (optional, i.e. NULL means don't care)
- * @name_len: attribute name length (only needed if @name present)
- * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
- * @val: attribute value to find (optional, resident attributes only)
- * @val_len: attribute value length
- * @ctx: search context with mft record and attribute to search from
- *
- * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must
- * be the base mft record and @ctx must have been obtained from a call to
- * ntfs_attr_get_search_ctx().
- *
- * This function transparently handles attribute lists and @ctx is used to
- * continue searches where they were left off at.
- *
- * After finishing with the attribute/mft record you need to call
- * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
- * mapped inodes, etc).
- *
- * Return 0 if the search was successful and -errno if not.
- *
- * When 0, @ctx->attr is the found attribute and it is in mft record
- * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is
- * the attribute list entry of the found attribute.
- *
- * When -ENOENT, @ctx->attr is the attribute which collates just after the
- * attribute being searched for, i.e. if one wants to add the attribute to the
- * mft record this is the correct place to insert it into. If an attribute
- * list attribute is present, @ctx->al_entry is the attribute list entry which
- * collates just after the attribute list entry of the attribute being searched
- * for, i.e. if one wants to add the attribute to the mft record this is the
- * correct place to insert its attribute list entry into.
- *
- * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is
- * then undefined and in particular you should not rely on it not changing.
- */
-int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
- const u32 name_len, const IGNORE_CASE_BOOL ic,
- const VCN lowest_vcn, const u8 *val, const u32 val_len,
- ntfs_attr_search_ctx *ctx)
-{
- ntfs_inode *base_ni;
-
- ntfs_debug("Entering.");
- BUG_ON(IS_ERR(ctx->mrec));
- if (ctx->base_ntfs_ino)
- base_ni = ctx->base_ntfs_ino;
- else
- base_ni = ctx->ntfs_ino;
- /* Sanity check, just for debugging really. */
- BUG_ON(!base_ni);
- if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
- return ntfs_attr_find(type, name, name_len, ic, val, val_len,
- ctx);
- return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
- val, val_len, ctx);
-}
-
-/**
- * ntfs_attr_init_search_ctx - initialize an attribute search context
- * @ctx: attribute search context to initialize
- * @ni: ntfs inode with which to initialize the search context
- * @mrec: mft record with which to initialize the search context
- *
- * Initialize the attribute search context @ctx with @ni and @mrec.
- */
-static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
- ntfs_inode *ni, MFT_RECORD *mrec)
-{
- *ctx = (ntfs_attr_search_ctx) {
- .mrec = mrec,
- /* Sanity checks are performed elsewhere. */
- .attr = (ATTR_RECORD*)((u8*)mrec +
- le16_to_cpu(mrec->attrs_offset)),
- .is_first = true,
- .ntfs_ino = ni,
- };
-}
-
-/**
- * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
- * @ctx: attribute search context to reinitialize
- *
- * Reinitialize the attribute search context @ctx, unmapping an associated
- * extent mft record if present, and initialize the search context again.
- *
- * This is used when a search for a new attribute is being started to reset
- * the search context to the beginning.
- */
-void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
-{
- if (likely(!ctx->base_ntfs_ino)) {
- /* No attribute list. */
- ctx->is_first = true;
- /* Sanity checks are performed elsewhere. */
- ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
- le16_to_cpu(ctx->mrec->attrs_offset));
- /*
- * This needs resetting due to ntfs_external_attr_find() which
- * can leave it set despite having zeroed ctx->base_ntfs_ino.
- */
- ctx->al_entry = NULL;
- return;
- } /* Attribute list. */
- if (ctx->ntfs_ino != ctx->base_ntfs_ino)
- unmap_extent_mft_record(ctx->ntfs_ino);
- ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
- return;
-}
-
-/**
- * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
- * @ni: ntfs inode with which to initialize the search context
- * @mrec: mft record with which to initialize the search context
- *
- * Allocate a new attribute search context, initialize it with @ni and @mrec,
- * and return it. Return NULL if allocation failed.
- */
-ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
-{
- ntfs_attr_search_ctx *ctx;
-
- ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
- if (ctx)
- ntfs_attr_init_search_ctx(ctx, ni, mrec);
- return ctx;
-}
-
-/**
- * ntfs_attr_put_search_ctx - release an attribute search context
- * @ctx: attribute search context to free
- *
- * Release the attribute search context @ctx, unmapping an associated extent
- * mft record if present.
- */
-void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
-{
- if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
- unmap_extent_mft_record(ctx->ntfs_ino);
- kmem_cache_free(ntfs_attr_ctx_cache, ctx);
- return;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
- * @vol: ntfs volume to which the attribute belongs
- * @type: attribute type which to find
- *
- * Search for the attribute definition record corresponding to the attribute
- * @type in the $AttrDef system file.
- *
- * Return the attribute type definition record if found and NULL if not found.
- */
-static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
- const ATTR_TYPE type)
-{
- ATTR_DEF *ad;
-
- BUG_ON(!vol->attrdef);
- BUG_ON(!type);
- for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
- vol->attrdef_size && ad->type; ++ad) {
- /* We have not found it yet, carry on searching. */
- if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
- continue;
- /* We found the attribute; return it. */
- if (likely(ad->type == type))
- return ad;
- /* We have gone too far already. No point in continuing. */
- break;
- }
- /* Attribute not found. */
- ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
- le32_to_cpu(type));
- return NULL;
-}
-
-/**
- * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
- * @vol: ntfs volume to which the attribute belongs
- * @type: attribute type which to check
- * @size: size which to check
- *
- * Check whether the @size in bytes is valid for an attribute of @type on the
- * ntfs volume @vol. This information is obtained from $AttrDef system file.
- *
- * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
- * listed in $AttrDef.
- */
-int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
- const s64 size)
-{
- ATTR_DEF *ad;
-
- BUG_ON(size < 0);
- /*
- * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
- * listed in $AttrDef.
- */
- if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
- return -ERANGE;
- /* Get the $AttrDef entry for the attribute @type. */
- ad = ntfs_attr_find_in_attrdef(vol, type);
- if (unlikely(!ad))
- return -ENOENT;
- /* Do the bounds check. */
- if (((sle64_to_cpu(ad->min_size) > 0) &&
- size < sle64_to_cpu(ad->min_size)) ||
- ((sle64_to_cpu(ad->max_size) > 0) && size >
- sle64_to_cpu(ad->max_size)))
- return -ERANGE;
- return 0;
-}
-
-/**
- * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
- * @vol: ntfs volume to which the attribute belongs
- * @type: attribute type which to check
- *
- * Check whether the attribute of @type on the ntfs volume @vol is allowed to
- * be non-resident. This information is obtained from $AttrDef system file.
- *
- * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and
- * -ENOENT if the attribute is not listed in $AttrDef.
- */
-int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
-{
- ATTR_DEF *ad;
-
- /* Find the attribute definition record in $AttrDef. */
- ad = ntfs_attr_find_in_attrdef(vol, type);
- if (unlikely(!ad))
- return -ENOENT;
- /* Check the flags and return the result. */
- if (ad->flags & ATTR_DEF_RESIDENT)
- return -EPERM;
- return 0;
-}
-
-/**
- * ntfs_attr_can_be_resident - check if an attribute can be resident
- * @vol: ntfs volume to which the attribute belongs
- * @type: attribute type which to check
- *
- * Check whether the attribute of @type on the ntfs volume @vol is allowed to
- * be resident. This information is derived from our ntfs knowledge and may
- * not be completely accurate, especially when user defined attributes are
- * present. Basically we allow everything to be resident except for index
- * allocation and $EA attributes.
- *
- * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
- *
- * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
- * otherwise windows will not boot (blue screen of death)! We cannot
- * check for this here as we do not know which inode's $Bitmap is
- * being asked about so the caller needs to special case this.
- */
-int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
-{
- if (type == AT_INDEX_ALLOCATION)
- return -EPERM;
- return 0;
-}
-
-/**
- * ntfs_attr_record_resize - resize an attribute record
- * @m: mft record containing attribute record
- * @a: attribute record to resize
- * @new_size: new size in bytes to which to resize the attribute record @a
- *
- * Resize the attribute record @a, i.e. the resident part of the attribute, in
- * the mft record @m to @new_size bytes.
- *
- * Return 0 on success and -errno on error. The following error codes are
- * defined:
- * -ENOSPC - Not enough space in the mft record @m to perform the resize.
- *
- * Note: On error, no modifications have been performed whatsoever.
- *
- * Warning: If you make a record smaller without having copied all the data you
- * are interested in the data may be overwritten.
- */
-int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
-{
- ntfs_debug("Entering for new_size %u.", new_size);
- /* Align to 8 bytes if it is not already done. */
- if (new_size & 7)
- new_size = (new_size + 7) & ~7;
- /* If the actual attribute length has changed, move things around. */
- if (new_size != le32_to_cpu(a->length)) {
- u32 new_muse = le32_to_cpu(m->bytes_in_use) -
- le32_to_cpu(a->length) + new_size;
- /* Not enough space in this mft record. */
- if (new_muse > le32_to_cpu(m->bytes_allocated))
- return -ENOSPC;
- /* Move attributes following @a to their new location. */
- memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
- le32_to_cpu(m->bytes_in_use) - ((u8*)a -
- (u8*)m) - le32_to_cpu(a->length));
- /* Adjust @m to reflect the change in used space. */
- m->bytes_in_use = cpu_to_le32(new_muse);
- /* Adjust @a to reflect the new size. */
- if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
- a->length = cpu_to_le32(new_size);
- }
- return 0;
-}
-
-/**
- * ntfs_resident_attr_value_resize - resize the value of a resident attribute
- * @m: mft record containing attribute record
- * @a: attribute record whose value to resize
- * @new_size: new size in bytes to which to resize the attribute value of @a
- *
- * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
- * If the value is made bigger, the newly allocated space is cleared.
- *
- * Return 0 on success and -errno on error. The following error codes are
- * defined:
- * -ENOSPC - Not enough space in the mft record @m to perform the resize.
- *
- * Note: On error, no modifications have been performed whatsoever.
- *
- * Warning: If you make a record smaller without having copied all the data you
- * are interested in the data may be overwritten.
- */
-int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
- const u32 new_size)
-{
- u32 old_size;
-
- /* Resize the resident part of the attribute record. */
- if (ntfs_attr_record_resize(m, a,
- le16_to_cpu(a->data.resident.value_offset) + new_size))
- return -ENOSPC;
- /*
- * The resize succeeded! If we made the attribute value bigger, clear
- * the area between the old size and @new_size.
- */
- old_size = le32_to_cpu(a->data.resident.value_length);
- if (new_size > old_size)
- memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
- old_size, 0, new_size - old_size);
- /* Finally update the length of the attribute value. */
- a->data.resident.value_length = cpu_to_le32(new_size);
- return 0;
-}
-
-/**
- * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
- * @ni: ntfs inode describing the attribute to convert
- * @data_size: size of the resident data to copy to the non-resident attribute
- *
- * Convert the resident ntfs attribute described by the ntfs inode @ni to a
- * non-resident one.
- *
- * @data_size must be equal to the attribute value size. This is needed since
- * we need to know the size before we can map the mft record and our callers
- * always know it. The reason we cannot simply read the size from the vfs
- * inode i_size is that this is not necessarily uptodate. This happens when
- * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
- *
- * Return 0 on success and -errno on error. The following error return codes
- * are defined:
- * -EPERM - The attribute is not allowed to be non-resident.
- * -ENOMEM - Not enough memory.
- * -ENOSPC - Not enough disk space.
- * -EINVAL - Attribute not defined on the volume.
- * -EIO - I/o error or other error.
- * Note that -ENOSPC is also returned in the case that there is not enough
- * space in the mft record to do the conversion. This can happen when the mft
- * record is already very full. The caller is responsible for trying to make
- * space in the mft record and trying again. FIXME: Do we need a separate
- * error return code for this kind of -ENOSPC or is it always worth trying
- * again in case the attribute may then fit in a resident state so no need to
- * make it non-resident at all? Ho-hum... (AIA)
- *
- * NOTE to self: No changes in the attribute list are required to move from
- * a resident to a non-resident attribute.
- *
- * Locking: - The caller must hold i_mutex on the inode.
- */
-int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
-{
- s64 new_size;
- struct inode *vi = VFS_I(ni);
- ntfs_volume *vol = ni->vol;
- ntfs_inode *base_ni;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- struct page *page;
- runlist_element *rl;
- u8 *kaddr;
- unsigned long flags;
- int mp_size, mp_ofs, name_ofs, arec_size, err, err2;
- u32 attr_size;
- u8 old_res_attr_flags;
-
- /* Check that the attribute is allowed to be non-resident. */
- err = ntfs_attr_can_be_non_resident(vol, ni->type);
- if (unlikely(err)) {
- if (err == -EPERM)
- ntfs_debug("Attribute is not allowed to be "
- "non-resident.");
- else
- ntfs_debug("Attribute not defined on the NTFS "
- "volume!");
- return err;
- }
- /*
- * FIXME: Compressed and encrypted attributes are not supported when
- * writing and we should never have gotten here for them.
- */
- BUG_ON(NInoCompressed(ni));
- BUG_ON(NInoEncrypted(ni));
- /*
- * The size needs to be aligned to a cluster boundary for allocation
- * purposes.
- */
- new_size = (data_size + vol->cluster_size - 1) &
- ~(vol->cluster_size - 1);
- if (new_size > 0) {
- /*
- * Will need the page later and since the page lock nests
- * outside all ntfs locks, we need to get the page now.
- */
- page = find_or_create_page(vi->i_mapping, 0,
- mapping_gfp_mask(vi->i_mapping));
- if (unlikely(!page))
- return -ENOMEM;
- /* Start by allocating clusters to hold the attribute value. */
- rl = ntfs_cluster_alloc(vol, 0, new_size >>
- vol->cluster_size_bits, -1, DATA_ZONE, true);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- ntfs_debug("Failed to allocate cluster%s, error code "
- "%i.", (new_size >>
- vol->cluster_size_bits) > 1 ? "s" : "",
- err);
- goto page_err_out;
- }
- } else {
- rl = NULL;
- page = NULL;
- }
- /* Determine the size of the mapping pairs array. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
- if (unlikely(mp_size < 0)) {
- err = mp_size;
- ntfs_debug("Failed to get size for mapping pairs array, error "
- "code %i.", err);
- goto rl_err_out;
- }
- down_write(&ni->runlist.lock);
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- BUG_ON(NInoNonResident(ni));
- BUG_ON(a->non_resident);
- /*
- * Calculate new offsets for the name and the mapping pairs array.
- */
- if (NInoSparse(ni) || NInoCompressed(ni))
- name_ofs = (offsetof(ATTR_REC,
- data.non_resident.compressed_size) +
- sizeof(a->data.non_resident.compressed_size) +
- 7) & ~7;
- else
- name_ofs = (offsetof(ATTR_REC,
- data.non_resident.compressed_size) + 7) & ~7;
- mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
- /*
- * Determine the size of the resident part of the now non-resident
- * attribute record.
- */
- arec_size = (mp_ofs + mp_size + 7) & ~7;
- /*
- * If the page is not uptodate bring it uptodate by copying from the
- * attribute value.
- */
- attr_size = le32_to_cpu(a->data.resident.value_length);
- BUG_ON(attr_size != data_size);
- if (page && !PageUptodate(page)) {
- kaddr = kmap_atomic(page);
- memcpy(kaddr, (u8*)a +
- le16_to_cpu(a->data.resident.value_offset),
- attr_size);
- memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size);
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
- SetPageUptodate(page);
- }
- /* Backup the attribute flag. */
- old_res_attr_flags = a->data.resident.flags;
- /* Resize the resident part of the attribute record. */
- err = ntfs_attr_record_resize(m, a, arec_size);
- if (unlikely(err))
- goto err_out;
- /*
- * Convert the resident part of the attribute record to describe a
- * non-resident attribute.
- */
- a->non_resident = 1;
- /* Move the attribute name if it exists and update the offset. */
- if (a->name_length)
- memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
- a->name_length * sizeof(ntfschar));
- a->name_offset = cpu_to_le16(name_ofs);
- /* Setup the fields specific to non-resident attributes. */
- a->data.non_resident.lowest_vcn = 0;
- a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
- vol->cluster_size_bits);
- a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
- memset(&a->data.non_resident.reserved, 0,
- sizeof(a->data.non_resident.reserved));
- a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
- a->data.non_resident.data_size =
- a->data.non_resident.initialized_size =
- cpu_to_sle64(attr_size);
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- a->data.non_resident.compression_unit = 0;
- if (NInoCompressed(ni) || vol->major_ver < 3)
- a->data.non_resident.compression_unit = 4;
- a->data.non_resident.compressed_size =
- a->data.non_resident.allocated_size;
- } else
- a->data.non_resident.compression_unit = 0;
- /* Generate the mapping pairs array into the attribute record. */
- err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
- arec_size - mp_ofs, rl, 0, -1, NULL);
- if (unlikely(err)) {
- ntfs_debug("Failed to build mapping pairs, error code %i.",
- err);
- goto undo_err_out;
- }
- /* Setup the in-memory attribute structure to be non-resident. */
- ni->runlist.rl = rl;
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = new_size;
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- ni->itype.compressed.size = ni->allocated_size;
- if (a->data.non_resident.compression_unit) {
- ni->itype.compressed.block_size = 1U << (a->data.
- non_resident.compression_unit +
- vol->cluster_size_bits);
- ni->itype.compressed.block_size_bits =
- ffs(ni->itype.compressed.block_size) -
- 1;
- ni->itype.compressed.block_clusters = 1U <<
- a->data.non_resident.compression_unit;
- } else {
- ni->itype.compressed.block_size = 0;
- ni->itype.compressed.block_size_bits = 0;
- ni->itype.compressed.block_clusters = 0;
- }
- vi->i_blocks = ni->itype.compressed.size >> 9;
- } else
- vi->i_blocks = ni->allocated_size >> 9;
- write_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * This needs to be last since the address space operations ->read_folio
- * and ->writepage can run concurrently with us as they are not
- * serialized on i_mutex. Note, we are not allowed to fail once we flip
- * this switch, which is another reason to do this last.
- */
- NInoSetNonResident(ni);
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- if (page) {
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
- }
- ntfs_debug("Done.");
- return 0;
-undo_err_out:
- /* Convert the attribute back into a resident attribute. */
- a->non_resident = 0;
- /* Move the attribute name if it exists and update the offset. */
- name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) +
- sizeof(a->data.resident.reserved) + 7) & ~7;
- if (a->name_length)
- memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
- a->name_length * sizeof(ntfschar));
- mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
- a->name_offset = cpu_to_le16(name_ofs);
- arec_size = (mp_ofs + attr_size + 7) & ~7;
- /* Resize the resident part of the attribute record. */
- err2 = ntfs_attr_record_resize(m, a, arec_size);
- if (unlikely(err2)) {
- /*
- * This cannot happen (well if memory corruption is at work it
- * could happen in theory), but deal with it as well as we can.
- * If the old size is too small, truncate the attribute,
- * otherwise simply give it a larger allocated size.
- * FIXME: Should check whether chkdsk complains when the
- * allocated size is much bigger than the resident value size.
- */
- arec_size = le32_to_cpu(a->length);
- if ((mp_ofs + attr_size) > arec_size) {
- err2 = attr_size;
- attr_size = arec_size - mp_ofs;
- ntfs_error(vol->sb, "Failed to undo partial resident "
- "to non-resident attribute "
- "conversion. Truncating inode 0x%lx, "
- "attribute type 0x%x from %i bytes to "
- "%i bytes to maintain metadata "
- "consistency. THIS MEANS YOU ARE "
- "LOSING %i BYTES DATA FROM THIS %s.",
- vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- err2, attr_size, err2 - attr_size,
- ((ni->type == AT_DATA) &&
- !ni->name_len) ? "FILE": "ATTRIBUTE");
- write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = attr_size;
- i_size_write(vi, attr_size);
- write_unlock_irqrestore(&ni->size_lock, flags);
- }
- }
- /* Setup the fields specific to resident attributes. */
- a->data.resident.value_length = cpu_to_le32(attr_size);
- a->data.resident.value_offset = cpu_to_le16(mp_ofs);
- a->data.resident.flags = old_res_attr_flags;
- memset(&a->data.resident.reserved, 0,
- sizeof(a->data.resident.reserved));
- /* Copy the data from the page back to the attribute value. */
- if (page) {
- kaddr = kmap_atomic(page);
- memcpy((u8*)a + mp_ofs, kaddr, attr_size);
- kunmap_atomic(kaddr);
- }
- /* Setup the allocated size in the ntfs inode in case it changed. */
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = arec_size - mp_ofs;
- write_unlock_irqrestore(&ni->size_lock, flags);
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- ni->runlist.rl = NULL;
- up_write(&ni->runlist.lock);
-rl_err_out:
- if (rl) {
- if (ntfs_cluster_free_from_rl(vol, rl) < 0) {
- ntfs_error(vol->sb, "Failed to release allocated "
- "cluster(s) in error code path. Run "
- "chkdsk to recover the lost "
- "cluster(s).");
- NVolSetErrors(vol);
- }
- ntfs_free(rl);
-page_err_out:
- unlock_page(page);
- put_page(page);
- }
- if (err == -EINVAL)
- err = -EIO;
- return err;
-}
-
-/**
- * ntfs_attr_extend_allocation - extend the allocated space of an attribute
- * @ni: ntfs inode of the attribute whose allocation to extend
- * @new_alloc_size: new size in bytes to which to extend the allocation to
- * @new_data_size: new size in bytes to which to extend the data to
- * @data_start: beginning of region which is required to be non-sparse
- *
- * Extend the allocated space of an attribute described by the ntfs inode @ni
- * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be
- * implemented as a hole in the file (as long as both the volume and the ntfs
- * inode @ni have sparse support enabled). If @data_start is >= 0, then the
- * region between the old allocated size and @data_start - 1 may be made sparse
- * but the regions between @data_start and @new_alloc_size must be backed by
- * actual clusters.
- *
- * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size
- * of the attribute is extended to @new_data_size. Note that the i_size of the
- * vfs inode is not updated. Only the data size in the base attribute record
- * is updated. The caller has to update i_size separately if this is required.
- * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
- * size as well as for @new_data_size to be greater than @new_alloc_size.
- *
- * For resident attributes this involves resizing the attribute record and if
- * necessary moving it and/or other attributes into extent mft records and/or
- * converting the attribute to a non-resident attribute which in turn involves
- * extending the allocation of a non-resident attribute as described below.
- *
- * For non-resident attributes this involves allocating clusters in the data
- * zone on the volume (except for regions that are being made sparse) and
- * extending the run list to describe the allocated clusters as well as
- * updating the mapping pairs array of the attribute. This in turn involves
- * resizing the attribute record and if necessary moving it and/or other
- * attributes into extent mft records and/or splitting the attribute record
- * into multiple extent attribute records.
- *
- * Also, the attribute list attribute is updated if present and in some of the
- * above cases (the ones where extent mft records/attributes come into play),
- * an attribute list attribute is created if not already present.
- *
- * Return the new allocated size on success and -errno on error. In the case
- * that an error is encountered but a partial extension at least up to
- * @data_start (if present) is possible, the allocation is partially extended
- * and this is returned. This means the caller must check the returned size to
- * determine if the extension was partial. If @data_start is -1 then partial
- * allocations are not performed.
- *
- * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
- *
- * Locking: This function takes the runlist lock of @ni for writing as well as
- * locking the mft record of the base ntfs inode. These locks are maintained
- * throughout execution of the function. These locks are required so that the
- * attribute can be resized safely and so that it can for example be converted
- * from resident to non-resident safely.
- *
- * TODO: At present attribute list attribute handling is not implemented.
- *
- * TODO: At present it is not safe to call this function for anything other
- * than the $DATA attribute(s) of an uncompressed and unencrypted file.
- */
-s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
- const s64 new_data_size, const s64 data_start)
-{
- VCN vcn;
- s64 ll, allocated_size, start = data_start;
- struct inode *vi = VFS_I(ni);
- ntfs_volume *vol = ni->vol;
- ntfs_inode *base_ni;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- runlist_element *rl, *rl2;
- unsigned long flags;
- int err, mp_size;
- u32 attr_len = 0; /* Silence stupid gcc warning. */
- bool mp_rebuilt;
-
-#ifdef DEBUG
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
- "old_allocated_size 0x%llx, "
- "new_allocated_size 0x%llx, new_data_size 0x%llx, "
- "data_start 0x%llx.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)allocated_size,
- (unsigned long long)new_alloc_size,
- (unsigned long long)new_data_size,
- (unsigned long long)start);
-#endif
-retry_extend:
- /*
- * For non-resident attributes, @start and @new_size need to be aligned
- * to cluster boundaries for allocation purposes.
- */
- if (NInoNonResident(ni)) {
- if (start > 0)
- start &= ~(s64)vol->cluster_size_mask;
- new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
- ~(s64)vol->cluster_size_mask;
- }
- BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
- /* Check if new size is allowed in $AttrDef. */
- err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
- if (unlikely(err)) {
- /* Only emit errors when the write will fail completely. */
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (start < 0 || start >= allocated_size) {
- if (err == -ERANGE) {
- ntfs_error(vol->sb, "Cannot extend allocation "
- "of inode 0x%lx, attribute "
- "type 0x%x, because the new "
- "allocation would exceed the "
- "maximum allowed size for "
- "this attribute type.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- } else {
- ntfs_error(vol->sb, "Cannot extend allocation "
- "of inode 0x%lx, attribute "
- "type 0x%x, because this "
- "attribute type is not "
- "defined on the NTFS volume. "
- "Possible corruption! You "
- "should run chkdsk!",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- }
- }
- /* Translate error code to be POSIX conformant for write(2). */
- if (err == -ERANGE)
- err = -EFBIG;
- else
- err = -EIO;
- return err;
- }
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /*
- * We will be modifying both the runlist (if non-resident) and the mft
- * record so lock them both down.
- */
- down_write(&ni->runlist.lock);
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * If non-resident, seek to the last extent. If resident, there is
- * only one extent, so seek to that.
- */
- vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
- 0;
- /*
- * Abort if someone did the work whilst we waited for the locks. If we
- * just converted the attribute from resident to non-resident it is
- * likely that exactly this has happened already. We cannot quite
- * abort if we need to update the data size.
- */
- if (unlikely(new_alloc_size <= allocated_size)) {
- ntfs_debug("Allocated size already exceeds requested size.");
- new_alloc_size = allocated_size;
- if (new_data_size < 0)
- goto done;
- /*
- * We want the first attribute extent so that we can update the
- * data size.
- */
- vcn = 0;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, vcn, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- /* Use goto to reduce indentation. */
- if (a->non_resident)
- goto do_non_resident_extend;
- BUG_ON(NInoNonResident(ni));
- /* The total length of the attribute value. */
- attr_len = le32_to_cpu(a->data.resident.value_length);
- /*
- * Extend the attribute record to be able to store the new attribute
- * size. ntfs_attr_record_resize() will not do anything if the size is
- * not changing.
- */
- if (new_alloc_size < vol->mft_record_size &&
- !ntfs_attr_record_resize(m, a,
- le16_to_cpu(a->data.resident.value_offset) +
- new_alloc_size)) {
- /* The resize succeeded! */
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = le32_to_cpu(a->length) -
- le16_to_cpu(a->data.resident.value_offset);
- write_unlock_irqrestore(&ni->size_lock, flags);
- if (new_data_size >= 0) {
- BUG_ON(new_data_size < attr_len);
- a->data.resident.value_length =
- cpu_to_le32((u32)new_data_size);
- }
- goto flush_done;
- }
- /*
- * We have to drop all the locks so we can call
- * ntfs_attr_make_non_resident(). This could be optimised by try-
- * locking the first page cache page and only if that fails dropping
- * the locks, locking the page, and redoing all the locking and
- * lookups. While this would be a huge optimisation, it is not worth
- * it as this is definitely a slow code path.
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- /*
- * Not enough space in the mft record, try to make the attribute
- * non-resident and if successful restart the extension process.
- */
- err = ntfs_attr_make_non_resident(ni, attr_len);
- if (likely(!err))
- goto retry_extend;
- /*
- * Could not make non-resident. If this is due to this not being
- * permitted for this attribute type or there not being enough space,
- * try to make other attributes non-resident. Otherwise fail.
- */
- if (unlikely(err != -EPERM && err != -ENOSPC)) {
- /* Only emit errors when the write will fail completely. */
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because the conversion from resident "
- "to non-resident attribute failed "
- "with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err != -ENOMEM)
- err = -EIO;
- goto conv_err_out;
- }
- /* TODO: Not implemented from here, abort. */
- read_lock_irqsave(&ni->size_lock, flags);
- allocated_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (start < 0 || start >= allocated_size) {
- if (err == -ENOSPC)
- ntfs_error(vol->sb, "Not enough space in the mft "
- "record/on disk for the non-resident "
- "attribute value. This case is not "
- "implemented yet.");
- else /* if (err == -EPERM) */
- ntfs_error(vol->sb, "This attribute type may not be "
- "non-resident. This case is not "
- "implemented yet.");
- }
- err = -EOPNOTSUPP;
- goto conv_err_out;
-#if 0
- // TODO: Attempt to make other attributes non-resident.
- if (!err)
- goto do_resident_extend;
- /*
- * Both the attribute list attribute and the standard information
- * attribute must remain in the base inode. Thus, if this is one of
- * these attributes, we have to try to move other attributes out into
- * extent mft records instead.
- */
- if (ni->type == AT_ATTRIBUTE_LIST ||
- ni->type == AT_STANDARD_INFORMATION) {
- // TODO: Attempt to move other attributes into extent mft
- // records.
- err = -EOPNOTSUPP;
- if (!err)
- goto do_resident_extend;
- goto err_out;
- }
- // TODO: Attempt to move this attribute to an extent mft record, but
- // only if it is not already the only attribute in an mft record in
- // which case there would be nothing to gain.
- err = -EOPNOTSUPP;
- if (!err)
- goto do_resident_extend;
- /* There is nothing we can do to make enough space. )-: */
- goto err_out;
-#endif
-do_non_resident_extend:
- BUG_ON(!NInoNonResident(ni));
- if (new_alloc_size == allocated_size) {
- BUG_ON(vcn);
- goto alloc_done;
- }
- /*
- * If the data starts after the end of the old allocation, this is a
- * $DATA attribute and sparse attributes are enabled on the volume and
- * for this inode, then create a sparse region between the old
- * allocated size and the start of the data. Otherwise simply proceed
- * with filling the whole space between the old allocated size and the
- * new allocated size with clusters.
- */
- if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
- !NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
- goto skip_sparse;
- // TODO: This is not implemented yet. We just fill in with real
- // clusters for now...
- ntfs_debug("Inserting holes is not-implemented yet. Falling back to "
- "allocating real clusters instead.");
-skip_sparse:
- rl = ni->runlist.rl;
- if (likely(rl)) {
- /* Seek to the end of the runlist. */
- while (rl->length)
- rl++;
- }
- /* If this attribute extent is not mapped, map it now. */
- if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
- (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
- (rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
- if (!rl && !allocated_size)
- goto first_alloc;
- rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation "
- "of inode 0x%lx, attribute "
- "type 0x%x, because the "
- "mapping of a runlist "
- "fragment failed with error "
- "code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- err);
- if (err != -ENOMEM)
- err = -EIO;
- goto err_out;
- }
- ni->runlist.rl = rl;
- /* Seek to the end of the runlist. */
- while (rl->length)
- rl++;
- }
- /*
- * We now know the runlist of the last extent is mapped and @rl is at
- * the end of the runlist. We want to begin allocating clusters
- * starting at the last allocated cluster to reduce fragmentation. If
- * there are no valid LCNs in the attribute we let the cluster
- * allocator choose the starting cluster.
- */
- /* If the last LCN is a hole or simillar seek back to last real LCN. */
- while (rl->lcn < 0 && rl > ni->runlist.rl)
- rl--;
-first_alloc:
- // FIXME: Need to implement partial allocations so at least part of the
- // write can be performed when start >= 0. (Needed for POSIX write(2)
- // conformance.)
- rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
- (new_alloc_size - allocated_size) >>
- vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
- rl->lcn + rl->length : -1, DATA_ZONE, true);
- if (IS_ERR(rl2)) {
- err = PTR_ERR(rl2);
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because the allocation of clusters "
- "failed with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err != -ENOMEM && err != -ENOSPC)
- err = -EIO;
- goto err_out;
- }
- rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because the runlist merge failed "
- "with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err != -ENOMEM)
- err = -EIO;
- if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to release allocated "
- "cluster(s) in error code path. Run "
- "chkdsk to recover the lost "
- "cluster(s).");
- NVolSetErrors(vol);
- }
- ntfs_free(rl2);
- goto err_out;
- }
- ni->runlist.rl = rl;
- ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
- allocated_size) >> vol->cluster_size_bits);
- /* Find the runlist element with which the attribute extent starts. */
- ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
- rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
- BUG_ON(!rl2);
- BUG_ON(!rl2->length);
- BUG_ON(rl2->lcn < LCN_HOLE);
- mp_rebuilt = false;
- /* Get the size for the new mapping pairs array for this extent. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
- if (unlikely(mp_size <= 0)) {
- err = mp_size;
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because determining the size for the "
- "mapping pairs failed with error code "
- "%i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- err = -EIO;
- goto undo_alloc;
- }
- /* Extend the attribute record to fit the bigger mapping pairs array. */
- attr_len = le32_to_cpu(a->length);
- err = ntfs_attr_record_resize(m, a, mp_size +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
- if (unlikely(err)) {
- BUG_ON(err != -ENOSPC);
- // TODO: Deal with this by moving this extent to a new mft
- // record or by starting a new extent in a new mft record,
- // possibly by extending this extent partially and filling it
- // and creating a new extent for the remainder, or by making
- // other attributes non-resident and/or by moving other
- // attributes out of this mft record.
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Not enough space in the mft "
- "record for the extended attribute "
- "record. This case is not "
- "implemented yet.");
- err = -EOPNOTSUPP;
- goto undo_alloc;
- }
- mp_rebuilt = true;
- /* Generate the mapping pairs array directly into the attr record. */
- err = ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
- mp_size, rl2, ll, -1, NULL);
- if (unlikely(err)) {
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot extend allocation of "
- "inode 0x%lx, attribute type 0x%x, "
- "because building the mapping pairs "
- "failed with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- err = -EIO;
- goto undo_alloc;
- }
- /* Update the highest_vcn. */
- a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
- vol->cluster_size_bits) - 1);
- /*
- * We now have extended the allocated size of the attribute. Reflect
- * this in the ntfs_inode structure and the attribute record.
- */
- if (a->data.non_resident.lowest_vcn) {
- /*
- * We are not in the first attribute extent, switch to it, but
- * first ensure the changes will make it to disk later.
- */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err))
- goto restore_undo_alloc;
- /* @m is not used any more so no need to set it. */
- a = ctx->attr;
- }
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = new_alloc_size;
- a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
- /*
- * FIXME: This would fail if @ni is a directory, $MFT, or an index,
- * since those can have sparse/compressed set. For example can be
- * set compressed even though it is not compressed itself and in that
- * case the bit means that files are to be created compressed in the
- * directory... At present this is ok as this code is only called for
- * regular files, and only for their $DATA attribute(s).
- * FIXME: The calculation is wrong if we created a hole above. For now
- * it does not matter as we never create holes.
- */
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- ni->itype.compressed.size += new_alloc_size - allocated_size;
- a->data.non_resident.compressed_size =
- cpu_to_sle64(ni->itype.compressed.size);
- vi->i_blocks = ni->itype.compressed.size >> 9;
- } else
- vi->i_blocks = new_alloc_size >> 9;
- write_unlock_irqrestore(&ni->size_lock, flags);
-alloc_done:
- if (new_data_size >= 0) {
- BUG_ON(new_data_size <
- sle64_to_cpu(a->data.non_resident.data_size));
- a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
- }
-flush_done:
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
-done:
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- ntfs_debug("Done, new_allocated_size 0x%llx.",
- (unsigned long long)new_alloc_size);
- return new_alloc_size;
-restore_undo_alloc:
- if (start < 0 || start >= allocated_size)
- ntfs_error(vol->sb, "Cannot complete extension of allocation "
- "of inode 0x%lx, attribute type 0x%x, because "
- "lookup of first attribute extent failed with "
- "error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err == -ENOENT)
- err = -EIO;
- ntfs_attr_reinit_search_ctx(ctx);
- if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
- allocated_size >> vol->cluster_size_bits, NULL, 0,
- ctx)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "attribute in error code path. Run chkdsk to "
- "recover.");
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = new_alloc_size;
- /*
- * FIXME: This would fail if @ni is a directory... See above.
- * FIXME: The calculation is wrong if we created a hole above.
- * For now it does not matter as we never create holes.
- */
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- ni->itype.compressed.size += new_alloc_size -
- allocated_size;
- vi->i_blocks = ni->itype.compressed.size >> 9;
- } else
- vi->i_blocks = new_alloc_size >> 9;
- write_unlock_irqrestore(&ni->size_lock, flags);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- /*
- * The only thing that is now wrong is the allocated size of the
- * base attribute extent which chkdsk should be able to fix.
- */
- NVolSetErrors(vol);
- return err;
- }
- ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
- (allocated_size >> vol->cluster_size_bits) - 1);
-undo_alloc:
- ll = allocated_size >> vol->cluster_size_bits;
- if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
- ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
- "in error code path. Run chkdsk to recover "
- "the lost cluster(s).");
- NVolSetErrors(vol);
- }
- m = ctx->mrec;
- a = ctx->attr;
- /*
- * If the runlist truncation fails and/or the search context is no
- * longer valid, we cannot resize the attribute record or build the
- * mapping pairs array thus we mark the inode bad so that no access to
- * the freed clusters can happen.
- */
- if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
- ntfs_error(vol->sb, "Failed to %s in error code path. Run "
- "chkdsk to recover.", IS_ERR(m) ?
- "restore attribute search context" :
- "truncate attribute runlist");
- NVolSetErrors(vol);
- } else if (mp_rebuilt) {
- if (ntfs_attr_record_resize(m, a, attr_len)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
- "record in error code path. Run "
- "chkdsk to recover.");
- NVolSetErrors(vol);
- } else /* if (success) */ {
- if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
- a->data.non_resident.
- mapping_pairs_offset), attr_len -
- le16_to_cpu(a->data.non_resident.
- mapping_pairs_offset), rl2, ll, -1,
- NULL)) {
- ntfs_error(vol->sb, "Failed to restore "
- "mapping pairs array in error "
- "code path. Run chkdsk to "
- "recover.");
- NVolSetErrors(vol);
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- }
- }
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
-conv_err_out:
- ntfs_debug("Failed. Returning error code %i.", err);
- return err;
-}
-
-/**
- * ntfs_attr_set - fill (a part of) an attribute with a byte
- * @ni: ntfs inode describing the attribute to fill
- * @ofs: offset inside the attribute at which to start to fill
- * @cnt: number of bytes to fill
- * @val: the unsigned 8-bit value with which to fill the attribute
- *
- * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
- * byte offset @ofs inside the attribute with the constant byte @val.
- *
- * This function is effectively like memset() applied to an ntfs attribute.
- * Note this function actually only operates on the page cache pages belonging
- * to the ntfs attribute and it marks them dirty after doing the memset().
- * Thus it relies on the vm dirty page write code paths to cause the modified
- * pages to be written to the mft record/disk.
- *
- * Return 0 on success and -errno on error. An error code of -ESPIPE means
- * that @ofs + @cnt were outside the end of the attribute and no write was
- * performed.
- */
-int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
-{
- ntfs_volume *vol = ni->vol;
- struct address_space *mapping;
- struct page *page;
- u8 *kaddr;
- pgoff_t idx, end;
- unsigned start_ofs, end_ofs, size;
-
- ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
- (long long)ofs, (long long)cnt, val);
- BUG_ON(ofs < 0);
- BUG_ON(cnt < 0);
- if (!cnt)
- goto done;
- /*
- * FIXME: Compressed and encrypted attributes are not supported when
- * writing and we should never have gotten here for them.
- */
- BUG_ON(NInoCompressed(ni));
- BUG_ON(NInoEncrypted(ni));
- mapping = VFS_I(ni)->i_mapping;
- /* Work out the starting index and page offset. */
- idx = ofs >> PAGE_SHIFT;
- start_ofs = ofs & ~PAGE_MASK;
- /* Work out the ending index and page offset. */
- end = ofs + cnt;
- end_ofs = end & ~PAGE_MASK;
- /* If the end is outside the inode size return -ESPIPE. */
- if (unlikely(end > i_size_read(VFS_I(ni)))) {
- ntfs_error(vol->sb, "Request exceeds end of attribute.");
- return -ESPIPE;
- }
- end >>= PAGE_SHIFT;
- /* If there is a first partial page, need to do it the slow way. */
- if (start_ofs) {
- page = read_mapping_page(mapping, idx, NULL);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read first partial "
- "page (error, index 0x%lx).", idx);
- return PTR_ERR(page);
- }
- /*
- * If the last page is the same as the first page, need to
- * limit the write to the end offset.
- */
- size = PAGE_SIZE;
- if (idx == end)
- size = end_ofs;
- kaddr = kmap_atomic(page);
- memset(kaddr + start_ofs, val, size - start_ofs);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- set_page_dirty(page);
- put_page(page);
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- if (idx == end)
- goto done;
- idx++;
- }
- /* Do the whole pages the fast way. */
- for (; idx < end; idx++) {
- /* Find or create the current page. (The page is locked.) */
- page = grab_cache_page(mapping, idx);
- if (unlikely(!page)) {
- ntfs_error(vol->sb, "Insufficient memory to grab "
- "page (index 0x%lx).", idx);
- return -ENOMEM;
- }
- kaddr = kmap_atomic(page);
- memset(kaddr, val, PAGE_SIZE);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- /*
- * If the page has buffers, mark them uptodate since buffer
- * state and not page state is definitive in 2.6 kernels.
- */
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
-
- bh = head = page_buffers(page);
- do {
- set_buffer_uptodate(bh);
- } while ((bh = bh->b_this_page) != head);
- }
- /* Now that buffers are uptodate, set the page uptodate, too. */
- SetPageUptodate(page);
- /*
- * Set the page and all its buffers dirty and mark the inode
- * dirty, too. The VM will write the page later on.
- */
- set_page_dirty(page);
- /* Finally unlock and release the page. */
- unlock_page(page);
- put_page(page);
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- }
- /* If there is a last partial page, need to do it the slow way. */
- if (end_ofs) {
- page = read_mapping_page(mapping, idx, NULL);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read last partial page "
- "(error, index 0x%lx).", idx);
- return PTR_ERR(page);
- }
- kaddr = kmap_atomic(page);
- memset(kaddr, val, end_ofs);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- set_page_dirty(page);
- put_page(page);
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- }
-done:
- ntfs_debug("Done.");
- return 0;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
deleted file mode 100644
index fe0890d3d072..000000000000
--- a/fs/ntfs/attrib.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_ATTRIB_H
-#define _LINUX_NTFS_ATTRIB_H
-
-#include "endian.h"
-#include "types.h"
-#include "layout.h"
-#include "inode.h"
-#include "runlist.h"
-#include "volume.h"
-
-/**
- * ntfs_attr_search_ctx - used in attribute search functions
- * @mrec: buffer containing mft record to search
- * @attr: attribute record in @mrec where to begin/continue search
- * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after
- *
- * Structure must be initialized to zero before the first call to one of the
- * attribute search functions. Initialize @mrec to point to the mft record to
- * search, and @attr to point to the first attribute within @mrec (not necessary
- * if calling the _first() functions), and set @is_first to 'true' (not necessary
- * if calling the _first() functions).
- *
- * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
- * the search begins after @attr. This is so that, after the first call to one
- * of the search attribute functions, we can call the function again, without
- * any modification of the search context, to automagically get the next
- * matching attribute.
- */
-typedef struct {
- MFT_RECORD *mrec;
- ATTR_RECORD *attr;
- bool is_first;
- ntfs_inode *ntfs_ino;
- ATTR_LIST_ENTRY *al_entry;
- ntfs_inode *base_ntfs_ino;
- MFT_RECORD *base_mrec;
- ATTR_RECORD *base_attr;
-} ntfs_attr_search_ctx;
-
-extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
- ntfs_attr_search_ctx *ctx);
-extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
-
-extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
- const bool write_locked);
-
-extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
- const VCN vcn, ntfs_attr_search_ctx *ctx);
-
-int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
- const u32 name_len, const IGNORE_CASE_BOOL ic,
- const VCN lowest_vcn, const u8 *val, const u32 val_len,
- ntfs_attr_search_ctx *ctx);
-
-extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
- const s64 size, const s64 initialized_size);
-
-static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
-{
- if (!a->non_resident)
- return (s64)le32_to_cpu(a->data.resident.value_length);
- return sle64_to_cpu(a->data.non_resident.data_size);
-}
-
-extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
-extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
- MFT_RECORD *mrec);
-extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
-
-#ifdef NTFS_RW
-
-extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
- const ATTR_TYPE type, const s64 size);
-extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
- const ATTR_TYPE type);
-extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
- const ATTR_TYPE type);
-
-extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
-extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
- const u32 new_size);
-
-extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
-
-extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
- const s64 new_data_size, const s64 data_start);
-
-extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
- const u8 val);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
deleted file mode 100644
index 0675b2400873..000000000000
--- a/fs/ntfs/bitmap.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/pagemap.h>
-
-#include "bitmap.h"
-#include "debug.h"
-#include "aops.h"
-#include "ntfs.h"
-
-/**
- * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
- * @vi: vfs inode describing the bitmap
- * @start_bit: first bit to set
- * @count: number of bits to set
- * @value: value to set the bits to (i.e. 0 or 1)
- * @is_rollback: if 'true' this is a rollback operation
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi to @value, where @value is either 0 or 1.
- *
- * @is_rollback should always be 'false', it is for internal use to rollback
- * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead.
- *
- * Return 0 on success and -errno on error.
- */
-int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
- const s64 count, const u8 value, const bool is_rollback)
-{
- s64 cnt = count;
- pgoff_t index, end_index;
- struct address_space *mapping;
- struct page *page;
- u8 *kaddr;
- int pos, len;
- u8 bit;
-
- BUG_ON(!vi);
- ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
- "value %u.%s", vi->i_ino, (unsigned long long)start_bit,
- (unsigned long long)cnt, (unsigned int)value,
- is_rollback ? " (rollback)" : "");
- BUG_ON(start_bit < 0);
- BUG_ON(cnt < 0);
- BUG_ON(value > 1);
- /*
- * Calculate the indices for the pages containing the first and last
- * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
- */
- index = start_bit >> (3 + PAGE_SHIFT);
- end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT);
-
- /* Get the page containing the first bit (@start_bit). */
- mapping = vi->i_mapping;
- page = ntfs_map_page(mapping, index);
- if (IS_ERR(page)) {
- if (!is_rollback)
- ntfs_error(vi->i_sb, "Failed to map first page (error "
- "%li), aborting.", PTR_ERR(page));
- return PTR_ERR(page);
- }
- kaddr = page_address(page);
-
- /* Set @pos to the position of the byte containing @start_bit. */
- pos = (start_bit >> 3) & ~PAGE_MASK;
-
- /* Calculate the position of @start_bit in the first byte. */
- bit = start_bit & 7;
-
- /* If the first byte is partial, modify the appropriate bits in it. */
- if (bit) {
- u8 *byte = kaddr + pos;
- while ((bit & 7) && cnt) {
- cnt--;
- if (value)
- *byte |= 1 << bit++;
- else
- *byte &= ~(1 << bit++);
- }
- /* If we are done, unmap the page and return success. */
- if (!cnt)
- goto done;
-
- /* Update @pos to the new position. */
- pos++;
- }
- /*
- * Depending on @value, modify all remaining whole bytes in the page up
- * to @cnt.
- */
- len = min_t(s64, cnt >> 3, PAGE_SIZE - pos);
- memset(kaddr + pos, value ? 0xff : 0, len);
- cnt -= len << 3;
-
- /* Update @len to point to the first not-done byte in the page. */
- if (cnt < 8)
- len += pos;
-
- /* If we are not in the last page, deal with all subsequent pages. */
- while (index < end_index) {
- BUG_ON(cnt <= 0);
-
- /* Update @index and get the next page. */
- flush_dcache_page(page);
- set_page_dirty(page);
- ntfs_unmap_page(page);
- page = ntfs_map_page(mapping, ++index);
- if (IS_ERR(page))
- goto rollback;
- kaddr = page_address(page);
- /*
- * Depending on @value, modify all remaining whole bytes in the
- * page up to @cnt.
- */
- len = min_t(s64, cnt >> 3, PAGE_SIZE);
- memset(kaddr, value ? 0xff : 0, len);
- cnt -= len << 3;
- }
- /*
- * The currently mapped page is the last one. If the last byte is
- * partial, modify the appropriate bits in it. Note, @len is the
- * position of the last byte inside the page.
- */
- if (cnt) {
- u8 *byte;
-
- BUG_ON(cnt > 7);
-
- bit = cnt;
- byte = kaddr + len;
- while (bit--) {
- if (value)
- *byte |= 1 << bit;
- else
- *byte &= ~(1 << bit);
- }
- }
-done:
- /* We are done. Unmap the page and return success. */
- flush_dcache_page(page);
- set_page_dirty(page);
- ntfs_unmap_page(page);
- ntfs_debug("Done.");
- return 0;
-rollback:
- /*
- * Current state:
- * - no pages are mapped
- * - @count - @cnt is the number of bits that have been modified
- */
- if (is_rollback)
- return PTR_ERR(page);
- if (count != cnt)
- pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
- value ? 0 : 1, true);
- else
- pos = 0;
- if (!pos) {
- /* Rollback was successful. */
- ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
- "%li), aborting.", PTR_ERR(page));
- } else {
- /* Rollback failed. */
- ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
- "%li) and rollback failed (error %i). "
- "Aborting and leaving inconsistent metadata. "
- "Unmount and run chkdsk.", PTR_ERR(page), pos);
- NVolSetErrors(NTFS_SB(vi->i_sb));
- }
- return PTR_ERR(page);
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
deleted file mode 100644
index 9dd2224ca9c4..000000000000
--- a/fs/ntfs/bitmap.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_BITMAP_H
-#define _LINUX_NTFS_BITMAP_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "types.h"
-
-extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
- const s64 count, const u8 value, const bool is_rollback);
-
-/**
- * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
- * @vi: vfs inode describing the bitmap
- * @start_bit: first bit to set
- * @count: number of bits to set
- * @value: value to set the bits to (i.e. 0 or 1)
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi to @value, where @value is either 0 or 1.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
- const s64 start_bit, const s64 count, const u8 value)
-{
- return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
- false);
-}
-
-/**
- * ntfs_bitmap_set_run - set a run of bits in a bitmap
- * @vi: vfs inode describing the bitmap
- * @start_bit: first bit to set
- * @count: number of bits to set
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
- const s64 count)
-{
- return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
-}
-
-/**
- * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
- * @vi: vfs inode describing the bitmap
- * @start_bit: first bit to clear
- * @count: number of bits to clear
- *
- * Clear @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
- const s64 count)
-{
- return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
-}
-
-/**
- * ntfs_bitmap_set_bit - set a bit in a bitmap
- * @vi: vfs inode describing the bitmap
- * @bit: bit to set
- *
- * Set bit @bit in the bitmap described by the vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
-{
- return ntfs_bitmap_set_run(vi, bit, 1);
-}
-
-/**
- * ntfs_bitmap_clear_bit - clear a bit in a bitmap
- * @vi: vfs inode describing the bitmap
- * @bit: bit to clear
- *
- * Clear bit @bit in the bitmap described by the vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
-{
- return ntfs_bitmap_clear_run(vi, bit, 1);
-}
-
-#endif /* NTFS_RW */
-
-#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
deleted file mode 100644
index 3ab6ec96abfe..000000000000
--- a/fs/ntfs/collate.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#include "collate.h"
-#include "debug.h"
-#include "ntfs.h"
-
-static int ntfs_collate_binary(ntfs_volume *vol,
- const void *data1, const int data1_len,
- const void *data2, const int data2_len)
-{
- int rc;
-
- ntfs_debug("Entering.");
- rc = memcmp(data1, data2, min(data1_len, data2_len));
- if (!rc && (data1_len != data2_len)) {
- if (data1_len < data2_len)
- rc = -1;
- else
- rc = 1;
- }
- ntfs_debug("Done, returning %i", rc);
- return rc;
-}
-
-static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
- const void *data1, const int data1_len,
- const void *data2, const int data2_len)
-{
- int rc;
- u32 d1, d2;
-
- ntfs_debug("Entering.");
- // FIXME: We don't really want to bug here.
- BUG_ON(data1_len != data2_len);
- BUG_ON(data1_len != 4);
- d1 = le32_to_cpup(data1);
- d2 = le32_to_cpup(data2);
- if (d1 < d2)
- rc = -1;
- else {
- if (d1 == d2)
- rc = 0;
- else
- rc = 1;
- }
- ntfs_debug("Done, returning %i", rc);
- return rc;
-}
-
-typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
- const void *, const int);
-
-static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
- ntfs_collate_binary,
- NULL/*ntfs_collate_file_name*/,
- NULL/*ntfs_collate_unicode_string*/,
-};
-
-static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
- ntfs_collate_ntofs_ulong,
- NULL/*ntfs_collate_ntofs_sid*/,
- NULL/*ntfs_collate_ntofs_security_hash*/,
- NULL/*ntfs_collate_ntofs_ulongs*/,
-};
-
-/**
- * ntfs_collate - collate two data items using a specified collation rule
- * @vol: ntfs volume to which the data items belong
- * @cr: collation rule to use when comparing the items
- * @data1: first data item to collate
- * @data1_len: length in bytes of @data1
- * @data2: second data item to collate
- * @data2_len: length in bytes of @data2
- *
- * Collate the two data items @data1 and @data2 using the collation rule @cr
- * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
- * to match, or to collate after @data2.
- *
- * For speed we use the collation rule @cr as an index into two tables of
- * function pointers to call the appropriate collation function.
- */
-int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
- const void *data1, const int data1_len,
- const void *data2, const int data2_len) {
- int i;
-
- ntfs_debug("Entering.");
- /*
- * FIXME: At the moment we only support COLLATION_BINARY and
- * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
- */
- BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
- i = le32_to_cpu(cr);
- BUG_ON(i < 0);
- if (i <= 0x02)
- return ntfs_do_collate0x0[i](vol, data1, data1_len,
- data2, data2_len);
- BUG_ON(i < 0x10);
- i -= 0x10;
- if (likely(i <= 3))
- return ntfs_do_collate0x1[i](vol, data1, data1_len,
- data2, data2_len);
- BUG();
- return 0;
-}
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
deleted file mode 100644
index f2255619b4f4..000000000000
--- a/fs/ntfs/collate.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * collate.h - Defines for NTFS kernel collation handling. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_COLLATE_H
-#define _LINUX_NTFS_COLLATE_H
-
-#include "types.h"
-#include "volume.h"
-
-static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
- int i;
-
- /*
- * FIXME: At the moment we only support COLLATION_BINARY and
- * COLLATION_NTOFS_ULONG, so we return false for everything else for
- * now.
- */
- if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
- return false;
- i = le32_to_cpu(cr);
- if (likely(((i >= 0) && (i <= 0x02)) ||
- ((i >= 0x10) && (i <= 0x13))))
- return true;
- return false;
-}
-
-extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
- const void *data1, const int data1_len,
- const void *data2, const int data2_len);
-
-#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
deleted file mode 100644
index 761aaa0195d6..000000000000
--- a/fs/ntfs/compress.c
+++ /dev/null
@@ -1,950 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * compress.c - NTFS kernel compressed attributes handling.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include "attrib.h"
-#include "inode.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/**
- * ntfs_compression_constants - enum of constants used in the compression code
- */
-typedef enum {
- /* Token types and access mask. */
- NTFS_SYMBOL_TOKEN = 0,
- NTFS_PHRASE_TOKEN = 1,
- NTFS_TOKEN_MASK = 1,
-
- /* Compression sub-block constants. */
- NTFS_SB_SIZE_MASK = 0x0fff,
- NTFS_SB_SIZE = 0x1000,
- NTFS_SB_IS_COMPRESSED = 0x8000,
-
- /*
- * The maximum compression block size is by definition 16 * the cluster
- * size, with the maximum supported cluster size being 4kiB. Thus the
- * maximum compression buffer size is 64kiB, so we use this when
- * initializing the compression buffer.
- */
- NTFS_MAX_CB_SIZE = 64 * 1024,
-} ntfs_compression_constants;
-
-/*
- * ntfs_compression_buffer - one buffer for the decompression engine
- */
-static u8 *ntfs_compression_buffer;
-
-/*
- * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
- */
-static DEFINE_SPINLOCK(ntfs_cb_lock);
-
-/**
- * allocate_compression_buffers - allocate the decompression buffers
- *
- * Caller has to hold the ntfs_lock mutex.
- *
- * Return 0 on success or -ENOMEM if the allocations failed.
- */
-int allocate_compression_buffers(void)
-{
- BUG_ON(ntfs_compression_buffer);
-
- ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
- if (!ntfs_compression_buffer)
- return -ENOMEM;
- return 0;
-}
-
-/**
- * free_compression_buffers - free the decompression buffers
- *
- * Caller has to hold the ntfs_lock mutex.
- */
-void free_compression_buffers(void)
-{
- BUG_ON(!ntfs_compression_buffer);
- vfree(ntfs_compression_buffer);
- ntfs_compression_buffer = NULL;
-}
-
-/**
- * zero_partial_compressed_page - zero out of bounds compressed page region
- */
-static void zero_partial_compressed_page(struct page *page,
- const s64 initialized_size)
-{
- u8 *kp = page_address(page);
- unsigned int kp_ofs;
-
- ntfs_debug("Zeroing page region outside initialized size.");
- if (((s64)page->index << PAGE_SHIFT) >= initialized_size) {
- clear_page(kp);
- return;
- }
- kp_ofs = initialized_size & ~PAGE_MASK;
- memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs);
- return;
-}
-
-/**
- * handle_bounds_compressed_page - test for&handle out of bounds compressed page
- */
-static inline void handle_bounds_compressed_page(struct page *page,
- const loff_t i_size, const s64 initialized_size)
-{
- if ((page->index >= (initialized_size >> PAGE_SHIFT)) &&
- (initialized_size < i_size))
- zero_partial_compressed_page(page, initialized_size);
- return;
-}
-
-/**
- * ntfs_decompress - decompress a compression block into an array of pages
- * @dest_pages: destination array of pages
- * @completed_pages: scratch space to track completed pages
- * @dest_index: current index into @dest_pages (IN/OUT)
- * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT)
- * @dest_max_index: maximum index into @dest_pages (IN)
- * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN)
- * @xpage: the target page (-1 if none) (IN)
- * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT)
- * @cb_start: compression block to decompress (IN)
- * @cb_size: size of compression block @cb_start in bytes (IN)
- * @i_size: file size when we started the read (IN)
- * @initialized_size: initialized file size when we started the read (IN)
- *
- * The caller must have disabled preemption. ntfs_decompress() reenables it when
- * the critical section is finished.
- *
- * This decompresses the compression block @cb_start into the array of
- * destination pages @dest_pages starting at index @dest_index into @dest_pages
- * and at offset @dest_pos into the page @dest_pages[@dest_index].
- *
- * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
- * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
- *
- * @cb_start is a pointer to the compression block which needs decompressing
- * and @cb_size is the size of @cb_start in bytes (8-64kiB).
- *
- * Return 0 if success or -EOVERFLOW on error in the compressed stream.
- * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
- * completed during the decompression of the compression block (@cb_start).
- *
- * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up
- * unpredicatbly! You have been warned!
- *
- * Note to hackers: This function may not sleep until it has finished accessing
- * the compression block @cb_start as it is a per-CPU buffer.
- */
-static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
- int *dest_index, int *dest_ofs, const int dest_max_index,
- const int dest_max_ofs, const int xpage, char *xpage_done,
- u8 *const cb_start, const u32 cb_size, const loff_t i_size,
- const s64 initialized_size)
-{
- /*
- * Pointers into the compressed data, i.e. the compression block (cb),
- * and the therein contained sub-blocks (sb).
- */
- u8 *cb_end = cb_start + cb_size; /* End of cb. */
- u8 *cb = cb_start; /* Current position in cb. */
- u8 *cb_sb_start; /* Beginning of the current sb in the cb. */
- u8 *cb_sb_end; /* End of current sb / beginning of next sb. */
-
- /* Variables for uncompressed data / destination. */
- struct page *dp; /* Current destination page being worked on. */
- u8 *dp_addr; /* Current pointer into dp. */
- u8 *dp_sb_start; /* Start of current sub-block in dp. */
- u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start +
- NTFS_SB_SIZE). */
- u16 do_sb_start; /* @dest_ofs when starting this sub-block. */
- u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start +
- NTFS_SB_SIZE). */
-
- /* Variables for tag and token parsing. */
- u8 tag; /* Current tag. */
- int token; /* Loop counter for the eight tokens in tag. */
- int nr_completed_pages = 0;
-
- /* Default error code. */
- int err = -EOVERFLOW;
-
- ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
-do_next_sb:
- ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
- cb - cb_start);
- /*
- * Have we reached the end of the compression block or the end of the
- * decompressed data? The latter can happen for example if the current
- * position in the compression block is one byte before its end so the
- * first two checks do not detect it.
- */
- if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
- (*dest_index == dest_max_index &&
- *dest_ofs == dest_max_ofs)) {
- int i;
-
- ntfs_debug("Completed. Returning success (0).");
- err = 0;
-return_error:
- /* We can sleep from now on, so we drop lock. */
- spin_unlock(&ntfs_cb_lock);
- /* Second stage: finalize completed pages. */
- if (nr_completed_pages > 0) {
- for (i = 0; i < nr_completed_pages; i++) {
- int di = completed_pages[i];
-
- dp = dest_pages[di];
- /*
- * If we are outside the initialized size, zero
- * the out of bounds page range.
- */
- handle_bounds_compressed_page(dp, i_size,
- initialized_size);
- flush_dcache_page(dp);
- kunmap(dp);
- SetPageUptodate(dp);
- unlock_page(dp);
- if (di == xpage)
- *xpage_done = 1;
- else
- put_page(dp);
- dest_pages[di] = NULL;
- }
- }
- return err;
- }
-
- /* Setup offsets for the current sub-block destination. */
- do_sb_start = *dest_ofs;
- do_sb_end = do_sb_start + NTFS_SB_SIZE;
-
- /* Check that we are still within allowed boundaries. */
- if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
- goto return_overflow;
-
- /* Does the minimum size of a compressed sb overflow valid range? */
- if (cb + 6 > cb_end)
- goto return_overflow;
-
- /* Setup the current sub-block source pointers and validate range. */
- cb_sb_start = cb;
- cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
- + 3;
- if (cb_sb_end > cb_end)
- goto return_overflow;
-
- /* Get the current destination page. */
- dp = dest_pages[*dest_index];
- if (!dp) {
- /* No page present. Skip decompression of this sub-block. */
- cb = cb_sb_end;
-
- /* Advance destination position to next sub-block. */
- *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK;
- if (!*dest_ofs && (++*dest_index > dest_max_index))
- goto return_overflow;
- goto do_next_sb;
- }
-
- /* We have a valid destination page. Setup the destination pointers. */
- dp_addr = (u8*)page_address(dp) + do_sb_start;
-
- /* Now, we are ready to process the current sub-block (sb). */
- if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
- ntfs_debug("Found uncompressed sub-block.");
- /* This sb is not compressed, just copy it into destination. */
-
- /* Advance source position to first data byte. */
- cb += 2;
-
- /* An uncompressed sb must be full size. */
- if (cb_sb_end - cb != NTFS_SB_SIZE)
- goto return_overflow;
-
- /* Copy the block and advance the source position. */
- memcpy(dp_addr, cb, NTFS_SB_SIZE);
- cb += NTFS_SB_SIZE;
-
- /* Advance destination position to next sub-block. */
- *dest_ofs += NTFS_SB_SIZE;
- if (!(*dest_ofs &= ~PAGE_MASK)) {
-finalize_page:
- /*
- * First stage: add current page index to array of
- * completed pages.
- */
- completed_pages[nr_completed_pages++] = *dest_index;
- if (++*dest_index > dest_max_index)
- goto return_overflow;
- }
- goto do_next_sb;
- }
- ntfs_debug("Found compressed sub-block.");
- /* This sb is compressed, decompress it into destination. */
-
- /* Setup destination pointers. */
- dp_sb_start = dp_addr;
- dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
-
- /* Forward to the first tag in the sub-block. */
- cb += 2;
-do_next_tag:
- if (cb == cb_sb_end) {
- /* Check if the decompressed sub-block was not full-length. */
- if (dp_addr < dp_sb_end) {
- int nr_bytes = do_sb_end - *dest_ofs;
-
- ntfs_debug("Filling incomplete sub-block with "
- "zeroes.");
- /* Zero remainder and update destination position. */
- memset(dp_addr, 0, nr_bytes);
- *dest_ofs += nr_bytes;
- }
- /* We have finished the current sub-block. */
- if (!(*dest_ofs &= ~PAGE_MASK))
- goto finalize_page;
- goto do_next_sb;
- }
-
- /* Check we are still in range. */
- if (cb > cb_sb_end || dp_addr > dp_sb_end)
- goto return_overflow;
-
- /* Get the next tag and advance to first token. */
- tag = *cb++;
-
- /* Parse the eight tokens described by the tag. */
- for (token = 0; token < 8; token++, tag >>= 1) {
- u16 lg, pt, length, max_non_overlap;
- register u16 i;
- u8 *dp_back_addr;
-
- /* Check if we are done / still in range. */
- if (cb >= cb_sb_end || dp_addr > dp_sb_end)
- break;
-
- /* Determine token type and parse appropriately.*/
- if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
- /*
- * We have a symbol token, copy the symbol across, and
- * advance the source and destination positions.
- */
- *dp_addr++ = *cb++;
- ++*dest_ofs;
-
- /* Continue with the next token. */
- continue;
- }
-
- /*
- * We have a phrase token. Make sure it is not the first tag in
- * the sb as this is illegal and would confuse the code below.
- */
- if (dp_addr == dp_sb_start)
- goto return_overflow;
-
- /*
- * Determine the number of bytes to go back (p) and the number
- * of bytes to copy (l). We use an optimized algorithm in which
- * we first calculate log2(current destination position in sb),
- * which allows determination of l and p in O(1) rather than
- * O(n). We just need an arch-optimized log2() function now.
- */
- lg = 0;
- for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
- lg++;
-
- /* Get the phrase token into i. */
- pt = le16_to_cpup((le16*)cb);
-
- /*
- * Calculate starting position of the byte sequence in
- * the destination using the fact that p = (pt >> (12 - lg)) + 1
- * and make sure we don't go too far back.
- */
- dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
- if (dp_back_addr < dp_sb_start)
- goto return_overflow;
-
- /* Now calculate the length of the byte sequence. */
- length = (pt & (0xfff >> lg)) + 3;
-
- /* Advance destination position and verify it is in range. */
- *dest_ofs += length;
- if (*dest_ofs > do_sb_end)
- goto return_overflow;
-
- /* The number of non-overlapping bytes. */
- max_non_overlap = dp_addr - dp_back_addr;
-
- if (length <= max_non_overlap) {
- /* The byte sequence doesn't overlap, just copy it. */
- memcpy(dp_addr, dp_back_addr, length);
-
- /* Advance destination pointer. */
- dp_addr += length;
- } else {
- /*
- * The byte sequence does overlap, copy non-overlapping
- * part and then do a slow byte by byte copy for the
- * overlapping part. Also, advance the destination
- * pointer.
- */
- memcpy(dp_addr, dp_back_addr, max_non_overlap);
- dp_addr += max_non_overlap;
- dp_back_addr += max_non_overlap;
- length -= max_non_overlap;
- while (length--)
- *dp_addr++ = *dp_back_addr++;
- }
-
- /* Advance source position and continue with the next token. */
- cb += 2;
- }
-
- /* No tokens left in the current tag. Continue with the next tag. */
- goto do_next_tag;
-
-return_overflow:
- ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
- goto return_error;
-}
-
-/**
- * ntfs_read_compressed_block - read a compressed block into the page cache
- * @page: locked page in the compression block(s) we need to read
- *
- * When we are called the page has already been verified to be locked and the
- * attribute is known to be non-resident, not encrypted, but compressed.
- *
- * 1. Determine which compression block(s) @page is in.
- * 2. Get hold of all pages corresponding to this/these compression block(s).
- * 3. Read the (first) compression block.
- * 4. Decompress it into the corresponding pages.
- * 5. Throw the compressed data away and proceed to 3. for the next compression
- * block or return success if no more compression blocks left.
- *
- * Warning: We have to be careful what we do about existing pages. They might
- * have been written to so that we would lose data if we were to just overwrite
- * them with the out-of-date uncompressed data.
- *
- * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at
- * the end of the file I think. We need to detect this case and zero the out
- * of bounds remainder of the page in question and mark it as handled. At the
- * moment we would just return -EIO on such a page. This bug will only become
- * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
- * clusters so is probably not going to be seen by anyone. Still this should
- * be fixed. (AIA)
- *
- * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in
- * handling sparse and compressed cbs. (AIA)
- *
- * FIXME: At the moment we don't do any zeroing out in the case that
- * initialized_size is less than data_size. This should be safe because of the
- * nature of the compression algorithm used. Just in case we check and output
- * an error message in read inode if the two sizes are not equal for a
- * compressed file. (AIA)
- */
-int ntfs_read_compressed_block(struct page *page)
-{
- loff_t i_size;
- s64 initialized_size;
- struct address_space *mapping = page->mapping;
- ntfs_inode *ni = NTFS_I(mapping->host);
- ntfs_volume *vol = ni->vol;
- struct super_block *sb = vol->sb;
- runlist_element *rl;
- unsigned long flags, block_size = sb->s_blocksize;
- unsigned char block_size_bits = sb->s_blocksize_bits;
- u8 *cb, *cb_pos, *cb_end;
- struct buffer_head **bhs;
- unsigned long offset, index = page->index;
- u32 cb_size = ni->itype.compressed.block_size;
- u64 cb_size_mask = cb_size - 1UL;
- VCN vcn;
- LCN lcn;
- /* The first wanted vcn (minimum alignment is PAGE_SIZE). */
- VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >>
- vol->cluster_size_bits;
- /*
- * The first vcn after the last wanted vcn (minimum alignment is again
- * PAGE_SIZE.
- */
- VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1)
- & ~cb_size_mask) >> vol->cluster_size_bits;
- /* Number of compression blocks (cbs) in the wanted vcn range. */
- unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
- >> ni->itype.compressed.block_size_bits;
- /*
- * Number of pages required to store the uncompressed data from all
- * compression blocks (cbs) overlapping @page. Due to alignment
- * guarantees of start_vcn and end_vcn, no need to round up here.
- */
- unsigned int nr_pages = (end_vcn - start_vcn) <<
- vol->cluster_size_bits >> PAGE_SHIFT;
- unsigned int xpage, max_page, cur_page, cur_ofs, i;
- unsigned int cb_clusters, cb_max_ofs;
- int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
- struct page **pages;
- int *completed_pages;
- unsigned char xpage_done = 0;
-
- ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
- "%i.", index, cb_size, nr_pages);
- /*
- * Bad things happen if we get here for anything that is not an
- * unnamed $DATA attribute.
- */
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
-
- pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
- completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS);
-
- /* Allocate memory to store the buffer heads we need. */
- bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
- bhs = kmalloc(bhs_size, GFP_NOFS);
-
- if (unlikely(!pages || !bhs || !completed_pages)) {
- kfree(bhs);
- kfree(pages);
- kfree(completed_pages);
- unlock_page(page);
- ntfs_error(vol->sb, "Failed to allocate internal buffers.");
- return -ENOMEM;
- }
-
- /*
- * We have already been given one page, this is the one we must do.
- * Once again, the alignment guarantees keep it simple.
- */
- offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT;
- xpage = index - offset;
- pages[xpage] = page;
- /*
- * The remaining pages need to be allocated and inserted into the page
- * cache, alignment guarantees keep all the below much simpler. (-8
- */
- read_lock_irqsave(&ni->size_lock, flags);
- i_size = i_size_read(VFS_I(ni));
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) -
- offset;
- /* Is the page fully outside i_size? (truncate in progress) */
- if (xpage >= max_page) {
- kfree(bhs);
- kfree(pages);
- kfree(completed_pages);
- zero_user(page, 0, PAGE_SIZE);
- ntfs_debug("Compressed read outside i_size - truncated?");
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
- }
- if (nr_pages < max_page)
- max_page = nr_pages;
- for (i = 0; i < max_page; i++, offset++) {
- if (i != xpage)
- pages[i] = grab_cache_page_nowait(mapping, offset);
- page = pages[i];
- if (page) {
- /*
- * We only (re)read the page if it isn't already read
- * in and/or dirty or we would be losing data or at
- * least wasting our time.
- */
- if (!PageDirty(page) && (!PageUptodate(page) ||
- PageError(page))) {
- ClearPageError(page);
- kmap(page);
- continue;
- }
- unlock_page(page);
- put_page(page);
- pages[i] = NULL;
- }
- }
-
- /*
- * We have the runlist, and all the destination pages we need to fill.
- * Now read the first compression block.
- */
- cur_page = 0;
- cur_ofs = 0;
- cb_clusters = ni->itype.compressed.block_clusters;
-do_next_cb:
- nr_cbs--;
- nr_bhs = 0;
-
- /* Read all cb buffer heads one cluster at a time. */
- rl = NULL;
- for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
- vcn++) {
- bool is_retry = false;
-
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
- (unsigned long long)vcn,
- (unsigned long long)lcn);
- if (lcn < 0) {
- /*
- * When we reach the first sparse cluster we have
- * finished with the cb.
- */
- if (lcn == LCN_HOLE)
- break;
- if (is_retry || lcn != LCN_RL_NOT_MAPPED)
- goto rl_err;
- is_retry = true;
- /*
- * Attempt to map runlist, dropping lock for the
- * duration.
- */
- up_read(&ni->runlist.lock);
- if (!ntfs_map_runlist(ni, vcn))
- goto lock_retry_remap;
- goto map_rl_err;
- }
- block = lcn << vol->cluster_size_bits >> block_size_bits;
- /* Read the lcn from device in chunks of block_size bytes. */
- max_block = block + (vol->cluster_size >> block_size_bits);
- do {
- ntfs_debug("block = 0x%x.", block);
- if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
- goto getblk_err;
- nr_bhs++;
- } while (++block < max_block);
- }
-
- /* Release the lock if we took it. */
- if (rl)
- up_read(&ni->runlist.lock);
-
- /* Setup and initiate io on all buffer heads. */
- for (i = 0; i < nr_bhs; i++) {
- struct buffer_head *tbh = bhs[i];
-
- if (!trylock_buffer(tbh))
- continue;
- if (unlikely(buffer_uptodate(tbh))) {
- unlock_buffer(tbh);
- continue;
- }
- get_bh(tbh);
- tbh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, tbh);
- }
-
- /* Wait for io completion on all buffer heads. */
- for (i = 0; i < nr_bhs; i++) {
- struct buffer_head *tbh = bhs[i];
-
- if (buffer_uptodate(tbh))
- continue;
- wait_on_buffer(tbh);
- /*
- * We need an optimization barrier here, otherwise we start
- * hitting the below fixup code when accessing a loopback
- * mounted ntfs partition. This indicates either there is a
- * race condition in the loop driver or, more likely, gcc
- * overoptimises the code without the barrier and it doesn't
- * do the Right Thing(TM).
- */
- barrier();
- if (unlikely(!buffer_uptodate(tbh))) {
- ntfs_warning(vol->sb, "Buffer is unlocked but not "
- "uptodate! Unplugging the disk queue "
- "and rescheduling.");
- get_bh(tbh);
- io_schedule();
- put_bh(tbh);
- if (unlikely(!buffer_uptodate(tbh)))
- goto read_err;
- ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
- }
- }
-
- /*
- * Get the compression buffer. We must not sleep any more
- * until we are finished with it.
- */
- spin_lock(&ntfs_cb_lock);
- cb = ntfs_compression_buffer;
-
- BUG_ON(!cb);
-
- cb_pos = cb;
- cb_end = cb + cb_size;
-
- /* Copy the buffer heads into the contiguous buffer. */
- for (i = 0; i < nr_bhs; i++) {
- memcpy(cb_pos, bhs[i]->b_data, block_size);
- cb_pos += block_size;
- }
-
- /* Just a precaution. */
- if (cb_pos + 2 <= cb + cb_size)
- *(u16*)cb_pos = 0;
-
- /* Reset cb_pos back to the beginning. */
- cb_pos = cb;
-
- /* We now have both source (if present) and destination. */
- ntfs_debug("Successfully read the compression block.");
-
- /* The last page and maximum offset within it for the current cb. */
- cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size;
- cb_max_ofs = cb_max_page & ~PAGE_MASK;
- cb_max_page >>= PAGE_SHIFT;
-
- /* Catch end of file inside a compression block. */
- if (cb_max_page > max_page)
- cb_max_page = max_page;
-
- if (vcn == start_vcn - cb_clusters) {
- /* Sparse cb, zero out page range overlapping the cb. */
- ntfs_debug("Found sparse compression block.");
- /* We can sleep from now on, so we drop lock. */
- spin_unlock(&ntfs_cb_lock);
- if (cb_max_ofs)
- cb_max_page--;
- for (; cur_page < cb_max_page; cur_page++) {
- page = pages[cur_page];
- if (page) {
- if (likely(!cur_ofs))
- clear_page(page_address(page));
- else
- memset(page_address(page) + cur_ofs, 0,
- PAGE_SIZE -
- cur_ofs);
- flush_dcache_page(page);
- kunmap(page);
- SetPageUptodate(page);
- unlock_page(page);
- if (cur_page == xpage)
- xpage_done = 1;
- else
- put_page(page);
- pages[cur_page] = NULL;
- }
- cb_pos += PAGE_SIZE - cur_ofs;
- cur_ofs = 0;
- if (cb_pos >= cb_end)
- break;
- }
- /* If we have a partial final page, deal with it now. */
- if (cb_max_ofs && cb_pos < cb_end) {
- page = pages[cur_page];
- if (page)
- memset(page_address(page) + cur_ofs, 0,
- cb_max_ofs - cur_ofs);
- /*
- * No need to update cb_pos at this stage:
- * cb_pos += cb_max_ofs - cur_ofs;
- */
- cur_ofs = cb_max_ofs;
- }
- } else if (vcn == start_vcn) {
- /* We can't sleep so we need two stages. */
- unsigned int cur2_page = cur_page;
- unsigned int cur_ofs2 = cur_ofs;
- u8 *cb_pos2 = cb_pos;
-
- ntfs_debug("Found uncompressed compression block.");
- /* Uncompressed cb, copy it to the destination pages. */
- /*
- * TODO: As a big optimization, we could detect this case
- * before we read all the pages and use block_read_full_folio()
- * on all full pages instead (we still have to treat partial
- * pages especially but at least we are getting rid of the
- * synchronous io for the majority of pages.
- * Or if we choose not to do the read-ahead/-behind stuff, we
- * could just return block_read_full_folio(pages[xpage]) as long
- * as PAGE_SIZE <= cb_size.
- */
- if (cb_max_ofs)
- cb_max_page--;
- /* First stage: copy data into destination pages. */
- for (; cur_page < cb_max_page; cur_page++) {
- page = pages[cur_page];
- if (page)
- memcpy(page_address(page) + cur_ofs, cb_pos,
- PAGE_SIZE - cur_ofs);
- cb_pos += PAGE_SIZE - cur_ofs;
- cur_ofs = 0;
- if (cb_pos >= cb_end)
- break;
- }
- /* If we have a partial final page, deal with it now. */
- if (cb_max_ofs && cb_pos < cb_end) {
- page = pages[cur_page];
- if (page)
- memcpy(page_address(page) + cur_ofs, cb_pos,
- cb_max_ofs - cur_ofs);
- cb_pos += cb_max_ofs - cur_ofs;
- cur_ofs = cb_max_ofs;
- }
- /* We can sleep from now on, so drop lock. */
- spin_unlock(&ntfs_cb_lock);
- /* Second stage: finalize pages. */
- for (; cur2_page < cb_max_page; cur2_page++) {
- page = pages[cur2_page];
- if (page) {
- /*
- * If we are outside the initialized size, zero
- * the out of bounds page range.
- */
- handle_bounds_compressed_page(page, i_size,
- initialized_size);
- flush_dcache_page(page);
- kunmap(page);
- SetPageUptodate(page);
- unlock_page(page);
- if (cur2_page == xpage)
- xpage_done = 1;
- else
- put_page(page);
- pages[cur2_page] = NULL;
- }
- cb_pos2 += PAGE_SIZE - cur_ofs2;
- cur_ofs2 = 0;
- if (cb_pos2 >= cb_end)
- break;
- }
- } else {
- /* Compressed cb, decompress it into the destination page(s). */
- unsigned int prev_cur_page = cur_page;
-
- ntfs_debug("Found compressed compression block.");
- err = ntfs_decompress(pages, completed_pages, &cur_page,
- &cur_ofs, cb_max_page, cb_max_ofs, xpage,
- &xpage_done, cb_pos, cb_size - (cb_pos - cb),
- i_size, initialized_size);
- /*
- * We can sleep from now on, lock already dropped by
- * ntfs_decompress().
- */
- if (err) {
- ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
- "0x%lx with error code %i. Skipping "
- "this compression block.",
- ni->mft_no, -err);
- /* Release the unfinished pages. */
- for (; prev_cur_page < cur_page; prev_cur_page++) {
- page = pages[prev_cur_page];
- if (page) {
- flush_dcache_page(page);
- kunmap(page);
- unlock_page(page);
- if (prev_cur_page != xpage)
- put_page(page);
- pages[prev_cur_page] = NULL;
- }
- }
- }
- }
-
- /* Release the buffer heads. */
- for (i = 0; i < nr_bhs; i++)
- brelse(bhs[i]);
-
- /* Do we have more work to do? */
- if (nr_cbs)
- goto do_next_cb;
-
- /* We no longer need the list of buffer heads. */
- kfree(bhs);
-
- /* Clean up if we have any pages left. Should never happen. */
- for (cur_page = 0; cur_page < max_page; cur_page++) {
- page = pages[cur_page];
- if (page) {
- ntfs_error(vol->sb, "Still have pages left! "
- "Terminating them with extreme "
- "prejudice. Inode 0x%lx, page index "
- "0x%lx.", ni->mft_no, page->index);
- flush_dcache_page(page);
- kunmap(page);
- unlock_page(page);
- if (cur_page != xpage)
- put_page(page);
- pages[cur_page] = NULL;
- }
- }
-
- /* We no longer need the list of pages. */
- kfree(pages);
- kfree(completed_pages);
-
- /* If we have completed the requested page, we return success. */
- if (likely(xpage_done))
- return 0;
-
- ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
- "EOVERFLOW" : (!err ? "EIO" : "unknown error"));
- return err < 0 ? err : -EIO;
-
-read_err:
- ntfs_error(vol->sb, "IO error while reading compressed data.");
- /* Release the buffer heads. */
- for (i = 0; i < nr_bhs; i++)
- brelse(bhs[i]);
- goto err_out;
-
-map_rl_err:
- ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
- "compression block.");
- goto err_out;
-
-rl_err:
- up_read(&ni->runlist.lock);
- ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
- "compression block.");
- goto err_out;
-
-getblk_err:
- up_read(&ni->runlist.lock);
- ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
-
-err_out:
- kfree(bhs);
- for (i = cur_page; i < max_page; i++) {
- page = pages[i];
- if (page) {
- flush_dcache_page(page);
- kunmap(page);
- unlock_page(page);
- if (i != xpage)
- put_page(page);
- }
- }
- kfree(pages);
- kfree(completed_pages);
- return -EIO;
-}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
deleted file mode 100644
index a3c1c5656f8f..000000000000
--- a/fs/ntfs/debug.c
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include "debug.h"
-
-/**
- * __ntfs_warning - output a warning to the syslog
- * @function: name of function outputting the warning
- * @sb: super block of mounted ntfs filesystem
- * @fmt: warning string containing format specifications
- * @...: a variable number of arguments specified in @fmt
- *
- * Outputs a warning to the syslog for the mounted ntfs filesystem described
- * by @sb.
- *
- * @fmt and the corresponding @... is printf style format string containing
- * the warning string and the corresponding format arguments, respectively.
- *
- * @function is the name of the function from which __ntfs_warning is being
- * called.
- *
- * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
- * as this provides the @function parameter automatically.
- */
-void __ntfs_warning(const char *function, const struct super_block *sb,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
- int flen = 0;
-
-#ifndef DEBUG
- if (!printk_ratelimit())
- return;
-#endif
- if (function)
- flen = strlen(function);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- if (sb)
- pr_warn("(device %s): %s(): %pV\n",
- sb->s_id, flen ? function : "", &vaf);
- else
- pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
- va_end(args);
-}
-
-/**
- * __ntfs_error - output an error to the syslog
- * @function: name of function outputting the error
- * @sb: super block of mounted ntfs filesystem
- * @fmt: error string containing format specifications
- * @...: a variable number of arguments specified in @fmt
- *
- * Outputs an error to the syslog for the mounted ntfs filesystem described
- * by @sb.
- *
- * @fmt and the corresponding @... is printf style format string containing
- * the error string and the corresponding format arguments, respectively.
- *
- * @function is the name of the function from which __ntfs_error is being
- * called.
- *
- * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
- * as this provides the @function parameter automatically.
- */
-void __ntfs_error(const char *function, const struct super_block *sb,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
- int flen = 0;
-
-#ifndef DEBUG
- if (!printk_ratelimit())
- return;
-#endif
- if (function)
- flen = strlen(function);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- if (sb)
- pr_err("(device %s): %s(): %pV\n",
- sb->s_id, flen ? function : "", &vaf);
- else
- pr_err("%s(): %pV\n", flen ? function : "", &vaf);
- va_end(args);
-}
-
-#ifdef DEBUG
-
-/* If 1, output debug messages, and if 0, don't. */
-int debug_msgs = 0;
-
-void __ntfs_debug(const char *file, int line, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
- int flen = 0;
-
- if (!debug_msgs)
- return;
- if (function)
- flen = strlen(function);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
- va_end(args);
-}
-
-/* Dump a runlist. Caller has to provide synchronisation for @rl. */
-void ntfs_debug_dump_runlist(const runlist_element *rl)
-{
- int i;
- const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED",
- "LCN_ENOENT ", "LCN_unknown " };
-
- if (!debug_msgs)
- return;
- pr_debug("Dumping runlist (values in hex):\n");
- if (!rl) {
- pr_debug("Run list not present.\n");
- return;
- }
- pr_debug("VCN LCN Run length\n");
- for (i = 0; ; i++) {
- LCN lcn = (rl + i)->lcn;
-
- if (lcn < (LCN)0) {
- int index = -lcn - 1;
-
- if (index > -LCN_ENOENT - 1)
- index = 3;
- pr_debug("%-16Lx %s %-16Lx%s\n",
- (long long)(rl + i)->vcn, lcn_str[index],
- (long long)(rl + i)->length,
- (rl + i)->length ? "" :
- " (runlist end)");
- } else
- pr_debug("%-16Lx %-16Lx %-16Lx%s\n",
- (long long)(rl + i)->vcn,
- (long long)(rl + i)->lcn,
- (long long)(rl + i)->length,
- (rl + i)->length ? "" :
- " (runlist end)");
- if (!(rl + i)->length)
- break;
- }
-}
-
-#endif
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
deleted file mode 100644
index 6fdef388f129..000000000000
--- a/fs/ntfs/debug.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_DEBUG_H
-#define _LINUX_NTFS_DEBUG_H
-
-#include <linux/fs.h>
-
-#include "runlist.h"
-
-#ifdef DEBUG
-
-extern int debug_msgs;
-
-extern __printf(4, 5)
-void __ntfs_debug(const char *file, int line, const char *function,
- const char *format, ...);
-/**
- * ntfs_debug - write a debug level message to syslog
- * @f: a printf format string containing the message
- * @...: the variables to substitute into @f
- *
- * ntfs_debug() writes a DEBUG level message to the syslog but only if the
- * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
- */
-#define ntfs_debug(f, a...) \
- __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
-
-extern void ntfs_debug_dump_runlist(const runlist_element *rl);
-
-#else /* !DEBUG */
-
-#define ntfs_debug(fmt, ...) \
-do { \
- if (0) \
- no_printk(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define ntfs_debug_dump_runlist(rl) do {} while (0)
-
-#endif /* !DEBUG */
-
-extern __printf(3, 4)
-void __ntfs_warning(const char *function, const struct super_block *sb,
- const char *fmt, ...);
-#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a)
-
-extern __printf(3, 4)
-void __ntfs_error(const char *function, const struct super_block *sb,
- const char *fmt, ...);
-#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a)
-
-#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
deleted file mode 100644
index 629723a8d712..000000000000
--- a/fs/ntfs/dir.c
+++ /dev/null
@@ -1,1540 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-
-#include "dir.h"
-#include "aops.h"
-#include "attrib.h"
-#include "mft.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/*
- * The little endian Unicode string $I30 as a global constant.
- */
-ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
- cpu_to_le16('3'), cpu_to_le16('0'), 0 };
-
-/**
- * ntfs_lookup_inode_by_name - find an inode in a directory given its name
- * @dir_ni: ntfs inode of the directory in which to search for the name
- * @uname: Unicode name for which to search in the directory
- * @uname_len: length of the name @uname in Unicode characters
- * @res: return the found file name if necessary (see below)
- *
- * Look for an inode with name @uname in the directory with inode @dir_ni.
- * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
- * the Unicode name. If the name is found in the directory, the corresponding
- * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
- * is a 64-bit number containing the sequence number.
- *
- * On error, a negative value is returned corresponding to the error code. In
- * particular if the inode is not found -ENOENT is returned. Note that you
- * can't just check the return value for being negative, you have to check the
- * inode number for being negative which you can extract using MREC(return
- * value).
- *
- * Note, @uname_len does not include the (optional) terminating NULL character.
- *
- * Note, we look for a case sensitive match first but we also look for a case
- * insensitive match at the same time. If we find a case insensitive match, we
- * save that for the case that we don't find an exact match, where we return
- * the case insensitive match and setup @res (which we allocate!) with the mft
- * reference, the file name type, length and with a copy of the little endian
- * Unicode file name itself. If we match a file name which is in the DOS name
- * space, we only return the mft reference and file name type in @res.
- * ntfs_lookup() then uses this to find the long file name in the inode itself.
- * This is to avoid polluting the dcache with short file names. We want them to
- * work but we don't care for how quickly one can access them. This also fixes
- * the dcache aliasing issues.
- *
- * Locking: - Caller must hold i_mutex on the directory.
- * - Each page cache page in the index allocation mapping must be
- * locked whilst being accessed otherwise we may find a corrupt
- * page due to it being under ->writepage at the moment which
- * applies the mst protection fixups before writing out and then
- * removes them again after the write is complete after which it
- * unlocks the page.
- */
-MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
- const int uname_len, ntfs_name **res)
-{
- ntfs_volume *vol = dir_ni->vol;
- struct super_block *sb = vol->sb;
- MFT_RECORD *m;
- INDEX_ROOT *ir;
- INDEX_ENTRY *ie;
- INDEX_ALLOCATION *ia;
- u8 *index_end;
- u64 mref;
- ntfs_attr_search_ctx *ctx;
- int err, rc;
- VCN vcn, old_vcn;
- struct address_space *ia_mapping;
- struct page *page;
- u8 *kaddr;
- ntfs_name *name = NULL;
-
- BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
- BUG_ON(NInoAttr(dir_ni));
- /* Get hold of the mft record for the directory. */
- m = map_mft_record(dir_ni);
- if (IS_ERR(m)) {
- ntfs_error(sb, "map_mft_record() failed with error code %ld.",
- -PTR_ERR(m));
- return ERR_MREF(PTR_ERR(m));
- }
- ctx = ntfs_attr_get_search_ctx(dir_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Find the index root attribute in the mft record. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- ntfs_error(sb, "Index root attribute missing in "
- "directory inode 0x%lx.",
- dir_ni->mft_no);
- err = -EIO;
- }
- goto err_out;
- }
- /* Get to the index root value (it's been verified in read_inode). */
- ir = (INDEX_ROOT*)((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ir->index +
- le32_to_cpu(ir->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds checks. */
- if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end)
- goto dir_err_out;
- /*
- * The last entry cannot contain a name. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /*
- * We perform a case sensitive comparison and if that matches
- * we are done and return the mft reference of the inode (i.e.
- * the inode number together with the sequence number for
- * consistency checking). We convert it to cpu format before
- * returning.
- */
- if (ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
-found_it:
- /*
- * We have a perfect match, so we don't need to care
- * about having matched imperfectly before, so we can
- * free name and set *res to NULL.
- * However, if the perfect match is a short file name,
- * we need to signal this through *res, so that
- * ntfs_lookup() can fix dcache aliasing issues.
- * As an optimization we just reuse an existing
- * allocation of *res.
- */
- if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
- if (!name) {
- name = kmalloc(sizeof(ntfs_name),
- GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- goto err_out;
- }
- }
- name->mref = le64_to_cpu(
- ie->data.dir.indexed_file);
- name->type = FILE_NAME_DOS;
- name->len = 0;
- *res = name;
- } else {
- kfree(name);
- *res = NULL;
- }
- mref = le64_to_cpu(ie->data.dir.indexed_file);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- return mref;
- }
- /*
- * For a case insensitive mount, we also perform a case
- * insensitive comparison (provided the file name is not in the
- * POSIX namespace). If the comparison matches, and the name is
- * in the WIN32 namespace, we cache the filename in *res so
- * that the caller, ntfs_lookup(), can work on it. If the
- * comparison matches, and the name is in the DOS namespace, we
- * only cache the mft reference and the file name type (we set
- * the name length to zero for simplicity).
- */
- if (!NVolCaseSensitive(vol) &&
- ie->key.file_name.file_name_type &&
- ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length,
- IGNORE_CASE, vol->upcase, vol->upcase_len)) {
- int name_size = sizeof(ntfs_name);
- u8 type = ie->key.file_name.file_name_type;
- u8 len = ie->key.file_name.file_name_length;
-
- /* Only one case insensitive matching name allowed. */
- if (name) {
- ntfs_error(sb, "Found already allocated name "
- "in phase 1. Please run chkdsk "
- "and if that doesn't find any "
- "errors please report you saw "
- "this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net.");
- goto dir_err_out;
- }
-
- if (type != FILE_NAME_DOS)
- name_size += len * sizeof(ntfschar);
- name = kmalloc(name_size, GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- goto err_out;
- }
- name->mref = le64_to_cpu(ie->data.dir.indexed_file);
- name->type = type;
- if (type != FILE_NAME_DOS) {
- name->len = len;
- memcpy(name->name, ie->key.file_name.file_name,
- len * sizeof(ntfschar));
- } else
- name->len = 0;
- *res = name;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- IGNORE_CASE, vol->upcase, vol->upcase_len);
- /*
- * If uname collates before the name of the current entry, there
- * is definitely no such name in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /* The names are not equal, continue the search. */
- if (rc)
- continue;
- /*
- * Names match with case insensitive comparison, now try the
- * case sensitive comparison, which is required for proper
- * collation.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len);
- if (rc == -1)
- break;
- if (rc)
- continue;
- /*
- * Perfect match, this will never happen as the
- * ntfs_are_names_equal() call will have gotten a match but we
- * still treat it correctly.
- */
- goto found_it;
- }
- /*
- * We have finished with this index without success. Check for the
- * presence of a child node and if not present return -ENOENT, unless
- * we have got a matching name cached in name in which case return the
- * mft reference associated with it.
- */
- if (!(ie->flags & INDEX_ENTRY_NODE)) {
- if (name) {
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- return name->mref;
- }
- ntfs_debug("Entry not found.");
- err = -ENOENT;
- goto err_out;
- } /* Child node present, descend into it. */
- /* Consistency check: Verify that an index allocation exists. */
- if (!NInoIndexAllocPresent(dir_ni)) {
- ntfs_error(sb, "No index allocation attribute but index entry "
- "requires one. Directory inode 0x%lx is "
- "corrupt or driver bug.", dir_ni->mft_no);
- goto err_out;
- }
- /* Get the starting vcn of the index_block holding the child node. */
- vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
- ia_mapping = VFS_I(dir_ni)->i_mapping;
- /*
- * We are done with the index root and the mft record. Release them,
- * otherwise we deadlock with ntfs_map_page().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- m = NULL;
- ctx = NULL;
-descend_into_child_node:
- /*
- * Convert vcn to index into the index allocation attribute in units
- * of PAGE_SIZE and map the page cache page, reading it from
- * disk if necessary.
- */
- page = ntfs_map_page(ia_mapping, vcn <<
- dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- ntfs_error(sb, "Failed to map directory index page, error %ld.",
- -PTR_ERR(page));
- err = PTR_ERR(page);
- goto err_out;
- }
- lock_page(page);
- kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
- /* Get to the index allocation block. */
- ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
- /* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
- "inode 0x%lx or driver bug.", dir_ni->mft_no);
- goto unm_err_out;
- }
- /* Catch multi sector transfer fixup errors. */
- if (unlikely(!ntfs_is_indx_record(ia->magic))) {
- ntfs_error(sb, "Directory index record with vcn 0x%llx is "
- "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
- ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
- "different from expected VCN (0x%llx). "
- "Directory inode 0x%lx is corrupt or driver "
- "bug.", (unsigned long long)
- sle64_to_cpu(ia->index_block_vcn),
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
- dir_ni->itype.index.block_size) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx has a size (%u) differing from the "
- "directory specified size (%u). Directory "
- "inode is corrupt or driver bug.",
- (unsigned long long)vcn, dir_ni->mft_no,
- le32_to_cpu(ia->index.allocated_size) + 0x18,
- dir_ni->itype.index.block_size);
- goto unm_err_out;
- }
- index_end = (u8*)ia + dir_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx crosses page boundary. Impossible! "
- "Cannot access! This is probably a bug in the "
- "driver.", (unsigned long long)vcn,
- dir_ni->mft_no);
- goto unm_err_out;
- }
- index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
- if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
- ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
- "inode 0x%lx exceeds maximum size.",
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ia->index +
- le32_to_cpu(ia->index.entries_offset));
- /*
- * Iterate similar to above big loop but applied to index buffer, thus
- * loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds check. */
- if ((u8*)ie < (u8*)ia || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end) {
- ntfs_error(sb, "Index entry out of bounds in "
- "directory inode 0x%lx.",
- dir_ni->mft_no);
- goto unm_err_out;
- }
- /*
- * The last entry cannot contain a name. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /*
- * We perform a case sensitive comparison and if that matches
- * we are done and return the mft reference of the inode (i.e.
- * the inode number together with the sequence number for
- * consistency checking). We convert it to cpu format before
- * returning.
- */
- if (ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
-found_it2:
- /*
- * We have a perfect match, so we don't need to care
- * about having matched imperfectly before, so we can
- * free name and set *res to NULL.
- * However, if the perfect match is a short file name,
- * we need to signal this through *res, so that
- * ntfs_lookup() can fix dcache aliasing issues.
- * As an optimization we just reuse an existing
- * allocation of *res.
- */
- if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
- if (!name) {
- name = kmalloc(sizeof(ntfs_name),
- GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- }
- name->mref = le64_to_cpu(
- ie->data.dir.indexed_file);
- name->type = FILE_NAME_DOS;
- name->len = 0;
- *res = name;
- } else {
- kfree(name);
- *res = NULL;
- }
- mref = le64_to_cpu(ie->data.dir.indexed_file);
- unlock_page(page);
- ntfs_unmap_page(page);
- return mref;
- }
- /*
- * For a case insensitive mount, we also perform a case
- * insensitive comparison (provided the file name is not in the
- * POSIX namespace). If the comparison matches, and the name is
- * in the WIN32 namespace, we cache the filename in *res so
- * that the caller, ntfs_lookup(), can work on it. If the
- * comparison matches, and the name is in the DOS namespace, we
- * only cache the mft reference and the file name type (we set
- * the name length to zero for simplicity).
- */
- if (!NVolCaseSensitive(vol) &&
- ie->key.file_name.file_name_type &&
- ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length,
- IGNORE_CASE, vol->upcase, vol->upcase_len)) {
- int name_size = sizeof(ntfs_name);
- u8 type = ie->key.file_name.file_name_type;
- u8 len = ie->key.file_name.file_name_length;
-
- /* Only one case insensitive matching name allowed. */
- if (name) {
- ntfs_error(sb, "Found already allocated name "
- "in phase 2. Please run chkdsk "
- "and if that doesn't find any "
- "errors please report you saw "
- "this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net.");
- unlock_page(page);
- ntfs_unmap_page(page);
- goto dir_err_out;
- }
-
- if (type != FILE_NAME_DOS)
- name_size += len * sizeof(ntfschar);
- name = kmalloc(name_size, GFP_NOFS);
- if (!name) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- name->mref = le64_to_cpu(ie->data.dir.indexed_file);
- name->type = type;
- if (type != FILE_NAME_DOS) {
- name->len = len;
- memcpy(name->name, ie->key.file_name.file_name,
- len * sizeof(ntfschar));
- } else
- name->len = 0;
- *res = name;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- IGNORE_CASE, vol->upcase, vol->upcase_len);
- /*
- * If uname collates before the name of the current entry, there
- * is definitely no such name in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /* The names are not equal, continue the search. */
- if (rc)
- continue;
- /*
- * Names match with case insensitive comparison, now try the
- * case sensitive comparison, which is required for proper
- * collation.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len);
- if (rc == -1)
- break;
- if (rc)
- continue;
- /*
- * Perfect match, this will never happen as the
- * ntfs_are_names_equal() call will have gotten a match but we
- * still treat it correctly.
- */
- goto found_it2;
- }
- /*
- * We have finished with this index buffer without success. Check for
- * the presence of a child node.
- */
- if (ie->flags & INDEX_ENTRY_NODE) {
- if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
- ntfs_error(sb, "Index entry with child node found in "
- "a leaf node in directory inode 0x%lx.",
- dir_ni->mft_no);
- goto unm_err_out;
- }
- /* Child node present, descend into it. */
- old_vcn = vcn;
- vcn = sle64_to_cpup((sle64*)((u8*)ie +
- le16_to_cpu(ie->length) - 8));
- if (vcn >= 0) {
- /* If vcn is in the same page cache page as old_vcn we
- * recycle the mapped page. */
- if (old_vcn << vol->cluster_size_bits >>
- PAGE_SHIFT == vcn <<
- vol->cluster_size_bits >>
- PAGE_SHIFT)
- goto fast_descend_into_child_node;
- unlock_page(page);
- ntfs_unmap_page(page);
- goto descend_into_child_node;
- }
- ntfs_error(sb, "Negative child node vcn in directory inode "
- "0x%lx.", dir_ni->mft_no);
- goto unm_err_out;
- }
- /*
- * No child node present, return -ENOENT, unless we have got a matching
- * name cached in name in which case return the mft reference
- * associated with it.
- */
- if (name) {
- unlock_page(page);
- ntfs_unmap_page(page);
- return name->mref;
- }
- ntfs_debug("Entry not found.");
- err = -ENOENT;
-unm_err_out:
- unlock_page(page);
- ntfs_unmap_page(page);
-err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(dir_ni);
- if (name) {
- kfree(name);
- *res = NULL;
- }
- return ERR_MREF(err);
-dir_err_out:
- ntfs_error(sb, "Corrupt directory. Aborting lookup.");
- goto err_out;
-}
-
-#if 0
-
-// TODO: (AIA)
-// The algorithm embedded in this code will be required for the time when we
-// want to support adding of entries to directories, where we require correct
-// collation of file names in order not to cause corruption of the filesystem.
-
-/**
- * ntfs_lookup_inode_by_name - find an inode in a directory given its name
- * @dir_ni: ntfs inode of the directory in which to search for the name
- * @uname: Unicode name for which to search in the directory
- * @uname_len: length of the name @uname in Unicode characters
- *
- * Look for an inode with name @uname in the directory with inode @dir_ni.
- * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
- * the Unicode name. If the name is found in the directory, the corresponding
- * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
- * is a 64-bit number containing the sequence number.
- *
- * On error, a negative value is returned corresponding to the error code. In
- * particular if the inode is not found -ENOENT is returned. Note that you
- * can't just check the return value for being negative, you have to check the
- * inode number for being negative which you can extract using MREC(return
- * value).
- *
- * Note, @uname_len does not include the (optional) terminating NULL character.
- */
-u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
- const int uname_len)
-{
- ntfs_volume *vol = dir_ni->vol;
- struct super_block *sb = vol->sb;
- MFT_RECORD *m;
- INDEX_ROOT *ir;
- INDEX_ENTRY *ie;
- INDEX_ALLOCATION *ia;
- u8 *index_end;
- u64 mref;
- ntfs_attr_search_ctx *ctx;
- int err, rc;
- IGNORE_CASE_BOOL ic;
- VCN vcn, old_vcn;
- struct address_space *ia_mapping;
- struct page *page;
- u8 *kaddr;
-
- /* Get hold of the mft record for the directory. */
- m = map_mft_record(dir_ni);
- if (IS_ERR(m)) {
- ntfs_error(sb, "map_mft_record() failed with error code %ld.",
- -PTR_ERR(m));
- return ERR_MREF(PTR_ERR(m));
- }
- ctx = ntfs_attr_get_search_ctx(dir_ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Find the index root attribute in the mft record. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- ntfs_error(sb, "Index root attribute missing in "
- "directory inode 0x%lx.",
- dir_ni->mft_no);
- err = -EIO;
- }
- goto err_out;
- }
- /* Get to the index root value (it's been verified in read_inode). */
- ir = (INDEX_ROOT*)((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ir->index +
- le32_to_cpu(ir->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds checks. */
- if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end)
- goto dir_err_out;
- /*
- * The last entry cannot contain a name. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /*
- * If the current entry has a name type of POSIX, the name is
- * case sensitive and not otherwise. This has the effect of us
- * not being able to access any POSIX file names which collate
- * after the non-POSIX one when they only differ in case, but
- * anyone doing screwy stuff like that deserves to burn in
- * hell... Doing that kind of stuff on NT4 actually causes
- * corruption on the partition even when using SP6a and Linux
- * is not involved at all.
- */
- ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
- CASE_SENSITIVE;
- /*
- * If the names match perfectly, we are done and return the
- * mft reference of the inode (i.e. the inode number together
- * with the sequence number for consistency checking. We
- * convert it to cpu format before returning.
- */
- if (ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, ic,
- vol->upcase, vol->upcase_len)) {
-found_it:
- mref = le64_to_cpu(ie->data.dir.indexed_file);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- return mref;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- IGNORE_CASE, vol->upcase, vol->upcase_len);
- /*
- * If uname collates before the name of the current entry, there
- * is definitely no such name in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /* The names are not equal, continue the search. */
- if (rc)
- continue;
- /*
- * Names match with case insensitive comparison, now try the
- * case sensitive comparison, which is required for proper
- * collation.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len);
- if (rc == -1)
- break;
- if (rc)
- continue;
- /*
- * Perfect match, this will never happen as the
- * ntfs_are_names_equal() call will have gotten a match but we
- * still treat it correctly.
- */
- goto found_it;
- }
- /*
- * We have finished with this index without success. Check for the
- * presence of a child node.
- */
- if (!(ie->flags & INDEX_ENTRY_NODE)) {
- /* No child node, return -ENOENT. */
- err = -ENOENT;
- goto err_out;
- } /* Child node present, descend into it. */
- /* Consistency check: Verify that an index allocation exists. */
- if (!NInoIndexAllocPresent(dir_ni)) {
- ntfs_error(sb, "No index allocation attribute but index entry "
- "requires one. Directory inode 0x%lx is "
- "corrupt or driver bug.", dir_ni->mft_no);
- goto err_out;
- }
- /* Get the starting vcn of the index_block holding the child node. */
- vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
- ia_mapping = VFS_I(dir_ni)->i_mapping;
- /*
- * We are done with the index root and the mft record. Release them,
- * otherwise we deadlock with ntfs_map_page().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(dir_ni);
- m = NULL;
- ctx = NULL;
-descend_into_child_node:
- /*
- * Convert vcn to index into the index allocation attribute in units
- * of PAGE_SIZE and map the page cache page, reading it from
- * disk if necessary.
- */
- page = ntfs_map_page(ia_mapping, vcn <<
- dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- ntfs_error(sb, "Failed to map directory index page, error %ld.",
- -PTR_ERR(page));
- err = PTR_ERR(page);
- goto err_out;
- }
- lock_page(page);
- kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
- /* Get to the index allocation block. */
- ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
- /* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
- "inode 0x%lx or driver bug.", dir_ni->mft_no);
- goto unm_err_out;
- }
- /* Catch multi sector transfer fixup errors. */
- if (unlikely(!ntfs_is_indx_record(ia->magic))) {
- ntfs_error(sb, "Directory index record with vcn 0x%llx is "
- "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
- ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
- "different from expected VCN (0x%llx). "
- "Directory inode 0x%lx is corrupt or driver "
- "bug.", (unsigned long long)
- sle64_to_cpu(ia->index_block_vcn),
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
- dir_ni->itype.index.block_size) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx has a size (%u) differing from the "
- "directory specified size (%u). Directory "
- "inode is corrupt or driver bug.",
- (unsigned long long)vcn, dir_ni->mft_no,
- le32_to_cpu(ia->index.allocated_size) + 0x18,
- dir_ni->itype.index.block_size);
- goto unm_err_out;
- }
- index_end = (u8*)ia + dir_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx crosses page boundary. Impossible! "
- "Cannot access! This is probably a bug in the "
- "driver.", (unsigned long long)vcn,
- dir_ni->mft_no);
- goto unm_err_out;
- }
- index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
- if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
- ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
- "inode 0x%lx exceeds maximum size.",
- (unsigned long long)vcn, dir_ni->mft_no);
- goto unm_err_out;
- }
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ia->index +
- le32_to_cpu(ia->index.entries_offset));
- /*
- * Iterate similar to above big loop but applied to index buffer, thus
- * loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds check. */
- if ((u8*)ie < (u8*)ia || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end) {
- ntfs_error(sb, "Index entry out of bounds in "
- "directory inode 0x%lx.",
- dir_ni->mft_no);
- goto unm_err_out;
- }
- /*
- * The last entry cannot contain a name. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /*
- * If the current entry has a name type of POSIX, the name is
- * case sensitive and not otherwise. This has the effect of us
- * not being able to access any POSIX file names which collate
- * after the non-POSIX one when they only differ in case, but
- * anyone doing screwy stuff like that deserves to burn in
- * hell... Doing that kind of stuff on NT4 actually causes
- * corruption on the partition even when using SP6a and Linux
- * is not involved at all.
- */
- ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
- CASE_SENSITIVE;
- /*
- * If the names match perfectly, we are done and return the
- * mft reference of the inode (i.e. the inode number together
- * with the sequence number for consistency checking. We
- * convert it to cpu format before returning.
- */
- if (ntfs_are_names_equal(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, ic,
- vol->upcase, vol->upcase_len)) {
-found_it2:
- mref = le64_to_cpu(ie->data.dir.indexed_file);
- unlock_page(page);
- ntfs_unmap_page(page);
- return mref;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- IGNORE_CASE, vol->upcase, vol->upcase_len);
- /*
- * If uname collates before the name of the current entry, there
- * is definitely no such name in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /* The names are not equal, continue the search. */
- if (rc)
- continue;
- /*
- * Names match with case insensitive comparison, now try the
- * case sensitive comparison, which is required for proper
- * collation.
- */
- rc = ntfs_collate_names(uname, uname_len,
- (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, 1,
- CASE_SENSITIVE, vol->upcase, vol->upcase_len);
- if (rc == -1)
- break;
- if (rc)
- continue;
- /*
- * Perfect match, this will never happen as the
- * ntfs_are_names_equal() call will have gotten a match but we
- * still treat it correctly.
- */
- goto found_it2;
- }
- /*
- * We have finished with this index buffer without success. Check for
- * the presence of a child node.
- */
- if (ie->flags & INDEX_ENTRY_NODE) {
- if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
- ntfs_error(sb, "Index entry with child node found in "
- "a leaf node in directory inode 0x%lx.",
- dir_ni->mft_no);
- goto unm_err_out;
- }
- /* Child node present, descend into it. */
- old_vcn = vcn;
- vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
- if (vcn >= 0) {
- /* If vcn is in the same page cache page as old_vcn we
- * recycle the mapped page. */
- if (old_vcn << vol->cluster_size_bits >>
- PAGE_SHIFT == vcn <<
- vol->cluster_size_bits >>
- PAGE_SHIFT)
- goto fast_descend_into_child_node;
- unlock_page(page);
- ntfs_unmap_page(page);
- goto descend_into_child_node;
- }
- ntfs_error(sb, "Negative child node vcn in directory inode "
- "0x%lx.", dir_ni->mft_no);
- goto unm_err_out;
- }
- /* No child node, return -ENOENT. */
- ntfs_debug("Entry not found.");
- err = -ENOENT;
-unm_err_out:
- unlock_page(page);
- ntfs_unmap_page(page);
-err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(dir_ni);
- return ERR_MREF(err);
-dir_err_out:
- ntfs_error(sb, "Corrupt directory. Aborting lookup.");
- goto err_out;
-}
-
-#endif
-
-/**
- * ntfs_filldir - ntfs specific filldir method
- * @vol: current ntfs volume
- * @ndir: ntfs inode of current directory
- * @ia_page: page in which the index allocation buffer @ie is in resides
- * @ie: current index entry
- * @name: buffer to use for the converted name
- * @actor: what to feed the entries to
- *
- * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
- * callback.
- *
- * If @ia_page is not NULL it is the locked page containing the index
- * allocation block containing the index entry @ie.
- *
- * Note, we drop (and then reacquire) the page lock on @ia_page across the
- * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
- * since ntfs_lookup() will lock the same page. As an optimization, we do not
- * retake the lock if we are returning a non-zero value as ntfs_readdir()
- * would need to drop the lock immediately anyway.
- */
-static inline int ntfs_filldir(ntfs_volume *vol,
- ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
- u8 *name, struct dir_context *actor)
-{
- unsigned long mref;
- int name_len;
- unsigned dt_type;
- FILE_NAME_TYPE_FLAGS name_type;
-
- name_type = ie->key.file_name.file_name_type;
- if (name_type == FILE_NAME_DOS) {
- ntfs_debug("Skipping DOS name space entry.");
- return 0;
- }
- if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
- ntfs_debug("Skipping root directory self reference entry.");
- return 0;
- }
- if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
- !NVolShowSystemFiles(vol)) {
- ntfs_debug("Skipping system file.");
- return 0;
- }
- name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
- ie->key.file_name.file_name_length, &name,
- NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
- if (name_len <= 0) {
- ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
- (long long)MREF_LE(ie->data.dir.indexed_file));
- return 0;
- }
- if (ie->key.file_name.file_attributes &
- FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
- dt_type = DT_DIR;
- else
- dt_type = DT_REG;
- mref = MREF_LE(ie->data.dir.indexed_file);
- /*
- * Drop the page lock otherwise we deadlock with NFS when it calls
- * ->lookup since ntfs_lookup() will lock the same page.
- */
- if (ia_page)
- unlock_page(ia_page);
- ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
- "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
- dt_type == DT_DIR ? "DIR" : "REG");
- if (!dir_emit(actor, name, name_len, mref, dt_type))
- return 1;
- /* Relock the page but not if we are aborting ->readdir. */
- if (ia_page)
- lock_page(ia_page);
- return 0;
-}
-
-/*
- * We use the same basic approach as the old NTFS driver, i.e. we parse the
- * index root entries and then the index allocation entries that are marked
- * as in use in the index bitmap.
- *
- * While this will return the names in random order this doesn't matter for
- * ->readdir but OTOH results in a faster ->readdir.
- *
- * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
- * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
- * modifications).
- *
- * Locking: - Caller must hold i_mutex on the directory.
- * - Each page cache page in the index allocation mapping must be
- * locked whilst being accessed otherwise we may find a corrupt
- * page due to it being under ->writepage at the moment which
- * applies the mst protection fixups before writing out and then
- * removes them again after the write is complete after which it
- * unlocks the page.
- */
-static int ntfs_readdir(struct file *file, struct dir_context *actor)
-{
- s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
- loff_t i_size;
- struct inode *bmp_vi, *vdir = file_inode(file);
- struct super_block *sb = vdir->i_sb;
- ntfs_inode *ndir = NTFS_I(vdir);
- ntfs_volume *vol = NTFS_SB(sb);
- MFT_RECORD *m;
- INDEX_ROOT *ir = NULL;
- INDEX_ENTRY *ie;
- INDEX_ALLOCATION *ia;
- u8 *name = NULL;
- int rc, err, ir_pos, cur_bmp_pos;
- struct address_space *ia_mapping, *bmp_mapping;
- struct page *bmp_page = NULL, *ia_page = NULL;
- u8 *kaddr, *bmp, *index_end;
- ntfs_attr_search_ctx *ctx;
-
- ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
- vdir->i_ino, actor->pos);
- rc = err = 0;
- /* Are we at end of dir yet? */
- i_size = i_size_read(vdir);
- if (actor->pos >= i_size + vol->mft_record_size)
- return 0;
- /* Emulate . and .. for all directories. */
- if (!dir_emit_dots(file, actor))
- return 0;
- m = NULL;
- ctx = NULL;
- /*
- * Allocate a buffer to store the current name being processed
- * converted to format determined by current NLS.
- */
- name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
- if (unlikely(!name)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Are we jumping straight into the index allocation attribute? */
- if (actor->pos >= vol->mft_record_size)
- goto skip_index_root;
- /* Get hold of the mft record for the directory. */
- m = map_mft_record(ndir);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(ndir, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Get the offset into the index root attribute. */
- ir_pos = (s64)actor->pos;
- /* Find the index root attribute in the mft record. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(err)) {
- ntfs_error(sb, "Index root attribute missing in directory "
- "inode 0x%lx.", vdir->i_ino);
- goto err_out;
- }
- /*
- * Copy the index root attribute value to a buffer so that we can put
- * the search context and unmap the mft record before calling the
- * filldir() callback. We need to do this because of NFSd which calls
- * ->lookup() from its filldir callback() and this causes NTFS to
- * deadlock as ntfs_lookup() maps the mft record of the directory and
- * we have got it mapped here already. The only solution is for us to
- * unmap the mft record here so that a call to ntfs_lookup() is able to
- * map the mft record without deadlocking.
- */
- rc = le32_to_cpu(ctx->attr->data.resident.value_length);
- ir = kmalloc(rc, GFP_NOFS);
- if (unlikely(!ir)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Copy the index root value (it has been verified in read_inode). */
- memcpy(ir, (u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ndir);
- ctx = NULL;
- m = NULL;
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ir->index +
- le32_to_cpu(ir->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry or until filldir tells us it has had enough
- * or signals an error (both covered by the rc test).
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
- /* Bounds checks. */
- if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end))
- goto err_out;
- /* The last entry cannot contain a name. */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /* Skip index root entry if continuing previous readdir. */
- if (ir_pos > (u8*)ie - (u8*)ir)
- continue;
- /* Advance the position even if going to skip the entry. */
- actor->pos = (u8*)ie - (u8*)ir;
- /* Submit the name to the filldir callback. */
- rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
- if (rc) {
- kfree(ir);
- goto abort;
- }
- }
- /* We are done with the index root and can free the buffer. */
- kfree(ir);
- ir = NULL;
- /* If there is no index allocation attribute we are finished. */
- if (!NInoIndexAllocPresent(ndir))
- goto EOD;
- /* Advance fpos to the beginning of the index allocation. */
- actor->pos = vol->mft_record_size;
-skip_index_root:
- kaddr = NULL;
- prev_ia_pos = -1LL;
- /* Get the offset into the index allocation attribute. */
- ia_pos = (s64)actor->pos - vol->mft_record_size;
- ia_mapping = vdir->i_mapping;
- ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
- bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
- if (IS_ERR(bmp_vi)) {
- ntfs_error(sb, "Failed to get bitmap attribute.");
- err = PTR_ERR(bmp_vi);
- goto err_out;
- }
- bmp_mapping = bmp_vi->i_mapping;
- /* Get the starting bitmap bit position and sanity check it. */
- bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
- if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) {
- ntfs_error(sb, "Current index allocation position exceeds "
- "index bitmap size.");
- goto iput_err_out;
- }
- /* Get the starting bit position in the current bitmap page. */
- cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1);
- bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1);
-get_next_bmp_page:
- ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
- (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT),
- (unsigned long long)bmp_pos &
- (unsigned long long)((PAGE_SIZE * 8) - 1));
- bmp_page = ntfs_map_page(bmp_mapping,
- bmp_pos >> (3 + PAGE_SHIFT));
- if (IS_ERR(bmp_page)) {
- ntfs_error(sb, "Reading index bitmap failed.");
- err = PTR_ERR(bmp_page);
- bmp_page = NULL;
- goto iput_err_out;
- }
- bmp = (u8*)page_address(bmp_page);
- /* Find next index block in use. */
- while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
-find_next_index_buffer:
- cur_bmp_pos++;
- /*
- * If we have reached the end of the bitmap page, get the next
- * page, and put away the old one.
- */
- if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) {
- ntfs_unmap_page(bmp_page);
- bmp_pos += PAGE_SIZE * 8;
- cur_bmp_pos = 0;
- goto get_next_bmp_page;
- }
- /* If we have reached the end of the bitmap, we are done. */
- if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size))
- goto unm_EOD;
- ia_pos = (bmp_pos + cur_bmp_pos) <<
- ndir->itype.index.block_size_bits;
- }
- ntfs_debug("Handling index buffer 0x%llx.",
- (unsigned long long)bmp_pos + cur_bmp_pos);
- /* If the current index buffer is in the same page we reuse the page. */
- if ((prev_ia_pos & (s64)PAGE_MASK) !=
- (ia_pos & (s64)PAGE_MASK)) {
- prev_ia_pos = ia_pos;
- if (likely(ia_page != NULL)) {
- unlock_page(ia_page);
- ntfs_unmap_page(ia_page);
- }
- /*
- * Map the page cache page containing the current ia_pos,
- * reading it from disk if necessary.
- */
- ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT);
- if (IS_ERR(ia_page)) {
- ntfs_error(sb, "Reading index allocation data failed.");
- err = PTR_ERR(ia_page);
- ia_page = NULL;
- goto err_out;
- }
- lock_page(ia_page);
- kaddr = (u8*)page_address(ia_page);
- }
- /* Get the current index buffer. */
- ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK &
- ~(s64)(ndir->itype.index.block_size - 1)));
- /* Bounds checks. */
- if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) {
- ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
- "inode 0x%lx or driver bug.", vdir->i_ino);
- goto err_out;
- }
- /* Catch multi sector transfer fixup errors. */
- if (unlikely(!ntfs_is_indx_record(ia->magic))) {
- ntfs_error(sb, "Directory index record with vcn 0x%llx is "
- "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
- (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino);
- goto err_out;
- }
- if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
- ~(s64)(ndir->itype.index.block_size - 1)) >>
- ndir->itype.index.vcn_size_bits)) {
- ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
- "different from expected VCN (0x%llx). "
- "Directory inode 0x%lx is corrupt or driver "
- "bug. ", (unsigned long long)
- sle64_to_cpu(ia->index_block_vcn),
- (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino);
- goto err_out;
- }
- if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
- ndir->itype.index.block_size)) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx has a size (%u) differing from the "
- "directory specified size (%u). Directory "
- "inode is corrupt or driver bug.",
- (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino,
- le32_to_cpu(ia->index.allocated_size) + 0x18,
- ndir->itype.index.block_size);
- goto err_out;
- }
- index_end = (u8*)ia + ndir->itype.index.block_size;
- if (unlikely(index_end > kaddr + PAGE_SIZE)) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
- "0x%lx crosses page boundary. Impossible! "
- "Cannot access! This is probably a bug in the "
- "driver.", (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino);
- goto err_out;
- }
- ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
- index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
- if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
- ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
- "inode 0x%lx exceeds maximum size.",
- (unsigned long long)ia_pos >>
- ndir->itype.index.vcn_size_bits, vdir->i_ino);
- goto err_out;
- }
- /* The first index entry in this index buffer. */
- ie = (INDEX_ENTRY*)((u8*)&ia->index +
- le32_to_cpu(ia->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry or until filldir tells us it has had enough
- * or signals an error (both covered by the rc test).
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- ntfs_debug("In index allocation, offset 0x%llx.",
- (unsigned long long)ia_start +
- (unsigned long long)((u8*)ie - (u8*)ia));
- /* Bounds checks. */
- if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->key_length) >
- index_end))
- goto err_out;
- /* The last entry cannot contain a name. */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /* Skip index block entry if continuing previous readdir. */
- if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
- continue;
- /* Advance the position even if going to skip the entry. */
- actor->pos = (u8*)ie - (u8*)ia +
- (sle64_to_cpu(ia->index_block_vcn) <<
- ndir->itype.index.vcn_size_bits) +
- vol->mft_record_size;
- /*
- * Submit the name to the @filldir callback. Note,
- * ntfs_filldir() drops the lock on @ia_page but it retakes it
- * before returning, unless a non-zero value is returned in
- * which case the page is left unlocked.
- */
- rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
- if (rc) {
- /* @ia_page is already unlocked in this case. */
- ntfs_unmap_page(ia_page);
- ntfs_unmap_page(bmp_page);
- iput(bmp_vi);
- goto abort;
- }
- }
- goto find_next_index_buffer;
-unm_EOD:
- if (ia_page) {
- unlock_page(ia_page);
- ntfs_unmap_page(ia_page);
- }
- ntfs_unmap_page(bmp_page);
- iput(bmp_vi);
-EOD:
- /* We are finished, set fpos to EOD. */
- actor->pos = i_size + vol->mft_record_size;
-abort:
- kfree(name);
- return 0;
-err_out:
- if (bmp_page) {
- ntfs_unmap_page(bmp_page);
-iput_err_out:
- iput(bmp_vi);
- }
- if (ia_page) {
- unlock_page(ia_page);
- ntfs_unmap_page(ia_page);
- }
- kfree(ir);
- kfree(name);
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(ndir);
- if (!err)
- err = -EIO;
- ntfs_debug("Failed. Returning error code %i.", -err);
- return err;
-}
-
-/**
- * ntfs_dir_open - called when an inode is about to be opened
- * @vi: inode to be opened
- * @filp: file structure describing the inode
- *
- * Limit directory size to the page cache limit on architectures where unsigned
- * long is 32-bits. This is the most we can do for now without overflowing the
- * page cache page index. Doing it this way means we don't run into problems
- * because of existing too large directories. It would be better to allow the
- * user to read the accessible part of the directory but I doubt very much
- * anyone is going to hit this check on a 32-bit architecture, so there is no
- * point in adding the extra complexity required to support this.
- *
- * On 64-bit architectures, the check is hopefully optimized away by the
- * compiler.
- */
-static int ntfs_dir_open(struct inode *vi, struct file *filp)
-{
- if (sizeof(unsigned long) < 8) {
- if (i_size_read(vi) > MAX_LFS_FILESIZE)
- return -EFBIG;
- }
- return 0;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_dir_fsync - sync a directory to disk
- * @filp: directory to be synced
- * @start: offset in bytes of the beginning of data range to sync
- * @end: offset in bytes of the end of data range (inclusive)
- * @datasync: if non-zero only flush user data and not metadata
- *
- * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and
- * msync system calls. This function is based on file.c::ntfs_file_fsync().
- *
- * Write the mft record and all associated extent mft records as well as the
- * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
- *
- * If @datasync is true, we do not wait on the inode(s) to be written out
- * but we always wait on the page cache pages to be written out.
- *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
- * Locking: Caller must hold i_mutex on the inode.
- *
- * TODO: We should probably also write all attribute/index inodes associated
- * with this inode but since we have no simple way of getting to them we ignore
- * this problem for now. We do write the $BITMAP attribute if it is present
- * which is the important one for a directory so things are not too bad.
- */
-static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *bmp_vi, *vi = filp->f_mapping->host;
- int err, ret;
- ntfs_attr na;
-
- ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-
- err = file_write_and_wait_range(filp, start, end);
- if (err)
- return err;
- inode_lock(vi);
-
- BUG_ON(!S_ISDIR(vi->i_mode));
- /* If the bitmap attribute inode is in memory sync it, too. */
- na.mft_no = vi->i_ino;
- na.type = AT_BITMAP;
- na.name = I30;
- na.name_len = 4;
- bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
- if (bmp_vi) {
- write_inode_now(bmp_vi, !datasync);
- iput(bmp_vi);
- }
- ret = __ntfs_write_inode(vi, 1);
- write_inode_now(vi, !datasync);
- err = sync_blockdev(vi->i_sb->s_bdev);
- if (unlikely(err && !ret))
- ret = err;
- if (likely(!ret))
- ntfs_debug("Done.");
- else
- ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
- "%u.", datasync ? "data" : "", vi->i_ino, -ret);
- inode_unlock(vi);
- return ret;
-}
-
-#endif /* NTFS_RW */
-
-WRAP_DIR_ITER(ntfs_readdir) // FIXME!
-const struct file_operations ntfs_dir_ops = {
- .llseek = generic_file_llseek, /* Seek inside directory. */
- .read = generic_read_dir, /* Return -EISDIR. */
- .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */
-#ifdef NTFS_RW
- .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
-#endif /* NTFS_RW */
- /*.ioctl = ,*/ /* Perform function on the
- mounted filesystem. */
- .open = ntfs_dir_open, /* Open directory. */
-};
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
deleted file mode 100644
index 0e326753df40..000000000000
--- a/fs/ntfs/dir.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
- * the Linux-NTFS project.
- *
- * Copyright (c) 2002-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_DIR_H
-#define _LINUX_NTFS_DIR_H
-
-#include "layout.h"
-#include "inode.h"
-#include "types.h"
-
-/*
- * ntfs_name is used to return the file name to the caller of
- * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
- * to be able to deal with dcache aliasing issues.
- */
-typedef struct {
- MFT_REF mref;
- FILE_NAME_TYPE_FLAGS type;
- u8 len;
- ntfschar name[0];
-} __attribute__ ((__packed__)) ntfs_name;
-
-/* The little endian Unicode string $I30 as a global constant. */
-extern ntfschar I30[5];
-
-extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
- const ntfschar *uname, const int uname_len, ntfs_name **res);
-
-#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
deleted file mode 100644
index f30c139bf9ae..000000000000
--- a/fs/ntfs/endian.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_ENDIAN_H
-#define _LINUX_NTFS_ENDIAN_H
-
-#include <asm/byteorder.h>
-#include "types.h"
-
-/*
- * Signed endianness conversion functions.
- */
-
-static inline s16 sle16_to_cpu(sle16 x)
-{
- return le16_to_cpu((__force le16)x);
-}
-
-static inline s32 sle32_to_cpu(sle32 x)
-{
- return le32_to_cpu((__force le32)x);
-}
-
-static inline s64 sle64_to_cpu(sle64 x)
-{
- return le64_to_cpu((__force le64)x);
-}
-
-static inline s16 sle16_to_cpup(sle16 *x)
-{
- return le16_to_cpu(*(__force le16*)x);
-}
-
-static inline s32 sle32_to_cpup(sle32 *x)
-{
- return le32_to_cpu(*(__force le32*)x);
-}
-
-static inline s64 sle64_to_cpup(sle64 *x)
-{
- return le64_to_cpu(*(__force le64*)x);
-}
-
-static inline sle16 cpu_to_sle16(s16 x)
-{
- return (__force sle16)cpu_to_le16(x);
-}
-
-static inline sle32 cpu_to_sle32(s32 x)
-{
- return (__force sle32)cpu_to_le32(x);
-}
-
-static inline sle64 cpu_to_sle64(s64 x)
-{
- return (__force sle64)cpu_to_le64(x);
-}
-
-static inline sle16 cpu_to_sle16p(s16 *x)
-{
- return (__force sle16)cpu_to_le16(*x);
-}
-
-static inline sle32 cpu_to_sle32p(s32 *x)
-{
- return (__force sle32)cpu_to_le32(*x);
-}
-
-static inline sle64 cpu_to_sle64p(s64 *x)
-{
- return (__force sle64)cpu_to_le64(*x);
-}
-
-#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
deleted file mode 100644
index 297c0b9db621..000000000000
--- a/fs/ntfs/file.c
+++ /dev/null
@@ -1,1997 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
- */
-
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-#include <linux/sched/signal.h>
-#include <linux/swap.h>
-#include <linux/uio.h>
-#include <linux/writeback.h>
-
-#include <asm/page.h>
-#include <linux/uaccess.h>
-
-#include "attrib.h"
-#include "bitmap.h"
-#include "inode.h"
-#include "debug.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-
-/**
- * ntfs_file_open - called when an inode is about to be opened
- * @vi: inode to be opened
- * @filp: file structure describing the inode
- *
- * Limit file size to the page cache limit on architectures where unsigned long
- * is 32-bits. This is the most we can do for now without overflowing the page
- * cache page index. Doing it this way means we don't run into problems because
- * of existing too large files. It would be better to allow the user to read
- * the beginning of the file but I doubt very much anyone is going to hit this
- * check on a 32-bit architecture, so there is no point in adding the extra
- * complexity required to support this.
- *
- * On 64-bit architectures, the check is hopefully optimized away by the
- * compiler.
- *
- * After the check passes, just call generic_file_open() to do its work.
- */
-static int ntfs_file_open(struct inode *vi, struct file *filp)
-{
- if (sizeof(unsigned long) < 8) {
- if (i_size_read(vi) > MAX_LFS_FILESIZE)
- return -EOVERFLOW;
- }
- return generic_file_open(vi, filp);
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_attr_extend_initialized - extend the initialized size of an attribute
- * @ni: ntfs inode of the attribute to extend
- * @new_init_size: requested new initialized size in bytes
- *
- * Extend the initialized size of an attribute described by the ntfs inode @ni
- * to @new_init_size bytes. This involves zeroing any non-sparse space between
- * the old initialized size and @new_init_size both in the page cache and on
- * disk (if relevant complete pages are already uptodate in the page cache then
- * these are simply marked dirty).
- *
- * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
- * in the resident attribute case, it is tied to the initialized size and, in
- * the non-resident attribute case, it may not fall below the initialized size.
- *
- * Note that if the attribute is resident, we do not need to touch the page
- * cache at all. This is because if the page cache page is not uptodate we
- * bring it uptodate later, when doing the write to the mft record since we
- * then already have the page mapped. And if the page is uptodate, the
- * non-initialized region will already have been zeroed when the page was
- * brought uptodate and the region may in fact already have been overwritten
- * with new data via mmap() based writes, so we cannot just zero it. And since
- * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
- * is unspecified, we choose not to do zeroing and thus we do not need to touch
- * the page at all. For a more detailed explanation see ntfs_truncate() in
- * fs/ntfs/inode.c.
- *
- * Return 0 on success and -errno on error. In the case that an error is
- * encountered it is possible that the initialized size will already have been
- * incremented some way towards @new_init_size but it is guaranteed that if
- * this is the case, the necessary zeroing will also have happened and that all
- * metadata is self-consistent.
- *
- * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
- * held by the caller.
- */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
-{
- s64 old_init_size;
- loff_t old_i_size;
- pgoff_t index, end_index;
- unsigned long flags;
- struct inode *vi = VFS_I(ni);
- ntfs_inode *base_ni;
- MFT_RECORD *m = NULL;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx = NULL;
- struct address_space *mapping;
- struct page *page = NULL;
- u8 *kattr;
- int err;
- u32 attr_len;
-
- read_lock_irqsave(&ni->size_lock, flags);
- old_init_size = ni->initialized_size;
- old_i_size = i_size_read(vi);
- BUG_ON(new_init_size > ni->allocated_size);
- read_unlock_irqrestore(&ni->size_lock, flags);
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
- "old_initialized_size 0x%llx, "
- "new_initialized_size 0x%llx, i_size 0x%llx.",
- vi->i_ino, (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)old_init_size,
- (unsigned long long)new_init_size, old_i_size);
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Use goto to reduce indentation and we need the label below anyway. */
- if (NInoNonResident(ni))
- goto do_non_resident_extend;
- BUG_ON(old_init_size != old_i_size);
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- BUG_ON(a->non_resident);
- /* The total length of the attribute value. */
- attr_len = le32_to_cpu(a->data.resident.value_length);
- BUG_ON(old_i_size != (loff_t)attr_len);
- /*
- * Do the zeroing in the mft record and update the attribute size in
- * the mft record.
- */
- kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
- memset(kattr + attr_len, 0, new_init_size - attr_len);
- a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
- /* Finally, update the sizes in the vfs and ntfs inodes. */
- write_lock_irqsave(&ni->size_lock, flags);
- i_size_write(vi, new_init_size);
- ni->initialized_size = new_init_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
- goto done;
-do_non_resident_extend:
- /*
- * If the new initialized size @new_init_size exceeds the current file
- * size (vfs inode->i_size), we need to extend the file size to the
- * new initialized size.
- */
- if (new_init_size > old_i_size) {
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- BUG_ON(!a->non_resident);
- BUG_ON(old_i_size != (loff_t)
- sle64_to_cpu(a->data.non_resident.data_size));
- a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- /* Update the file size in the vfs inode. */
- i_size_write(vi, new_init_size);
- ntfs_attr_put_search_ctx(ctx);
- ctx = NULL;
- unmap_mft_record(base_ni);
- m = NULL;
- }
- mapping = vi->i_mapping;
- index = old_init_size >> PAGE_SHIFT;
- end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- do {
- /*
- * Read the page. If the page is not present, this will zero
- * the uninitialized regions for us.
- */
- page = read_mapping_page(mapping, index, NULL);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto init_err_out;
- }
- /*
- * Update the initialized size in the ntfs inode. This is
- * enough to make ntfs_writepage() work.
- */
- write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
- if (ni->initialized_size > new_init_size)
- ni->initialized_size = new_init_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
- /* Set the page dirty so it gets written out. */
- set_page_dirty(page);
- put_page(page);
- /*
- * Play nice with the vm and the rest of the system. This is
- * very much needed as we can potentially be modifying the
- * initialised size from a very small value to a really huge
- * value, e.g.
- * f = open(somefile, O_TRUNC);
- * truncate(f, 10GiB);
- * seek(f, 10GiB);
- * write(f, 1);
- * And this would mean we would be marking dirty hundreds of
- * thousands of pages or as in the above example more than
- * two and a half million pages!
- *
- * TODO: For sparse pages could optimize this workload by using
- * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
- * would be set in read_folio for sparse pages and here we would
- * not need to mark dirty any pages which have this bit set.
- * The only caveat is that we have to clear the bit everywhere
- * where we allocate any clusters that lie in the page or that
- * contain the page.
- *
- * TODO: An even greater optimization would be for us to only
- * call read_folio() on pages which are not in sparse regions as
- * determined from the runlist. This would greatly reduce the
- * number of pages we read and make dirty in the case of sparse
- * files.
- */
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- } while (++index < end_index);
- read_lock_irqsave(&ni->size_lock, flags);
- BUG_ON(ni->initialized_size != new_init_size);
- read_unlock_irqrestore(&ni->size_lock, flags);
- /* Now bring in sync the initialized_size in the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- goto init_err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto init_err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto init_err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- BUG_ON(!a->non_resident);
- a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
-done:
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
- (unsigned long long)new_init_size, i_size_read(vi));
- return 0;
-init_err_out:
- write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = old_init_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- ntfs_debug("Failed. Returning error code %i.", err);
- return err;
-}
-
-static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
- struct iov_iter *from)
-{
- loff_t pos;
- s64 end, ll;
- ssize_t err;
- unsigned long flags;
- struct file *file = iocb->ki_filp;
- struct inode *vi = file_inode(file);
- ntfs_inode *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
-
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
- "0x%llx, count 0x%zx.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)iocb->ki_pos,
- iov_iter_count(from));
- err = generic_write_checks(iocb, from);
- if (unlikely(err <= 0))
- goto out;
- /*
- * All checks have passed. Before we start doing any writing we want
- * to abort any totally illegal writes.
- */
- BUG_ON(NInoMstProtected(ni));
- BUG_ON(ni->type != AT_DATA);
- /* If file is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- /* Only $DATA attributes can be encrypted. */
- /*
- * Reminder for later: Encrypted files are _always_
- * non-resident so that the content can always be encrypted.
- */
- ntfs_debug("Denying write access to encrypted file.");
- err = -EACCES;
- goto out;
- }
- if (NInoCompressed(ni)) {
- /* Only unnamed $DATA attribute can be compressed. */
- BUG_ON(ni->name_len);
- /*
- * Reminder for later: If resident, the data is not actually
- * compressed. Only on the switch to non-resident does
- * compression kick in. This is in contrast to encrypted files
- * (see above).
- */
- ntfs_error(vi->i_sb, "Writing to compressed files is not "
- "implemented yet. Sorry.");
- err = -EOPNOTSUPP;
- goto out;
- }
- err = file_remove_privs(file);
- if (unlikely(err))
- goto out;
- /*
- * Our ->update_time method always succeeds thus file_update_time()
- * cannot fail either so there is no need to check the return code.
- */
- file_update_time(file);
- pos = iocb->ki_pos;
- /* The first byte after the last cluster being written to. */
- end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
- ~(u64)vol->cluster_size_mask;
- /*
- * If the write goes beyond the allocated size, extend the allocation
- * to cover the whole of the write, rounded up to the nearest cluster.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (end > ll) {
- /*
- * Extend the allocation without changing the data size.
- *
- * Note we ensure the allocation is big enough to at least
- * write some data but we do not require the allocation to be
- * complete, i.e. it may be partial.
- */
- ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
- if (likely(ll >= 0)) {
- BUG_ON(pos >= ll);
- /* If the extension was partial truncate the write. */
- if (end > ll) {
- ntfs_debug("Truncating write to inode 0x%lx, "
- "attribute type 0x%x, because "
- "the allocation was only "
- "partially extended.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- iov_iter_truncate(from, ll - pos);
- }
- } else {
- err = ll;
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /* Perform a partial write if possible or fail. */
- if (pos < ll) {
- ntfs_debug("Truncating write to inode 0x%lx "
- "attribute type 0x%x, because "
- "extending the allocation "
- "failed (error %d).",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type),
- (int)-err);
- iov_iter_truncate(from, ll - pos);
- } else {
- if (err != -ENOSPC)
- ntfs_error(vi->i_sb, "Cannot perform "
- "write to inode "
- "0x%lx, attribute "
- "type 0x%x, because "
- "extending the "
- "allocation failed "
- "(error %ld).",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type),
- (long)-err);
- else
- ntfs_debug("Cannot perform write to "
- "inode 0x%lx, "
- "attribute type 0x%x, "
- "because there is not "
- "space left.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- goto out;
- }
- }
- }
- /*
- * If the write starts beyond the initialized size, extend it up to the
- * beginning of the write and initialize all non-sparse space between
- * the old initialized size and the new one. This automatically also
- * increments the vfs inode->i_size to keep it above or equal to the
- * initialized_size.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (pos > ll) {
- /*
- * Wait for ongoing direct i/o to complete before proceeding.
- * New direct i/o cannot start as we hold i_mutex.
- */
- inode_dio_wait(vi);
- err = ntfs_attr_extend_initialized(ni, pos);
- if (unlikely(err < 0))
- ntfs_error(vi->i_sb, "Cannot perform write to inode "
- "0x%lx, attribute type 0x%x, because "
- "extending the initialized size "
- "failed (error %d).", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- (int)-err);
- }
-out:
- return err;
-}
-
-/**
- * __ntfs_grab_cache_pages - obtain a number of locked pages
- * @mapping: address space mapping from which to obtain page cache pages
- * @index: starting index in @mapping at which to begin obtaining pages
- * @nr_pages: number of page cache pages to obtain
- * @pages: array of pages in which to return the obtained page cache pages
- * @cached_page: allocated but as yet unused page
- *
- * Obtain @nr_pages locked page cache pages from the mapping @mapping and
- * starting at index @index.
- *
- * If a page is newly created, add it to lru list
- *
- * Note, the page locks are obtained in ascending page index order.
- */
-static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
- pgoff_t index, const unsigned nr_pages, struct page **pages,
- struct page **cached_page)
-{
- int err, nr;
-
- BUG_ON(!nr_pages);
- err = nr = 0;
- do {
- pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
- FGP_ACCESSED);
- if (!pages[nr]) {
- if (!*cached_page) {
- *cached_page = page_cache_alloc(mapping);
- if (unlikely(!*cached_page)) {
- err = -ENOMEM;
- goto err_out;
- }
- }
- err = add_to_page_cache_lru(*cached_page, mapping,
- index,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
- if (unlikely(err)) {
- if (err == -EEXIST)
- continue;
- goto err_out;
- }
- pages[nr] = *cached_page;
- *cached_page = NULL;
- }
- index++;
- nr++;
- } while (nr < nr_pages);
-out:
- return err;
-err_out:
- while (nr > 0) {
- unlock_page(pages[--nr]);
- put_page(pages[nr]);
- }
- goto out;
-}
-
-static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
-{
- lock_buffer(bh);
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, bh);
-}
-
-/**
- * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
- * @pages: array of destination pages
- * @nr_pages: number of pages in @pages
- * @pos: byte position in file at which the write begins
- * @bytes: number of bytes to be written
- *
- * This is called for non-resident attributes from ntfs_file_buffered_write()
- * with i_mutex held on the inode (@pages[0]->mapping->host). There are
- * @nr_pages pages in @pages which are locked but not kmap()ped. The source
- * data has not yet been copied into the @pages.
- *
- * Need to fill any holes with actual clusters, allocate buffers if necessary,
- * ensure all the buffers are mapped, and bring uptodate any buffers that are
- * only partially being written to.
- *
- * If @nr_pages is greater than one, we are guaranteed that the cluster size is
- * greater than PAGE_SIZE, that all pages in @pages are entirely inside
- * the same cluster and that they are the entirety of that cluster, and that
- * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
- *
- * i_size is not to be modified yet.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
- unsigned nr_pages, s64 pos, size_t bytes)
-{
- VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
- LCN lcn;
- s64 bh_pos, vcn_len, end, initialized_size;
- sector_t lcn_block;
- struct folio *folio;
- struct inode *vi;
- ntfs_inode *ni, *base_ni = NULL;
- ntfs_volume *vol;
- runlist_element *rl, *rl2;
- struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *m = NULL;
- ATTR_RECORD *a = NULL;
- unsigned long flags;
- u32 attr_rec_len = 0;
- unsigned blocksize, u;
- int err, mp_size;
- bool rl_write_locked, was_hole, is_retry;
- unsigned char blocksize_bits;
- struct {
- u8 runlist_merged:1;
- u8 mft_attr_mapped:1;
- u8 mp_rebuilt:1;
- u8 attr_switched:1;
- } status = { 0, 0, 0, 0 };
-
- BUG_ON(!nr_pages);
- BUG_ON(!pages);
- BUG_ON(!*pages);
- vi = pages[0]->mapping->host;
- ni = NTFS_I(vi);
- vol = ni->vol;
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
- "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
- vi->i_ino, ni->type, pages[0]->index, nr_pages,
- (long long)pos, bytes);
- blocksize = vol->sb->s_blocksize;
- blocksize_bits = vol->sb->s_blocksize_bits;
- rl_write_locked = false;
- rl = NULL;
- err = 0;
- vcn = lcn = -1;
- vcn_len = 0;
- lcn_block = -1;
- was_hole = false;
- cpos = pos >> vol->cluster_size_bits;
- end = pos + bytes;
- cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
- /*
- * Loop over each buffer in each folio. Use goto to
- * reduce indentation.
- */
- u = 0;
-do_next_folio:
- folio = page_folio(pages[u]);
- bh_pos = folio_pos(folio);
- head = folio_buffers(folio);
- if (!head)
- /*
- * create_empty_buffers() will create uptodate/dirty
- * buffers if the folio is uptodate/dirty.
- */
- head = create_empty_buffers(folio, blocksize, 0);
- bh = head;
- do {
- VCN cdelta;
- s64 bh_end;
- unsigned bh_cofs;
-
- /* Clear buffer_new on all buffers to reinitialise state. */
- if (buffer_new(bh))
- clear_buffer_new(bh);
- bh_end = bh_pos + blocksize;
- bh_cpos = bh_pos >> vol->cluster_size_bits;
- bh_cofs = bh_pos & vol->cluster_size_mask;
- if (buffer_mapped(bh)) {
- /*
- * The buffer is already mapped. If it is uptodate,
- * ignore it.
- */
- if (buffer_uptodate(bh))
- continue;
- /*
- * The buffer is not uptodate. If the folio is uptodate
- * set the buffer uptodate and otherwise ignore it.
- */
- if (folio_test_uptodate(folio)) {
- set_buffer_uptodate(bh);
- continue;
- }
- /*
- * Neither the folio nor the buffer are uptodate. If
- * the buffer is only partially being written to, we
- * need to read it in before the write, i.e. now.
- */
- if ((bh_pos < pos && bh_end > pos) ||
- (bh_pos < end && bh_end > end)) {
- /*
- * If the buffer is fully or partially within
- * the initialized size, do an actual read.
- * Otherwise, simply zero the buffer.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (bh_pos < initialized_size) {
- ntfs_submit_bh_for_read(bh);
- *wait_bh++ = bh;
- } else {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- }
- continue;
- }
- /* Unmapped buffer. Need to map it. */
- bh->b_bdev = vol->sb->s_bdev;
- /*
- * If the current buffer is in the same clusters as the map
- * cache, there is no need to check the runlist again. The
- * map cache is made up of @vcn, which is the first cached file
- * cluster, @vcn_len which is the number of cached file
- * clusters, @lcn is the device cluster corresponding to @vcn,
- * and @lcn_block is the block number corresponding to @lcn.
- */
- cdelta = bh_cpos - vcn;
- if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
-map_buffer_cached:
- BUG_ON(lcn < 0);
- bh->b_blocknr = lcn_block +
- (cdelta << (vol->cluster_size_bits -
- blocksize_bits)) +
- (bh_cofs >> blocksize_bits);
- set_buffer_mapped(bh);
- /*
- * If the folio is uptodate so is the buffer. If the
- * buffer is fully outside the write, we ignore it if
- * it was already allocated and we mark it dirty so it
- * gets written out if we allocated it. On the other
- * hand, if we allocated the buffer but we are not
- * marking it dirty we set buffer_new so we can do
- * error recovery.
- */
- if (folio_test_uptodate(folio)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- if (unlikely(was_hole)) {
- /* We allocated the buffer. */
- clean_bdev_bh_alias(bh);
- if (bh_end <= pos || bh_pos >= end)
- mark_buffer_dirty(bh);
- else
- set_buffer_new(bh);
- }
- continue;
- }
- /* Page is _not_ uptodate. */
- if (likely(!was_hole)) {
- /*
- * Buffer was already allocated. If it is not
- * uptodate and is only partially being written
- * to, we need to read it in before the write,
- * i.e. now.
- */
- if (!buffer_uptodate(bh) && bh_pos < end &&
- bh_end > pos &&
- (bh_pos < pos ||
- bh_end > end)) {
- /*
- * If the buffer is fully or partially
- * within the initialized size, do an
- * actual read. Otherwise, simply zero
- * the buffer.
- */
- read_lock_irqsave(&ni->size_lock,
- flags);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock,
- flags);
- if (bh_pos < initialized_size) {
- ntfs_submit_bh_for_read(bh);
- *wait_bh++ = bh;
- } else {
- folio_zero_range(folio,
- bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- }
- continue;
- }
- /* We allocated the buffer. */
- clean_bdev_bh_alias(bh);
- /*
- * If the buffer is fully outside the write, zero it,
- * set it uptodate, and mark it dirty so it gets
- * written out. If it is partially being written to,
- * zero region surrounding the write but leave it to
- * commit write to do anything else. Finally, if the
- * buffer is fully being overwritten, do nothing.
- */
- if (bh_end <= pos || bh_pos >= end) {
- if (!buffer_uptodate(bh)) {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- mark_buffer_dirty(bh);
- continue;
- }
- set_buffer_new(bh);
- if (!buffer_uptodate(bh) &&
- (bh_pos < pos || bh_end > end)) {
- u8 *kaddr;
- unsigned pofs;
-
- kaddr = kmap_local_folio(folio, 0);
- if (bh_pos < pos) {
- pofs = bh_pos & ~PAGE_MASK;
- memset(kaddr + pofs, 0, pos - bh_pos);
- }
- if (bh_end > end) {
- pofs = end & ~PAGE_MASK;
- memset(kaddr + pofs, 0, bh_end - end);
- }
- kunmap_local(kaddr);
- flush_dcache_folio(folio);
- }
- continue;
- }
- /*
- * Slow path: this is the first buffer in the cluster. If it
- * is outside allocated size and is not uptodate, zero it and
- * set it uptodate.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (bh_pos > initialized_size) {
- if (folio_test_uptodate(folio)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- } else if (!buffer_uptodate(bh)) {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- continue;
- }
- is_retry = false;
- if (!rl) {
- down_read(&ni->runlist.lock);
-retry_remap:
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target cluster. */
- while (rl->length && rl[1].vcn <= bh_cpos)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
- if (likely(lcn >= 0)) {
- /*
- * Successful remap, setup the map cache and
- * use that to deal with the buffer.
- */
- was_hole = false;
- vcn = bh_cpos;
- vcn_len = rl[1].vcn - vcn;
- lcn_block = lcn << (vol->cluster_size_bits -
- blocksize_bits);
- cdelta = 0;
- /*
- * If the number of remaining clusters touched
- * by the write is smaller or equal to the
- * number of cached clusters, unlock the
- * runlist as the map cache will be used from
- * now on.
- */
- if (likely(vcn + vcn_len >= cend)) {
- if (rl_write_locked) {
- up_write(&ni->runlist.lock);
- rl_write_locked = false;
- } else
- up_read(&ni->runlist.lock);
- rl = NULL;
- }
- goto map_buffer_cached;
- }
- } else
- lcn = LCN_RL_NOT_MAPPED;
- /*
- * If it is not a hole and not out of bounds, the runlist is
- * probably unmapped so try to map it now.
- */
- if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
- if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
- /* Attempt to map runlist. */
- if (!rl_write_locked) {
- /*
- * We need the runlist locked for
- * writing, so if it is locked for
- * reading relock it now and retry in
- * case it changed whilst we dropped
- * the lock.
- */
- up_read(&ni->runlist.lock);
- down_write(&ni->runlist.lock);
- rl_write_locked = true;
- goto retry_remap;
- }
- err = ntfs_map_runlist_nolock(ni, bh_cpos,
- NULL);
- if (likely(!err)) {
- is_retry = true;
- goto retry_remap;
- }
- /*
- * If @vcn is out of bounds, pretend @lcn is
- * LCN_ENOENT. As long as the buffer is out
- * of bounds this will work fine.
- */
- if (err == -ENOENT) {
- lcn = LCN_ENOENT;
- err = 0;
- goto rl_not_mapped_enoent;
- }
- } else
- err = -EIO;
- /* Failed to map the buffer, even after retrying. */
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
- "attribute type 0x%x, vcn 0x%llx, "
- "vcn offset 0x%x, because its "
- "location on disk could not be "
- "determined%s (error code %i).",
- ni->mft_no, ni->type,
- (unsigned long long)bh_cpos,
- (unsigned)bh_pos &
- vol->cluster_size_mask,
- is_retry ? " even after retrying" : "",
- err);
- break;
- }
-rl_not_mapped_enoent:
- /*
- * The buffer is in a hole or out of bounds. We need to fill
- * the hole, unless the buffer is in a cluster which is not
- * touched by the write, in which case we just leave the buffer
- * unmapped. This can only happen when the cluster size is
- * less than the page cache size.
- */
- if (unlikely(vol->cluster_size < PAGE_SIZE)) {
- bh_cend = (bh_end + vol->cluster_size - 1) >>
- vol->cluster_size_bits;
- if ((bh_cend <= cpos || bh_cpos >= cend)) {
- bh->b_blocknr = -1;
- /*
- * If the buffer is uptodate we skip it. If it
- * is not but the folio is uptodate, we can set
- * the buffer uptodate. If the folio is not
- * uptodate, we can clear the buffer and set it
- * uptodate. Whether this is worthwhile is
- * debatable and this could be removed.
- */
- if (folio_test_uptodate(folio)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- } else if (!buffer_uptodate(bh)) {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- continue;
- }
- }
- /*
- * Out of bounds buffer is invalid if it was not really out of
- * bounds.
- */
- BUG_ON(lcn != LCN_HOLE);
- /*
- * We need the runlist locked for writing, so if it is locked
- * for reading relock it now and retry in case it changed
- * whilst we dropped the lock.
- */
- BUG_ON(!rl);
- if (!rl_write_locked) {
- up_read(&ni->runlist.lock);
- down_write(&ni->runlist.lock);
- rl_write_locked = true;
- goto retry_remap;
- }
- /* Find the previous last allocated cluster. */
- BUG_ON(rl->lcn != LCN_HOLE);
- lcn = -1;
- rl2 = rl;
- while (--rl2 >= ni->runlist.rl) {
- if (rl2->lcn >= 0) {
- lcn = rl2->lcn + rl2->length;
- break;
- }
- }
- rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
- false);
- if (IS_ERR(rl2)) {
- err = PTR_ERR(rl2);
- ntfs_debug("Failed to allocate cluster, error code %i.",
- err);
- break;
- }
- lcn = rl2->lcn;
- rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- if (err != -ENOMEM)
- err = -EIO;
- if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to release "
- "allocated cluster in error "
- "code path. Run chkdsk to "
- "recover the lost cluster.");
- NVolSetErrors(vol);
- }
- ntfs_free(rl2);
- break;
- }
- ni->runlist.rl = rl;
- status.runlist_merged = 1;
- ntfs_debug("Allocated cluster, lcn 0x%llx.",
- (unsigned long long)lcn);
- /* Map and lock the mft record and get the attribute record. */
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- break;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- unmap_mft_record(base_ni);
- break;
- }
- status.mft_attr_mapped = 1;
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- break;
- }
- m = ctx->mrec;
- a = ctx->attr;
- /*
- * Find the runlist element with which the attribute extent
- * starts. Note, we cannot use the _attr_ version because we
- * have mapped the mft record. That is ok because we know the
- * runlist fragment must be mapped already to have ever gotten
- * here, so we can just use the _rl_ version.
- */
- vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
- rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
- BUG_ON(!rl2);
- BUG_ON(!rl2->length);
- BUG_ON(rl2->lcn < LCN_HOLE);
- highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
- /*
- * If @highest_vcn is zero, calculate the real highest_vcn
- * (which can really be zero).
- */
- if (!highest_vcn)
- highest_vcn = (sle64_to_cpu(
- a->data.non_resident.allocated_size) >>
- vol->cluster_size_bits) - 1;
- /*
- * Determine the size of the mapping pairs array for the new
- * extent, i.e. the old extent with the hole filled.
- */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
- highest_vcn);
- if (unlikely(mp_size <= 0)) {
- if (!(err = mp_size))
- err = -EIO;
- ntfs_debug("Failed to get size for mapping pairs "
- "array, error code %i.", err);
- break;
- }
- /*
- * Resize the attribute record to fit the new mapping pairs
- * array.
- */
- attr_rec_len = le32_to_cpu(a->length);
- err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset));
- if (unlikely(err)) {
- BUG_ON(err != -ENOSPC);
- // TODO: Deal with this by using the current attribute
- // and fill it with as much of the mapping pairs
- // array as possible. Then loop over each attribute
- // extent rewriting the mapping pairs arrays as we go
- // along and if when we reach the end we have not
- // enough space, try to resize the last attribute
- // extent and if even that fails, add a new attribute
- // extent.
- // We could also try to resize at each step in the hope
- // that we will not need to rewrite every single extent.
- // Note, we may need to decompress some extents to fill
- // the runlist as we are walking the extents...
- ntfs_error(vol->sb, "Not enough space in the mft "
- "record for the extended attribute "
- "record. This case is not "
- "implemented yet.");
- err = -EOPNOTSUPP;
- break ;
- }
- status.mp_rebuilt = 1;
- /*
- * Generate the mapping pairs array directly into the attribute
- * record.
- */
- err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- mp_size, rl2, vcn, highest_vcn, NULL);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
- "attribute type 0x%x, because building "
- "the mapping pairs failed with error "
- "code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- err = -EIO;
- break;
- }
- /* Update the highest_vcn but only if it was not set. */
- if (unlikely(!a->data.non_resident.highest_vcn))
- a->data.non_resident.highest_vcn =
- cpu_to_sle64(highest_vcn);
- /*
- * If the attribute is sparse/compressed, update the compressed
- * size in the ntfs_inode structure and the attribute record.
- */
- if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
- /*
- * If we are not in the first attribute extent, switch
- * to it, but first ensure the changes will make it to
- * disk later.
- */
- if (a->data.non_resident.lowest_vcn) {
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(ni->type, ni->name,
- ni->name_len, CASE_SENSITIVE,
- 0, NULL, 0, ctx);
- if (unlikely(err)) {
- status.attr_switched = 1;
- break;
- }
- /* @m is not used any more so do not set it. */
- a = ctx->attr;
- }
- write_lock_irqsave(&ni->size_lock, flags);
- ni->itype.compressed.size += vol->cluster_size;
- a->data.non_resident.compressed_size =
- cpu_to_sle64(ni->itype.compressed.size);
- write_unlock_irqrestore(&ni->size_lock, flags);
- }
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- /* Successfully filled the hole. */
- status.runlist_merged = 0;
- status.mft_attr_mapped = 0;
- status.mp_rebuilt = 0;
- /* Setup the map cache and use that to deal with the buffer. */
- was_hole = true;
- vcn = bh_cpos;
- vcn_len = 1;
- lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
- cdelta = 0;
- /*
- * If the number of remaining clusters in the @pages is smaller
- * or equal to the number of cached clusters, unlock the
- * runlist as the map cache will be used from now on.
- */
- if (likely(vcn + vcn_len >= cend)) {
- up_write(&ni->runlist.lock);
- rl_write_locked = false;
- rl = NULL;
- }
- goto map_buffer_cached;
- } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
- /* If there are no errors, do the next page. */
- if (likely(!err && ++u < nr_pages))
- goto do_next_folio;
- /* If there are no errors, release the runlist lock if we took it. */
- if (likely(!err)) {
- if (unlikely(rl_write_locked)) {
- up_write(&ni->runlist.lock);
- rl_write_locked = false;
- } else if (unlikely(rl))
- up_read(&ni->runlist.lock);
- rl = NULL;
- }
- /* If we issued read requests, let them complete. */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- while (wait_bh > wait) {
- bh = *--wait_bh;
- wait_on_buffer(bh);
- if (likely(buffer_uptodate(bh))) {
- folio = bh->b_folio;
- bh_pos = folio_pos(folio) + bh_offset(bh);
- /*
- * If the buffer overflows the initialized size, need
- * to zero the overflowing region.
- */
- if (unlikely(bh_pos + blocksize > initialized_size)) {
- int ofs = 0;
-
- if (likely(bh_pos < initialized_size))
- ofs = initialized_size - bh_pos;
- folio_zero_segment(folio, bh_offset(bh) + ofs,
- blocksize);
- }
- } else /* if (unlikely(!buffer_uptodate(bh))) */
- err = -EIO;
- }
- if (likely(!err)) {
- /* Clear buffer_new on all buffers. */
- u = 0;
- do {
- bh = head = page_buffers(pages[u]);
- do {
- if (buffer_new(bh))
- clear_buffer_new(bh);
- } while ((bh = bh->b_this_page) != head);
- } while (++u < nr_pages);
- ntfs_debug("Done.");
- return err;
- }
- if (status.attr_switched) {
- /* Get back to the attribute extent we modified. */
- ntfs_attr_reinit_search_ctx(ctx);
- if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
- ntfs_error(vol->sb, "Failed to find required "
- "attribute extent of attribute in "
- "error code path. Run chkdsk to "
- "recover.");
- write_lock_irqsave(&ni->size_lock, flags);
- ni->itype.compressed.size += vol->cluster_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- /*
- * The only thing that is now wrong is the compressed
- * size of the base attribute extent which chkdsk
- * should be able to fix.
- */
- NVolSetErrors(vol);
- } else {
- m = ctx->mrec;
- a = ctx->attr;
- status.attr_switched = 0;
- }
- }
- /*
- * If the runlist has been modified, need to restore it by punching a
- * hole into it and we then need to deallocate the on-disk cluster as
- * well. Note, we only modify the runlist if we are able to generate a
- * new mapping pairs array, i.e. only when the mapped attribute extent
- * is not switched.
- */
- if (status.runlist_merged && !status.attr_switched) {
- BUG_ON(!rl_write_locked);
- /* Make the file cluster we allocated sparse in the runlist. */
- if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
- ntfs_error(vol->sb, "Failed to punch hole into "
- "attribute runlist in error code "
- "path. Run chkdsk to recover the "
- "lost cluster.");
- NVolSetErrors(vol);
- } else /* if (success) */ {
- status.runlist_merged = 0;
- /*
- * Deallocate the on-disk cluster we allocated but only
- * if we succeeded in punching its vcn out of the
- * runlist.
- */
- down_write(&vol->lcnbmp_lock);
- if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
- ntfs_error(vol->sb, "Failed to release "
- "allocated cluster in error "
- "code path. Run chkdsk to "
- "recover the lost cluster.");
- NVolSetErrors(vol);
- }
- up_write(&vol->lcnbmp_lock);
- }
- }
- /*
- * Resize the attribute record to its old size and rebuild the mapping
- * pairs array. Note, we only can do this if the runlist has been
- * restored to its old state which also implies that the mapped
- * attribute extent is not switched.
- */
- if (status.mp_rebuilt && !status.runlist_merged) {
- if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
- "record in error code path. Run "
- "chkdsk to recover.");
- NVolSetErrors(vol);
- } else /* if (success) */ {
- if (ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.
- mapping_pairs_offset), attr_rec_len -
- le16_to_cpu(a->data.non_resident.
- mapping_pairs_offset), ni->runlist.rl,
- vcn, highest_vcn, NULL)) {
- ntfs_error(vol->sb, "Failed to restore "
- "mapping pairs array in error "
- "code path. Run chkdsk to "
- "recover.");
- NVolSetErrors(vol);
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- }
- }
- /* Release the mft record and the attribute. */
- if (status.mft_attr_mapped) {
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- }
- /* Release the runlist lock. */
- if (rl_write_locked)
- up_write(&ni->runlist.lock);
- else if (rl)
- up_read(&ni->runlist.lock);
- /*
- * Zero out any newly allocated blocks to avoid exposing stale data.
- * If BH_New is set, we know that the block was newly allocated above
- * and that it has not been fully zeroed and marked dirty yet.
- */
- nr_pages = u;
- u = 0;
- end = bh_cpos << vol->cluster_size_bits;
- do {
- folio = page_folio(pages[u]);
- bh = head = folio_buffers(folio);
- do {
- if (u == nr_pages &&
- folio_pos(folio) + bh_offset(bh) >= end)
- break;
- if (!buffer_new(bh))
- continue;
- clear_buffer_new(bh);
- if (!buffer_uptodate(bh)) {
- if (folio_test_uptodate(folio))
- set_buffer_uptodate(bh);
- else {
- folio_zero_range(folio, bh_offset(bh),
- blocksize);
- set_buffer_uptodate(bh);
- }
- }
- mark_buffer_dirty(bh);
- } while ((bh = bh->b_this_page) != head);
- } while (++u <= nr_pages);
- ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
- return err;
-}
-
-static inline void ntfs_flush_dcache_pages(struct page **pages,
- unsigned nr_pages)
-{
- BUG_ON(!nr_pages);
- /*
- * Warning: Do not do the decrement at the same time as the call to
- * flush_dcache_page() because it is a NULL macro on i386 and hence the
- * decrement never happens so the loop never terminates.
- */
- do {
- --nr_pages;
- flush_dcache_page(pages[nr_pages]);
- } while (nr_pages > 0);
-}
-
-/**
- * ntfs_commit_pages_after_non_resident_write - commit the received data
- * @pages: array of destination pages
- * @nr_pages: number of pages in @pages
- * @pos: byte position in file at which the write begins
- * @bytes: number of bytes to be written
- *
- * See description of ntfs_commit_pages_after_write(), below.
- */
-static inline int ntfs_commit_pages_after_non_resident_write(
- struct page **pages, const unsigned nr_pages,
- s64 pos, size_t bytes)
-{
- s64 end, initialized_size;
- struct inode *vi;
- ntfs_inode *ni, *base_ni;
- struct buffer_head *bh, *head;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- unsigned long flags;
- unsigned blocksize, u;
- int err;
-
- vi = pages[0]->mapping->host;
- ni = NTFS_I(vi);
- blocksize = vi->i_sb->s_blocksize;
- end = pos + bytes;
- u = 0;
- do {
- s64 bh_pos;
- struct page *page;
- bool partial;
-
- page = pages[u];
- bh_pos = (s64)page->index << PAGE_SHIFT;
- bh = head = page_buffers(page);
- partial = false;
- do {
- s64 bh_end;
-
- bh_end = bh_pos + blocksize;
- if (bh_end <= pos || bh_pos >= end) {
- if (!buffer_uptodate(bh))
- partial = true;
- } else {
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- }
- } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
- /*
- * If all buffers are now uptodate but the page is not, set the
- * page uptodate.
- */
- if (!partial && !PageUptodate(page))
- SetPageUptodate(page);
- } while (++u < nr_pages);
- /*
- * Finally, if we do not need to update initialized_size or i_size we
- * are finished.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (end <= initialized_size) {
- ntfs_debug("Done.");
- return 0;
- }
- /*
- * Update initialized_size/i_size as appropriate, both in the inode and
- * the mft record.
- */
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Map, pin, and lock the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- BUG_ON(!NInoNonResident(ni));
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- a = ctx->attr;
- BUG_ON(!a->non_resident);
- write_lock_irqsave(&ni->size_lock, flags);
- BUG_ON(end > ni->allocated_size);
- ni->initialized_size = end;
- a->data.non_resident.initialized_size = cpu_to_sle64(end);
- if (end > i_size_read(vi)) {
- i_size_write(vi, end);
- a->data.non_resident.data_size =
- a->data.non_resident.initialized_size;
- }
- write_unlock_irqrestore(&ni->size_lock, flags);
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- ntfs_debug("Done.");
- return 0;
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
- "code %i).", err);
- if (err != -ENOMEM)
- NVolSetErrors(ni->vol);
- return err;
-}
-
-/**
- * ntfs_commit_pages_after_write - commit the received data
- * @pages: array of destination pages
- * @nr_pages: number of pages in @pages
- * @pos: byte position in file at which the write begins
- * @bytes: number of bytes to be written
- *
- * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
- * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
- * locked but not kmap()ped. The source data has already been copied into the
- * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
- * the data was copied (for non-resident attributes only) and it returned
- * success.
- *
- * Need to set uptodate and mark dirty all buffers within the boundary of the
- * write. If all buffers in a page are uptodate we set the page uptodate, too.
- *
- * Setting the buffers dirty ensures that they get written out later when
- * ntfs_writepage() is invoked by the VM.
- *
- * Finally, we need to update i_size and initialized_size as appropriate both
- * in the inode and the mft record.
- *
- * This is modelled after fs/buffer.c::generic_commit_write(), which marks
- * buffers uptodate and dirty, sets the page uptodate if all buffers in the
- * page are uptodate, and updates i_size if the end of io is beyond i_size. In
- * that case, it also marks the inode dirty.
- *
- * If things have gone as outlined in
- * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
- * content modifications here for non-resident attributes. For resident
- * attributes we need to do the uptodate bringing here which we combine with
- * the copying into the mft record which means we save one atomic kmap.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_commit_pages_after_write(struct page **pages,
- const unsigned nr_pages, s64 pos, size_t bytes)
-{
- s64 end, initialized_size;
- loff_t i_size;
- struct inode *vi;
- ntfs_inode *ni, *base_ni;
- struct page *page;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- char *kattr, *kaddr;
- unsigned long flags;
- u32 attr_len;
- int err;
-
- BUG_ON(!nr_pages);
- BUG_ON(!pages);
- page = pages[0];
- BUG_ON(!page);
- vi = page->mapping->host;
- ni = NTFS_I(vi);
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
- "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
- vi->i_ino, ni->type, page->index, nr_pages,
- (long long)pos, bytes);
- if (NInoNonResident(ni))
- return ntfs_commit_pages_after_non_resident_write(pages,
- nr_pages, pos, bytes);
- BUG_ON(nr_pages > 1);
- /*
- * Attribute is resident, implying it is not compressed, encrypted, or
- * sparse.
- */
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- BUG_ON(NInoNonResident(ni));
- /* Map, pin, and lock the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- a = ctx->attr;
- BUG_ON(a->non_resident);
- /* The total length of the attribute value. */
- attr_len = le32_to_cpu(a->data.resident.value_length);
- i_size = i_size_read(vi);
- BUG_ON(attr_len != i_size);
- BUG_ON(pos > attr_len);
- end = pos + bytes;
- BUG_ON(end > le32_to_cpu(a->length) -
- le16_to_cpu(a->data.resident.value_offset));
- kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
- kaddr = kmap_atomic(page);
- /* Copy the received data from the page to the mft record. */
- memcpy(kattr + pos, kaddr + pos, bytes);
- /* Update the attribute length if necessary. */
- if (end > attr_len) {
- attr_len = end;
- a->data.resident.value_length = cpu_to_le32(attr_len);
- }
- /*
- * If the page is not uptodate, bring the out of bounds area(s)
- * uptodate by copying data from the mft record to the page.
- */
- if (!PageUptodate(page)) {
- if (pos > 0)
- memcpy(kaddr, kattr, pos);
- if (end < attr_len)
- memcpy(kaddr + end, kattr + end, attr_len - end);
- /* Zero the region outside the end of the attribute value. */
- memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
- flush_dcache_page(page);
- SetPageUptodate(page);
- }
- kunmap_atomic(kaddr);
- /* Update initialized_size/i_size if necessary. */
- read_lock_irqsave(&ni->size_lock, flags);
- initialized_size = ni->initialized_size;
- BUG_ON(end > ni->allocated_size);
- read_unlock_irqrestore(&ni->size_lock, flags);
- BUG_ON(initialized_size != i_size);
- if (end > initialized_size) {
- write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = end;
- i_size_write(vi, end);
- write_unlock_irqrestore(&ni->size_lock, flags);
- }
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- ntfs_debug("Done.");
- return 0;
-err_out:
- if (err == -ENOMEM) {
- ntfs_warning(vi->i_sb, "Error allocating memory required to "
- "commit the write.");
- if (PageUptodate(page)) {
- ntfs_warning(vi->i_sb, "Page is uptodate, setting "
- "dirty so the write will be retried "
- "later on by the VM.");
- /*
- * Put the page on mapping->dirty_pages, but leave its
- * buffers' dirty state as-is.
- */
- __set_page_dirty_nobuffers(page);
- err = 0;
- } else
- ntfs_error(vi->i_sb, "Page is not uptodate. Written "
- "data has been lost.");
- } else {
- ntfs_error(vi->i_sb, "Resident attribute commit write failed "
- "with error %i.", err);
- NVolSetErrors(ni->vol);
- }
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- return err;
-}
-
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied. If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
- unsigned ofs, struct iov_iter *i, size_t bytes)
-{
- struct page **last_page = pages + nr_pages;
- size_t total = 0;
- unsigned len, copied;
-
- do {
- len = PAGE_SIZE - ofs;
- if (len > bytes)
- len = bytes;
- copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
- total += copied;
- bytes -= copied;
- if (!bytes)
- break;
- if (copied < len)
- goto err;
- ofs = 0;
- } while (++pages < last_page);
-out:
- return total;
-err:
- /* Zero the rest of the target like __copy_from_user(). */
- len = PAGE_SIZE - copied;
- do {
- if (len > bytes)
- len = bytes;
- zero_user(*pages, copied, len);
- bytes -= len;
- copied = 0;
- len = PAGE_SIZE;
- } while (++pages < last_page);
- goto out;
-}
-
-/**
- * ntfs_perform_write - perform buffered write to a file
- * @file: file to write to
- * @i: iov_iter with data to write
- * @pos: byte offset in file at which to begin writing to
- */
-static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
- loff_t pos)
-{
- struct address_space *mapping = file->f_mapping;
- struct inode *vi = mapping->host;
- ntfs_inode *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
- struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
- struct page *cached_page = NULL;
- VCN last_vcn;
- LCN lcn;
- size_t bytes;
- ssize_t status, written = 0;
- unsigned nr_pages;
-
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
- "0x%llx, count 0x%lx.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)pos,
- (unsigned long)iov_iter_count(i));
- /*
- * If a previous ntfs_truncate() failed, repeat it and abort if it
- * fails again.
- */
- if (unlikely(NInoTruncateFailed(ni))) {
- int err;
-
- inode_dio_wait(vi);
- err = ntfs_truncate(vi);
- if (err || NInoTruncateFailed(ni)) {
- if (!err)
- err = -EIO;
- ntfs_error(vol->sb, "Cannot perform write to inode "
- "0x%lx, attribute type 0x%x, because "
- "ntfs_truncate() failed (error code "
- "%i).", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- return err;
- }
- }
- /*
- * Determine the number of pages per cluster for non-resident
- * attributes.
- */
- nr_pages = 1;
- if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
- nr_pages = vol->cluster_size >> PAGE_SHIFT;
- last_vcn = -1;
- do {
- VCN vcn;
- pgoff_t start_idx;
- unsigned ofs, do_pages, u;
- size_t copied;
-
- start_idx = pos >> PAGE_SHIFT;
- ofs = pos & ~PAGE_MASK;
- bytes = PAGE_SIZE - ofs;
- do_pages = 1;
- if (nr_pages > 1) {
- vcn = pos >> vol->cluster_size_bits;
- if (vcn != last_vcn) {
- last_vcn = vcn;
- /*
- * Get the lcn of the vcn the write is in. If
- * it is a hole, need to lock down all pages in
- * the cluster.
- */
- down_read(&ni->runlist.lock);
- lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
- vol->cluster_size_bits, false);
- up_read(&ni->runlist.lock);
- if (unlikely(lcn < LCN_HOLE)) {
- if (lcn == LCN_ENOMEM)
- status = -ENOMEM;
- else {
- status = -EIO;
- ntfs_error(vol->sb, "Cannot "
- "perform write to "
- "inode 0x%lx, "
- "attribute type 0x%x, "
- "because the attribute "
- "is corrupt.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- }
- break;
- }
- if (lcn == LCN_HOLE) {
- start_idx = (pos & ~(s64)
- vol->cluster_size_mask)
- >> PAGE_SHIFT;
- bytes = vol->cluster_size - (pos &
- vol->cluster_size_mask);
- do_pages = nr_pages;
- }
- }
- }
- if (bytes > iov_iter_count(i))
- bytes = iov_iter_count(i);
-again:
- /*
- * Bring in the user page(s) that we will copy from _first_.
- * Otherwise there is a nasty deadlock on copying from the same
- * page(s) as we are writing to, without it/them being marked
- * up-to-date. Note, at present there is nothing to stop the
- * pages being swapped out between us bringing them into memory
- * and doing the actual copying.
- */
- if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
- status = -EFAULT;
- break;
- }
- /* Get and lock @do_pages starting at index @start_idx. */
- status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
- pages, &cached_page);
- if (unlikely(status))
- break;
- /*
- * For non-resident attributes, we need to fill any holes with
- * actual clusters and ensure all bufferes are mapped. We also
- * need to bring uptodate any buffers that are only partially
- * being written to.
- */
- if (NInoNonResident(ni)) {
- status = ntfs_prepare_pages_for_non_resident_write(
- pages, do_pages, pos, bytes);
- if (unlikely(status)) {
- do {
- unlock_page(pages[--do_pages]);
- put_page(pages[do_pages]);
- } while (do_pages);
- break;
- }
- }
- u = (pos >> PAGE_SHIFT) - pages[0]->index;
- copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
- i, bytes);
- ntfs_flush_dcache_pages(pages + u, do_pages - u);
- status = 0;
- if (likely(copied == bytes)) {
- status = ntfs_commit_pages_after_write(pages, do_pages,
- pos, bytes);
- }
- do {
- unlock_page(pages[--do_pages]);
- put_page(pages[do_pages]);
- } while (do_pages);
- if (unlikely(status < 0)) {
- iov_iter_revert(i, copied);
- break;
- }
- cond_resched();
- if (unlikely(copied < bytes)) {
- iov_iter_revert(i, copied);
- if (copied)
- bytes = copied;
- else if (bytes > PAGE_SIZE - ofs)
- bytes = PAGE_SIZE - ofs;
- goto again;
- }
- pos += copied;
- written += copied;
- balance_dirty_pages_ratelimited(mapping);
- if (fatal_signal_pending(current)) {
- status = -EINTR;
- break;
- }
- } while (iov_iter_count(i));
- if (cached_page)
- put_page(cached_page);
- ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
- written ? "written" : "status", (unsigned long)written,
- (long)status);
- return written ? written : status;
-}
-
-/**
- * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
- * @iocb: IO state structure
- * @from: iov_iter with data to write
- *
- * Basically the same as generic_file_write_iter() except that it ends up
- * up calling ntfs_perform_write() instead of generic_perform_write() and that
- * O_DIRECT is not implemented.
- */
-static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *vi = file_inode(file);
- ssize_t written = 0;
- ssize_t err;
-
- inode_lock(vi);
- /* We can write back this queue in page reclaim. */
- err = ntfs_prepare_file_for_write(iocb, from);
- if (iov_iter_count(from) && !err)
- written = ntfs_perform_write(file, from, iocb->ki_pos);
- inode_unlock(vi);
- iocb->ki_pos += written;
- if (likely(written > 0))
- written = generic_write_sync(iocb, written);
- return written ? written : err;
-}
-
-/**
- * ntfs_file_fsync - sync a file to disk
- * @filp: file to be synced
- * @datasync: if non-zero only flush user data and not metadata
- *
- * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
- * system calls. This function is inspired by fs/buffer.c::file_fsync().
- *
- * If @datasync is false, write the mft record and all associated extent mft
- * records as well as the $DATA attribute and then sync the block device.
- *
- * If @datasync is true and the attribute is non-resident, we skip the writing
- * of the mft record and all associated extent mft records (this might still
- * happen due to the write_inode_now() call).
- *
- * Also, if @datasync is true, we do not wait on the inode to be written out
- * but we always wait on the page cache pages to be written out.
- *
- * Locking: Caller must hold i_mutex on the inode.
- *
- * TODO: We should probably also write all attribute/index inodes associated
- * with this inode but since we have no simple way of getting to them we ignore
- * this problem for now.
- */
-static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *vi = filp->f_mapping->host;
- int err, ret = 0;
-
- ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-
- err = file_write_and_wait_range(filp, start, end);
- if (err)
- return err;
- inode_lock(vi);
-
- BUG_ON(S_ISDIR(vi->i_mode));
- if (!datasync || !NInoNonResident(NTFS_I(vi)))
- ret = __ntfs_write_inode(vi, 1);
- write_inode_now(vi, !datasync);
- /*
- * NOTE: If we were to use mapping->private_list (see ext2 and
- * fs/buffer.c) for dirty blocks then we could optimize the below to be
- * sync_mapping_buffers(vi->i_mapping).
- */
- err = sync_blockdev(vi->i_sb->s_bdev);
- if (unlikely(err && !ret))
- ret = err;
- if (likely(!ret))
- ntfs_debug("Done.");
- else
- ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
- "%u.", datasync ? "data" : "", vi->i_ino, -ret);
- inode_unlock(vi);
- return ret;
-}
-
-#endif /* NTFS_RW */
-
-const struct file_operations ntfs_file_ops = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
-#ifdef NTFS_RW
- .write_iter = ntfs_file_write_iter,
- .fsync = ntfs_file_fsync,
-#endif /* NTFS_RW */
- .mmap = generic_file_mmap,
- .open = ntfs_file_open,
- .splice_read = filemap_splice_read,
-};
-
-const struct inode_operations ntfs_file_inode_ops = {
-#ifdef NTFS_RW
- .setattr = ntfs_setattr,
-#endif /* NTFS_RW */
-};
-
-const struct file_operations ntfs_empty_file_ops = {};
-
-const struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
deleted file mode 100644
index d46c2c03a032..000000000000
--- a/fs/ntfs/index.c
+++ /dev/null
@@ -1,440 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * index.c - NTFS kernel index handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#include <linux/slab.h>
-
-#include "aops.h"
-#include "collate.h"
-#include "debug.h"
-#include "index.h"
-#include "ntfs.h"
-
-/**
- * ntfs_index_ctx_get - allocate and initialize a new index context
- * @idx_ni: ntfs index inode with which to initialize the context
- *
- * Allocate a new index context, initialize it with @idx_ni and return it.
- * Return NULL if allocation failed.
- *
- * Locking: Caller must hold i_mutex on the index inode.
- */
-ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
-{
- ntfs_index_context *ictx;
-
- ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
- if (ictx)
- *ictx = (ntfs_index_context){ .idx_ni = idx_ni };
- return ictx;
-}
-
-/**
- * ntfs_index_ctx_put - release an index context
- * @ictx: index context to free
- *
- * Release the index context @ictx, releasing all associated resources.
- *
- * Locking: Caller must hold i_mutex on the index inode.
- */
-void ntfs_index_ctx_put(ntfs_index_context *ictx)
-{
- if (ictx->entry) {
- if (ictx->is_in_root) {
- if (ictx->actx)
- ntfs_attr_put_search_ctx(ictx->actx);
- if (ictx->base_ni)
- unmap_mft_record(ictx->base_ni);
- } else {
- struct page *page = ictx->page;
- if (page) {
- BUG_ON(!PageLocked(page));
- unlock_page(page);
- ntfs_unmap_page(page);
- }
- }
- }
- kmem_cache_free(ntfs_index_ctx_cache, ictx);
- return;
-}
-
-/**
- * ntfs_index_lookup - find a key in an index and return its index entry
- * @key: [IN] key for which to search in the index
- * @key_len: [IN] length of @key in bytes
- * @ictx: [IN/OUT] context describing the index and the returned entry
- *
- * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
- * call to ntfs_index_ctx_get().
- *
- * Look for the @key in the index specified by the index lookup context @ictx.
- * ntfs_index_lookup() walks the contents of the index looking for the @key.
- *
- * If the @key is found in the index, 0 is returned and @ictx is setup to
- * describe the index entry containing the matching @key. @ictx->entry is the
- * index entry and @ictx->data and @ictx->data_len are the index entry data and
- * its length in bytes, respectively.
- *
- * If the @key is not found in the index, -ENOENT is returned and @ictx is
- * setup to describe the index entry whose key collates immediately after the
- * search @key, i.e. this is the position in the index at which an index entry
- * with a key of @key would need to be inserted.
- *
- * If an error occurs return the negative error code and @ictx is left
- * untouched.
- *
- * When finished with the entry and its data, call ntfs_index_ctx_put() to free
- * the context and other associated resources.
- *
- * If the index entry was modified, call flush_dcache_index_entry_page()
- * immediately after the modification and either ntfs_index_entry_mark_dirty()
- * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
- * ensure that the changes are written to disk.
- *
- * Locking: - Caller must hold i_mutex on the index inode.
- * - Each page cache page in the index allocation mapping must be
- * locked whilst being accessed otherwise we may find a corrupt
- * page due to it being under ->writepage at the moment which
- * applies the mst protection fixups before writing out and then
- * removes them again after the write is complete after which it
- * unlocks the page.
- */
-int ntfs_index_lookup(const void *key, const int key_len,
- ntfs_index_context *ictx)
-{
- VCN vcn, old_vcn;
- ntfs_inode *idx_ni = ictx->idx_ni;
- ntfs_volume *vol = idx_ni->vol;
- struct super_block *sb = vol->sb;
- ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
- MFT_RECORD *m;
- INDEX_ROOT *ir;
- INDEX_ENTRY *ie;
- INDEX_ALLOCATION *ia;
- u8 *index_end, *kaddr;
- ntfs_attr_search_ctx *actx;
- struct address_space *ia_mapping;
- struct page *page;
- int rc, err = 0;
-
- ntfs_debug("Entering.");
- BUG_ON(!NInoAttr(idx_ni));
- BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
- BUG_ON(idx_ni->nr_extents != -1);
- BUG_ON(!base_ni);
- BUG_ON(!key);
- BUG_ON(key_len <= 0);
- if (!ntfs_is_collation_rule_supported(
- idx_ni->itype.index.collation_rule)) {
- ntfs_error(sb, "Index uses unsupported collation rule 0x%x. "
- "Aborting lookup.", le32_to_cpu(
- idx_ni->itype.index.collation_rule));
- return -EOPNOTSUPP;
- }
- /* Get hold of the mft record for the index inode. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- ntfs_error(sb, "map_mft_record() failed with error code %ld.",
- -PTR_ERR(m));
- return PTR_ERR(m);
- }
- actx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!actx)) {
- err = -ENOMEM;
- goto err_out;
- }
- /* Find the index root attribute in the mft record. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, actx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- ntfs_error(sb, "Index root attribute missing in inode "
- "0x%lx.", idx_ni->mft_no);
- err = -EIO;
- }
- goto err_out;
- }
- /* Get to the index root value (it has been verified in read_inode). */
- ir = (INDEX_ROOT*)((u8*)actx->attr +
- le16_to_cpu(actx->attr->data.resident.value_offset));
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ir->index +
- le32_to_cpu(ir->index.entries_offset));
- /*
- * Loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds checks. */
- if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->length) > index_end)
- goto idx_err_out;
- /*
- * The last entry cannot contain a key. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /* Further bounds checks. */
- if ((u32)sizeof(INDEX_ENTRY_HEADER) +
- le16_to_cpu(ie->key_length) >
- le16_to_cpu(ie->data.vi.data_offset) ||
- (u32)le16_to_cpu(ie->data.vi.data_offset) +
- le16_to_cpu(ie->data.vi.data_length) >
- le16_to_cpu(ie->length))
- goto idx_err_out;
- /* If the keys match perfectly, we setup @ictx and return 0. */
- if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
- &ie->key, key_len)) {
-ir_done:
- ictx->is_in_root = true;
- ictx->ir = ir;
- ictx->actx = actx;
- ictx->base_ni = base_ni;
- ictx->ia = NULL;
- ictx->page = NULL;
-done:
- ictx->entry = ie;
- ictx->data = (u8*)ie +
- le16_to_cpu(ie->data.vi.data_offset);
- ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
- ntfs_debug("Done.");
- return err;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
- key_len, &ie->key, le16_to_cpu(ie->key_length));
- /*
- * If @key collates before the key of the current entry, there
- * is definitely no such key in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /*
- * A match should never happen as the memcmp() call should have
- * cought it, but we still treat it correctly.
- */
- if (!rc)
- goto ir_done;
- /* The keys are not equal, continue the search. */
- }
- /*
- * We have finished with this index without success. Check for the
- * presence of a child node and if not present setup @ictx and return
- * -ENOENT.
- */
- if (!(ie->flags & INDEX_ENTRY_NODE)) {
- ntfs_debug("Entry not found.");
- err = -ENOENT;
- goto ir_done;
- } /* Child node present, descend into it. */
- /* Consistency check: Verify that an index allocation exists. */
- if (!NInoIndexAllocPresent(idx_ni)) {
- ntfs_error(sb, "No index allocation attribute but index entry "
- "requires one. Inode 0x%lx is corrupt or "
- "driver bug.", idx_ni->mft_no);
- goto err_out;
- }
- /* Get the starting vcn of the index_block holding the child node. */
- vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
- ia_mapping = VFS_I(idx_ni)->i_mapping;
- /*
- * We are done with the index root and the mft record. Release them,
- * otherwise we deadlock with ntfs_map_page().
- */
- ntfs_attr_put_search_ctx(actx);
- unmap_mft_record(base_ni);
- m = NULL;
- actx = NULL;
-descend_into_child_node:
- /*
- * Convert vcn to index into the index allocation attribute in units
- * of PAGE_SIZE and map the page cache page, reading it from
- * disk if necessary.
- */
- page = ntfs_map_page(ia_mapping, vcn <<
- idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- ntfs_error(sb, "Failed to map index page, error %ld.",
- -PTR_ERR(page));
- err = PTR_ERR(page);
- goto err_out;
- }
- lock_page(page);
- kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
- /* Get to the index allocation block. */
- ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
- /* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Out of bounds check failed. Corrupt inode "
- "0x%lx or driver bug.", idx_ni->mft_no);
- goto unm_err_out;
- }
- /* Catch multi sector transfer fixup errors. */
- if (unlikely(!ntfs_is_indx_record(ia->magic))) {
- ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. "
- "Corrupt inode 0x%lx. Run chkdsk.",
- (long long)vcn, idx_ni->mft_no);
- goto unm_err_out;
- }
- if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
- ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
- "different from expected VCN (0x%llx). Inode "
- "0x%lx is corrupt or driver bug.",
- (unsigned long long)
- sle64_to_cpu(ia->index_block_vcn),
- (unsigned long long)vcn, idx_ni->mft_no);
- goto unm_err_out;
- }
- if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
- idx_ni->itype.index.block_size) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
- "a size (%u) differing from the index "
- "specified size (%u). Inode is corrupt or "
- "driver bug.", (unsigned long long)vcn,
- idx_ni->mft_no,
- le32_to_cpu(ia->index.allocated_size) + 0x18,
- idx_ni->itype.index.block_size);
- goto unm_err_out;
- }
- index_end = (u8*)ia + idx_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_SIZE) {
- ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
- "crosses page boundary. Impossible! Cannot "
- "access! This is probably a bug in the "
- "driver.", (unsigned long long)vcn,
- idx_ni->mft_no);
- goto unm_err_out;
- }
- index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
- if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
- ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
- "0x%lx exceeds maximum size.",
- (unsigned long long)vcn, idx_ni->mft_no);
- goto unm_err_out;
- }
- /* The first index entry. */
- ie = (INDEX_ENTRY*)((u8*)&ia->index +
- le32_to_cpu(ia->index.entries_offset));
- /*
- * Iterate similar to above big loop but applied to index buffer, thus
- * loop until we exceed valid memory (corruption case) or until we
- * reach the last entry.
- */
- for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
- /* Bounds checks. */
- if ((u8*)ie < (u8*)ia || (u8*)ie +
- sizeof(INDEX_ENTRY_HEADER) > index_end ||
- (u8*)ie + le16_to_cpu(ie->length) > index_end) {
- ntfs_error(sb, "Index entry out of bounds in inode "
- "0x%lx.", idx_ni->mft_no);
- goto unm_err_out;
- }
- /*
- * The last entry cannot contain a key. It can however contain
- * a pointer to a child node in the B+tree so we just break out.
- */
- if (ie->flags & INDEX_ENTRY_END)
- break;
- /* Further bounds checks. */
- if ((u32)sizeof(INDEX_ENTRY_HEADER) +
- le16_to_cpu(ie->key_length) >
- le16_to_cpu(ie->data.vi.data_offset) ||
- (u32)le16_to_cpu(ie->data.vi.data_offset) +
- le16_to_cpu(ie->data.vi.data_length) >
- le16_to_cpu(ie->length)) {
- ntfs_error(sb, "Index entry out of bounds in inode "
- "0x%lx.", idx_ni->mft_no);
- goto unm_err_out;
- }
- /* If the keys match perfectly, we setup @ictx and return 0. */
- if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
- &ie->key, key_len)) {
-ia_done:
- ictx->is_in_root = false;
- ictx->actx = NULL;
- ictx->base_ni = NULL;
- ictx->ia = ia;
- ictx->page = page;
- goto done;
- }
- /*
- * Not a perfect match, need to do full blown collation so we
- * know which way in the B+tree we have to go.
- */
- rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
- key_len, &ie->key, le16_to_cpu(ie->key_length));
- /*
- * If @key collates before the key of the current entry, there
- * is definitely no such key in this index but we might need to
- * descend into the B+tree so we just break out of the loop.
- */
- if (rc == -1)
- break;
- /*
- * A match should never happen as the memcmp() call should have
- * cought it, but we still treat it correctly.
- */
- if (!rc)
- goto ia_done;
- /* The keys are not equal, continue the search. */
- }
- /*
- * We have finished with this index buffer without success. Check for
- * the presence of a child node and if not present return -ENOENT.
- */
- if (!(ie->flags & INDEX_ENTRY_NODE)) {
- ntfs_debug("Entry not found.");
- err = -ENOENT;
- goto ia_done;
- }
- if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
- ntfs_error(sb, "Index entry with child node found in a leaf "
- "node in inode 0x%lx.", idx_ni->mft_no);
- goto unm_err_out;
- }
- /* Child node present, descend into it. */
- old_vcn = vcn;
- vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
- if (vcn >= 0) {
- /*
- * If vcn is in the same page cache page as old_vcn we recycle
- * the mapped page.
- */
- if (old_vcn << vol->cluster_size_bits >>
- PAGE_SHIFT == vcn <<
- vol->cluster_size_bits >>
- PAGE_SHIFT)
- goto fast_descend_into_child_node;
- unlock_page(page);
- ntfs_unmap_page(page);
- goto descend_into_child_node;
- }
- ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
- idx_ni->mft_no);
-unm_err_out:
- unlock_page(page);
- ntfs_unmap_page(page);
-err_out:
- if (!err)
- err = -EIO;
- if (actx)
- ntfs_attr_put_search_ctx(actx);
- if (m)
- unmap_mft_record(base_ni);
- return err;
-idx_err_out:
- ntfs_error(sb, "Corrupt index. Aborting lookup.");
- goto err_out;
-}
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
deleted file mode 100644
index bb3c3ae55138..000000000000
--- a/fs/ntfs/index.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_INDEX_H
-#define _LINUX_NTFS_INDEX_H
-
-#include <linux/fs.h>
-
-#include "types.h"
-#include "layout.h"
-#include "inode.h"
-#include "attrib.h"
-#include "mft.h"
-#include "aops.h"
-
-/**
- * @idx_ni: index inode containing the @entry described by this context
- * @entry: index entry (points into @ir or @ia)
- * @data: index entry data (points into @entry)
- * @data_len: length in bytes of @data
- * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia
- * @ir: index root if @is_in_root and NULL otherwise
- * @actx: attribute search context if @is_in_root and NULL otherwise
- * @base_ni: base inode if @is_in_root and NULL otherwise
- * @ia: index block if @is_in_root is 'false' and NULL otherwise
- * @page: page if @is_in_root is 'false' and NULL otherwise
- *
- * @idx_ni is the index inode this context belongs to.
- *
- * @entry is the index entry described by this context. @data and @data_len
- * are the index entry data and its length in bytes, respectively. @data
- * simply points into @entry. This is probably what the user is interested in.
- *
- * If @is_in_root is 'true', @entry is in the index root attribute @ir described
- * by the attribute search context @actx and the base inode @base_ni. @ia and
- * @page are NULL in this case.
- *
- * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
- * and @page point to the index allocation block and the mapped, locked page it
- * is in, respectively. @ir, @actx and @base_ni are NULL in this case.
- *
- * To obtain a context call ntfs_index_ctx_get().
- *
- * We use this context to allow ntfs_index_lookup() to return the found index
- * @entry and its @data without having to allocate a buffer and copy the @entry
- * and/or its @data into it.
- *
- * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
- * free the context and other associated resources.
- *
- * If the index entry was modified, call flush_dcache_index_entry_page()
- * immediately after the modification and either ntfs_index_entry_mark_dirty()
- * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
- * ensure that the changes are written to disk.
- */
-typedef struct {
- ntfs_inode *idx_ni;
- INDEX_ENTRY *entry;
- void *data;
- u16 data_len;
- bool is_in_root;
- INDEX_ROOT *ir;
- ntfs_attr_search_ctx *actx;
- ntfs_inode *base_ni;
- INDEX_ALLOCATION *ia;
- struct page *page;
-} ntfs_index_context;
-
-extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
-extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
-
-extern int ntfs_index_lookup(const void *key, const int key_len,
- ntfs_index_context *ictx);
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
- * @ictx: ntfs index context describing the index entry
- *
- * Call flush_dcache_page() for the page in which an index entry resides.
- *
- * This must be called every time an index entry is modified, just after the
- * modification.
- *
- * If the index entry is in the index root attribute, simply flush the page
- * containing the mft record containing the index root attribute.
- *
- * If the index entry is in an index block belonging to the index allocation
- * attribute, simply flush the page cache page containing the index block.
- */
-static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
-{
- if (ictx->is_in_root)
- flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
- else
- flush_dcache_page(ictx->page);
-}
-
-/**
- * ntfs_index_entry_mark_dirty - mark an index entry dirty
- * @ictx: ntfs index context describing the index entry
- *
- * Mark the index entry described by the index entry context @ictx dirty.
- *
- * If the index entry is in the index root attribute, simply mark the mft
- * record containing the index root attribute dirty. This ensures the mft
- * record, and hence the index root attribute, will be written out to disk
- * later.
- *
- * If the index entry is in an index block belonging to the index allocation
- * attribute, mark the buffers belonging to the index record as well as the
- * page cache page the index block is in dirty. This automatically marks the
- * VFS inode of the ntfs index inode to which the index entry belongs dirty,
- * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
- * dirty index block, will be written out to disk later.
- */
-static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
-{
- if (ictx->is_in_root)
- mark_mft_record_dirty(ictx->actx->ntfs_ino);
- else
- mark_ntfs_record_dirty(ictx->page,
- (u8*)ictx->ia - (u8*)page_address(ictx->page));
-}
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
deleted file mode 100644
index aba1e22db4e9..000000000000
--- a/fs/ntfs/inode.c
+++ /dev/null
@@ -1,3102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * inode.c - NTFS kernel inode handling.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- */
-
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/pagemap.h>
-#include <linux/quotaops.h>
-#include <linux/slab.h>
-#include <linux/log2.h>
-
-#include "aops.h"
-#include "attrib.h"
-#include "bitmap.h"
-#include "dir.h"
-#include "debug.h"
-#include "inode.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "time.h"
-#include "ntfs.h"
-
-/**
- * ntfs_test_inode - compare two (possibly fake) inodes for equality
- * @vi: vfs inode which to test
- * @data: data which is being tested with
- *
- * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
- * inode @vi for equality with the ntfs attribute @data.
- *
- * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
- * @na->name and @na->name_len are then ignored.
- *
- * Return 1 if the attributes match and 0 if not.
- *
- * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
- * allowed to sleep.
- */
-int ntfs_test_inode(struct inode *vi, void *data)
-{
- ntfs_attr *na = (ntfs_attr *)data;
- ntfs_inode *ni;
-
- if (vi->i_ino != na->mft_no)
- return 0;
- ni = NTFS_I(vi);
- /* If !NInoAttr(ni), @vi is a normal file or directory inode. */
- if (likely(!NInoAttr(ni))) {
- /* If not looking for a normal inode this is a mismatch. */
- if (unlikely(na->type != AT_UNUSED))
- return 0;
- } else {
- /* A fake inode describing an attribute. */
- if (ni->type != na->type)
- return 0;
- if (ni->name_len != na->name_len)
- return 0;
- if (na->name_len && memcmp(ni->name, na->name,
- na->name_len * sizeof(ntfschar)))
- return 0;
- }
- /* Match! */
- return 1;
-}
-
-/**
- * ntfs_init_locked_inode - initialize an inode
- * @vi: vfs inode to initialize
- * @data: data which to initialize @vi to
- *
- * Initialize the vfs inode @vi with the values from the ntfs attribute @data in
- * order to enable ntfs_test_inode() to do its work.
- *
- * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
- * In that case, @na->name and @na->name_len should be set to NULL and 0,
- * respectively. Although that is not strictly necessary as
- * ntfs_read_locked_inode() will fill them in later.
- *
- * Return 0 on success and -errno on error.
- *
- * NOTE: This function runs with the inode->i_lock spin lock held so it is not
- * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
- */
-static int ntfs_init_locked_inode(struct inode *vi, void *data)
-{
- ntfs_attr *na = (ntfs_attr *)data;
- ntfs_inode *ni = NTFS_I(vi);
-
- vi->i_ino = na->mft_no;
-
- ni->type = na->type;
- if (na->type == AT_INDEX_ALLOCATION)
- NInoSetMstProtected(ni);
-
- ni->name = na->name;
- ni->name_len = na->name_len;
-
- /* If initializing a normal inode, we are done. */
- if (likely(na->type == AT_UNUSED)) {
- BUG_ON(na->name);
- BUG_ON(na->name_len);
- return 0;
- }
-
- /* It is a fake inode. */
- NInoSetAttr(ni);
-
- /*
- * We have I30 global constant as an optimization as it is the name
- * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
- * allocation but that is ok. And most attributes are unnamed anyway,
- * thus the fraction of named attributes with name != I30 is actually
- * absolutely tiny.
- */
- if (na->name_len && na->name != I30) {
- unsigned int i;
-
- BUG_ON(!na->name);
- i = na->name_len * sizeof(ntfschar);
- ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
- if (!ni->name)
- return -ENOMEM;
- memcpy(ni->name, na->name, i);
- ni->name[na->name_len] = 0;
- }
- return 0;
-}
-
-static int ntfs_read_locked_inode(struct inode *vi);
-static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
-static int ntfs_read_locked_index_inode(struct inode *base_vi,
- struct inode *vi);
-
-/**
- * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
- * @sb: super block of mounted volume
- * @mft_no: mft record number / inode number to obtain
- *
- * Obtain the struct inode corresponding to a specific normal inode (i.e. a
- * file or directory).
- *
- * If the inode is in the cache, it is just returned with an increased
- * reference count. Otherwise, a new struct inode is allocated and initialized,
- * and finally ntfs_read_locked_inode() is called to read in the inode and
- * fill in the remainder of the inode structure.
- *
- * Return the struct inode on success. Check the return value with IS_ERR() and
- * if true, the function failed and the error code is obtained from PTR_ERR().
- */
-struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
-{
- struct inode *vi;
- int err;
- ntfs_attr na;
-
- na.mft_no = mft_no;
- na.type = AT_UNUSED;
- na.name = NULL;
- na.name_len = 0;
-
- vi = iget5_locked(sb, mft_no, ntfs_test_inode,
- ntfs_init_locked_inode, &na);
- if (unlikely(!vi))
- return ERR_PTR(-ENOMEM);
-
- err = 0;
-
- /* If this is a freshly allocated inode, need to read it now. */
- if (vi->i_state & I_NEW) {
- err = ntfs_read_locked_inode(vi);
- unlock_new_inode(vi);
- }
- /*
- * There is no point in keeping bad inodes around if the failure was
- * due to ENOMEM. We want to be able to retry again later.
- */
- if (unlikely(err == -ENOMEM)) {
- iput(vi);
- vi = ERR_PTR(err);
- }
- return vi;
-}
-
-/**
- * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
- * @base_vi: vfs base inode containing the attribute
- * @type: attribute type
- * @name: Unicode name of the attribute (NULL if unnamed)
- * @name_len: length of @name in Unicode characters (0 if unnamed)
- *
- * Obtain the (fake) struct inode corresponding to the attribute specified by
- * @type, @name, and @name_len, which is present in the base mft record
- * specified by the vfs inode @base_vi.
- *
- * If the attribute inode is in the cache, it is just returned with an
- * increased reference count. Otherwise, a new struct inode is allocated and
- * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
- * attribute and fill in the inode structure.
- *
- * Note, for index allocation attributes, you need to use ntfs_index_iget()
- * instead of ntfs_attr_iget() as working with indices is a lot more complex.
- *
- * Return the struct inode of the attribute inode on success. Check the return
- * value with IS_ERR() and if true, the function failed and the error code is
- * obtained from PTR_ERR().
- */
-struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
- ntfschar *name, u32 name_len)
-{
- struct inode *vi;
- int err;
- ntfs_attr na;
-
- /* Make sure no one calls ntfs_attr_iget() for indices. */
- BUG_ON(type == AT_INDEX_ALLOCATION);
-
- na.mft_no = base_vi->i_ino;
- na.type = type;
- na.name = name;
- na.name_len = name_len;
-
- vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
- ntfs_init_locked_inode, &na);
- if (unlikely(!vi))
- return ERR_PTR(-ENOMEM);
-
- err = 0;
-
- /* If this is a freshly allocated inode, need to read it now. */
- if (vi->i_state & I_NEW) {
- err = ntfs_read_locked_attr_inode(base_vi, vi);
- unlock_new_inode(vi);
- }
- /*
- * There is no point in keeping bad attribute inodes around. This also
- * simplifies things in that we never need to check for bad attribute
- * inodes elsewhere.
- */
- if (unlikely(err)) {
- iput(vi);
- vi = ERR_PTR(err);
- }
- return vi;
-}
-
-/**
- * ntfs_index_iget - obtain a struct inode corresponding to an index
- * @base_vi: vfs base inode containing the index related attributes
- * @name: Unicode name of the index
- * @name_len: length of @name in Unicode characters
- *
- * Obtain the (fake) struct inode corresponding to the index specified by @name
- * and @name_len, which is present in the base mft record specified by the vfs
- * inode @base_vi.
- *
- * If the index inode is in the cache, it is just returned with an increased
- * reference count. Otherwise, a new struct inode is allocated and
- * initialized, and finally ntfs_read_locked_index_inode() is called to read
- * the index related attributes and fill in the inode structure.
- *
- * Return the struct inode of the index inode on success. Check the return
- * value with IS_ERR() and if true, the function failed and the error code is
- * obtained from PTR_ERR().
- */
-struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
- u32 name_len)
-{
- struct inode *vi;
- int err;
- ntfs_attr na;
-
- na.mft_no = base_vi->i_ino;
- na.type = AT_INDEX_ALLOCATION;
- na.name = name;
- na.name_len = name_len;
-
- vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
- ntfs_init_locked_inode, &na);
- if (unlikely(!vi))
- return ERR_PTR(-ENOMEM);
-
- err = 0;
-
- /* If this is a freshly allocated inode, need to read it now. */
- if (vi->i_state & I_NEW) {
- err = ntfs_read_locked_index_inode(base_vi, vi);
- unlock_new_inode(vi);
- }
- /*
- * There is no point in keeping bad index inodes around. This also
- * simplifies things in that we never need to check for bad index
- * inodes elsewhere.
- */
- if (unlikely(err)) {
- iput(vi);
- vi = ERR_PTR(err);
- }
- return vi;
-}
-
-struct inode *ntfs_alloc_big_inode(struct super_block *sb)
-{
- ntfs_inode *ni;
-
- ntfs_debug("Entering.");
- ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
- if (likely(ni != NULL)) {
- ni->state = 0;
- return VFS_I(ni);
- }
- ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
- return NULL;
-}
-
-void ntfs_free_big_inode(struct inode *inode)
-{
- kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
-}
-
-static inline ntfs_inode *ntfs_alloc_extent_inode(void)
-{
- ntfs_inode *ni;
-
- ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
- if (likely(ni != NULL)) {
- ni->state = 0;
- return ni;
- }
- ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
- return NULL;
-}
-
-static void ntfs_destroy_extent_inode(ntfs_inode *ni)
-{
- ntfs_debug("Entering.");
- BUG_ON(ni->page);
- if (!atomic_dec_and_test(&ni->count))
- BUG();
- kmem_cache_free(ntfs_inode_cache, ni);
-}
-
-/*
- * The attribute runlist lock has separate locking rules from the
- * normal runlist lock, so split the two lock-classes:
- */
-static struct lock_class_key attr_list_rl_lock_class;
-
-/**
- * __ntfs_init_inode - initialize ntfs specific part of an inode
- * @sb: super block of mounted volume
- * @ni: freshly allocated ntfs inode which to initialize
- *
- * Initialize an ntfs inode to defaults.
- *
- * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
- * untouched. Make sure to initialize them elsewhere.
- *
- * Return zero on success and -ENOMEM on error.
- */
-void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
-{
- ntfs_debug("Entering.");
- rwlock_init(&ni->size_lock);
- ni->initialized_size = ni->allocated_size = 0;
- ni->seq_no = 0;
- atomic_set(&ni->count, 1);
- ni->vol = NTFS_SB(sb);
- ntfs_init_runlist(&ni->runlist);
- mutex_init(&ni->mrec_lock);
- ni->page = NULL;
- ni->page_ofs = 0;
- ni->attr_list_size = 0;
- ni->attr_list = NULL;
- ntfs_init_runlist(&ni->attr_list_rl);
- lockdep_set_class(&ni->attr_list_rl.lock,
- &attr_list_rl_lock_class);
- ni->itype.index.block_size = 0;
- ni->itype.index.vcn_size = 0;
- ni->itype.index.collation_rule = 0;
- ni->itype.index.block_size_bits = 0;
- ni->itype.index.vcn_size_bits = 0;
- mutex_init(&ni->extent_lock);
- ni->nr_extents = 0;
- ni->ext.base_ntfs_ino = NULL;
-}
-
-/*
- * Extent inodes get MFT-mapped in a nested way, while the base inode
- * is still mapped. Teach this nesting to the lock validator by creating
- * a separate class for nested inode's mrec_lock's:
- */
-static struct lock_class_key extent_inode_mrec_lock_key;
-
-inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
- unsigned long mft_no)
-{
- ntfs_inode *ni = ntfs_alloc_extent_inode();
-
- ntfs_debug("Entering.");
- if (likely(ni != NULL)) {
- __ntfs_init_inode(sb, ni);
- lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
- ni->mft_no = mft_no;
- ni->type = AT_UNUSED;
- ni->name = NULL;
- ni->name_len = 0;
- }
- return ni;
-}
-
-/**
- * ntfs_is_extended_system_file - check if a file is in the $Extend directory
- * @ctx: initialized attribute search context
- *
- * Search all file name attributes in the inode described by the attribute
- * search context @ctx and check if any of the names are in the $Extend system
- * directory.
- *
- * Return values:
- * 1: file is in $Extend directory
- * 0: file is not in $Extend directory
- * -errno: failed to determine if the file is in the $Extend directory
- */
-static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
-{
- int nr_links, err;
-
- /* Restart search. */
- ntfs_attr_reinit_search_ctx(ctx);
-
- /* Get number of hard links. */
- nr_links = le16_to_cpu(ctx->mrec->link_count);
-
- /* Loop through all hard links. */
- while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
- ctx))) {
- FILE_NAME_ATTR *file_name_attr;
- ATTR_RECORD *attr = ctx->attr;
- u8 *p, *p2;
-
- nr_links--;
- /*
- * Maximum sanity checking as we are called on an inode that
- * we suspect might be corrupt.
- */
- p = (u8*)attr + le32_to_cpu(attr->length);
- if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_in_use)) {
-err_corrupt_attr:
- ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
- "attribute. You should run chkdsk.");
- return -EIO;
- }
- if (attr->non_resident) {
- ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
- "name. You should run chkdsk.");
- return -EIO;
- }
- if (attr->flags) {
- ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
- "invalid flags. You should run "
- "chkdsk.");
- return -EIO;
- }
- if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
- ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
- "name. You should run chkdsk.");
- return -EIO;
- }
- file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
- le16_to_cpu(attr->data.resident.value_offset));
- p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length);
- if (p2 < (u8*)attr || p2 > p)
- goto err_corrupt_attr;
- /* This attribute is ok, but is it in the $Extend directory? */
- if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
- return 1; /* YES, it's an extended system file. */
- }
- if (unlikely(err != -ENOENT))
- return err;
- if (unlikely(nr_links)) {
- ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
- "doesn't match number of name attributes. You "
- "should run chkdsk.");
- return -EIO;
- }
- return 0; /* NO, it is not an extended system file. */
-}
-
-/**
- * ntfs_read_locked_inode - read an inode from its device
- * @vi: inode to read
- *
- * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
- * described by @vi into memory from the device.
- *
- * The only fields in @vi that we need to/can look at when the function is
- * called are i_sb, pointing to the mounted device's super block, and i_ino,
- * the number of the inode to load.
- *
- * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
- * for reading and sets up the necessary @vi fields as well as initializing
- * the ntfs inode.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- * i_count is set to 1, so it is not going to go away
- * i_flags is set to 0 and we have no business touching it. Only an ioctl()
- * is allowed to write to them. We should of course be honouring them but
- * we need to do that using the IS_* macros defined in include/linux/fs.h.
- * In any case ntfs_read_locked_inode() has nothing to do with i_flags.
- *
- * Return 0 on success and -errno on error. In the error case, the inode will
- * have had make_bad_inode() executed on it.
- */
-static int ntfs_read_locked_inode(struct inode *vi)
-{
- ntfs_volume *vol = NTFS_SB(vi->i_sb);
- ntfs_inode *ni;
- struct inode *bvi;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- STANDARD_INFORMATION *si;
- ntfs_attr_search_ctx *ctx;
- int err = 0;
-
- ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-
- /* Setup the generic vfs inode parts now. */
- vi->i_uid = vol->uid;
- vi->i_gid = vol->gid;
- vi->i_mode = 0;
-
- /*
- * Initialize the ntfs specific part of @vi special casing
- * FILE_MFT which we need to do at mount time.
- */
- if (vi->i_ino != FILE_MFT)
- ntfs_init_big_inode(vi);
- ni = NTFS_I(vi);
-
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto unm_err_out;
- }
-
- if (!(m->flags & MFT_RECORD_IN_USE)) {
- ntfs_error(vi->i_sb, "Inode is not in use!");
- goto unm_err_out;
- }
- if (m->base_mft_record) {
- ntfs_error(vi->i_sb, "Inode is an extent inode!");
- goto unm_err_out;
- }
-
- /* Transfer information from mft record into vfs and ntfs inodes. */
- vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-
- /*
- * FIXME: Keep in mind that link_count is two for files which have both
- * a long file name and a short file name as separate entries, so if
- * we are hiding short file names this will be too high. Either we need
- * to account for the short file names by subtracting them or we need
- * to make sure we delete files even though i_nlink is not zero which
- * might be tricky due to vfs interactions. Need to think about this
- * some more when implementing the unlink command.
- */
- set_nlink(vi, le16_to_cpu(m->link_count));
- /*
- * FIXME: Reparse points can have the directory bit set even though
- * they would be S_IFLNK. Need to deal with this further below when we
- * implement reparse points / symbolic links but it will do for now.
- * Also if not a directory, it could be something else, rather than
- * a regular file. But again, will do for now.
- */
- /* Everyone gets all permissions. */
- vi->i_mode |= S_IRWXUGO;
- /* If read-only, no one gets write permissions. */
- if (IS_RDONLY(vi))
- vi->i_mode &= ~S_IWUGO;
- if (m->flags & MFT_RECORD_IS_DIRECTORY) {
- vi->i_mode |= S_IFDIR;
- /*
- * Apply the directory permissions mask set in the mount
- * options.
- */
- vi->i_mode &= ~vol->dmask;
- /* Things break without this kludge! */
- if (vi->i_nlink > 1)
- set_nlink(vi, 1);
- } else {
- vi->i_mode |= S_IFREG;
- /* Apply the file permissions mask set in the mount options. */
- vi->i_mode &= ~vol->fmask;
- }
- /*
- * Find the standard information attribute in the mft record. At this
- * stage we haven't setup the attribute list stuff yet, so this could
- * in fact fail if the standard information is in an extent record, but
- * I don't think this actually ever happens.
- */
- err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
- ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- /*
- * TODO: We should be performing a hot fix here (if the
- * recover mount option is set) by creating a new
- * attribute.
- */
- ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
- "is missing.");
- }
- goto unm_err_out;
- }
- a = ctx->attr;
- /* Get the standard information attribute value. */
- if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset)
- + le32_to_cpu(a->data.resident.value_length) >
- (u8 *)ctx->mrec + vol->mft_record_size) {
- ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode.");
- goto unm_err_out;
- }
- si = (STANDARD_INFORMATION*)((u8*)a +
- le16_to_cpu(a->data.resident.value_offset));
-
- /* Transfer information from the standard information into vi. */
- /*
- * Note: The i_?times do not quite map perfectly onto the NTFS times,
- * but they are close enough, and in the end it doesn't really matter
- * that much...
- */
- /*
- * mtime is the last change of the data within the file. Not changed
- * when only metadata is changed, e.g. a rename doesn't affect mtime.
- */
- inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time));
- /*
- * ctime is the last change of the metadata of the file. This obviously
- * always changes, when mtime is changed. ctime can be changed on its
- * own, mtime is then not changed, e.g. when a file is renamed.
- */
- inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time));
- /*
- * Last access to the data within the file. Not changed during a rename
- * for example but changed whenever the file is written to.
- */
- inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time));
-
- /* Find the attribute list attribute if present. */
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
- if (err) {
- if (unlikely(err != -ENOENT)) {
- ntfs_error(vi->i_sb, "Failed to lookup attribute list "
- "attribute.");
- goto unm_err_out;
- }
- } else /* if (!err) */ {
- if (vi->i_ino == FILE_MFT)
- goto skip_attr_list_load;
- ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
- NInoSetAttrList(ni);
- a = ctx->attr;
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(vi->i_sb, "Attribute list attribute is "
- "compressed.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_ENCRYPTED ||
- a->flags & ATTR_IS_SPARSE) {
- if (a->non_resident) {
- ntfs_error(vi->i_sb, "Non-resident attribute "
- "list attribute is encrypted/"
- "sparse.");
- goto unm_err_out;
- }
- ntfs_warning(vi->i_sb, "Resident attribute list "
- "attribute in inode 0x%lx is marked "
- "encrypted/sparse which is not true. "
- "However, Windows allows this and "
- "chkdsk does not detect or correct it "
- "so we will just ignore the invalid "
- "flags and pretend they are not set.",
- vi->i_ino);
- }
- /* Now allocate memory for the attribute list. */
- ni->attr_list_size = (u32)ntfs_attr_size(a);
- ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
- if (!ni->attr_list) {
- ntfs_error(vi->i_sb, "Not enough memory to allocate "
- "buffer for attribute list.");
- err = -ENOMEM;
- goto unm_err_out;
- }
- if (a->non_resident) {
- NInoSetAttrListNonResident(ni);
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "Attribute list has non "
- "zero lowest_vcn.");
- goto unm_err_out;
- }
- /*
- * Setup the runlist. No need for locking as we have
- * exclusive access to the inode at this time.
- */
- ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
- a, NULL);
- if (IS_ERR(ni->attr_list_rl.rl)) {
- err = PTR_ERR(ni->attr_list_rl.rl);
- ni->attr_list_rl.rl = NULL;
- ntfs_error(vi->i_sb, "Mapping pairs "
- "decompression failed.");
- goto unm_err_out;
- }
- /* Now load the attribute list. */
- if ((err = load_attribute_list(vol, &ni->attr_list_rl,
- ni->attr_list, ni->attr_list_size,
- sle64_to_cpu(a->data.non_resident.
- initialized_size)))) {
- ntfs_error(vi->i_sb, "Failed to load "
- "attribute list attribute.");
- goto unm_err_out;
- }
- } else /* if (!a->non_resident) */ {
- if ((u8*)a + le16_to_cpu(a->data.resident.value_offset)
- + le32_to_cpu(
- a->data.resident.value_length) >
- (u8*)ctx->mrec + vol->mft_record_size) {
- ntfs_error(vi->i_sb, "Corrupt attribute list "
- "in inode.");
- goto unm_err_out;
- }
- /* Now copy the attribute list. */
- memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
- a->data.resident.value_offset),
- le32_to_cpu(
- a->data.resident.value_length));
- }
- }
-skip_attr_list_load:
- /*
- * If an attribute list is present we now have the attribute list value
- * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
- */
- if (S_ISDIR(vi->i_mode)) {
- loff_t bvi_size;
- ntfs_inode *bni;
- INDEX_ROOT *ir;
- u8 *ir_end, *index_end;
-
- /* It is a directory, find index root attribute. */
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
- 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- // FIXME: File is corrupt! Hot-fix with empty
- // index root attribute if recovery option is
- // set.
- ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
- "is missing.");
- }
- goto unm_err_out;
- }
- a = ctx->attr;
- /* Set up the state. */
- if (unlikely(a->non_resident)) {
- ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
- "resident.");
- goto unm_err_out;
- }
- /* Ensure the attribute name is placed before the value. */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(a->data.resident.value_offset)))) {
- ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
- "placed after the attribute value.");
- goto unm_err_out;
- }
- /*
- * Compressed/encrypted index root just means that the newly
- * created files in that directory should be created compressed/
- * encrypted. However index root cannot be both compressed and
- * encrypted.
- */
- if (a->flags & ATTR_COMPRESSION_MASK)
- NInoSetCompressed(ni);
- if (a->flags & ATTR_IS_ENCRYPTED) {
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(vi->i_sb, "Found encrypted and "
- "compressed attribute.");
- goto unm_err_out;
- }
- NInoSetEncrypted(ni);
- }
- if (a->flags & ATTR_IS_SPARSE)
- NInoSetSparse(ni);
- ir = (INDEX_ROOT*)((u8*)a +
- le16_to_cpu(a->data.resident.value_offset));
- ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
- if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
- ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
- "corrupt.");
- goto unm_err_out;
- }
- index_end = (u8*)&ir->index +
- le32_to_cpu(ir->index.index_length);
- if (index_end > ir_end) {
- ntfs_error(vi->i_sb, "Directory index is corrupt.");
- goto unm_err_out;
- }
- if (ir->type != AT_FILE_NAME) {
- ntfs_error(vi->i_sb, "Indexed attribute is not "
- "$FILE_NAME.");
- goto unm_err_out;
- }
- if (ir->collation_rule != COLLATION_FILE_NAME) {
- ntfs_error(vi->i_sb, "Index collation rule is not "
- "COLLATION_FILE_NAME.");
- goto unm_err_out;
- }
- ni->itype.index.collation_rule = ir->collation_rule;
- ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
- if (ni->itype.index.block_size &
- (ni->itype.index.block_size - 1)) {
- ntfs_error(vi->i_sb, "Index block size (%u) is not a "
- "power of two.",
- ni->itype.index.block_size);
- goto unm_err_out;
- }
- if (ni->itype.index.block_size > PAGE_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) > "
- "PAGE_SIZE (%ld) is not "
- "supported. Sorry.",
- ni->itype.index.block_size,
- PAGE_SIZE);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) < "
- "NTFS_BLOCK_SIZE (%i) is not "
- "supported. Sorry.",
- ni->itype.index.block_size,
- NTFS_BLOCK_SIZE);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- ni->itype.index.block_size_bits =
- ffs(ni->itype.index.block_size) - 1;
- /* Determine the size of a vcn in the directory index. */
- if (vol->cluster_size <= ni->itype.index.block_size) {
- ni->itype.index.vcn_size = vol->cluster_size;
- ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
- } else {
- ni->itype.index.vcn_size = vol->sector_size;
- ni->itype.index.vcn_size_bits = vol->sector_size_bits;
- }
-
- /* Setup the index allocation attribute, even if not present. */
- NInoSetMstProtected(ni);
- ni->type = AT_INDEX_ALLOCATION;
- ni->name = I30;
- ni->name_len = 4;
-
- if (!(ir->index.flags & LARGE_INDEX)) {
- /* No index allocation. */
- vi->i_size = ni->initialized_size =
- ni->allocated_size = 0;
- /* We are done with the mft record, so we release it. */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- m = NULL;
- ctx = NULL;
- goto skip_large_dir_stuff;
- } /* LARGE_INDEX: Index allocation present. Setup state. */
- NInoSetIndexAllocPresent(ni);
- /* Find index allocation attribute. */
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
- "attribute is not present but "
- "$INDEX_ROOT indicated it is.");
- else
- ntfs_error(vi->i_sb, "Failed to lookup "
- "$INDEX_ALLOCATION "
- "attribute.");
- goto unm_err_out;
- }
- a = ctx->attr;
- if (!a->non_resident) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
- "is resident.");
- goto unm_err_out;
- }
- /*
- * Ensure the attribute name is placed before the mapping pairs
- * array.
- */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset)))) {
- ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
- "is placed after the mapping pairs "
- "array.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_ENCRYPTED) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
- "is encrypted.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_SPARSE) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
- "is sparse.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
- "is compressed.");
- goto unm_err_out;
- }
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "First extent of "
- "$INDEX_ALLOCATION attribute has non "
- "zero lowest_vcn.");
- goto unm_err_out;
- }
- vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(
- a->data.non_resident.allocated_size);
- /*
- * We are done with the mft record, so we release it. Otherwise
- * we would deadlock in ntfs_attr_iget().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- m = NULL;
- ctx = NULL;
- /* Get the index bitmap attribute inode. */
- bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
- if (IS_ERR(bvi)) {
- ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
- err = PTR_ERR(bvi);
- goto unm_err_out;
- }
- bni = NTFS_I(bvi);
- if (NInoCompressed(bni) || NInoEncrypted(bni) ||
- NInoSparse(bni)) {
- ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
- "and/or encrypted and/or sparse.");
- goto iput_unm_err_out;
- }
- /* Consistency check bitmap size vs. index allocation size. */
- bvi_size = i_size_read(bvi);
- if ((bvi_size << 3) < (vi->i_size >>
- ni->itype.index.block_size_bits)) {
- ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
- "for index allocation (0x%llx).",
- bvi_size << 3, vi->i_size);
- goto iput_unm_err_out;
- }
- /* No longer need the bitmap attribute inode. */
- iput(bvi);
-skip_large_dir_stuff:
- /* Setup the operations for this inode. */
- vi->i_op = &ntfs_dir_inode_ops;
- vi->i_fop = &ntfs_dir_ops;
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- } else {
- /* It is a file. */
- ntfs_attr_reinit_search_ctx(ctx);
-
- /* Setup the data attribute, even if not present. */
- ni->type = AT_DATA;
- ni->name = NULL;
- ni->name_len = 0;
-
- /* Find first extent of the unnamed data attribute. */
- err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- vi->i_size = ni->initialized_size =
- ni->allocated_size = 0;
- if (err != -ENOENT) {
- ntfs_error(vi->i_sb, "Failed to lookup $DATA "
- "attribute.");
- goto unm_err_out;
- }
- /*
- * FILE_Secure does not have an unnamed $DATA
- * attribute, so we special case it here.
- */
- if (vi->i_ino == FILE_Secure)
- goto no_data_attr_special_case;
- /*
- * Most if not all the system files in the $Extend
- * system directory do not have unnamed data
- * attributes so we need to check if the parent
- * directory of the file is FILE_Extend and if it is
- * ignore this error. To do this we need to get the
- * name of this inode from the mft record as the name
- * contains the back reference to the parent directory.
- */
- if (ntfs_is_extended_system_file(ctx) > 0)
- goto no_data_attr_special_case;
- // FIXME: File is corrupt! Hot-fix with empty data
- // attribute if recovery option is set.
- ntfs_error(vi->i_sb, "$DATA attribute is missing.");
- goto unm_err_out;
- }
- a = ctx->attr;
- /* Setup the state. */
- if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
- if (a->flags & ATTR_COMPRESSION_MASK) {
- NInoSetCompressed(ni);
- if (vol->cluster_size > 4096) {
- ntfs_error(vi->i_sb, "Found "
- "compressed data but "
- "compression is "
- "disabled due to "
- "cluster size (%i) > "
- "4kiB.",
- vol->cluster_size);
- goto unm_err_out;
- }
- if ((a->flags & ATTR_COMPRESSION_MASK)
- != ATTR_IS_COMPRESSED) {
- ntfs_error(vi->i_sb, "Found unknown "
- "compression method "
- "or corrupt file.");
- goto unm_err_out;
- }
- }
- if (a->flags & ATTR_IS_SPARSE)
- NInoSetSparse(ni);
- }
- if (a->flags & ATTR_IS_ENCRYPTED) {
- if (NInoCompressed(ni)) {
- ntfs_error(vi->i_sb, "Found encrypted and "
- "compressed data.");
- goto unm_err_out;
- }
- NInoSetEncrypted(ni);
- }
- if (a->non_resident) {
- NInoSetNonResident(ni);
- if (NInoCompressed(ni) || NInoSparse(ni)) {
- if (NInoCompressed(ni) && a->data.non_resident.
- compression_unit != 4) {
- ntfs_error(vi->i_sb, "Found "
- "non-standard "
- "compression unit (%u "
- "instead of 4). "
- "Cannot handle this.",
- a->data.non_resident.
- compression_unit);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- if (a->data.non_resident.compression_unit) {
- ni->itype.compressed.block_size = 1U <<
- (a->data.non_resident.
- compression_unit +
- vol->cluster_size_bits);
- ni->itype.compressed.block_size_bits =
- ffs(ni->itype.
- compressed.
- block_size) - 1;
- ni->itype.compressed.block_clusters =
- 1U << a->data.
- non_resident.
- compression_unit;
- } else {
- ni->itype.compressed.block_size = 0;
- ni->itype.compressed.block_size_bits =
- 0;
- ni->itype.compressed.block_clusters =
- 0;
- }
- ni->itype.compressed.size = sle64_to_cpu(
- a->data.non_resident.
- compressed_size);
- }
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "First extent of $DATA "
- "attribute has non zero "
- "lowest_vcn.");
- goto unm_err_out;
- }
- vi->i_size = sle64_to_cpu(
- a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(
- a->data.non_resident.allocated_size);
- } else { /* Resident attribute. */
- vi->i_size = ni->initialized_size = le32_to_cpu(
- a->data.resident.value_length);
- ni->allocated_size = le32_to_cpu(a->length) -
- le16_to_cpu(
- a->data.resident.value_offset);
- if (vi->i_size > ni->allocated_size) {
- ntfs_error(vi->i_sb, "Resident data attribute "
- "is corrupt (size exceeds "
- "allocation).");
- goto unm_err_out;
- }
- }
-no_data_attr_special_case:
- /* We are done with the mft record, so we release it. */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- m = NULL;
- ctx = NULL;
- /* Setup the operations for this inode. */
- vi->i_op = &ntfs_file_inode_ops;
- vi->i_fop = &ntfs_file_ops;
- vi->i_mapping->a_ops = &ntfs_normal_aops;
- if (NInoMstProtected(ni))
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- else if (NInoCompressed(ni))
- vi->i_mapping->a_ops = &ntfs_compressed_aops;
- }
- /*
- * The number of 512-byte blocks used on disk (for stat). This is in so
- * far inaccurate as it doesn't account for any named streams or other
- * special non-resident attributes, but that is how Windows works, too,
- * so we are at least consistent with Windows, if not entirely
- * consistent with the Linux Way. Doing it the Linux Way would cause a
- * significant slowdown as it would involve iterating over all
- * attributes in the mft record and adding the allocated/compressed
- * sizes of all non-resident attributes present to give us the Linux
- * correct size that should go into i_blocks (after division by 512).
- */
- if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni)))
- vi->i_blocks = ni->itype.compressed.size >> 9;
- else
- vi->i_blocks = ni->allocated_size >> 9;
- ntfs_debug("Done.");
- return 0;
-iput_unm_err_out:
- iput(bvi);
-unm_err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(ni);
-err_out:
- ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt "
- "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino);
- make_bad_inode(vi);
- if (err != -EOPNOTSUPP && err != -ENOMEM)
- NVolSetErrors(vol);
- return err;
-}
-
-/**
- * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
- * @base_vi: base inode
- * @vi: attribute inode to read
- *
- * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
- * attribute inode described by @vi into memory from the base mft record
- * described by @base_ni.
- *
- * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
- * reading and looks up the attribute described by @vi before setting up the
- * necessary fields in @vi as well as initializing the ntfs inode.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- * i_count is set to 1, so it is not going to go away
- *
- * Return 0 on success and -errno on error. In the error case, the inode will
- * have had make_bad_inode() executed on it.
- *
- * Note this cannot be called for AT_INDEX_ALLOCATION.
- */
-static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
-{
- ntfs_volume *vol = NTFS_SB(vi->i_sb);
- ntfs_inode *ni, *base_ni;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- int err = 0;
-
- ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-
- ntfs_init_big_inode(vi);
-
- ni = NTFS_I(vi);
- base_ni = NTFS_I(base_vi);
-
- /* Just mirror the values from the base inode. */
- vi->i_uid = base_vi->i_uid;
- vi->i_gid = base_vi->i_gid;
- set_nlink(vi, base_vi->i_nlink);
- inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
- inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
- inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
- vi->i_generation = ni->seq_no = base_ni->seq_no;
-
- /* Set inode type to zero but preserve permissions. */
- vi->i_mode = base_vi->i_mode & ~S_IFMT;
-
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- /* Find the attribute. */
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err))
- goto unm_err_out;
- a = ctx->attr;
- if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
- if (a->flags & ATTR_COMPRESSION_MASK) {
- NInoSetCompressed(ni);
- if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
- ni->name_len)) {
- ntfs_error(vi->i_sb, "Found compressed "
- "non-data or named data "
- "attribute. Please report "
- "you saw this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net");
- goto unm_err_out;
- }
- if (vol->cluster_size > 4096) {
- ntfs_error(vi->i_sb, "Found compressed "
- "attribute but compression is "
- "disabled due to cluster size "
- "(%i) > 4kiB.",
- vol->cluster_size);
- goto unm_err_out;
- }
- if ((a->flags & ATTR_COMPRESSION_MASK) !=
- ATTR_IS_COMPRESSED) {
- ntfs_error(vi->i_sb, "Found unknown "
- "compression method.");
- goto unm_err_out;
- }
- }
- /*
- * The compressed/sparse flag set in an index root just means
- * to compress all files.
- */
- if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
- ntfs_error(vi->i_sb, "Found mst protected attribute "
- "but the attribute is %s. Please "
- "report you saw this message to "
- "linux-ntfs-dev@lists.sourceforge.net",
- NInoCompressed(ni) ? "compressed" :
- "sparse");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_SPARSE)
- NInoSetSparse(ni);
- }
- if (a->flags & ATTR_IS_ENCRYPTED) {
- if (NInoCompressed(ni)) {
- ntfs_error(vi->i_sb, "Found encrypted and compressed "
- "data.");
- goto unm_err_out;
- }
- /*
- * The encryption flag set in an index root just means to
- * encrypt all files.
- */
- if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
- ntfs_error(vi->i_sb, "Found mst protected attribute "
- "but the attribute is encrypted. "
- "Please report you saw this message "
- "to linux-ntfs-dev@lists.sourceforge."
- "net");
- goto unm_err_out;
- }
- if (ni->type != AT_DATA) {
- ntfs_error(vi->i_sb, "Found encrypted non-data "
- "attribute.");
- goto unm_err_out;
- }
- NInoSetEncrypted(ni);
- }
- if (!a->non_resident) {
- /* Ensure the attribute name is placed before the value. */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(a->data.resident.value_offset)))) {
- ntfs_error(vol->sb, "Attribute name is placed after "
- "the attribute value.");
- goto unm_err_out;
- }
- if (NInoMstProtected(ni)) {
- ntfs_error(vi->i_sb, "Found mst protected attribute "
- "but the attribute is resident. "
- "Please report you saw this message to "
- "linux-ntfs-dev@lists.sourceforge.net");
- goto unm_err_out;
- }
- vi->i_size = ni->initialized_size = le32_to_cpu(
- a->data.resident.value_length);
- ni->allocated_size = le32_to_cpu(a->length) -
- le16_to_cpu(a->data.resident.value_offset);
- if (vi->i_size > ni->allocated_size) {
- ntfs_error(vi->i_sb, "Resident attribute is corrupt "
- "(size exceeds allocation).");
- goto unm_err_out;
- }
- } else {
- NInoSetNonResident(ni);
- /*
- * Ensure the attribute name is placed before the mapping pairs
- * array.
- */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset)))) {
- ntfs_error(vol->sb, "Attribute name is placed after "
- "the mapping pairs array.");
- goto unm_err_out;
- }
- if (NInoCompressed(ni) || NInoSparse(ni)) {
- if (NInoCompressed(ni) && a->data.non_resident.
- compression_unit != 4) {
- ntfs_error(vi->i_sb, "Found non-standard "
- "compression unit (%u instead "
- "of 4). Cannot handle this.",
- a->data.non_resident.
- compression_unit);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- if (a->data.non_resident.compression_unit) {
- ni->itype.compressed.block_size = 1U <<
- (a->data.non_resident.
- compression_unit +
- vol->cluster_size_bits);
- ni->itype.compressed.block_size_bits =
- ffs(ni->itype.compressed.
- block_size) - 1;
- ni->itype.compressed.block_clusters = 1U <<
- a->data.non_resident.
- compression_unit;
- } else {
- ni->itype.compressed.block_size = 0;
- ni->itype.compressed.block_size_bits = 0;
- ni->itype.compressed.block_clusters = 0;
- }
- ni->itype.compressed.size = sle64_to_cpu(
- a->data.non_resident.compressed_size);
- }
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "First extent of attribute has "
- "non-zero lowest_vcn.");
- goto unm_err_out;
- }
- vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(
- a->data.non_resident.allocated_size);
- }
- vi->i_mapping->a_ops = &ntfs_normal_aops;
- if (NInoMstProtected(ni))
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- else if (NInoCompressed(ni))
- vi->i_mapping->a_ops = &ntfs_compressed_aops;
- if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
- vi->i_blocks = ni->itype.compressed.size >> 9;
- else
- vi->i_blocks = ni->allocated_size >> 9;
- /*
- * Make sure the base inode does not go away and attach it to the
- * attribute inode.
- */
- igrab(base_vi);
- ni->ext.base_ntfs_ino = base_ni;
- ni->nr_extents = -1;
-
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
-
- ntfs_debug("Done.");
- return 0;
-
-unm_err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
-err_out:
- ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
- "inode (mft_no 0x%lx, type 0x%x, name_len %i). "
- "Marking corrupt inode and base inode 0x%lx as bad. "
- "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
- base_vi->i_ino);
- make_bad_inode(vi);
- if (err != -ENOMEM)
- NVolSetErrors(vol);
- return err;
-}
-
-/**
- * ntfs_read_locked_index_inode - read an index inode from its base inode
- * @base_vi: base inode
- * @vi: index inode to read
- *
- * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
- * index inode described by @vi into memory from the base mft record described
- * by @base_ni.
- *
- * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
- * reading and looks up the attributes relating to the index described by @vi
- * before setting up the necessary fields in @vi as well as initializing the
- * ntfs inode.
- *
- * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
- * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they
- * are setup like directory inodes since directories are a special case of
- * indices ao they need to be treated in much the same way. Most importantly,
- * for small indices the index allocation attribute might not actually exist.
- * However, the index root attribute always exists but this does not need to
- * have an inode associated with it and this is why we define a new inode type
- * index. Also, like for directories, we need to have an attribute inode for
- * the bitmap attribute corresponding to the index allocation attribute and we
- * can store this in the appropriate field of the inode, just like we do for
- * normal directory inodes.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- * i_count is set to 1, so it is not going to go away
- *
- * Return 0 on success and -errno on error. In the error case, the inode will
- * have had make_bad_inode() executed on it.
- */
-static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
-{
- loff_t bvi_size;
- ntfs_volume *vol = NTFS_SB(vi->i_sb);
- ntfs_inode *ni, *base_ni, *bni;
- struct inode *bvi;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- INDEX_ROOT *ir;
- u8 *ir_end, *index_end;
- int err = 0;
-
- ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
- ntfs_init_big_inode(vi);
- ni = NTFS_I(vi);
- base_ni = NTFS_I(base_vi);
- /* Just mirror the values from the base inode. */
- vi->i_uid = base_vi->i_uid;
- vi->i_gid = base_vi->i_gid;
- set_nlink(vi, base_vi->i_nlink);
- inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
- inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
- inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
- vi->i_generation = ni->seq_no = base_ni->seq_no;
- /* Set inode type to zero but preserve permissions. */
- vi->i_mode = base_vi->i_mode & ~S_IFMT;
- /* Map the mft record for the base inode. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- /* Find the index root attribute. */
- err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
- "missing.");
- goto unm_err_out;
- }
- a = ctx->attr;
- /* Set up the state. */
- if (unlikely(a->non_resident)) {
- ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
- goto unm_err_out;
- }
- /* Ensure the attribute name is placed before the value. */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(a->data.resident.value_offset)))) {
- ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
- "after the attribute value.");
- goto unm_err_out;
- }
- /*
- * Compressed/encrypted/sparse index root is not allowed, except for
- * directories of course but those are not dealt with here.
- */
- if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
- ATTR_IS_SPARSE)) {
- ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
- "root attribute.");
- goto unm_err_out;
- }
- ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset));
- ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
- if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
- ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
- goto unm_err_out;
- }
- index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
- if (index_end > ir_end) {
- ntfs_error(vi->i_sb, "Index is corrupt.");
- goto unm_err_out;
- }
- if (ir->type) {
- ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
- le32_to_cpu(ir->type));
- goto unm_err_out;
- }
- ni->itype.index.collation_rule = ir->collation_rule;
- ntfs_debug("Index collation rule is 0x%x.",
- le32_to_cpu(ir->collation_rule));
- ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
- if (!is_power_of_2(ni->itype.index.block_size)) {
- ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
- "two.", ni->itype.index.block_size);
- goto unm_err_out;
- }
- if (ni->itype.index.block_size > PAGE_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE "
- "(%ld) is not supported. Sorry.",
- ni->itype.index.block_size, PAGE_SIZE);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
- "(%i) is not supported. Sorry.",
- ni->itype.index.block_size, NTFS_BLOCK_SIZE);
- err = -EOPNOTSUPP;
- goto unm_err_out;
- }
- ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
- /* Determine the size of a vcn in the index. */
- if (vol->cluster_size <= ni->itype.index.block_size) {
- ni->itype.index.vcn_size = vol->cluster_size;
- ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
- } else {
- ni->itype.index.vcn_size = vol->sector_size;
- ni->itype.index.vcn_size_bits = vol->sector_size_bits;
- }
- /* Check for presence of index allocation attribute. */
- if (!(ir->index.flags & LARGE_INDEX)) {
- /* No index allocation. */
- vi->i_size = ni->initialized_size = ni->allocated_size = 0;
- /* We are done with the mft record, so we release it. */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- m = NULL;
- ctx = NULL;
- goto skip_large_index_stuff;
- } /* LARGE_INDEX: Index allocation present. Setup state. */
- NInoSetIndexAllocPresent(ni);
- /* Find index allocation attribute. */
- ntfs_attr_reinit_search_ctx(ctx);
- err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
- "not present but $INDEX_ROOT "
- "indicated it is.");
- else
- ntfs_error(vi->i_sb, "Failed to lookup "
- "$INDEX_ALLOCATION attribute.");
- goto unm_err_out;
- }
- a = ctx->attr;
- if (!a->non_resident) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
- "resident.");
- goto unm_err_out;
- }
- /*
- * Ensure the attribute name is placed before the mapping pairs array.
- */
- if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
- le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset)))) {
- ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
- "placed after the mapping pairs array.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_ENCRYPTED) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
- "encrypted.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_IS_SPARSE) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
- goto unm_err_out;
- }
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
- "compressed.");
- goto unm_err_out;
- }
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
- "attribute has non zero lowest_vcn.");
- goto unm_err_out;
- }
- vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size);
- /*
- * We are done with the mft record, so we release it. Otherwise
- * we would deadlock in ntfs_attr_iget().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- m = NULL;
- ctx = NULL;
- /* Get the index bitmap attribute inode. */
- bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
- if (IS_ERR(bvi)) {
- ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
- err = PTR_ERR(bvi);
- goto unm_err_out;
- }
- bni = NTFS_I(bvi);
- if (NInoCompressed(bni) || NInoEncrypted(bni) ||
- NInoSparse(bni)) {
- ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
- "encrypted and/or sparse.");
- goto iput_unm_err_out;
- }
- /* Consistency check bitmap size vs. index allocation size. */
- bvi_size = i_size_read(bvi);
- if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) {
- ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
- "index allocation (0x%llx).", bvi_size << 3,
- vi->i_size);
- goto iput_unm_err_out;
- }
- iput(bvi);
-skip_large_index_stuff:
- /* Setup the operations for this index inode. */
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- vi->i_blocks = ni->allocated_size >> 9;
- /*
- * Make sure the base inode doesn't go away and attach it to the
- * index inode.
- */
- igrab(base_vi);
- ni->ext.base_ntfs_ino = base_ni;
- ni->nr_extents = -1;
-
- ntfs_debug("Done.");
- return 0;
-iput_unm_err_out:
- iput(bvi);
-unm_err_out:
- if (!err)
- err = -EIO;
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
-err_out:
- ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
- "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
- ni->name_len);
- make_bad_inode(vi);
- if (err != -EOPNOTSUPP && err != -ENOMEM)
- NVolSetErrors(vol);
- return err;
-}
-
-/*
- * The MFT inode has special locking, so teach the lock validator
- * about this by splitting off the locking rules of the MFT from
- * the locking rules of other inodes. The MFT inode can never be
- * accessed from the VFS side (or even internally), only by the
- * map_mft functions.
- */
-static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
-
-/**
- * ntfs_read_inode_mount - special read_inode for mount time use only
- * @vi: inode to read
- *
- * Read inode FILE_MFT at mount time, only called with super_block lock
- * held from within the read_super() code path.
- *
- * This function exists because when it is called the page cache for $MFT/$DATA
- * is not initialized and hence we cannot get at the contents of mft records
- * by calling map_mft_record*().
- *
- * Further it needs to cope with the circular references problem, i.e. cannot
- * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
- * we do not know where the other extent mft records are yet and again, because
- * we cannot call map_mft_record*() yet. Obviously this applies only when an
- * attribute list is actually present in $MFT inode.
- *
- * We solve these problems by starting with the $DATA attribute before anything
- * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each
- * extent is found, we ntfs_mapping_pairs_decompress() including the implied
- * ntfs_runlists_merge(). Each step of the iteration necessarily provides
- * sufficient information for the next step to complete.
- *
- * This should work but there are two possible pit falls (see inline comments
- * below), but only time will tell if they are real pits or just smoke...
- */
-int ntfs_read_inode_mount(struct inode *vi)
-{
- VCN next_vcn, last_vcn, highest_vcn;
- s64 block;
- struct super_block *sb = vi->i_sb;
- ntfs_volume *vol = NTFS_SB(sb);
- struct buffer_head *bh;
- ntfs_inode *ni;
- MFT_RECORD *m = NULL;
- ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
- unsigned int i, nr_blocks;
- int err;
-
- ntfs_debug("Entering.");
-
- /* Initialize the ntfs specific part of @vi. */
- ntfs_init_big_inode(vi);
-
- ni = NTFS_I(vi);
-
- /* Setup the data attribute. It is special as it is mst protected. */
- NInoSetNonResident(ni);
- NInoSetMstProtected(ni);
- NInoSetSparseDisabled(ni);
- ni->type = AT_DATA;
- ni->name = NULL;
- ni->name_len = 0;
- /*
- * This sets up our little cheat allowing us to reuse the async read io
- * completion handler for directories.
- */
- ni->itype.index.block_size = vol->mft_record_size;
- ni->itype.index.block_size_bits = vol->mft_record_size_bits;
-
- /* Very important! Needed to be able to call map_mft_record*(). */
- vol->mft_ino = vi;
-
- /* Allocate enough memory to read the first mft record. */
- if (vol->mft_record_size > 64 * 1024) {
- ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
- vol->mft_record_size);
- goto err_out;
- }
- i = vol->mft_record_size;
- if (i < sb->s_blocksize)
- i = sb->s_blocksize;
- m = (MFT_RECORD*)ntfs_malloc_nofs(i);
- if (!m) {
- ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
- goto err_out;
- }
-
- /* Determine the first block of the $MFT/$DATA attribute. */
- block = vol->mft_lcn << vol->cluster_size_bits >>
- sb->s_blocksize_bits;
- nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
- if (!nr_blocks)
- nr_blocks = 1;
-
- /* Load $MFT/$DATA's first mft record. */
- for (i = 0; i < nr_blocks; i++) {
- bh = sb_bread(sb, block++);
- if (!bh) {
- ntfs_error(sb, "Device read failed.");
- goto err_out;
- }
- memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
- sb->s_blocksize);
- brelse(bh);
- }
-
- if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
- ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.",
- le32_to_cpu(m->bytes_allocated), vol->mft_record_size);
- goto err_out;
- }
-
- /* Apply the mst fixups. */
- if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
- /* FIXME: Try to use the $MFTMirr now. */
- ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
- goto err_out;
- }
-
- /* Sanity check offset to the first attribute */
- if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
- ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
- le16_to_cpu(m->attrs_offset));
- goto err_out;
- }
-
- /* Need this to sanity check attribute list references to $MFT. */
- vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-
- /* Provides read_folio() for map_mft_record(). */
- vi->i_mapping->a_ops = &ntfs_mst_aops;
-
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto err_out;
- }
-
- /* Find the attribute list attribute if present. */
- err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
- if (err) {
- if (unlikely(err != -ENOENT)) {
- ntfs_error(sb, "Failed to lookup attribute list "
- "attribute. You should run chkdsk.");
- goto put_err_out;
- }
- } else /* if (!err) */ {
- ATTR_LIST_ENTRY *al_entry, *next_al_entry;
- u8 *al_end;
- static const char *es = " Not allowed. $MFT is corrupt. "
- "You should run chkdsk.";
-
- ntfs_debug("Attribute list attribute found in $MFT.");
- NInoSetAttrList(ni);
- a = ctx->attr;
- if (a->flags & ATTR_COMPRESSION_MASK) {
- ntfs_error(sb, "Attribute list attribute is "
- "compressed.%s", es);
- goto put_err_out;
- }
- if (a->flags & ATTR_IS_ENCRYPTED ||
- a->flags & ATTR_IS_SPARSE) {
- if (a->non_resident) {
- ntfs_error(sb, "Non-resident attribute list "
- "attribute is encrypted/"
- "sparse.%s", es);
- goto put_err_out;
- }
- ntfs_warning(sb, "Resident attribute list attribute "
- "in $MFT system file is marked "
- "encrypted/sparse which is not true. "
- "However, Windows allows this and "
- "chkdsk does not detect or correct it "
- "so we will just ignore the invalid "
- "flags and pretend they are not set.");
- }
- /* Now allocate memory for the attribute list. */
- ni->attr_list_size = (u32)ntfs_attr_size(a);
- if (!ni->attr_list_size) {
- ntfs_error(sb, "Attr_list_size is zero");
- goto put_err_out;
- }
- ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
- if (!ni->attr_list) {
- ntfs_error(sb, "Not enough memory to allocate buffer "
- "for attribute list.");
- goto put_err_out;
- }
- if (a->non_resident) {
- NInoSetAttrListNonResident(ni);
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(sb, "Attribute list has non zero "
- "lowest_vcn. $MFT is corrupt. "
- "You should run chkdsk.");
- goto put_err_out;
- }
- /* Setup the runlist. */
- ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
- a, NULL);
- if (IS_ERR(ni->attr_list_rl.rl)) {
- err = PTR_ERR(ni->attr_list_rl.rl);
- ni->attr_list_rl.rl = NULL;
- ntfs_error(sb, "Mapping pairs decompression "
- "failed with error code %i.",
- -err);
- goto put_err_out;
- }
- /* Now load the attribute list. */
- if ((err = load_attribute_list(vol, &ni->attr_list_rl,
- ni->attr_list, ni->attr_list_size,
- sle64_to_cpu(a->data.
- non_resident.initialized_size)))) {
- ntfs_error(sb, "Failed to load attribute list "
- "attribute with error code %i.",
- -err);
- goto put_err_out;
- }
- } else /* if (!ctx.attr->non_resident) */ {
- if ((u8*)a + le16_to_cpu(
- a->data.resident.value_offset) +
- le32_to_cpu(
- a->data.resident.value_length) >
- (u8*)ctx->mrec + vol->mft_record_size) {
- ntfs_error(sb, "Corrupt attribute list "
- "attribute.");
- goto put_err_out;
- }
- /* Now copy the attribute list. */
- memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
- a->data.resident.value_offset),
- le32_to_cpu(
- a->data.resident.value_length));
- }
- /* The attribute list is now setup in memory. */
- /*
- * FIXME: I don't know if this case is actually possible.
- * According to logic it is not possible but I have seen too
- * many weird things in MS software to rely on logic... Thus we
- * perform a manual search and make sure the first $MFT/$DATA
- * extent is in the base inode. If it is not we abort with an
- * error and if we ever see a report of this error we will need
- * to do some magic in order to have the necessary mft record
- * loaded and in the right place in the page cache. But
- * hopefully logic will prevail and this never happens...
- */
- al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
- al_end = (u8*)al_entry + ni->attr_list_size;
- for (;; al_entry = next_al_entry) {
- /* Out of bounds check. */
- if ((u8*)al_entry < ni->attr_list ||
- (u8*)al_entry > al_end)
- goto em_put_err_out;
- /* Catch the end of the attribute list. */
- if ((u8*)al_entry == al_end)
- goto em_put_err_out;
- if (!al_entry->length)
- goto em_put_err_out;
- if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
- le16_to_cpu(al_entry->length) > al_end)
- goto em_put_err_out;
- next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
- le16_to_cpu(al_entry->length));
- if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
- goto em_put_err_out;
- if (AT_DATA != al_entry->type)
- continue;
- /* We want an unnamed attribute. */
- if (al_entry->name_length)
- goto em_put_err_out;
- /* Want the first entry, i.e. lowest_vcn == 0. */
- if (al_entry->lowest_vcn)
- goto em_put_err_out;
- /* First entry has to be in the base mft record. */
- if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
- /* MFT references do not match, logic fails. */
- ntfs_error(sb, "BUG: The first $DATA extent "
- "of $MFT is not in the base "
- "mft record. Please report "
- "you saw this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net");
- goto put_err_out;
- } else {
- /* Sequence numbers must match. */
- if (MSEQNO_LE(al_entry->mft_reference) !=
- ni->seq_no)
- goto em_put_err_out;
- /* Got it. All is ok. We can stop now. */
- break;
- }
- }
- }
-
- ntfs_attr_reinit_search_ctx(ctx);
-
- /* Now load all attribute extents. */
- a = NULL;
- next_vcn = last_vcn = highest_vcn = 0;
- while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
- ctx))) {
- runlist_element *nrl;
-
- /* Cache the current attribute. */
- a = ctx->attr;
- /* $MFT must be non-resident. */
- if (!a->non_resident) {
- ntfs_error(sb, "$MFT must be non-resident but a "
- "resident extent was found. $MFT is "
- "corrupt. Run chkdsk.");
- goto put_err_out;
- }
- /* $MFT must be uncompressed and unencrypted. */
- if (a->flags & ATTR_COMPRESSION_MASK ||
- a->flags & ATTR_IS_ENCRYPTED ||
- a->flags & ATTR_IS_SPARSE) {
- ntfs_error(sb, "$MFT must be uncompressed, "
- "non-sparse, and unencrypted but a "
- "compressed/sparse/encrypted extent "
- "was found. $MFT is corrupt. Run "
- "chkdsk.");
- goto put_err_out;
- }
- /*
- * Decompress the mapping pairs array of this extent and merge
- * the result into the existing runlist. No need for locking
- * as we have exclusive access to the inode at this time and we
- * are a mount in progress task, too.
- */
- nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
- if (IS_ERR(nrl)) {
- ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
- "failed with error code %ld. $MFT is "
- "corrupt.", PTR_ERR(nrl));
- goto put_err_out;
- }
- ni->runlist.rl = nrl;
-
- /* Are we in the first extent? */
- if (!next_vcn) {
- if (a->data.non_resident.lowest_vcn) {
- ntfs_error(sb, "First extent of $DATA "
- "attribute has non zero "
- "lowest_vcn. $MFT is corrupt. "
- "You should run chkdsk.");
- goto put_err_out;
- }
- /* Get the last vcn in the $DATA attribute. */
- last_vcn = sle64_to_cpu(
- a->data.non_resident.allocated_size)
- >> vol->cluster_size_bits;
- /* Fill in the inode size. */
- vi->i_size = sle64_to_cpu(
- a->data.non_resident.data_size);
- ni->initialized_size = sle64_to_cpu(
- a->data.non_resident.initialized_size);
- ni->allocated_size = sle64_to_cpu(
- a->data.non_resident.allocated_size);
- /*
- * Verify the number of mft records does not exceed
- * 2^32 - 1.
- */
- if ((vi->i_size >> vol->mft_record_size_bits) >=
- (1ULL << 32)) {
- ntfs_error(sb, "$MFT is too big! Aborting.");
- goto put_err_out;
- }
- /*
- * We have got the first extent of the runlist for
- * $MFT which means it is now relatively safe to call
- * the normal ntfs_read_inode() function.
- * Complete reading the inode, this will actually
- * re-read the mft record for $MFT, this time entering
- * it into the page cache with which we complete the
- * kick start of the volume. It should be safe to do
- * this now as the first extent of $MFT/$DATA is
- * already known and we would hope that we don't need
- * further extents in order to find the other
- * attributes belonging to $MFT. Only time will tell if
- * this is really the case. If not we will have to play
- * magic at this point, possibly duplicating a lot of
- * ntfs_read_inode() at this point. We will need to
- * ensure we do enough of its work to be able to call
- * ntfs_read_inode() on extents of $MFT/$DATA. But lets
- * hope this never happens...
- */
- ntfs_read_locked_inode(vi);
- if (is_bad_inode(vi)) {
- ntfs_error(sb, "ntfs_read_inode() of $MFT "
- "failed. BUG or corrupt $MFT. "
- "Run chkdsk and if no errors "
- "are found, please report you "
- "saw this message to "
- "linux-ntfs-dev@lists."
- "sourceforge.net");
- ntfs_attr_put_search_ctx(ctx);
- /* Revert to the safe super operations. */
- ntfs_free(m);
- return -1;
- }
- /*
- * Re-initialize some specifics about $MFT's inode as
- * ntfs_read_inode() will have set up the default ones.
- */
- /* Set uid and gid to root. */
- vi->i_uid = GLOBAL_ROOT_UID;
- vi->i_gid = GLOBAL_ROOT_GID;
- /* Regular file. No access for anyone. */
- vi->i_mode = S_IFREG;
- /* No VFS initiated operations allowed for $MFT. */
- vi->i_op = &ntfs_empty_inode_ops;
- vi->i_fop = &ntfs_empty_file_ops;
- }
-
- /* Get the lowest vcn for the next extent. */
- highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
- next_vcn = highest_vcn + 1;
-
- /* Only one extent or error, which we catch below. */
- if (next_vcn <= 0)
- break;
-
- /* Avoid endless loops due to corruption. */
- if (next_vcn < sle64_to_cpu(
- a->data.non_resident.lowest_vcn)) {
- ntfs_error(sb, "$MFT has corrupt attribute list "
- "attribute. Run chkdsk.");
- goto put_err_out;
- }
- }
- if (err != -ENOENT) {
- ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
- "$MFT is corrupt. Run chkdsk.");
- goto put_err_out;
- }
- if (!a) {
- ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
- "corrupt. Run chkdsk.");
- goto put_err_out;
- }
- if (highest_vcn && highest_vcn != last_vcn - 1) {
- ntfs_error(sb, "Failed to load the complete runlist for "
- "$MFT/$DATA. Driver bug or corrupt $MFT. "
- "Run chkdsk.");
- ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
- (unsigned long long)highest_vcn,
- (unsigned long long)last_vcn - 1);
- goto put_err_out;
- }
- ntfs_attr_put_search_ctx(ctx);
- ntfs_debug("Done.");
- ntfs_free(m);
-
- /*
- * Split the locking rules of the MFT inode from the
- * locking rules of other inodes:
- */
- lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
- lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
-
- return 0;
-
-em_put_err_out:
- ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
- "attribute list. $MFT is corrupt. Run chkdsk.");
-put_err_out:
- ntfs_attr_put_search_ctx(ctx);
-err_out:
- ntfs_error(sb, "Failed. Marking inode as bad.");
- make_bad_inode(vi);
- ntfs_free(m);
- return -1;
-}
-
-static void __ntfs_clear_inode(ntfs_inode *ni)
-{
- /* Free all alocated memory. */
- down_write(&ni->runlist.lock);
- if (ni->runlist.rl) {
- ntfs_free(ni->runlist.rl);
- ni->runlist.rl = NULL;
- }
- up_write(&ni->runlist.lock);
-
- if (ni->attr_list) {
- ntfs_free(ni->attr_list);
- ni->attr_list = NULL;
- }
-
- down_write(&ni->attr_list_rl.lock);
- if (ni->attr_list_rl.rl) {
- ntfs_free(ni->attr_list_rl.rl);
- ni->attr_list_rl.rl = NULL;
- }
- up_write(&ni->attr_list_rl.lock);
-
- if (ni->name_len && ni->name != I30) {
- /* Catch bugs... */
- BUG_ON(!ni->name);
- kfree(ni->name);
- }
-}
-
-void ntfs_clear_extent_inode(ntfs_inode *ni)
-{
- ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-
- BUG_ON(NInoAttr(ni));
- BUG_ON(ni->nr_extents != -1);
-
-#ifdef NTFS_RW
- if (NInoDirty(ni)) {
- if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
- ntfs_error(ni->vol->sb, "Clearing dirty extent inode! "
- "Losing data! This is a BUG!!!");
- // FIXME: Do something!!!
- }
-#endif /* NTFS_RW */
-
- __ntfs_clear_inode(ni);
-
- /* Bye, bye... */
- ntfs_destroy_extent_inode(ni);
-}
-
-/**
- * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
- * @vi: vfs inode pending annihilation
- *
- * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
- * is called, which deallocates all memory belonging to the NTFS specific part
- * of the inode and returns.
- *
- * If the MFT record is dirty, we commit it before doing anything else.
- */
-void ntfs_evict_big_inode(struct inode *vi)
-{
- ntfs_inode *ni = NTFS_I(vi);
-
- truncate_inode_pages_final(&vi->i_data);
- clear_inode(vi);
-
-#ifdef NTFS_RW
- if (NInoDirty(ni)) {
- bool was_bad = (is_bad_inode(vi));
-
- /* Committing the inode also commits all extent inodes. */
- ntfs_commit_inode(vi);
-
- if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
- ntfs_error(vi->i_sb, "Failed to commit dirty inode "
- "0x%lx. Losing data!", vi->i_ino);
- // FIXME: Do something!!!
- }
- }
-#endif /* NTFS_RW */
-
- /* No need to lock at this stage as no one else has a reference. */
- if (ni->nr_extents > 0) {
- int i;
-
- for (i = 0; i < ni->nr_extents; i++)
- ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
- kfree(ni->ext.extent_ntfs_inos);
- }
-
- __ntfs_clear_inode(ni);
-
- if (NInoAttr(ni)) {
- /* Release the base inode if we are holding it. */
- if (ni->nr_extents == -1) {
- iput(VFS_I(ni->ext.base_ntfs_ino));
- ni->nr_extents = 0;
- ni->ext.base_ntfs_ino = NULL;
- }
- }
- BUG_ON(ni->page);
- if (!atomic_dec_and_test(&ni->count))
- BUG();
- return;
-}
-
-/**
- * ntfs_show_options - show mount options in /proc/mounts
- * @sf: seq_file in which to write our mount options
- * @root: root of the mounted tree whose mount options to display
- *
- * Called by the VFS once for each mounted ntfs volume when someone reads
- * /proc/mounts in order to display the NTFS specific mount options of each
- * mount. The mount options of fs specified by @root are written to the seq file
- * @sf and success is returned.
- */
-int ntfs_show_options(struct seq_file *sf, struct dentry *root)
-{
- ntfs_volume *vol = NTFS_SB(root->d_sb);
- int i;
-
- seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid));
- seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid));
- if (vol->fmask == vol->dmask)
- seq_printf(sf, ",umask=0%o", vol->fmask);
- else {
- seq_printf(sf, ",fmask=0%o", vol->fmask);
- seq_printf(sf, ",dmask=0%o", vol->dmask);
- }
- seq_printf(sf, ",nls=%s", vol->nls_map->charset);
- if (NVolCaseSensitive(vol))
- seq_printf(sf, ",case_sensitive");
- if (NVolShowSystemFiles(vol))
- seq_printf(sf, ",show_sys_files");
- if (!NVolSparseEnabled(vol))
- seq_printf(sf, ",disable_sparse");
- for (i = 0; on_errors_arr[i].val; i++) {
- if (on_errors_arr[i].val & vol->on_errors)
- seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
- }
- seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
- return 0;
-}
-
-#ifdef NTFS_RW
-
-static const char *es = " Leaving inconsistent metadata. Unmount and run "
- "chkdsk.";
-
-/**
- * ntfs_truncate - called when the i_size of an ntfs inode is changed
- * @vi: inode for which the i_size was changed
- *
- * We only support i_size changes for normal files at present, i.e. not
- * compressed and not encrypted. This is enforced in ntfs_setattr(), see
- * below.
- *
- * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
- * that the change is allowed.
- *
- * This implies for us that @vi is a file inode rather than a directory, index,
- * or attribute inode as well as that @vi is a base inode.
- *
- * Returns 0 on success or -errno on error.
- *
- * Called with ->i_mutex held.
- */
-int ntfs_truncate(struct inode *vi)
-{
- s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
- VCN highest_vcn;
- unsigned long flags;
- ntfs_inode *base_ni, *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- const char *te = " Leaving file length out of sync with i_size.";
- int err, mp_size, size_change, alloc_change;
-
- ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
- BUG_ON(NInoAttr(ni));
- BUG_ON(S_ISDIR(vi->i_mode));
- BUG_ON(NInoMstProtected(ni));
- BUG_ON(ni->nr_extents < 0);
-retry_truncate:
- /*
- * Lock the runlist for writing and map the mft record to ensure it is
- * safe to mess with the attribute runlist and sizes.
- */
- down_write(&ni->runlist.lock);
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
- "(error code %d).%s", vi->i_ino, err, te);
- ctx = NULL;
- m = NULL;
- goto old_bad_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- ntfs_error(vi->i_sb, "Failed to allocate a search context for "
- "inode 0x%lx (not enough memory).%s",
- vi->i_ino, te);
- err = -ENOMEM;
- goto old_bad_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT) {
- ntfs_error(vi->i_sb, "Open attribute is missing from "
- "mft record. Inode 0x%lx is corrupt. "
- "Run chkdsk.%s", vi->i_ino, te);
- err = -EIO;
- } else
- ntfs_error(vi->i_sb, "Failed to lookup attribute in "
- "inode 0x%lx (error code %d).%s",
- vi->i_ino, err, te);
- goto old_bad_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- /*
- * The i_size of the vfs inode is the new size for the attribute value.
- */
- new_size = i_size_read(vi);
- /* The current size of the attribute value is the old size. */
- old_size = ntfs_attr_size(a);
- /* Calculate the new allocated size. */
- if (NInoNonResident(ni))
- new_alloc_size = (new_size + vol->cluster_size - 1) &
- ~(s64)vol->cluster_size_mask;
- else
- new_alloc_size = (new_size + 7) & ~7;
- /* The current allocated size is the old allocated size. */
- read_lock_irqsave(&ni->size_lock, flags);
- old_alloc_size = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * The change in the file size. This will be 0 if no change, >0 if the
- * size is growing, and <0 if the size is shrinking.
- */
- size_change = -1;
- if (new_size - old_size >= 0) {
- size_change = 1;
- if (new_size == old_size)
- size_change = 0;
- }
- /* As above for the allocated size. */
- alloc_change = -1;
- if (new_alloc_size - old_alloc_size >= 0) {
- alloc_change = 1;
- if (new_alloc_size == old_alloc_size)
- alloc_change = 0;
- }
- /*
- * If neither the size nor the allocation are being changed there is
- * nothing to do.
- */
- if (!size_change && !alloc_change)
- goto unm_done;
- /* If the size is changing, check if new size is allowed in $AttrDef. */
- if (size_change) {
- err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
- if (unlikely(err)) {
- if (err == -ERANGE) {
- ntfs_error(vol->sb, "Truncate would cause the "
- "inode 0x%lx to %simum size "
- "for its attribute type "
- "(0x%x). Aborting truncate.",
- vi->i_ino,
- new_size > old_size ? "exceed "
- "the max" : "go under the min",
- le32_to_cpu(ni->type));
- err = -EFBIG;
- } else {
- ntfs_error(vol->sb, "Inode 0x%lx has unknown "
- "attribute type 0x%x. "
- "Aborting truncate.",
- vi->i_ino,
- le32_to_cpu(ni->type));
- err = -EIO;
- }
- /* Reset the vfs inode size to the old size. */
- i_size_write(vi, old_size);
- goto err_out;
- }
- }
- if (NInoCompressed(ni) || NInoEncrypted(ni)) {
- ntfs_warning(vi->i_sb, "Changes in inode size are not "
- "supported yet for %s files, ignoring.",
- NInoCompressed(ni) ? "compressed" :
- "encrypted");
- err = -EOPNOTSUPP;
- goto bad_out;
- }
- if (a->non_resident)
- goto do_non_resident_truncate;
- BUG_ON(NInoNonResident(ni));
- /* Resize the attribute record to best fit the new attribute size. */
- if (new_size < vol->mft_record_size &&
- !ntfs_resident_attr_value_resize(m, a, new_size)) {
- /* The resize succeeded! */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- write_lock_irqsave(&ni->size_lock, flags);
- /* Update the sizes in the ntfs inode and all is done. */
- ni->allocated_size = le32_to_cpu(a->length) -
- le16_to_cpu(a->data.resident.value_offset);
- /*
- * Note ntfs_resident_attr_value_resize() has already done any
- * necessary data clearing in the attribute record. When the
- * file is being shrunk vmtruncate() will already have cleared
- * the top part of the last partial page, i.e. since this is
- * the resident case this is the page with index 0. However,
- * when the file is being expanded, the page cache page data
- * between the old data_size, i.e. old_size, and the new_size
- * has not been zeroed. Fortunately, we do not need to zero it
- * either since on one hand it will either already be zero due
- * to both read_folio and writepage clearing partial page data
- * beyond i_size in which case there is nothing to do or in the
- * case of the file being mmap()ped at the same time, POSIX
- * specifies that the behaviour is unspecified thus we do not
- * have to do anything. This means that in our implementation
- * in the rare case that the file is mmap()ped and a write
- * occurred into the mmap()ped region just beyond the file size
- * and writepage has not yet been called to write out the page
- * (which would clear the area beyond the file size) and we now
- * extend the file size to incorporate this dirty region
- * outside the file size, a write of the page would result in
- * this data being written to disk instead of being cleared.
- * Given both POSIX and the Linux mmap(2) man page specify that
- * this corner case is undefined, we choose to leave it like
- * that as this is much simpler for us as we cannot lock the
- * relevant page now since we are holding too many ntfs locks
- * which would result in a lock reversal deadlock.
- */
- ni->initialized_size = new_size;
- write_unlock_irqrestore(&ni->size_lock, flags);
- goto unm_done;
- }
- /* If the above resize failed, this must be an attribute extension. */
- BUG_ON(size_change < 0);
- /*
- * We have to drop all the locks so we can call
- * ntfs_attr_make_non_resident(). This could be optimised by try-
- * locking the first page cache page and only if that fails dropping
- * the locks, locking the page, and redoing all the locking and
- * lookups. While this would be a huge optimisation, it is not worth
- * it as this is definitely a slow code path as it only ever can happen
- * once for any given file.
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- /*
- * Not enough space in the mft record, try to make the attribute
- * non-resident and if successful restart the truncation process.
- */
- err = ntfs_attr_make_non_resident(ni, old_size);
- if (likely(!err))
- goto retry_truncate;
- /*
- * Could not make non-resident. If this is due to this not being
- * permitted for this attribute type or there not being enough space,
- * try to make other attributes non-resident. Otherwise fail.
- */
- if (unlikely(err != -EPERM && err != -ENOSPC)) {
- ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
- "type 0x%x, because the conversion from "
- "resident to non-resident attribute failed "
- "with error code %i.", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- if (err != -ENOMEM)
- err = -EIO;
- goto conv_err_out;
- }
- /* TODO: Not implemented from here, abort. */
- if (err == -ENOSPC)
- ntfs_error(vol->sb, "Not enough space in the mft record/on "
- "disk for the non-resident attribute value. "
- "This case is not implemented yet.");
- else /* if (err == -EPERM) */
- ntfs_error(vol->sb, "This attribute type may not be "
- "non-resident. This case is not implemented "
- "yet.");
- err = -EOPNOTSUPP;
- goto conv_err_out;
-#if 0
- // TODO: Attempt to make other attributes non-resident.
- if (!err)
- goto do_resident_extend;
- /*
- * Both the attribute list attribute and the standard information
- * attribute must remain in the base inode. Thus, if this is one of
- * these attributes, we have to try to move other attributes out into
- * extent mft records instead.
- */
- if (ni->type == AT_ATTRIBUTE_LIST ||
- ni->type == AT_STANDARD_INFORMATION) {
- // TODO: Attempt to move other attributes into extent mft
- // records.
- err = -EOPNOTSUPP;
- if (!err)
- goto do_resident_extend;
- goto err_out;
- }
- // TODO: Attempt to move this attribute to an extent mft record, but
- // only if it is not already the only attribute in an mft record in
- // which case there would be nothing to gain.
- err = -EOPNOTSUPP;
- if (!err)
- goto do_resident_extend;
- /* There is nothing we can do to make enough space. )-: */
- goto err_out;
-#endif
-do_non_resident_truncate:
- BUG_ON(!NInoNonResident(ni));
- if (alloc_change < 0) {
- highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
- if (highest_vcn > 0 &&
- old_alloc_size >> vol->cluster_size_bits >
- highest_vcn + 1) {
- /*
- * This attribute has multiple extents. Not yet
- * supported.
- */
- ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
- "attribute type 0x%x, because the "
- "attribute is highly fragmented (it "
- "consists of multiple extents) and "
- "this case is not implemented yet.",
- vi->i_ino,
- (unsigned)le32_to_cpu(ni->type));
- err = -EOPNOTSUPP;
- goto bad_out;
- }
- }
- /*
- * If the size is shrinking, need to reduce the initialized_size and
- * the data_size before reducing the allocation.
- */
- if (size_change < 0) {
- /*
- * Make the valid size smaller (i_size is already up-to-date).
- */
- write_lock_irqsave(&ni->size_lock, flags);
- if (new_size < ni->initialized_size) {
- ni->initialized_size = new_size;
- a->data.non_resident.initialized_size =
- cpu_to_sle64(new_size);
- }
- a->data.non_resident.data_size = cpu_to_sle64(new_size);
- write_unlock_irqrestore(&ni->size_lock, flags);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- /* If the allocated size is not changing, we are done. */
- if (!alloc_change)
- goto unm_done;
- /*
- * If the size is shrinking it makes no sense for the
- * allocation to be growing.
- */
- BUG_ON(alloc_change > 0);
- } else /* if (size_change >= 0) */ {
- /*
- * The file size is growing or staying the same but the
- * allocation can be shrinking, growing or staying the same.
- */
- if (alloc_change > 0) {
- /*
- * We need to extend the allocation and possibly update
- * the data size. If we are updating the data size,
- * since we are not touching the initialized_size we do
- * not need to worry about the actual data on disk.
- * And as far as the page cache is concerned, there
- * will be no pages beyond the old data size and any
- * partial region in the last page between the old and
- * new data size (or the end of the page if the new
- * data size is outside the page) does not need to be
- * modified as explained above for the resident
- * attribute truncate case. To do this, we simply drop
- * the locks we hold and leave all the work to our
- * friendly helper ntfs_attr_extend_allocation().
- */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
- err = ntfs_attr_extend_allocation(ni, new_size,
- size_change > 0 ? new_size : -1, -1);
- /*
- * ntfs_attr_extend_allocation() will have done error
- * output already.
- */
- goto done;
- }
- if (!alloc_change)
- goto alloc_done;
- }
- /* alloc_change < 0 */
- /* Free the clusters. */
- nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
- vol->cluster_size_bits, -1, ctx);
- m = ctx->mrec;
- a = ctx->attr;
- if (unlikely(nr_freed < 0)) {
- ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
- "%lli). Unmount and run chkdsk to recover "
- "the lost cluster(s).", (long long)nr_freed);
- NVolSetErrors(vol);
- nr_freed = 0;
- }
- /* Truncate the runlist. */
- err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
- new_alloc_size >> vol->cluster_size_bits);
- /*
- * If the runlist truncation failed and/or the search context is no
- * longer valid, we cannot resize the attribute record or build the
- * mapping pairs array thus we mark the inode bad so that no access to
- * the freed clusters can happen.
- */
- if (unlikely(err || IS_ERR(m))) {
- ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
- IS_ERR(m) ?
- "restore attribute search context" :
- "truncate attribute runlist",
- IS_ERR(m) ? PTR_ERR(m) : err, es);
- err = -EIO;
- goto bad_out;
- }
- /* Get the size for the shrunk mapping pairs array for the runlist. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
- if (unlikely(mp_size <= 0)) {
- ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
- "attribute type 0x%x, because determining the "
- "size for the mapping pairs failed with error "
- "code %i.%s", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), mp_size, es);
- err = -EIO;
- goto bad_out;
- }
- /*
- * Shrink the attribute record for the new mapping pairs array. Note,
- * this cannot fail since we are making the attribute smaller thus by
- * definition there is enough space to do so.
- */
- err = ntfs_attr_record_resize(m, a, mp_size +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
- BUG_ON(err);
- /*
- * Generate the mapping pairs array directly into the attribute record.
- */
- err = ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
- mp_size, ni->runlist.rl, 0, -1, NULL);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
- "attribute type 0x%x, because building the "
- "mapping pairs failed with error code %i.%s",
- vi->i_ino, (unsigned)le32_to_cpu(ni->type),
- err, es);
- err = -EIO;
- goto bad_out;
- }
- /* Update the allocated/compressed size as well as the highest vcn. */
- a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
- vol->cluster_size_bits) - 1);
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = new_alloc_size;
- a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
- if (NInoSparse(ni) || NInoCompressed(ni)) {
- if (nr_freed) {
- ni->itype.compressed.size -= nr_freed <<
- vol->cluster_size_bits;
- BUG_ON(ni->itype.compressed.size < 0);
- a->data.non_resident.compressed_size = cpu_to_sle64(
- ni->itype.compressed.size);
- vi->i_blocks = ni->itype.compressed.size >> 9;
- }
- } else
- vi->i_blocks = new_alloc_size >> 9;
- write_unlock_irqrestore(&ni->size_lock, flags);
- /*
- * We have shrunk the allocation. If this is a shrinking truncate we
- * have already dealt with the initialized_size and the data_size above
- * and we are done. If the truncate is only changing the allocation
- * and not the data_size, we are also done. If this is an extending
- * truncate, need to extend the data_size now which is ensured by the
- * fact that @size_change is positive.
- */
-alloc_done:
- /*
- * If the size is growing, need to update it now. If it is shrinking,
- * we have already updated it above (before the allocation change).
- */
- if (size_change > 0)
- a->data.non_resident.data_size = cpu_to_sle64(new_size);
- /* Ensure the modified mft record is written out. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
-unm_done:
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
-done:
- /* Update the mtime and ctime on the base inode. */
- /* normally ->truncate shouldn't update ctime or mtime,
- * but ntfs did before so it got a copy & paste version
- * of file_update_time. one day someone should fix this
- * for real.
- */
- if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
- struct timespec64 now = current_time(VFS_I(base_ni));
- struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
- struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni));
- int sync_it = 0;
-
- if (!timespec64_equal(&mtime, &now) ||
- !timespec64_equal(&ctime, &now))
- sync_it = 1;
- inode_set_ctime_to_ts(VFS_I(base_ni), now);
- inode_set_mtime_to_ts(VFS_I(base_ni), now);
-
- if (sync_it)
- mark_inode_dirty_sync(VFS_I(base_ni));
- }
-
- if (likely(!err)) {
- NInoClearTruncateFailed(ni);
- ntfs_debug("Done.");
- }
- return err;
-old_bad_out:
- old_size = -1;
-bad_out:
- if (err != -ENOMEM && err != -EOPNOTSUPP)
- NVolSetErrors(vol);
- if (err != -EOPNOTSUPP)
- NInoSetTruncateFailed(ni);
- else if (old_size >= 0)
- i_size_write(vi, old_size);
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- up_write(&ni->runlist.lock);
-out:
- ntfs_debug("Failed. Returning error code %i.", err);
- return err;
-conv_err_out:
- if (err != -ENOMEM && err != -EOPNOTSUPP)
- NVolSetErrors(vol);
- if (err != -EOPNOTSUPP)
- NInoSetTruncateFailed(ni);
- else
- i_size_write(vi, old_size);
- goto out;
-}
-
-/**
- * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
- * @vi: inode for which the i_size was changed
- *
- * Wrapper for ntfs_truncate() that has no return value.
- *
- * See ntfs_truncate() description above for details.
- */
-#ifdef NTFS_RW
-void ntfs_truncate_vfs(struct inode *vi) {
- ntfs_truncate(vi);
-}
-#endif
-
-/**
- * ntfs_setattr - called from notify_change() when an attribute is being changed
- * @idmap: idmap of the mount the inode was found from
- * @dentry: dentry whose attributes to change
- * @attr: structure describing the attributes and the changes
- *
- * We have to trap VFS attempts to truncate the file described by @dentry as
- * soon as possible, because we do not implement changes in i_size yet. So we
- * abort all i_size changes here.
- *
- * We also abort all changes of user, group, and mode as we do not implement
- * the NTFS ACLs yet.
- *
- * Called with ->i_mutex held.
- */
-int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct iattr *attr)
-{
- struct inode *vi = d_inode(dentry);
- int err;
- unsigned int ia_valid = attr->ia_valid;
-
- err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
- if (err)
- goto out;
- /* We do not support NTFS ACLs yet. */
- if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
- ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
- "supported yet, ignoring.");
- err = -EOPNOTSUPP;
- goto out;
- }
- if (ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(vi)) {
- ntfs_inode *ni = NTFS_I(vi);
- /*
- * FIXME: For now we do not support resizing of
- * compressed or encrypted files yet.
- */
- if (NInoCompressed(ni) || NInoEncrypted(ni)) {
- ntfs_warning(vi->i_sb, "Changes in inode size "
- "are not supported yet for "
- "%s files, ignoring.",
- NInoCompressed(ni) ?
- "compressed" : "encrypted");
- err = -EOPNOTSUPP;
- } else {
- truncate_setsize(vi, attr->ia_size);
- ntfs_truncate_vfs(vi);
- }
- if (err || ia_valid == ATTR_SIZE)
- goto out;
- } else {
- /*
- * We skipped the truncate but must still update
- * timestamps.
- */
- ia_valid |= ATTR_MTIME | ATTR_CTIME;
- }
- }
- if (ia_valid & ATTR_ATIME)
- inode_set_atime_to_ts(vi, attr->ia_atime);
- if (ia_valid & ATTR_MTIME)
- inode_set_mtime_to_ts(vi, attr->ia_mtime);
- if (ia_valid & ATTR_CTIME)
- inode_set_ctime_to_ts(vi, attr->ia_ctime);
- mark_inode_dirty(vi);
-out:
- return err;
-}
-
-/**
- * __ntfs_write_inode - write out a dirty inode
- * @vi: inode to write out
- * @sync: if true, write out synchronously
- *
- * Write out a dirty inode to disk including any extent inodes if present.
- *
- * If @sync is true, commit the inode to disk and wait for io completion. This
- * is done using write_mft_record().
- *
- * If @sync is false, just schedule the write to happen but do not wait for i/o
- * completion. In 2.6 kernels, scheduling usually happens just by virtue of
- * marking the page (and in this case mft record) dirty but we do not implement
- * this yet as write_mft_record() largely ignores the @sync parameter and
- * always performs synchronous writes.
- *
- * Return 0 on success and -errno on error.
- */
-int __ntfs_write_inode(struct inode *vi, int sync)
-{
- sle64 nt;
- ntfs_inode *ni = NTFS_I(vi);
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- STANDARD_INFORMATION *si;
- int err = 0;
- bool modified = false;
-
- ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
- vi->i_ino);
- /*
- * Dirty attribute inodes are written via their real inodes so just
- * clean them here. Access time updates are taken care off when the
- * real inode is written.
- */
- if (NInoAttr(ni)) {
- NInoClearDirty(ni);
- ntfs_debug("Done.");
- return 0;
- }
- /* Map, pin, and lock the mft record belonging to the inode. */
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- /* Update the access times in the standard information attribute. */
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto unm_err_out;
- }
- err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- ntfs_attr_put_search_ctx(ctx);
- goto unm_err_out;
- }
- si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- /* Update the access times if they have changed. */
- nt = utc2ntfs(inode_get_mtime(vi));
- if (si->last_data_change_time != nt) {
- ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
- "new = 0x%llx", vi->i_ino, (long long)
- sle64_to_cpu(si->last_data_change_time),
- (long long)sle64_to_cpu(nt));
- si->last_data_change_time = nt;
- modified = true;
- }
- nt = utc2ntfs(inode_get_ctime(vi));
- if (si->last_mft_change_time != nt) {
- ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
- "new = 0x%llx", vi->i_ino, (long long)
- sle64_to_cpu(si->last_mft_change_time),
- (long long)sle64_to_cpu(nt));
- si->last_mft_change_time = nt;
- modified = true;
- }
- nt = utc2ntfs(inode_get_atime(vi));
- if (si->last_access_time != nt) {
- ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
- "new = 0x%llx", vi->i_ino,
- (long long)sle64_to_cpu(si->last_access_time),
- (long long)sle64_to_cpu(nt));
- si->last_access_time = nt;
- modified = true;
- }
- /*
- * If we just modified the standard information attribute we need to
- * mark the mft record it is in dirty. We do this manually so that
- * mark_inode_dirty() is not called which would redirty the inode and
- * hence result in an infinite loop of trying to write the inode.
- * There is no need to mark the base inode nor the base mft record
- * dirty, since we are going to write this mft record below in any case
- * and the base mft record may actually not have been modified so it
- * might not need to be written out.
- * NOTE: It is not a problem when the inode for $MFT itself is being
- * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
- * on the $MFT inode and hence __ntfs_write_inode() will not be
- * re-invoked because of it which in turn is ok since the dirtied mft
- * record will be cleaned and written out to disk below, i.e. before
- * this function returns.
- */
- if (modified) {
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- if (!NInoTestSetDirty(ctx->ntfs_ino))
- mark_ntfs_record_dirty(ctx->ntfs_ino->page,
- ctx->ntfs_ino->page_ofs);
- }
- ntfs_attr_put_search_ctx(ctx);
- /* Now the access times are updated, write the base mft record. */
- if (NInoDirty(ni))
- err = write_mft_record(ni, m, sync);
- /* Write all attached extent mft records. */
- mutex_lock(&ni->extent_lock);
- if (ni->nr_extents > 0) {
- ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
- int i;
-
- ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
- for (i = 0; i < ni->nr_extents; i++) {
- ntfs_inode *tni = extent_nis[i];
-
- if (NInoDirty(tni)) {
- MFT_RECORD *tm = map_mft_record(tni);
- int ret;
-
- if (IS_ERR(tm)) {
- if (!err || err == -ENOMEM)
- err = PTR_ERR(tm);
- continue;
- }
- ret = write_mft_record(tni, tm, sync);
- unmap_mft_record(tni);
- if (unlikely(ret)) {
- if (!err || err == -ENOMEM)
- err = ret;
- }
- }
- }
- }
- mutex_unlock(&ni->extent_lock);
- unmap_mft_record(ni);
- if (unlikely(err))
- goto err_out;
- ntfs_debug("Done.");
- return 0;
-unm_err_out:
- unmap_mft_record(ni);
-err_out:
- if (err == -ENOMEM) {
- ntfs_warning(vi->i_sb, "Not enough memory to write inode. "
- "Marking the inode dirty again, so the VFS "
- "retries later.");
- mark_inode_dirty(vi);
- } else {
- ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err);
- NVolSetErrors(ni->vol);
- }
- return err;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
deleted file mode 100644
index 147ef4ddb691..000000000000
--- a/fs/ntfs/inode.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
- * the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_INODE_H
-#define _LINUX_NTFS_INODE_H
-
-#include <linux/atomic.h>
-
-#include <linux/fs.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/mutex.h>
-#include <linux/seq_file.h>
-
-#include "layout.h"
-#include "volume.h"
-#include "types.h"
-#include "runlist.h"
-#include "debug.h"
-
-typedef struct _ntfs_inode ntfs_inode;
-
-/*
- * The NTFS in-memory inode structure. It is just used as an extension to the
- * fields already provided in the VFS inode.
- */
-struct _ntfs_inode {
- rwlock_t size_lock; /* Lock serializing access to inode sizes. */
- s64 initialized_size; /* Copy from the attribute record. */
- s64 allocated_size; /* Copy from the attribute record. */
- unsigned long state; /* NTFS specific flags describing this inode.
- See ntfs_inode_state_bits below. */
- unsigned long mft_no; /* Number of the mft record / inode. */
- u16 seq_no; /* Sequence number of the mft record. */
- atomic_t count; /* Inode reference count for book keeping. */
- ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */
- /*
- * If NInoAttr() is true, the below fields describe the attribute which
- * this fake inode belongs to. The actual inode of this attribute is
- * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
- * below). For real inodes, we also set the type (AT_DATA for files and
- * AT_INDEX_ALLOCATION for directories), with the name = NULL and
- * name_len = 0 for files and name = I30 (global constant) and
- * name_len = 4 for directories.
- */
- ATTR_TYPE type; /* Attribute type of this fake inode. */
- ntfschar *name; /* Attribute name of this fake inode. */
- u32 name_len; /* Attribute name length of this fake inode. */
- runlist runlist; /* If state has the NI_NonResident bit set,
- the runlist of the unnamed data attribute
- (if a file) or of the index allocation
- attribute (directory) or of the attribute
- described by the fake inode (if NInoAttr()).
- If runlist.rl is NULL, the runlist has not
- been read in yet or has been unmapped. If
- NI_NonResident is clear, the attribute is
- resident (file and fake inode) or there is
- no $I30 index allocation attribute
- (small directory). In the latter case
- runlist.rl is always NULL.*/
- /*
- * The following fields are only valid for real inodes and extent
- * inodes.
- */
- struct mutex mrec_lock; /* Lock for serializing access to the
- mft record belonging to this inode. */
- struct page *page; /* The page containing the mft record of the
- inode. This should only be touched by the
- (un)map_mft_record*() functions. */
- int page_ofs; /* Offset into the page at which the mft record
- begins. This should only be touched by the
- (un)map_mft_record*() functions. */
- /*
- * Attribute list support (only for use by the attribute lookup
- * functions). Setup during read_inode for all inodes with attribute
- * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
- * further only valid if NI_AttrListNonResident is set.
- */
- u32 attr_list_size; /* Length of attribute list value in bytes. */
- u8 *attr_list; /* Attribute list value itself. */
- runlist attr_list_rl; /* Run list for the attribute list value. */
- union {
- struct { /* It is a directory, $MFT, or an index inode. */
- u32 block_size; /* Size of an index block. */
- u32 vcn_size; /* Size of a vcn in this
- index. */
- COLLATION_RULE collation_rule; /* The collation rule
- for the index. */
- u8 block_size_bits; /* Log2 of the above. */
- u8 vcn_size_bits; /* Log2 of the above. */
- } index;
- struct { /* It is a compressed/sparse file/attribute inode. */
- s64 size; /* Copy of compressed_size from
- $DATA. */
- u32 block_size; /* Size of a compression block
- (cb). */
- u8 block_size_bits; /* Log2 of the size of a cb. */
- u8 block_clusters; /* Number of clusters per cb. */
- } compressed;
- } itype;
- struct mutex extent_lock; /* Lock for accessing/modifying the
- below . */
- s32 nr_extents; /* For a base mft record, the number of attached extent
- inodes (0 if none), for extent records and for fake
- inodes describing an attribute this is -1. */
- union { /* This union is only used if nr_extents != 0. */
- ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of
- the ntfs inodes of the extent
- mft records belonging to
- this base inode which have
- been loaded. */
- ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the
- ntfs inode of the base mft
- record. For fake inodes, the
- real (base) inode to which
- the attribute belongs. */
- } ext;
-};
-
-/*
- * Defined bits for the state field in the ntfs_inode structure.
- * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
- */
-typedef enum {
- NI_Dirty, /* 1: Mft record needs to be written to disk. */
- NI_AttrList, /* 1: Mft record contains an attribute list. */
- NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies
- NI_AttrList is set. */
-
- NI_Attr, /* 1: Fake inode for attribute i/o.
- 0: Real inode or extent inode. */
-
- NI_MstProtected, /* 1: Attribute is protected by MST fixups.
- 0: Attribute is not protected by fixups. */
- NI_NonResident, /* 1: Unnamed data attr is non-resident (f).
- 1: Attribute is non-resident (a). */
- NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is
- present (d). */
- NI_Compressed, /* 1: Unnamed data attr is compressed (f).
- 1: Create compressed files by default (d).
- 1: Attribute is compressed (a). */
- NI_Encrypted, /* 1: Unnamed data attr is encrypted (f).
- 1: Create encrypted files by default (d).
- 1: Attribute is encrypted (a). */
- NI_Sparse, /* 1: Unnamed data attr is sparse (f).
- 1: Create sparse files by default (d).
- 1: Attribute is sparse (a). */
- NI_SparseDisabled, /* 1: May not create sparse regions. */
- NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */
-} ntfs_inode_state_bits;
-
-/*
- * NOTE: We should be adding dirty mft records to a list somewhere and they
- * should be independent of the (ntfs/vfs) inode structure so that an inode can
- * be removed but the record can be left dirty for syncing later.
- */
-
-/*
- * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
- * functions.
- */
-#define NINO_FNS(flag) \
-static inline int NIno##flag(ntfs_inode *ni) \
-{ \
- return test_bit(NI_##flag, &(ni)->state); \
-} \
-static inline void NInoSet##flag(ntfs_inode *ni) \
-{ \
- set_bit(NI_##flag, &(ni)->state); \
-} \
-static inline void NInoClear##flag(ntfs_inode *ni) \
-{ \
- clear_bit(NI_##flag, &(ni)->state); \
-}
-
-/*
- * As above for NInoTestSetFoo() and NInoTestClearFoo().
- */
-#define TAS_NINO_FNS(flag) \
-static inline int NInoTestSet##flag(ntfs_inode *ni) \
-{ \
- return test_and_set_bit(NI_##flag, &(ni)->state); \
-} \
-static inline int NInoTestClear##flag(ntfs_inode *ni) \
-{ \
- return test_and_clear_bit(NI_##flag, &(ni)->state); \
-}
-
-/* Emit the ntfs inode bitops functions. */
-NINO_FNS(Dirty)
-TAS_NINO_FNS(Dirty)
-NINO_FNS(AttrList)
-NINO_FNS(AttrListNonResident)
-NINO_FNS(Attr)
-NINO_FNS(MstProtected)
-NINO_FNS(NonResident)
-NINO_FNS(IndexAllocPresent)
-NINO_FNS(Compressed)
-NINO_FNS(Encrypted)
-NINO_FNS(Sparse)
-NINO_FNS(SparseDisabled)
-NINO_FNS(TruncateFailed)
-
-/*
- * The full structure containing a ntfs_inode and a vfs struct inode. Used for
- * all real and fake inodes but not for extent inodes which lack the vfs struct
- * inode.
- */
-typedef struct {
- ntfs_inode ntfs_inode;
- struct inode vfs_inode; /* The vfs inode structure. */
-} big_ntfs_inode;
-
-/**
- * NTFS_I - return the ntfs inode given a vfs inode
- * @inode: VFS inode
- *
- * NTFS_I() returns the ntfs inode associated with the VFS @inode.
- */
-static inline ntfs_inode *NTFS_I(struct inode *inode)
-{
- return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode);
-}
-
-static inline struct inode *VFS_I(ntfs_inode *ni)
-{
- return &((big_ntfs_inode *)ni)->vfs_inode;
-}
-
-/**
- * ntfs_attr - ntfs in memory attribute structure
- * @mft_no: mft record number of the base mft record of this attribute
- * @name: Unicode name of the attribute (NULL if unnamed)
- * @name_len: length of @name in Unicode characters (0 if unnamed)
- * @type: attribute type (see layout.h)
- *
- * This structure exists only to provide a small structure for the
- * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
- *
- * NOTE: Elements are ordered by size to make the structure as compact as
- * possible on all architectures.
- */
-typedef struct {
- unsigned long mft_no;
- ntfschar *name;
- u32 name_len;
- ATTR_TYPE type;
-} ntfs_attr;
-
-extern int ntfs_test_inode(struct inode *vi, void *data);
-
-extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
-extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
- ntfschar *name, u32 name_len);
-extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
- u32 name_len);
-
-extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
-extern void ntfs_free_big_inode(struct inode *inode);
-extern void ntfs_evict_big_inode(struct inode *vi);
-
-extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
-
-static inline void ntfs_init_big_inode(struct inode *vi)
-{
- ntfs_inode *ni = NTFS_I(vi);
-
- ntfs_debug("Entering.");
- __ntfs_init_inode(vi->i_sb, ni);
- ni->mft_no = vi->i_ino;
-}
-
-extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
- unsigned long mft_no);
-extern void ntfs_clear_extent_inode(ntfs_inode *ni);
-
-extern int ntfs_read_inode_mount(struct inode *vi);
-
-extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
-
-#ifdef NTFS_RW
-
-extern int ntfs_truncate(struct inode *vi);
-extern void ntfs_truncate_vfs(struct inode *vi);
-
-extern int ntfs_setattr(struct mnt_idmap *idmap,
- struct dentry *dentry, struct iattr *attr);
-
-extern int __ntfs_write_inode(struct inode *vi, int sync);
-
-static inline void ntfs_commit_inode(struct inode *vi)
-{
- if (!is_bad_inode(vi))
- __ntfs_write_inode(vi, 1);
- return;
-}
-
-#else
-
-static inline void ntfs_truncate_vfs(struct inode *vi) {}
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
deleted file mode 100644
index 5d4bf7a3259f..000000000000
--- a/fs/ntfs/layout.h
+++ /dev/null
@@ -1,2421 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_LAYOUT_H
-#define _LINUX_NTFS_LAYOUT_H
-
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/list.h>
-#include <asm/byteorder.h>
-
-#include "types.h"
-
-/* The NTFS oem_id "NTFS " */
-#define magicNTFS cpu_to_le64(0x202020205346544eULL)
-
-/*
- * Location of bootsector on partition:
- * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
- * On NT4 and above there is one backup copy of the boot sector to
- * be found on the last sector of the partition (not normally accessible
- * from within Windows as the bootsector contained number of sectors
- * value is one less than the actual value!).
- * On versions of NT 3.51 and earlier, the backup copy was located at
- * number of sectors/2 (integer divide), i.e. in the middle of the volume.
- */
-
-/*
- * BIOS parameter block (bpb) structure.
- */
-typedef struct {
- le16 bytes_per_sector; /* Size of a sector in bytes. */
- u8 sectors_per_cluster; /* Size of a cluster in sectors. */
- le16 reserved_sectors; /* zero */
- u8 fats; /* zero */
- le16 root_entries; /* zero */
- le16 sectors; /* zero */
- u8 media_type; /* 0xf8 = hard disk */
- le16 sectors_per_fat; /* zero */
- le16 sectors_per_track; /* irrelevant */
- le16 heads; /* irrelevant */
- le32 hidden_sectors; /* zero */
- le32 large_sectors; /* zero */
-} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
-
-/*
- * NTFS boot sector structure.
- */
-typedef struct {
- u8 jump[3]; /* Irrelevant (jump to boot up code).*/
- le64 oem_id; /* Magic "NTFS ". */
- BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */
- u8 unused[4]; /* zero, NTFS diskedit.exe states that
- this is actually:
- __u8 physical_drive; // 0x80
- __u8 current_head; // zero
- __u8 extended_boot_signature;
- // 0x80
- __u8 unused; // zero
- */
-/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives
- maximum volume size of 2^63 sectors.
- Assuming standard sector size of 512
- bytes, the maximum byte size is
- approx. 4.7x10^21 bytes. (-; */
- sle64 mft_lcn; /* Cluster location of mft data. */
- sle64 mftmirr_lcn; /* Cluster location of copy of mft. */
- s8 clusters_per_mft_record; /* Mft record size in clusters. */
- u8 reserved0[3]; /* zero */
- s8 clusters_per_index_record; /* Index block size in clusters. */
- u8 reserved1[3]; /* zero */
- le64 volume_serial_number; /* Irrelevant (serial number). */
- le32 checksum; /* Boot sector checksum. */
-/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */
- le16 end_of_sector_marker; /* End of bootsector magic. Always is
- 0xaa55 in little endian. */
-/* sizeof() = 512 (0x200) bytes */
-} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
-
-/*
- * Magic identifiers present at the beginning of all ntfs record containing
- * records (like mft records for example).
- */
-enum {
- /* Found in $MFT/$DATA. */
- magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
- magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
- magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
-
- /* Found in $LogFile/$DATA. */
- magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
- magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
-
- /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */
- magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
-
- /* Found in all ntfs record containing records. */
- magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
- transfer was detected. */
- /*
- * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
- * thus not initialized. Page must be initialized before using it.
- */
- magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
-};
-
-typedef le32 NTFS_RECORD_TYPE;
-
-/*
- * Generic magic comparison macros. Finally found a use for the ## preprocessor
- * operator! (-8
- */
-
-static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
-{
- return (x == r);
-}
-#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m)
-
-static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
-{
- return (*p == r);
-}
-#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m)
-
-/*
- * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
- */
-#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) )
-#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) )
-#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) )
-#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) )
-#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) )
-#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) )
-#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) )
-#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) )
-
-#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) )
-#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) )
-#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) )
-#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) )
-
-#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) )
-#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) )
-
-#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) )
-#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) )
-
-#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) )
-#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) )
-
-/*
- * The Update Sequence Array (usa) is an array of the le16 values which belong
- * to the end of each sector protected by the update sequence record in which
- * this array is contained. Note that the first entry is the Update Sequence
- * Number (usn), a cyclic counter of how many times the protected record has
- * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
- * last le16's of each sector have to be equal to the usn (during reading) or
- * are set to it (during writing). If they are not, an incomplete multi sector
- * transfer has occurred when the data was written.
- * The maximum size for the update sequence array is fixed to:
- * maximum size = usa_ofs + (usa_count * 2) = 510 bytes
- * The 510 bytes comes from the fact that the last le16 in the array has to
- * (obviously) finish before the last le16 of the first 512-byte sector.
- * This formula can be used as a consistency check in that usa_ofs +
- * (usa_count * 2) has to be less than or equal to 510.
- */
-typedef struct {
- NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record
- type and/or status. */
- le16 usa_ofs; /* Offset to the Update Sequence Array (usa)
- from the start of the ntfs record. */
- le16 usa_count; /* Number of le16 sized entries in the usa
- including the Update Sequence Number (usn),
- thus the number of fixups is the usa_count
- minus 1. */
-} __attribute__ ((__packed__)) NTFS_RECORD;
-
-/*
- * System files mft record numbers. All these files are always marked as used
- * in the bitmap attribute of the mft; presumably in order to avoid accidental
- * allocation for random other mft records. Also, the sequence number for each
- * of the system files is always equal to their mft record number and it is
- * never modified.
- */
-typedef enum {
- FILE_MFT = 0, /* Master file table (mft). Data attribute
- contains the entries and bitmap attribute
- records which ones are in use (bit==1). */
- FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records
- in data attribute. If cluster size > 4kiB,
- copy of first N mft records, with
- N = cluster_size / mft_record_size. */
- FILE_LogFile = 2, /* Journalling log in data attribute. */
- FILE_Volume = 3, /* Volume name attribute and volume information
- attribute (flags and ntfs version). Windows
- refers to this file as volume DASD (Direct
- Access Storage Device). */
- FILE_AttrDef = 4, /* Array of attribute definitions in data
- attribute. */
- FILE_root = 5, /* Root directory. */
- FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in
- data attribute. */
- FILE_Boot = 7, /* Boot sector (always at cluster 0) in data
- attribute. */
- FILE_BadClus = 8, /* Contains all bad clusters in the non-resident
- data attribute. */
- FILE_Secure = 9, /* Shared security descriptors in data attribute
- and two indexes into the descriptors.
- Appeared in Windows 2000. Before that, this
- file was named $Quota but was unused. */
- FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode
- characters in data attribute. */
- FILE_Extend = 11, /* Directory containing other system files (eg.
- $ObjId, $Quota, $Reparse and $UsnJrnl). This
- is new to NTFS3.0. */
- FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */
- FILE_reserved13 = 13,
- FILE_reserved14 = 14,
- FILE_reserved15 = 15,
- FILE_first_user = 16, /* First user file, used as test limit for
- whether to allow opening a file or not. */
-} NTFS_SYSTEM_FILES;
-
-/*
- * These are the so far known MFT_RECORD_* flags (16-bit) which contain
- * information about the mft record in which they are present.
- */
-enum {
- MFT_RECORD_IN_USE = cpu_to_le16(0x0001),
- MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
-} __attribute__ ((__packed__));
-
-typedef le16 MFT_RECORD_FLAGS;
-
-/*
- * mft references (aka file references or file record segment references) are
- * used whenever a structure needs to refer to a record in the mft.
- *
- * A reference consists of a 48-bit index into the mft and a 16-bit sequence
- * number used to detect stale references.
- *
- * For error reporting purposes we treat the 48-bit index as a signed quantity.
- *
- * The sequence number is a circular counter (skipping 0) describing how many
- * times the referenced mft record has been (re)used. This has to match the
- * sequence number of the mft record being referenced, otherwise the reference
- * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
- *
- * If the sequence number is zero it is assumed that no sequence number
- * consistency checking should be performed.
- *
- * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
- * for high_part being 0 and if not either BUG(), cause a panic() or handle
- * the situation in some other way. This shouldn't be a problem as a volume has
- * to become HUGE in order to need more than 32-bits worth of mft records.
- * Assuming the standard mft record size of 1kb only the records (never mind
- * the non-resident attributes, etc.) would require 4Tb of space on their own
- * for the first 32 bits worth of records. This is only if some strange person
- * doesn't decide to foul play and make the mft sparse which would be a really
- * horrible thing to do as it would trash our current driver implementation. )-:
- * Do I hear screams "we want 64-bit inodes!" ?!? (-;
- *
- * FIXME: The mft zone is defined as the first 12% of the volume. This space is
- * reserved so that the mft can grow contiguously and hence doesn't become
- * fragmented. Volume free space includes the empty part of the mft zone and
- * when the volume's free 88% are used up, the mft zone is shrunk by a factor
- * of 2, thus making more space available for more files/data. This process is
- * repeated every time there is no more free space except for the mft zone until
- * there really is no more free space.
- */
-
-/*
- * Typedef the MFT_REF as a 64-bit value for easier handling.
- * Also define two unpacking macros to get to the reference (MREF) and
- * sequence number (MSEQNO) respectively.
- * The _LE versions are to be applied on little endian MFT_REFs.
- * Note: The _LE versions will return a CPU endian formatted value!
- */
-#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
-
-typedef u64 MFT_REF;
-typedef le64 leMFT_REF;
-
-#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \
- ((MFT_REF)(m) & MFT_REF_MASK_CPU)))
-#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
-
-#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU))
-#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff))
-#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
-#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff))
-
-#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false)
-#define ERR_MREF(x) ((u64)((s64)(x)))
-#define MREF_ERR(x) ((int)((s64)(x)))
-
-/*
- * The mft record header present at the beginning of every record in the mft.
- * This is followed by a sequence of variable length attribute records which
- * is terminated by an attribute of type AT_END which is a truncated attribute
- * in that it only consists of the attribute type code AT_END and none of the
- * other members of the attribute structure are present.
- */
-typedef struct {
-/*Ofs*/
-/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
- NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
- le16 usa_ofs; /* See NTFS_RECORD definition above. */
- le16 usa_count; /* See NTFS_RECORD definition above. */
-
-/* 8*/ le64 lsn; /* $LogFile sequence number for this record.
- Changed every time the record is modified. */
-/* 16*/ le16 sequence_number; /* Number of times this mft record has been
- reused. (See description for MFT_REF
- above.) NOTE: The increment (skipping zero)
- is done when the file is deleted. NOTE: If
- this is zero it is left zero. */
-/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of
- directory entries referencing this record.
- NOTE: Only used in mft base records.
- NOTE: When deleting a directory entry we
- check the link_count and if it is 1 we
- delete the file. Otherwise we delete the
- FILE_NAME_ATTR being referenced by the
- directory entry from the mft record and
- decrement the link_count.
- FIXME: Careful with Win32 + DOS names! */
-/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this
- mft record from the start of the mft record.
- NOTE: Must be aligned to 8-byte boundary. */
-/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
- is deleted, the MFT_RECORD_IN_USE flag is
- set to zero. */
-/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record.
- NOTE: Must be aligned to 8-byte boundary. */
-/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft
- record. This should be equal to the mft
- record size. */
-/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
- When it is not zero it is a mft reference
- pointing to the base mft record to which
- this record belongs (this is then used to
- locate the attribute list attribute present
- in the base record which describes this
- extension record and hence might need
- modification when the extension record
- itself is modified, also locating the
- attribute list also means finding the other
- potential extents, belonging to the non-base
- mft record). */
-/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
- the next attribute added to this mft record.
- NOTE: Incremented each time after it is used.
- NOTE: Every time the mft record is reused
- this number is set to zero. NOTE: The first
- instance number is always 0. */
-/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
-/* 42*/ le16 reserved; /* Reserved/alignment. */
-/* 44*/ le32 mft_record_number; /* Number of this mft record. */
-/* sizeof() = 48 bytes */
-/*
- * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes. This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading we obviously use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) MFT_RECORD;
-
-/* This is the version without the NTFS 3.1+ specific fields. */
-typedef struct {
-/*Ofs*/
-/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
- NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
- le16 usa_ofs; /* See NTFS_RECORD definition above. */
- le16 usa_count; /* See NTFS_RECORD definition above. */
-
-/* 8*/ le64 lsn; /* $LogFile sequence number for this record.
- Changed every time the record is modified. */
-/* 16*/ le16 sequence_number; /* Number of times this mft record has been
- reused. (See description for MFT_REF
- above.) NOTE: The increment (skipping zero)
- is done when the file is deleted. NOTE: If
- this is zero it is left zero. */
-/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of
- directory entries referencing this record.
- NOTE: Only used in mft base records.
- NOTE: When deleting a directory entry we
- check the link_count and if it is 1 we
- delete the file. Otherwise we delete the
- FILE_NAME_ATTR being referenced by the
- directory entry from the mft record and
- decrement the link_count.
- FIXME: Careful with Win32 + DOS names! */
-/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this
- mft record from the start of the mft record.
- NOTE: Must be aligned to 8-byte boundary. */
-/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
- is deleted, the MFT_RECORD_IN_USE flag is
- set to zero. */
-/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record.
- NOTE: Must be aligned to 8-byte boundary. */
-/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft
- record. This should be equal to the mft
- record size. */
-/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
- When it is not zero it is a mft reference
- pointing to the base mft record to which
- this record belongs (this is then used to
- locate the attribute list attribute present
- in the base record which describes this
- extension record and hence might need
- modification when the extension record
- itself is modified, also locating the
- attribute list also means finding the other
- potential extents, belonging to the non-base
- mft record). */
-/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
- the next attribute added to this mft record.
- NOTE: Incremented each time after it is used.
- NOTE: Every time the mft record is reused
- this number is set to zero. NOTE: The first
- instance number is always 0. */
-/* sizeof() = 42 bytes */
-/*
- * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes. This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading we obviously use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) MFT_RECORD_OLD;
-
-/*
- * System defined attributes (32-bit). Each attribute type has a corresponding
- * attribute name (Unicode string of maximum 64 character length) as described
- * by the attribute definitions present in the data attribute of the $AttrDef
- * system file. On NTFS 3.0 volumes the names are just as the types are named
- * in the below defines exchanging AT_ for the dollar sign ($). If that is not
- * a revealing choice of symbol I do not know what is... (-;
- */
-enum {
- AT_UNUSED = cpu_to_le32( 0),
- AT_STANDARD_INFORMATION = cpu_to_le32( 0x10),
- AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20),
- AT_FILE_NAME = cpu_to_le32( 0x30),
- AT_OBJECT_ID = cpu_to_le32( 0x40),
- AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50),
- AT_VOLUME_NAME = cpu_to_le32( 0x60),
- AT_VOLUME_INFORMATION = cpu_to_le32( 0x70),
- AT_DATA = cpu_to_le32( 0x80),
- AT_INDEX_ROOT = cpu_to_le32( 0x90),
- AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0),
- AT_BITMAP = cpu_to_le32( 0xb0),
- AT_REPARSE_POINT = cpu_to_le32( 0xc0),
- AT_EA_INFORMATION = cpu_to_le32( 0xd0),
- AT_EA = cpu_to_le32( 0xe0),
- AT_PROPERTY_SET = cpu_to_le32( 0xf0),
- AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100),
- AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000),
- AT_END = cpu_to_le32(0xffffffff)
-};
-
-typedef le32 ATTR_TYPE;
-
-/*
- * The collation rules for sorting views/indexes/etc (32-bit).
- *
- * COLLATION_BINARY - Collate by binary compare where the first byte is most
- * significant.
- * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
- * Unicode values, except that when a character can be uppercased, the
- * upper case value collates before the lower case one.
- * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
- * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
- * what the difference is. Perhaps the difference is that file names
- * would treat some special characters in an odd way (see
- * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
- * for what I mean but COLLATION_UNICODE_STRING would not give any special
- * treatment to any characters at all, but this is speculation.
- * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
- * values. E.g. used for $SII index in FILE_Secure, which sorts by
- * security_id (le32).
- * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
- * E.g. used for $O index in FILE_Extend/$Quota.
- * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
- * values and second by ascending security_id values. E.g. used for $SDH
- * index in FILE_Secure.
- * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
- * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
- * sorts by object_id (16-byte), by splitting up the object_id in four
- * le32 values and using them as individual keys. E.g. take the following
- * two security_ids, stored as follows on disk:
- * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
- * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
- * To compare them, they are split into four le32 values each, like so:
- * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
- * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
- * Now, it is apparent why the 2nd object_id collates after the 1st: the
- * first le32 value of the 1st object_id is less than the first le32 of
- * the 2nd object_id. If the first le32 values of both object_ids were
- * equal then the second le32 values would be compared, etc.
- */
-enum {
- COLLATION_BINARY = cpu_to_le32(0x00),
- COLLATION_FILE_NAME = cpu_to_le32(0x01),
- COLLATION_UNICODE_STRING = cpu_to_le32(0x02),
- COLLATION_NTOFS_ULONG = cpu_to_le32(0x10),
- COLLATION_NTOFS_SID = cpu_to_le32(0x11),
- COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12),
- COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13),
-};
-
-typedef le32 COLLATION_RULE;
-
-/*
- * The flags (32-bit) describing attribute properties in the attribute
- * definition structure. FIXME: This information is based on Regis's
- * information and, according to him, it is not certain and probably
- * incomplete. The INDEXABLE flag is fairly certainly correct as only the file
- * name attribute has this flag set and this is the only attribute indexed in
- * NT4.
- */
-enum {
- ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be
- indexed. */
- ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type
- can be present multiple times in the
- mft records of an inode. */
- ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value
- must contain at least one non-zero
- byte. */
- ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
- indexed and the attribute value must be
- unique for the attribute type in all of
- the mft records of an inode. */
- ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be
- named and the name must be unique for
- the attribute type in all of the mft
- records of an inode. */
- ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be
- resident. */
- ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log
- modifications to this attribute,
- regardless of whether it is resident or
- non-resident. Without this, only log
- modifications if the attribute is
- resident. */
-};
-
-typedef le32 ATTR_DEF_FLAGS;
-
-/*
- * The data attribute of FILE_AttrDef contains a sequence of attribute
- * definitions for the NTFS volume. With this, it is supposed to be safe for an
- * older NTFS driver to mount a volume containing a newer NTFS version without
- * damaging it (that's the theory. In practice it's: not damaging it too much).
- * Entries are sorted by attribute type. The flags describe whether the
- * attribute can be resident/non-resident and possibly other things, but the
- * actual bits are unknown.
- */
-typedef struct {
-/*hex ofs*/
-/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero
- terminated. */
-/* 80*/ ATTR_TYPE type; /* Type of the attribute. */
-/* 84*/ le32 display_rule; /* Default display rule.
- FIXME: What does it mean? (AIA) */
-/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */
-/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */
-/* 90*/ sle64 min_size; /* Optional minimum attribute size. */
-/* 98*/ sle64 max_size; /* Maximum size of attribute. */
-/* sizeof() = 0xa0 or 160 bytes */
-} __attribute__ ((__packed__)) ATTR_DEF;
-
-/*
- * Attribute flags (16-bit).
- */
-enum {
- ATTR_IS_COMPRESSED = cpu_to_le16(0x0001),
- ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
- mask. Also, first
- illegal value. */
- ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000),
- ATTR_IS_SPARSE = cpu_to_le16(0x8000),
-} __attribute__ ((__packed__));
-
-typedef le16 ATTR_FLAGS;
-
-/*
- * Attribute compression.
- *
- * Only the data attribute is ever compressed in the current ntfs driver in
- * Windows. Further, compression is only applied when the data attribute is
- * non-resident. Finally, to use compression, the maximum allowed cluster size
- * on a volume is 4kib.
- *
- * The compression method is based on independently compressing blocks of X
- * clusters, where X is determined from the compression_unit value found in the
- * non-resident attribute record header (more precisely: X = 2^compression_unit
- * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
- *
- * There are three different cases of how a compression block of X clusters
- * can be stored:
- *
- * 1) The data in the block is all zero (a sparse block):
- * This is stored as a sparse block in the runlist, i.e. the runlist
- * entry has length = X and lcn = -1. The mapping pairs array actually
- * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
- * all, which is then interpreted by the driver as lcn = -1.
- * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
- * the same principles apply as above, except that the length is not
- * restricted to being any particular value.
- *
- * 2) The data in the block is not compressed:
- * This happens when compression doesn't reduce the size of the block
- * in clusters. I.e. if compression has a small effect so that the
- * compressed data still occupies X clusters, then the uncompressed data
- * is stored in the block.
- * This case is recognised by the fact that the runlist entry has
- * length = X and lcn >= 0. The mapping pairs array stores this as
- * normal with a run length of X and some specific delta_lcn, i.e.
- * delta_lcn has to be present.
- *
- * 3) The data in the block is compressed:
- * The common case. This case is recognised by the fact that the run
- * list entry has length L < X and lcn >= 0. The mapping pairs array
- * stores this as normal with a run length of X and some specific
- * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
- * immediately followed by a sparse entry with length = X - L and
- * lcn = -1. The latter entry is to make up the vcn counting to the
- * full compression block size X.
- *
- * In fact, life is more complicated because adjacent entries of the same type
- * can be coalesced. This means that one has to keep track of the number of
- * clusters handled and work on a basis of X clusters at a time being one
- * block. An example: if length L > X this means that this particular runlist
- * entry contains a block of length X and part of one or more blocks of length
- * L - X. Another example: if length L < X, this does not necessarily mean that
- * the block is compressed as it might be that the lcn changes inside the block
- * and hence the following runlist entry describes the continuation of the
- * potentially compressed block. The block would be compressed if the
- * following runlist entry describes at least X - L sparse clusters, thus
- * making up the compression block length as described in point 3 above. (Of
- * course, there can be several runlist entries with small lengths so that the
- * sparse entry does not follow the first data containing entry with
- * length < X.)
- *
- * NOTE: At the end of the compressed attribute value, there most likely is not
- * just the right amount of data to make up a compression block, thus this data
- * is not even attempted to be compressed. It is just stored as is, unless
- * the number of clusters it occupies is reduced when compressed in which case
- * it is stored as a compressed compression block, complete with sparse
- * clusters at the end.
- */
-
-/*
- * Flags of resident attributes (8-bit).
- */
-enum {
- RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
- (has implications for deleting and
- modifying the attribute). */
-} __attribute__ ((__packed__));
-
-typedef u8 RESIDENT_ATTR_FLAGS;
-
-/*
- * Attribute record header. Always aligned to 8-byte boundary.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */
-/* 4*/ le32 length; /* Byte size of the resident part of the
- attribute (aligned to 8-byte boundary).
- Used to get to the next attribute. */
-/* 8*/ u8 non_resident; /* If 0, attribute is resident.
- If 1, attribute is non-resident. */
-/* 9*/ u8 name_length; /* Unicode character size of name of attribute.
- 0 if unnamed. */
-/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the
- beginning of the name from the attribute
- record. Note that the name is stored as a
- Unicode string. When creating, place offset
- just at the end of the record header. Then,
- follow with attribute value or mapping pairs
- array, resident and non-resident attributes
- respectively, aligning to an 8-byte
- boundary. */
-/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */
-/* 14*/ le16 instance; /* The instance of this attribute record. This
- number is unique within this mft record (see
- MFT_RECORD/next_attribute_instance notes in
- mft.h for more details). */
-/* 16*/ union {
- /* Resident attributes. */
- struct {
-/* 16 */ le32 value_length;/* Byte size of attribute value. */
-/* 20 */ le16 value_offset;/* Byte offset of the attribute
- value from the start of the
- attribute record. When creating,
- align to 8-byte boundary if we
- have a name present as this might
- not have a length of a multiple
- of 8-bytes. */
-/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */
-/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte
- boundary. */
- } __attribute__ ((__packed__)) resident;
- /* Non-resident attributes. */
- struct {
-/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number
- for this portion of the attribute value or
- 0 if this is the only extent (usually the
- case). - Only when an attribute list is used
- does lowest_vcn != 0 ever occur. */
-/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of
- the attribute value. - Usually there is only one
- portion, so this usually equals the attribute
- value size in clusters minus 1. Can be -1 for
- zero length files. Can be 0 for "single extent"
- attributes. */
-/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the
- beginning of the structure to the mapping pairs
- array which contains the mappings between the
- vcns and the logical cluster numbers (lcns).
- When creating, place this at the end of this
- record header aligned to 8-byte boundary. */
-/* 34*/ u8 compression_unit; /* The compression unit expressed
- as the log to the base 2 of the number of
- clusters in a compression unit. 0 means not
- compressed. (This effectively limits the
- compression unit size to be a power of two
- clusters.) WinNT4 only uses a value of 4.
- Sparse files have this set to 0 on XPSP2. */
-/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */
-/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
- be difficult to keep them up-to-date.*/
-/* 40*/ sle64 allocated_size; /* Byte size of disk space
- allocated to hold the attribute value. Always
- is a multiple of the cluster size. When a file
- is compressed, this field is a multiple of the
- compression block size (2^compression_unit) and
- it represents the logically allocated space
- rather than the actual on disk usage. For this
- use the compressed_size (see below). */
-/* 48*/ sle64 data_size; /* Byte size of the attribute
- value. Can be larger than allocated_size if
- attribute value is compressed or sparse. */
-/* 56*/ sle64 initialized_size; /* Byte size of initialized
- portion of the attribute value. Usually equals
- data_size. */
-/* sizeof(uncompressed attr) = 64*/
-/* 64*/ sle64 compressed_size; /* Byte size of the attribute
- value after compression. Only present when
- compressed or sparse. Always is a multiple of
- the cluster size. Represents the actual amount
- of disk space being used on the disk. */
-/* sizeof(compressed attr) = 72*/
- } __attribute__ ((__packed__)) non_resident;
- } __attribute__ ((__packed__)) data;
-} __attribute__ ((__packed__)) ATTR_RECORD;
-
-typedef ATTR_RECORD ATTR_REC;
-
-/*
- * File attribute flags (32-bit) appearing in the file_attributes fields of the
- * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR
- * attributes of MFT_RECORDs and directory index entries.
- *
- * All of the below flags appear in the directory index entries but only some
- * appear in the STANDARD_INFORMATION attribute whilst only some others appear
- * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the
- * flags appear in all of the above.
- */
-enum {
- FILE_ATTR_READONLY = cpu_to_le32(0x00000001),
- FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002),
- FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004),
- /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
-
- FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010),
- /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is
- reserved for the DOS SUBDIRECTORY flag. */
- FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020),
- FILE_ATTR_DEVICE = cpu_to_le32(0x00000040),
- FILE_ATTR_NORMAL = cpu_to_le32(0x00000080),
-
- FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100),
- FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200),
- FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400),
- FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800),
-
- FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000),
- FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
- FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000),
-
- FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7),
- /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
- FILE_ATTR_DEVICE and preserves everything else. This mask is used
- to obtain all flags that are valid for reading. */
- FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7),
- /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
- F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
- F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
- is used to obtain all flags that are valid for setting. */
- /*
- * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
- * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
- * attribute of an mft record.
- */
- FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000),
- /* Note, this is a copy of the corresponding bit from the mft record,
- telling us whether this is a directory or not, i.e. whether it has
- an index root attribute or not. */
- FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000),
- /* Note, this is a copy of the corresponding bit from the mft record,
- telling us whether this file has a view index present (eg. object id
- index, quota index, one of the security indexes or the encrypting
- filesystem related indexes). */
-};
-
-typedef le32 FILE_ATTR_FLAGS;
-
-/*
- * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
- * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
- * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
- * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
- */
-
-/*
- * Attribute: Standard information (0x10).
- *
- * NOTE: Always resident.
- * NOTE: Present in all base file records on a volume.
- * NOTE: There is conflicting information about the meaning of each of the time
- * fields but the meaning as defined below has been verified to be
- * correct by practical experimentation on Windows NT4 SP6a and is hence
- * assumed to be the one and only correct interpretation.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ sle64 creation_time; /* Time file was created. Updated when
- a filename is changed(?). */
-/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last
- modified. */
-/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last
- modified. */
-/* 24*/ sle64 last_access_time; /* Approximate time when the file was
- last accessed (obviously this is not
- updated on read-only volumes). In
- Windows this is only updated when
- accessed if some time delta has
- passed since the last update. Also,
- last access time updates can be
- disabled altogether for speed. */
-/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
-/* 36*/ union {
- /* NTFS 1.2 */
- struct {
- /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte
- boundary. */
- } __attribute__ ((__packed__)) v1;
- /* sizeof() = 48 bytes */
- /* NTFS 3.x */
- struct {
-/*
- * If a volume has been upgraded from a previous NTFS version, then these
- * fields are present only if the file has been accessed since the upgrade.
- * Recognize the difference by comparing the length of the resident attribute
- * value. If it is 48, then the following fields are missing. If it is 72 then
- * the fields are present. Maybe just check like this:
- * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
- * Assume NTFS 1.2- format.
- * If (volume version is 3.x)
- * Upgrade attribute to NTFS 3.x format.
- * else
- * Use NTFS 1.2- format for access.
- * } else
- * Use NTFS 3.x format for access.
- * Only problem is that it might be legal to set the length of the value to
- * arbitrarily large values thus spoiling this check. - But chkdsk probably
- * views that as a corruption, assuming that it behaves like this for all
- * attributes.
- */
- /* 36*/ le32 maximum_versions; /* Maximum allowed versions for
- file. Zero if version numbering is disabled. */
- /* 40*/ le32 version_number; /* This file's version (if any).
- Set to zero if maximum_versions is zero. */
- /* 44*/ le32 class_id; /* Class id from bidirectional
- class id index (?). */
- /* 48*/ le32 owner_id; /* Owner_id of the user owning
- the file. Translate via $Q index in FILE_Extend
- /$Quota to the quota control entry for the user
- owning the file. Zero if quotas are disabled. */
- /* 52*/ le32 security_id; /* Security_id for the file.
- Translate via $SII index and $SDS data stream
- in FILE_Secure to the security descriptor. */
- /* 56*/ le64 quota_charged; /* Byte size of the charge to
- the quota for all streams of the file. Note: Is
- zero if quotas are disabled. */
- /* 64*/ leUSN usn; /* Last update sequence number
- of the file. This is a direct index into the
- transaction log file ($UsnJrnl). It is zero if
- the usn journal is disabled or this file has
- not been subject to logging yet. See usnjrnl.h
- for details. */
- } __attribute__ ((__packed__)) v3;
- /* sizeof() = 72 bytes (NTFS 3.x) */
- } __attribute__ ((__packed__)) ver;
-} __attribute__ ((__packed__)) STANDARD_INFORMATION;
-
-/*
- * Attribute: Attribute list (0x20).
- *
- * - Can be either resident or non-resident.
- * - Value consists of a sequence of variable length, 8-byte aligned,
- * ATTR_LIST_ENTRY records.
- * - The list is not terminated by anything at all! The only way to know when
- * the end is reached is to keep track of the current offset and compare it to
- * the attribute value size.
- * - The attribute list attribute contains one entry for each attribute of
- * the file in which the list is located, except for the list attribute
- * itself. The list is sorted: first by attribute type, second by attribute
- * name (if present), third by instance number. The extents of one
- * non-resident attribute (if present) immediately follow after the initial
- * extent. They are ordered by lowest_vcn and have their instace set to zero.
- * It is not allowed to have two attributes with all sorting keys equal.
- * - Further restrictions:
- * - If not resident, the vcn to lcn mapping array has to fit inside the
- * base mft record.
- * - The attribute list attribute value has a maximum size of 256kb. This
- * is imposed by the Windows cache manager.
- * - Attribute lists are only used when the attributes of mft record do not
- * fit inside the mft record despite all attributes (that can be made
- * non-resident) having been made non-resident. This can happen e.g. when:
- * - File has a large number of hard links (lots of file name
- * attributes present).
- * - The mapping pairs array of some non-resident attribute becomes so
- * large due to fragmentation that it overflows the mft record.
- * - The security descriptor is very complex (not applicable to
- * NTFS 3.0 volumes).
- * - There are many named streams.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */
-/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */
-/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the
- attribute or 0 if unnamed. */
-/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name
- (always set this to where the name would
- start even if unnamed). */
-/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion
- of the attribute value. This is usually 0. It
- is non-zero for the case where one attribute
- does not fit into one mft record and thus
- several mft records are allocated to hold
- this attribute. In the latter case, each mft
- record holds one extent of the attribute and
- there is one attribute list entry for each
- extent. NOTE: This is DEFINITELY a signed
- value! The windows driver uses cmp, followed
- by jg when comparing this, thus it treats it
- as signed. */
-/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding
- the ATTR_RECORD for this portion of the
- attribute value. */
-/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the
- attribute being referenced; otherwise 0. */
-/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use
- name_offset to determine the location of the
- name. */
-/* sizeof() = 26 + (attribute_name_length * 2) bytes */
-} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
-
-/*
- * The maximum allowed length for a file name.
- */
-#define MAXIMUM_FILE_NAME_LENGTH 255
-
-/*
- * Possible namespaces for filenames in ntfs (8-bit).
- */
-enum {
- FILE_NAME_POSIX = 0x00,
- /* This is the largest namespace. It is case sensitive and allows all
- Unicode characters except for: '\0' and '/'. Beware that in
- WinNT/2k/2003 by default files which eg have the same name except
- for their case will not be distinguished by the standard utilities
- and thus a "del filename" will delete both "filename" and "fileName"
- without warning. However if for example Services For Unix (SFU) are
- installed and the case sensitive option was enabled at installation
- time, then you can create/access/delete such files.
- Note that even SFU places restrictions on the filenames beyond the
- '\0' and '/' and in particular the following set of characters is
- not allowed: '"', '/', '<', '>', '\'. All other characters,
- including the ones no allowed in WIN32 namespace are allowed.
- Tested with SFU 3.5 (this is now free) running on Windows XP. */
- FILE_NAME_WIN32 = 0x01,
- /* The standard WinNT/2k NTFS long filenames. Case insensitive. All
- Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
- and '|'. Further, names cannot end with a '.' or a space. */
- FILE_NAME_DOS = 0x02,
- /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit
- characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
- '<', '=', '>', '?', and '\'. */
- FILE_NAME_WIN32_AND_DOS = 0x03,
- /* 3 means that both the Win32 and the DOS filenames are identical and
- hence have been saved in this single filename record. */
-} __attribute__ ((__packed__));
-
-typedef u8 FILE_NAME_TYPE_FLAGS;
-
-/*
- * Attribute: Filename (0x30).
- *
- * NOTE: Always resident.
- * NOTE: All fields, except the parent_directory, are only updated when the
- * filename is changed. Until then, they just become out of sync with
- * reality and the more up to date values are present in the standard
- * information attribute.
- * NOTE: There is conflicting information about the meaning of each of the time
- * fields but the meaning as defined below has been verified to be
- * correct by practical experimentation on Windows NT4 SP6a and is hence
- * assumed to be the one and only correct interpretation.
- */
-typedef struct {
-/*hex ofs*/
-/* 0*/ leMFT_REF parent_directory; /* Directory this filename is
- referenced from. */
-/* 8*/ sle64 creation_time; /* Time file was created. */
-/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last
- modified. */
-/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last
- modified. */
-/* 20*/ sle64 last_access_time; /* Time this mft record was last
- accessed. */
-/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space
- for the unnamed data attribute. So
- for normal $DATA, this is the
- allocated_size from the unnamed
- $DATA attribute and for compressed
- and/or sparse $DATA, this is the
- compressed_size from the unnamed
- $DATA attribute. For a directory or
- other inode without an unnamed $DATA
- attribute, this is always 0. NOTE:
- This is a multiple of the cluster
- size. */
-/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed
- data attribute. For a directory or
- other inode without an unnamed $DATA
- attribute, this is always 0. */
-/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
-/* 3c*/ union {
- /* 3c*/ struct {
- /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to
- pack the extended attributes
- (EAs), if such are present.*/
- /* 3e*/ le16 reserved; /* Reserved for alignment. */
- } __attribute__ ((__packed__)) ea;
- /* 3c*/ struct {
- /* 3c*/ le32 reparse_point_tag; /* Type of reparse point,
- present only in reparse
- points and only if there are
- no EAs. */
- } __attribute__ ((__packed__)) rp;
- } __attribute__ ((__packed__)) type;
-/* 40*/ u8 file_name_length; /* Length of file name in
- (Unicode) characters. */
-/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/
-/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */
-} __attribute__ ((__packed__)) FILE_NAME_ATTR;
-
-/*
- * GUID structures store globally unique identifiers (GUID). A GUID is a
- * 128-bit value consisting of one group of eight hexadecimal digits, followed
- * by three groups of four hexadecimal digits each, followed by one group of
- * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
- * distributed computing environment (DCE) universally unique identifier (UUID).
- * Example of a GUID:
- * 1F010768-5A73-BC91-0010A52216A7
- */
-typedef struct {
- le32 data1; /* The first eight hexadecimal digits of the GUID. */
- le16 data2; /* The first group of four hexadecimal digits. */
- le16 data3; /* The second group of four hexadecimal digits. */
- u8 data4[8]; /* The first two bytes are the third group of four
- hexadecimal digits. The remaining six bytes are the
- final 12 hexadecimal digits. */
-} __attribute__ ((__packed__)) GUID;
-
-/*
- * FILE_Extend/$ObjId contains an index named $O. This index contains all
- * object_ids present on the volume as the index keys and the corresponding
- * mft_record numbers as the index entry data parts. The data part (defined
- * below) also contains three other object_ids:
- * birth_volume_id - object_id of FILE_Volume on which the file was first
- * created. Optional (i.e. can be zero).
- * birth_object_id - object_id of file when it was first created. Usually
- * equals the object_id. Optional (i.e. can be zero).
- * domain_id - Reserved (always zero).
- */
-typedef struct {
- leMFT_REF mft_reference;/* Mft record containing the object_id in
- the index entry key. */
- union {
- struct {
- GUID birth_volume_id;
- GUID birth_object_id;
- GUID domain_id;
- } __attribute__ ((__packed__)) origin;
- u8 extended_info[48];
- } __attribute__ ((__packed__)) opt;
-} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
-
-/*
- * Attribute: Object id (NTFS 3.0+) (0x40).
- *
- * NOTE: Always resident.
- */
-typedef struct {
- GUID object_id; /* Unique id assigned to the
- file.*/
- /* The following fields are optional. The attribute value size is 16
- bytes, i.e. sizeof(GUID), if these are not present at all. Note,
- the entries can be present but one or more (or all) can be zero
- meaning that that particular value(s) is(are) not defined. */
- union {
- struct {
- GUID birth_volume_id; /* Unique id of volume on which
- the file was first created.*/
- GUID birth_object_id; /* Unique id of file when it was
- first created. */
- GUID domain_id; /* Reserved, zero. */
- } __attribute__ ((__packed__)) origin;
- u8 extended_info[48];
- } __attribute__ ((__packed__)) opt;
-} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
-
-/*
- * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
- * the SID structure (see below).
- */
-//typedef enum { /* SID string prefix. */
-// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */
-// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */
-// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */
-// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */
-// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */
-// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */
-//} IDENTIFIER_AUTHORITIES;
-
-/*
- * These relative identifiers (RIDs) are used with the above identifier
- * authorities to make up universal well-known SIDs.
- *
- * Note: The relative identifier (RID) refers to the portion of a SID, which
- * identifies a user or group in relation to the authority that issued the SID.
- * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
- * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
- * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
- */
-typedef enum { /* Identifier authority. */
- SECURITY_NULL_RID = 0, /* S-1-0 */
- SECURITY_WORLD_RID = 0, /* S-1-1 */
- SECURITY_LOCAL_RID = 0, /* S-1-2 */
-
- SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */
- SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */
-
- SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */
- SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */
-
- SECURITY_DIALUP_RID = 1,
- SECURITY_NETWORK_RID = 2,
- SECURITY_BATCH_RID = 3,
- SECURITY_INTERACTIVE_RID = 4,
- SECURITY_SERVICE_RID = 6,
- SECURITY_ANONYMOUS_LOGON_RID = 7,
- SECURITY_PROXY_RID = 8,
- SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
- SECURITY_SERVER_LOGON_RID = 9,
- SECURITY_PRINCIPAL_SELF_RID = 0xa,
- SECURITY_AUTHENTICATED_USER_RID = 0xb,
- SECURITY_RESTRICTED_CODE_RID = 0xc,
- SECURITY_TERMINAL_SERVER_RID = 0xd,
-
- SECURITY_LOGON_IDS_RID = 5,
- SECURITY_LOGON_IDS_RID_COUNT = 3,
-
- SECURITY_LOCAL_SYSTEM_RID = 0x12,
-
- SECURITY_NT_NON_UNIQUE = 0x15,
-
- SECURITY_BUILTIN_DOMAIN_RID = 0x20,
-
- /*
- * Well-known domain relative sub-authority values (RIDs).
- */
-
- /* Users. */
- DOMAIN_USER_RID_ADMIN = 0x1f4,
- DOMAIN_USER_RID_GUEST = 0x1f5,
- DOMAIN_USER_RID_KRBTGT = 0x1f6,
-
- /* Groups. */
- DOMAIN_GROUP_RID_ADMINS = 0x200,
- DOMAIN_GROUP_RID_USERS = 0x201,
- DOMAIN_GROUP_RID_GUESTS = 0x202,
- DOMAIN_GROUP_RID_COMPUTERS = 0x203,
- DOMAIN_GROUP_RID_CONTROLLERS = 0x204,
- DOMAIN_GROUP_RID_CERT_ADMINS = 0x205,
- DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206,
- DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
- DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208,
-
- /* Aliases. */
- DOMAIN_ALIAS_RID_ADMINS = 0x220,
- DOMAIN_ALIAS_RID_USERS = 0x221,
- DOMAIN_ALIAS_RID_GUESTS = 0x222,
- DOMAIN_ALIAS_RID_POWER_USERS = 0x223,
-
- DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224,
- DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225,
- DOMAIN_ALIAS_RID_PRINT_OPS = 0x226,
- DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227,
-
- DOMAIN_ALIAS_RID_REPLICATOR = 0x228,
- DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229,
- DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
-} RELATIVE_IDENTIFIERS;
-
-/*
- * The universal well-known SIDs:
- *
- * NULL_SID S-1-0-0
- * WORLD_SID S-1-1-0
- * LOCAL_SID S-1-2-0
- * CREATOR_OWNER_SID S-1-3-0
- * CREATOR_GROUP_SID S-1-3-1
- * CREATOR_OWNER_SERVER_SID S-1-3-2
- * CREATOR_GROUP_SERVER_SID S-1-3-3
- *
- * (Non-unique IDs) S-1-4
- *
- * NT well-known SIDs:
- *
- * NT_AUTHORITY_SID S-1-5
- * DIALUP_SID S-1-5-1
- *
- * NETWORD_SID S-1-5-2
- * BATCH_SID S-1-5-3
- * INTERACTIVE_SID S-1-5-4
- * SERVICE_SID S-1-5-6
- * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session)
- * PROXY_SID S-1-5-8
- * SERVER_LOGON_SID S-1-5-9 (aka domain controller account)
- * SELF_SID S-1-5-10 (self RID)
- * AUTHENTICATED_USER_SID S-1-5-11
- * RESTRICTED_CODE_SID S-1-5-12 (running restricted code)
- * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server)
- *
- * (Logon IDs) S-1-5-5-X-Y
- *
- * (NT non-unique IDs) S-1-5-0x15-...
- *
- * (Built-in domain) S-1-5-0x20
- */
-
-/*
- * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
- *
- * NOTE: This is stored as a big endian number, hence the high_part comes
- * before the low_part.
- */
-typedef union {
- struct {
- u16 high_part; /* High 16-bits. */
- u32 low_part; /* Low 32-bits. */
- } __attribute__ ((__packed__)) parts;
- u8 value[6]; /* Value as individual bytes. */
-} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
-
-/*
- * The SID structure is a variable-length structure used to uniquely identify
- * users or groups. SID stands for security identifier.
- *
- * The standard textual representation of the SID is of the form:
- * S-R-I-S-S...
- * Where:
- * - The first "S" is the literal character 'S' identifying the following
- * digits as a SID.
- * - R is the revision level of the SID expressed as a sequence of digits
- * either in decimal or hexadecimal (if the later, prefixed by "0x").
- * - I is the 48-bit identifier_authority, expressed as digits as R above.
- * - S... is one or more sub_authority values, expressed as digits as above.
- *
- * Example SID; the domain-relative SID of the local Administrators group on
- * Windows NT/2k:
- * S-1-5-32-544
- * This translates to a SID with:
- * revision = 1,
- * sub_authority_count = 2,
- * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY
- * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID
- * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS
- */
-typedef struct {
- u8 revision;
- u8 sub_authority_count;
- SID_IDENTIFIER_AUTHORITY identifier_authority;
- le32 sub_authority[1]; /* At least one sub_authority. */
-} __attribute__ ((__packed__)) SID;
-
-/*
- * Current constants for SIDs.
- */
-typedef enum {
- SID_REVISION = 1, /* Current revision level. */
- SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */
- SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in
- a future revision. */
-} SID_CONSTANTS;
-
-/*
- * The predefined ACE types (8-bit, see below).
- */
-enum {
- ACCESS_MIN_MS_ACE_TYPE = 0,
- ACCESS_ALLOWED_ACE_TYPE = 0,
- ACCESS_DENIED_ACE_TYPE = 1,
- SYSTEM_AUDIT_ACE_TYPE = 2,
- SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */
- ACCESS_MAX_MS_V2_ACE_TYPE = 3,
-
- ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
- ACCESS_MAX_MS_V3_ACE_TYPE = 4,
-
- /* The following are Win2k only. */
- ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5,
- ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5,
- ACCESS_DENIED_OBJECT_ACE_TYPE = 6,
- SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7,
- SYSTEM_ALARM_OBJECT_ACE_TYPE = 8,
- ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8,
-
- ACCESS_MAX_MS_V4_ACE_TYPE = 8,
-
- /* This one is for WinNT/2k. */
- ACCESS_MAX_MS_ACE_TYPE = 8,
-} __attribute__ ((__packed__));
-
-typedef u8 ACE_TYPES;
-
-/*
- * The ACE flags (8-bit) for audit and inheritance (see below).
- *
- * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
- * types to indicate that a message is generated (in Windows!) for successful
- * accesses.
- *
- * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
- * to indicate that a message is generated (in Windows!) for failed accesses.
- */
-enum {
- /* The inheritance flags. */
- OBJECT_INHERIT_ACE = 0x01,
- CONTAINER_INHERIT_ACE = 0x02,
- NO_PROPAGATE_INHERIT_ACE = 0x04,
- INHERIT_ONLY_ACE = 0x08,
- INHERITED_ACE = 0x10, /* Win2k only. */
- VALID_INHERIT_FLAGS = 0x1f,
-
- /* The audit flags. */
- SUCCESSFUL_ACCESS_ACE_FLAG = 0x40,
- FAILED_ACCESS_ACE_FLAG = 0x80,
-} __attribute__ ((__packed__));
-
-typedef u8 ACE_FLAGS;
-
-/*
- * An ACE is an access-control entry in an access-control list (ACL).
- * An ACE defines access to an object for a specific user or group or defines
- * the types of access that generate system-administration messages or alarms
- * for a specific user or group. The user or group is identified by a security
- * identifier (SID).
- *
- * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
- * which specifies the type and size of the ACE. The format of the subsequent
- * data depends on the ACE type.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ ACE_TYPES type; /* Type of the ACE. */
-/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */
-/* 2*/ le16 size; /* Size in bytes of the ACE. */
-} __attribute__ ((__packed__)) ACE_HEADER;
-
-/*
- * The access mask (32-bit). Defines the access rights.
- *
- * The specific rights (bits 0 to 15). These depend on the type of the object
- * being secured by the ACE.
- */
-enum {
- /* Specific rights for files and directories are as follows: */
-
- /* Right to read data from the file. (FILE) */
- FILE_READ_DATA = cpu_to_le32(0x00000001),
- /* Right to list contents of a directory. (DIRECTORY) */
- FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001),
-
- /* Right to write data to the file. (FILE) */
- FILE_WRITE_DATA = cpu_to_le32(0x00000002),
- /* Right to create a file in the directory. (DIRECTORY) */
- FILE_ADD_FILE = cpu_to_le32(0x00000002),
-
- /* Right to append data to the file. (FILE) */
- FILE_APPEND_DATA = cpu_to_le32(0x00000004),
- /* Right to create a subdirectory. (DIRECTORY) */
- FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004),
-
- /* Right to read extended attributes. (FILE/DIRECTORY) */
- FILE_READ_EA = cpu_to_le32(0x00000008),
-
- /* Right to write extended attributes. (FILE/DIRECTORY) */
- FILE_WRITE_EA = cpu_to_le32(0x00000010),
-
- /* Right to execute a file. (FILE) */
- FILE_EXECUTE = cpu_to_le32(0x00000020),
- /* Right to traverse the directory. (DIRECTORY) */
- FILE_TRAVERSE = cpu_to_le32(0x00000020),
-
- /*
- * Right to delete a directory and all the files it contains (its
- * children), even if the files are read-only. (DIRECTORY)
- */
- FILE_DELETE_CHILD = cpu_to_le32(0x00000040),
-
- /* Right to read file attributes. (FILE/DIRECTORY) */
- FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080),
-
- /* Right to change file attributes. (FILE/DIRECTORY) */
- FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100),
-
- /*
- * The standard rights (bits 16 to 23). These are independent of the
- * type of object being secured.
- */
-
- /* Right to delete the object. */
- DELETE = cpu_to_le32(0x00010000),
-
- /*
- * Right to read the information in the object's security descriptor,
- * not including the information in the SACL, i.e. right to read the
- * security descriptor and owner.
- */
- READ_CONTROL = cpu_to_le32(0x00020000),
-
- /* Right to modify the DACL in the object's security descriptor. */
- WRITE_DAC = cpu_to_le32(0x00040000),
-
- /* Right to change the owner in the object's security descriptor. */
- WRITE_OWNER = cpu_to_le32(0x00080000),
-
- /*
- * Right to use the object for synchronization. Enables a process to
- * wait until the object is in the signalled state. Some object types
- * do not support this access right.
- */
- SYNCHRONIZE = cpu_to_le32(0x00100000),
-
- /*
- * The following STANDARD_RIGHTS_* are combinations of the above for
- * convenience and are defined by the Win32 API.
- */
-
- /* These are currently defined to READ_CONTROL. */
- STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000),
- STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000),
- STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000),
-
- /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
- STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000),
-
- /*
- * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
- * SYNCHRONIZE access.
- */
- STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000),
-
- /*
- * The access system ACL and maximum allowed access types (bits 24 to
- * 25, bits 26 to 27 are reserved).
- */
- ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000),
- MAXIMUM_ALLOWED = cpu_to_le32(0x02000000),
-
- /*
- * The generic rights (bits 28 to 31). These map onto the standard and
- * specific rights.
- */
-
- /* Read, write, and execute access. */
- GENERIC_ALL = cpu_to_le32(0x10000000),
-
- /* Execute access. */
- GENERIC_EXECUTE = cpu_to_le32(0x20000000),
-
- /*
- * Write access. For files, this maps onto:
- * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
- * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
- * For directories, the mapping has the same numerical value. See
- * above for the descriptions of the rights granted.
- */
- GENERIC_WRITE = cpu_to_le32(0x40000000),
-
- /*
- * Read access. For files, this maps onto:
- * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
- * STANDARD_RIGHTS_READ | SYNCHRONIZE
- * For directories, the mapping has the same numberical value. See
- * above for the descriptions of the rights granted.
- */
- GENERIC_READ = cpu_to_le32(0x80000000),
-};
-
-typedef le32 ACCESS_MASK;
-
-/*
- * The generic mapping array. Used to denote the mapping of each generic
- * access right to a specific access mask.
- *
- * FIXME: What exactly is this and what is it for? (AIA)
- */
-typedef struct {
- ACCESS_MASK generic_read;
- ACCESS_MASK generic_write;
- ACCESS_MASK generic_execute;
- ACCESS_MASK generic_all;
-} __attribute__ ((__packed__)) GENERIC_MAPPING;
-
-/*
- * The predefined ACE type structures are as defined below.
- */
-
-/*
- * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
- */
-typedef struct {
-/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
- ACE_TYPES type; /* Type of the ACE. */
- ACE_FLAGS flags; /* Flags describing the ACE. */
- le16 size; /* Size in bytes of the ACE. */
-/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */
-
-/* 8*/ SID sid; /* The SID associated with the ACE. */
-} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
- SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
-
-/*
- * The object ACE flags (32-bit).
- */
-enum {
- ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1),
- ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2),
-};
-
-typedef le32 OBJECT_ACE_FLAGS;
-
-typedef struct {
-/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
- ACE_TYPES type; /* Type of the ACE. */
- ACE_FLAGS flags; /* Flags describing the ACE. */
- le16 size; /* Size in bytes of the ACE. */
-/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */
-
-/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */
-/* 12*/ GUID object_type;
-/* 28*/ GUID inherited_object_type;
-
-/* 44*/ SID sid; /* The SID associated with the ACE. */
-} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
- ACCESS_DENIED_OBJECT_ACE,
- SYSTEM_AUDIT_OBJECT_ACE,
- SYSTEM_ALARM_OBJECT_ACE;
-
-/*
- * An ACL is an access-control list (ACL).
- * An ACL starts with an ACL header structure, which specifies the size of
- * the ACL and the number of ACEs it contains. The ACL header is followed by
- * zero or more access control entries (ACEs). The ACL as well as each ACE
- * are aligned on 4-byte boundaries.
- */
-typedef struct {
- u8 revision; /* Revision of this ACL. */
- u8 alignment1;
- le16 size; /* Allocated space in bytes for ACL. Includes this
- header, the ACEs and the remaining free space. */
- le16 ace_count; /* Number of ACEs in the ACL. */
- le16 alignment2;
-/* sizeof() = 8 bytes */
-} __attribute__ ((__packed__)) ACL;
-
-/*
- * Current constants for ACLs.
- */
-typedef enum {
- /* Current revision. */
- ACL_REVISION = 2,
- ACL_REVISION_DS = 4,
-
- /* History of revisions. */
- ACL_REVISION1 = 1,
- MIN_ACL_REVISION = 2,
- ACL_REVISION2 = 2,
- ACL_REVISION3 = 3,
- ACL_REVISION4 = 4,
- MAX_ACL_REVISION = 4,
-} ACL_CONSTANTS;
-
-/*
- * The security descriptor control flags (16-bit).
- *
- * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
- * pointed to by the Owner field was provided by a defaulting mechanism
- * rather than explicitly provided by the original provider of the
- * security descriptor. This may affect the treatment of the SID with
- * respect to inheritance of an owner.
- *
- * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
- * the Group field was provided by a defaulting mechanism rather than
- * explicitly provided by the original provider of the security
- * descriptor. This may affect the treatment of the SID with respect to
- * inheritance of a primary group.
- *
- * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
- * descriptor contains a discretionary ACL. If this flag is set and the
- * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
- * explicitly being specified.
- *
- * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
- * pointed to by the Dacl field was provided by a defaulting mechanism
- * rather than explicitly provided by the original provider of the
- * security descriptor. This may affect the treatment of the ACL with
- * respect to inheritance of an ACL. This flag is ignored if the
- * DaclPresent flag is not set.
- *
- * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security
- * descriptor contains a system ACL pointed to by the Sacl field. If this
- * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
- * an empty (but present) ACL is being specified.
- *
- * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
- * pointed to by the Sacl field was provided by a defaulting mechanism
- * rather than explicitly provided by the original provider of the
- * security descriptor. This may affect the treatment of the ACL with
- * respect to inheritance of an ACL. This flag is ignored if the
- * SaclPresent flag is not set.
- *
- * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
- * descriptor is in self-relative form. In this form, all fields of the
- * security descriptor are contiguous in memory and all pointer fields are
- * expressed as offsets from the beginning of the security descriptor.
- */
-enum {
- SE_OWNER_DEFAULTED = cpu_to_le16(0x0001),
- SE_GROUP_DEFAULTED = cpu_to_le16(0x0002),
- SE_DACL_PRESENT = cpu_to_le16(0x0004),
- SE_DACL_DEFAULTED = cpu_to_le16(0x0008),
-
- SE_SACL_PRESENT = cpu_to_le16(0x0010),
- SE_SACL_DEFAULTED = cpu_to_le16(0x0020),
-
- SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100),
- SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200),
- SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400),
- SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800),
-
- SE_DACL_PROTECTED = cpu_to_le16(0x1000),
- SE_SACL_PROTECTED = cpu_to_le16(0x2000),
- SE_RM_CONTROL_VALID = cpu_to_le16(0x4000),
- SE_SELF_RELATIVE = cpu_to_le16(0x8000)
-} __attribute__ ((__packed__));
-
-typedef le16 SECURITY_DESCRIPTOR_CONTROL;
-
-/*
- * Self-relative security descriptor. Contains the owner and group SIDs as well
- * as the sacl and dacl ACLs inside the security descriptor itself.
- */
-typedef struct {
- u8 revision; /* Revision level of the security descriptor. */
- u8 alignment;
- SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
- the descriptor as well as the following fields. */
- le32 owner; /* Byte offset to a SID representing an object's
- owner. If this is NULL, no owner SID is present in
- the descriptor. */
- le32 group; /* Byte offset to a SID representing an object's
- primary group. If this is NULL, no primary group
- SID is present in the descriptor. */
- le32 sacl; /* Byte offset to a system ACL. Only valid, if
- SE_SACL_PRESENT is set in the control field. If
- SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
- is specified. */
- le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if
- SE_DACL_PRESENT is set in the control field. If
- SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
- (unconditionally granting access) is specified. */
-/* sizeof() = 0x14 bytes */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
-
-/*
- * Absolute security descriptor. Does not contain the owner and group SIDs, nor
- * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
- * pointers to these structures in memory. Obviously, absolute security
- * descriptors are only useful for in memory representations of security
- * descriptors. On disk, a self-relative security descriptor is used.
- */
-typedef struct {
- u8 revision; /* Revision level of the security descriptor. */
- u8 alignment;
- SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
- the descriptor as well as the following fields. */
- SID *owner; /* Points to a SID representing an object's owner. If
- this is NULL, no owner SID is present in the
- descriptor. */
- SID *group; /* Points to a SID representing an object's primary
- group. If this is NULL, no primary group SID is
- present in the descriptor. */
- ACL *sacl; /* Points to a system ACL. Only valid, if
- SE_SACL_PRESENT is set in the control field. If
- SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
- is specified. */
- ACL *dacl; /* Points to a discretionary ACL. Only valid, if
- SE_DACL_PRESENT is set in the control field. If
- SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
- (unconditionally granting access) is specified. */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
-
-/*
- * Current constants for security descriptors.
- */
-typedef enum {
- /* Current revision. */
- SECURITY_DESCRIPTOR_REVISION = 1,
- SECURITY_DESCRIPTOR_REVISION1 = 1,
-
- /* The sizes of both the absolute and relative security descriptors is
- the same as pointers, at least on ia32 architecture are 32-bit. */
- SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR),
-} SECURITY_DESCRIPTOR_CONSTANTS;
-
-/*
- * Attribute: Security descriptor (0x50). A standard self-relative security
- * descriptor.
- *
- * NOTE: Can be resident or non-resident.
- * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
- * in FILE_Secure and the correct descriptor is found using the security_id
- * from the standard information attribute.
- */
-typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
-
-/*
- * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
- * referenced instance of each unique security descriptor is stored.
- *
- * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
- * does, however, contain two indexes ($SDH and $SII) as well as a named data
- * stream ($SDS).
- *
- * Every unique security descriptor is assigned a unique security identifier
- * (security_id, not to be confused with a SID). The security_id is unique for
- * the NTFS volume and is used as an index into the $SII index, which maps
- * security_ids to the security descriptor's storage location within the $SDS
- * data attribute. The $SII index is sorted by ascending security_id.
- *
- * A simple hash is computed from each security descriptor. This hash is used
- * as an index into the $SDH index, which maps security descriptor hashes to
- * the security descriptor's storage location within the $SDS data attribute.
- * The $SDH index is sorted by security descriptor hash and is stored in a B+
- * tree. When searching $SDH (with the intent of determining whether or not a
- * new security descriptor is already present in the $SDS data stream), if a
- * matching hash is found, but the security descriptors do not match, the
- * search in the $SDH index is continued, searching for a next matching hash.
- *
- * When a precise match is found, the security_id coresponding to the security
- * descriptor in the $SDS attribute is read from the found $SDH index entry and
- * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
- * which the security descriptor is being applied. The $STANDARD_INFORMATION
- * attribute is present in all base mft records (i.e. in all files and
- * directories).
- *
- * If a match is not found, the security descriptor is assigned a new unique
- * security_id and is added to the $SDS data attribute. Then, entries
- * referencing the this security descriptor in the $SDS data attribute are
- * added to the $SDH and $SII indexes.
- *
- * Note: Entries are never deleted from FILE_Secure, even if nothing
- * references an entry any more.
- */
-
-/*
- * This header precedes each security descriptor in the $SDS data stream.
- * This is also the index entry data part of both the $SII and $SDH indexes.
- */
-typedef struct {
- le32 hash; /* Hash of the security descriptor. */
- le32 security_id; /* The security_id assigned to the descriptor. */
- le64 offset; /* Byte offset of this entry in the $SDS stream. */
- le32 length; /* Size in bytes of this entry in $SDS stream. */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
-
-/*
- * The $SDS data stream contains the security descriptors, aligned on 16-byte
- * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
- * cross 256kib boundaries (this restriction is imposed by the Windows cache
- * manager). Each security descriptor is contained in a SDS_ENTRY structure.
- * Also, each security descriptor is stored twice in the $SDS stream with a
- * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
- * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
- * first copy of the security descriptor will be at offset 0x51d0 in the
- * $SDS data stream and the second copy will be at offset 0x451d0.
- */
-typedef struct {
-/*Ofs*/
-/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
- unnamed structs. */
- le32 hash; /* Hash of the security descriptor. */
- le32 security_id; /* The security_id assigned to the descriptor. */
- le64 offset; /* Byte offset of this entry in the $SDS stream. */
- le32 length; /* Size in bytes of this entry in $SDS stream. */
-/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
- descriptor. */
-} __attribute__ ((__packed__)) SDS_ENTRY;
-
-/*
- * The index entry key used in the $SII index. The collation type is
- * COLLATION_NTOFS_ULONG.
- */
-typedef struct {
- le32 security_id; /* The security_id assigned to the descriptor. */
-} __attribute__ ((__packed__)) SII_INDEX_KEY;
-
-/*
- * The index entry key used in the $SDH index. The keys are sorted first by
- * hash and then by security_id. The collation rule is
- * COLLATION_NTOFS_SECURITY_HASH.
- */
-typedef struct {
- le32 hash; /* Hash of the security descriptor. */
- le32 security_id; /* The security_id assigned to the descriptor. */
-} __attribute__ ((__packed__)) SDH_INDEX_KEY;
-
-/*
- * Attribute: Volume name (0x60).
- *
- * NOTE: Always resident.
- * NOTE: Present only in FILE_Volume.
- */
-typedef struct {
- ntfschar name[0]; /* The name of the volume in Unicode. */
-} __attribute__ ((__packed__)) VOLUME_NAME;
-
-/*
- * Possible flags for the volume (16-bit).
- */
-enum {
- VOLUME_IS_DIRTY = cpu_to_le16(0x0001),
- VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002),
- VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004),
- VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008),
-
- VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010),
- VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020),
-
- VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000),
- VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000),
-
- VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f),
-
- /* To make our life easier when checking if we must mount read-only. */
- VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027),
-} __attribute__ ((__packed__));
-
-typedef le16 VOLUME_FLAGS;
-
-/*
- * Attribute: Volume information (0x70).
- *
- * NOTE: Always resident.
- * NOTE: Present only in FILE_Volume.
- * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
- * NTFS 1.2. I haven't personally seen other values yet.
- */
-typedef struct {
- le64 reserved; /* Not used (yet?). */
- u8 major_ver; /* Major version of the ntfs format. */
- u8 minor_ver; /* Minor version of the ntfs format. */
- VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */
-} __attribute__ ((__packed__)) VOLUME_INFORMATION;
-
-/*
- * Attribute: Data attribute (0x80).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Data contents of a file (i.e. the unnamed stream) or of a named stream.
- */
-typedef struct {
- u8 data[0]; /* The file's data contents. */
-} __attribute__ ((__packed__)) DATA_ATTR;
-
-/*
- * Index header flags (8-bit).
- */
-enum {
- /*
- * When index header is in an index root attribute:
- */
- SMALL_INDEX = 0, /* The index is small enough to fit inside the index
- root attribute and there is no index allocation
- attribute present. */
- LARGE_INDEX = 1, /* The index is too large to fit in the index root
- attribute and/or an index allocation attribute is
- present. */
- /*
- * When index header is in an index block, i.e. is part of index
- * allocation attribute:
- */
- LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes
- branching off it. */
- INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
- node. */
- NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */
-} __attribute__ ((__packed__));
-
-typedef u8 INDEX_HEADER_FLAGS;
-
-/*
- * This is the header for indexes, describing the INDEX_ENTRY records, which
- * follow the INDEX_HEADER. Together the index header and the index entries
- * make up a complete index.
- *
- * IMPORTANT NOTE: The offset, length and size structure members are counted
- * relative to the start of the index header structure and not relative to the
- * start of the index root or index allocation structures themselves.
- */
-typedef struct {
- le32 entries_offset; /* Byte offset to first INDEX_ENTRY
- aligned to 8-byte boundary. */
- le32 index_length; /* Data size of the index in bytes,
- i.e. bytes used from allocated
- size, aligned to 8-byte boundary. */
- le32 allocated_size; /* Byte size of this index (block),
- multiple of 8 bytes. */
- /* NOTE: For the index root attribute, the above two numbers are always
- equal, as the attribute is resident and it is resized as needed. In
- the case of the index allocation attribute the attribute is not
- resident and hence the allocated_size is a fixed value and must
- equal the index_block_size specified by the INDEX_ROOT attribute
- corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
- belongs to. */
- INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */
- u8 reserved[3]; /* Reserved/align to 8-byte boundary. */
-} __attribute__ ((__packed__)) INDEX_HEADER;
-
-/*
- * Attribute: Index root (0x90).
- *
- * NOTE: Always resident.
- *
- * This is followed by a sequence of index entries (INDEX_ENTRY structures)
- * as described by the index header.
- *
- * When a directory is small enough to fit inside the index root then this
- * is the only attribute describing the directory. When the directory is too
- * large to fit in the index root, on the other hand, two additional attributes
- * are present: an index allocation attribute, containing sub-nodes of the B+
- * directory tree (see below), and a bitmap attribute, describing which virtual
- * cluster numbers (vcns) in the index allocation attribute are in use by an
- * index block.
- *
- * NOTE: The root directory (FILE_root) contains an entry for itself. Other
- * directories do not contain entries for themselves, though.
- */
-typedef struct {
- ATTR_TYPE type; /* Type of the indexed attribute. Is
- $FILE_NAME for directories, zero
- for view indexes. No other values
- allowed. */
- COLLATION_RULE collation_rule; /* Collation rule used to sort the
- index entries. If type is $FILE_NAME,
- this must be COLLATION_FILE_NAME. */
- le32 index_block_size; /* Size of each index block in bytes (in
- the index allocation attribute). */
- u8 clusters_per_index_block; /* Cluster size of each index block (in
- the index allocation attribute), when
- an index block is >= than a cluster,
- otherwise this will be the log of
- the size (like how the encoding of
- the mft record size and the index
- record size found in the boot sector
- work). Has to be a power of 2. */
- u8 reserved[3]; /* Reserved/align to 8-byte boundary. */
- INDEX_HEADER index; /* Index header describing the
- following index entries. */
-} __attribute__ ((__packed__)) INDEX_ROOT;
-
-/*
- * Attribute: Index allocation (0xa0).
- *
- * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
- *
- * This is an array of index blocks. Each index block starts with an
- * INDEX_BLOCK structure containing an index header, followed by a sequence of
- * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
- */
-typedef struct {
-/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
- NTFS_RECORD_TYPE magic; /* Magic is "INDX". */
- le16 usa_ofs; /* See NTFS_RECORD definition. */
- le16 usa_count; /* See NTFS_RECORD definition. */
-
-/* 8*/ sle64 lsn; /* $LogFile sequence number of the last
- modification of this index block. */
-/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block.
- If the cluster_size on the volume is <= the
- index_block_size of the directory,
- index_block_vcn counts in units of clusters,
- and in units of sectors otherwise. */
-/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */
-/* sizeof()= 40 (0x28) bytes */
-/*
- * When creating the index block, we place the update sequence array at this
- * offset, i.e. before we start with the index entries. This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) INDEX_BLOCK;
-
-typedef INDEX_BLOCK INDEX_ALLOCATION;
-
-/*
- * The system file FILE_Extend/$Reparse contains an index named $R listing
- * all reparse points on the volume. The index entry keys are as defined
- * below. Note, that there is no index data associated with the index entries.
- *
- * The index entries are sorted by the index key file_id. The collation rule is
- * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
- * primary key / is not a key at all. (AIA)
- */
-typedef struct {
- le32 reparse_tag; /* Reparse point type (inc. flags). */
- leMFT_REF file_id; /* Mft record of the file containing the
- reparse point attribute. */
-} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
-
-/*
- * Quota flags (32-bit).
- *
- * The user quota flags. Names explain meaning.
- */
-enum {
- QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001),
- QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002),
- QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004),
-
- QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007),
- /* This is a bit mask for the user quota flags. */
-
- /*
- * These flags are only present in the quota defaults index entry, i.e.
- * in the entry where owner_id = QUOTA_DEFAULTS_ID.
- */
- QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010),
- QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020),
- QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040),
- QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080),
-
- QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100),
- QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200),
- QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400),
- QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800),
-};
-
-typedef le32 QUOTA_FLAGS;
-
-/*
- * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
- * are on a per volume and per user basis.
- *
- * The $Q index contains one entry for each existing user_id on the volume. The
- * index key is the user_id of the user/group owning this quota control entry,
- * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
- * owner_id, is found in the standard information attribute. The collation rule
- * for $Q is COLLATION_NTOFS_ULONG.
- *
- * The $O index contains one entry for each user/group who has been assigned
- * a quota on that volume. The index key holds the SID of the user_id the
- * entry belongs to, i.e. the owner_id. The collation rule for $O is
- * COLLATION_NTOFS_SID.
- *
- * The $O index entry data is the user_id of the user corresponding to the SID.
- * This user_id is used as an index into $Q to find the quota control entry
- * associated with the SID.
- *
- * The $Q index entry data is the quota control entry and is defined below.
- */
-typedef struct {
- le32 version; /* Currently equals 2. */
- QUOTA_FLAGS flags; /* Flags describing this quota entry. */
- le64 bytes_used; /* How many bytes of the quota are in use. */
- sle64 change_time; /* Last time this quota entry was changed. */
- sle64 threshold; /* Soft quota (-1 if not limited). */
- sle64 limit; /* Hard quota (-1 if not limited). */
- sle64 exceeded_time; /* How long the soft quota has been exceeded. */
- SID sid; /* The SID of the user/object associated with
- this quota entry. Equals zero for the quota
- defaults entry (and in fact on a WinXP
- volume, it is not present at all). */
-} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
-
-/*
- * Predefined owner_id values (32-bit).
- */
-enum {
- QUOTA_INVALID_ID = cpu_to_le32(0x00000000),
- QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001),
- QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100),
-};
-
-/*
- * Current constants for quota control entries.
- */
-typedef enum {
- /* Current version. */
- QUOTA_VERSION = 2,
-} QUOTA_CONTROL_ENTRY_CONSTANTS;
-
-/*
- * Index entry flags (16-bit).
- */
-enum {
- INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
- sub-node, i.e. a reference to an index block in form of
- a virtual cluster number (see below). */
- INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last
- entry in an index block. The index entry does not
- represent a file but it can point to a sub-node. */
-
- INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
- enum bit width to 16-bit. */
-} __attribute__ ((__packed__));
-
-typedef le16 INDEX_ENTRY_FLAGS;
-
-/*
- * This the index entry header (see below).
- */
-typedef struct {
-/* 0*/ union {
- struct { /* Only valid when INDEX_ENTRY_END is not set. */
- leMFT_REF indexed_file; /* The mft reference of the file
- described by this index
- entry. Used for directory
- indexes. */
- } __attribute__ ((__packed__)) dir;
- struct { /* Used for views/indexes to find the entry's data. */
- le16 data_offset; /* Data byte offset from this
- INDEX_ENTRY. Follows the
- index key. */
- le16 data_length; /* Data length in bytes. */
- le32 reservedV; /* Reserved (zero). */
- } __attribute__ ((__packed__)) vi;
- } __attribute__ ((__packed__)) data;
-/* 8*/ le16 length; /* Byte size of this index entry, multiple of
- 8-bytes. */
-/* 10*/ le16 key_length; /* Byte size of the key value, which is in the
- index entry. It follows field reserved. Not
- multiple of 8-bytes. */
-/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
-/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */
-/* sizeof() = 16 bytes */
-} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
-
-/*
- * This is an index entry. A sequence of such entries follows each INDEX_HEADER
- * structure. Together they make up a complete index. The index follows either
- * an index root attribute or an index allocation attribute.
- *
- * NOTE: Before NTFS 3.0 only filename attributes were indexed.
- */
-typedef struct {
-/*Ofs*/
-/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
- union {
- struct { /* Only valid when INDEX_ENTRY_END is not set. */
- leMFT_REF indexed_file; /* The mft reference of the file
- described by this index
- entry. Used for directory
- indexes. */
- } __attribute__ ((__packed__)) dir;
- struct { /* Used for views/indexes to find the entry's data. */
- le16 data_offset; /* Data byte offset from this
- INDEX_ENTRY. Follows the
- index key. */
- le16 data_length; /* Data length in bytes. */
- le32 reservedV; /* Reserved (zero). */
- } __attribute__ ((__packed__)) vi;
- } __attribute__ ((__packed__)) data;
- le16 length; /* Byte size of this index entry, multiple of
- 8-bytes. */
- le16 key_length; /* Byte size of the key value, which is in the
- index entry. It follows field reserved. Not
- multiple of 8-bytes. */
- INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
- le16 reserved; /* Reserved/align to 8-byte boundary. */
-
-/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present
- if INDEX_ENTRY_END bit in flags is not set. NOTE: On
- NTFS versions before 3.0 the only valid key is the
- FILE_NAME_ATTR. On NTFS 3.0+ the following
- additional index keys are defined: */
- FILE_NAME_ATTR file_name;/* $I30 index in directories. */
- SII_INDEX_KEY sii; /* $SII index in $Secure. */
- SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */
- GUID object_id; /* $O index in FILE_Extend/$ObjId: The
- object_id of the mft record found in
- the data part of the index. */
- REPARSE_INDEX_KEY reparse; /* $R index in
- FILE_Extend/$Reparse. */
- SID sid; /* $O index in FILE_Extend/$Quota:
- SID of the owner of the user_id. */
- le32 owner_id; /* $Q index in FILE_Extend/$Quota:
- user_id of the owner of the quota
- control entry in the data part of
- the index. */
- } __attribute__ ((__packed__)) key;
- /* The (optional) index data is inserted here when creating. */
- // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last
- // eight bytes of this index entry contain the virtual
- // cluster number of the index block that holds the
- // entries immediately preceding the current entry (the
- // vcn references the corresponding cluster in the data
- // of the non-resident index allocation attribute). If
- // the key_length is zero, then the vcn immediately
- // follows the INDEX_ENTRY_HEADER. Regardless of
- // key_length, the address of the 8-byte boundary
- // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
- // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
- // where sizeof(VCN) can be hardcoded as 8 if wanted. */
-} __attribute__ ((__packed__)) INDEX_ENTRY;
-
-/*
- * Attribute: Bitmap (0xb0).
- *
- * Contains an array of bits (aka a bitfield).
- *
- * When used in conjunction with the index allocation attribute, each bit
- * corresponds to one index block within the index allocation attribute. Thus
- * the number of bits in the bitmap * index block size / cluster size is the
- * number of clusters in the index allocation attribute.
- */
-typedef struct {
- u8 bitmap[0]; /* Array of bits. */
-} __attribute__ ((__packed__)) BITMAP_ATTR;
-
-/*
- * The reparse point tag defines the type of the reparse point. It also
- * includes several flags, which further describe the reparse point.
- *
- * The reparse point tag is an unsigned 32-bit value divided in three parts:
- *
- * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
- * the reparse point.
- * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
- * 3. The most significant three bits are flags describing the reparse point.
- * They are defined as follows:
- * bit 29: Name surrogate bit. If set, the filename is an alias for
- * another object in the system.
- * bit 30: High-latency bit. If set, accessing the first byte of data will
- * be slow. (E.g. the data is stored on a tape drive.)
- * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
- * defined tags have to use zero here.
- *
- * These are the predefined reparse point tags:
- */
-enum {
- IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000),
- IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000),
- IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000),
-
- IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000),
- IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001),
- IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001),
-
- IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005),
- IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006),
- IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007),
- IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008),
-
- IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003),
-
- IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004),
-
- IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000),
-
- IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff),
-};
-
-/*
- * Attribute: Reparse point (0xc0).
- *
- * NOTE: Can be resident or non-resident.
- */
-typedef struct {
- le32 reparse_tag; /* Reparse point type (inc. flags). */
- le16 reparse_data_length; /* Byte size of reparse data. */
- le16 reserved; /* Align to 8-byte boundary. */
- u8 reparse_data[0]; /* Meaning depends on reparse_tag. */
-} __attribute__ ((__packed__)) REPARSE_POINT;
-
-/*
- * Attribute: Extended attribute (EA) information (0xd0).
- *
- * NOTE: Always resident. (Is this true???)
- */
-typedef struct {
- le16 ea_length; /* Byte size of the packed extended
- attributes. */
- le16 need_ea_count; /* The number of extended attributes which have
- the NEED_EA bit set. */
- le32 ea_query_length; /* Byte size of the buffer required to query
- the extended attributes when calling
- ZwQueryEaFile() in Windows NT/2k. I.e. the
- byte size of the unpacked extended
- attributes. */
-} __attribute__ ((__packed__)) EA_INFORMATION;
-
-/*
- * Extended attribute flags (8-bit).
- */
-enum {
- NEED_EA = 0x80 /* If set the file to which the EA belongs
- cannot be interpreted without understanding
- the associates extended attributes. */
-} __attribute__ ((__packed__));
-
-typedef u8 EA_FLAGS;
-
-/*
- * Attribute: Extended attribute (EA) (0xe0).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Like the attribute list and the index buffer list, the EA attribute value is
- * a sequence of EA_ATTR variable length records.
- */
-typedef struct {
- le32 next_entry_offset; /* Offset to the next EA_ATTR. */
- EA_FLAGS flags; /* Flags describing the EA. */
- u8 ea_name_length; /* Length of the name of the EA in bytes
- excluding the '\0' byte terminator. */
- le16 ea_value_length; /* Byte size of the EA's value. */
- u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not
- Unicode and it is zero terminated. */
- u8 ea_value[0]; /* The value of the EA. Immediately follows
- the name. */
-} __attribute__ ((__packed__)) EA_ATTR;
-
-/*
- * Attribute: Property set (0xf0).
- *
- * Intended to support Native Structure Storage (NSS) - a feature removed from
- * NTFS 3.0 during beta testing.
- */
-typedef struct {
- /* Irrelevant as feature unused. */
-} __attribute__ ((__packed__)) PROPERTY_SET;
-
-/*
- * Attribute: Logged utility stream (0x100).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Operations on this attribute are logged to the journal ($LogFile) like
- * normal metadata changes.
- *
- * Used by the Encrypting File System (EFS). All encrypted files have this
- * attribute with the name $EFS.
- */
-typedef struct {
- /* Can be anything the creator chooses. */
- /* EFS uses it as follows: */
- // FIXME: Type this info, verifying it along the way. (AIA)
-} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
-
-#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
deleted file mode 100644
index eda9972e6159..000000000000
--- a/fs/ntfs/lcnalloc.c
+++ /dev/null
@@ -1,1000 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/pagemap.h>
-
-#include "lcnalloc.h"
-#include "debug.h"
-#include "bitmap.h"
-#include "inode.h"
-#include "volume.h"
-#include "attrib.h"
-#include "malloc.h"
-#include "aops.h"
-#include "ntfs.h"
-
-/**
- * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
- * @vol: mounted ntfs volume on which to free the clusters
- * @rl: runlist describing the clusters to free
- *
- * Free all the clusters described by the runlist @rl on the volume @vol. In
- * the case of an error being returned, at least some of the clusters were not
- * freed.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - The volume lcn bitmap must be locked for writing on entry and is
- * left locked on return.
- */
-int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
- const runlist_element *rl)
-{
- struct inode *lcnbmp_vi = vol->lcnbmp_ino;
- int ret = 0;
-
- ntfs_debug("Entering.");
- if (!rl)
- return 0;
- for (; rl->length; rl++) {
- int err;
-
- if (rl->lcn < 0)
- continue;
- err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
- if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
- ret = err;
- }
- ntfs_debug("Done.");
- return ret;
-}
-
-/**
- * ntfs_cluster_alloc - allocate clusters on an ntfs volume
- * @vol: mounted ntfs volume on which to allocate the clusters
- * @start_vcn: vcn to use for the first allocated cluster
- * @count: number of clusters to allocate
- * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none)
- * @zone: zone from which to allocate the clusters
- * @is_extension: if 'true', this is an attribute extension
- *
- * Allocate @count clusters preferably starting at cluster @start_lcn or at the
- * current allocator position if @start_lcn is -1, on the mounted ntfs volume
- * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
- * MFT_ZONE for allocation of clusters for the master file table, i.e. the
- * $MFT/$DATA attribute.
- *
- * @start_vcn specifies the vcn of the first allocated cluster. This makes
- * merging the resulting runlist with the old runlist easier.
- *
- * If @is_extension is 'true', the caller is allocating clusters to extend an
- * attribute and if it is 'false', the caller is allocating clusters to fill a
- * hole in an attribute. Practically the difference is that if @is_extension
- * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
- * @is_extension is 'false' the runlist will be terminated with
- * LCN_RL_NOT_MAPPED.
- *
- * You need to check the return value with IS_ERR(). If this is false, the
- * function was successful and the return value is a runlist describing the
- * allocated cluster(s). If IS_ERR() is true, the function failed and
- * PTR_ERR() gives you the error code.
- *
- * Notes on the allocation algorithm
- * =================================
- *
- * There are two data zones. First is the area between the end of the mft zone
- * and the end of the volume, and second is the area between the start of the
- * volume and the start of the mft zone. On unmodified/standard NTFS 1.x
- * volumes, the second data zone does not exist due to the mft zone being
- * expanded to cover the start of the volume in order to reserve space for the
- * mft bitmap attribute.
- *
- * This is not the prettiest function but the complexity stems from the need of
- * implementing the mft vs data zoned approach and from the fact that we have
- * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
- * need to cope with crossing over boundaries of two buffers. Further, the
- * fact that the allocator allows for caller supplied hints as to the location
- * of where allocation should begin and the fact that the allocator keeps track
- * of where in the data zones the next natural allocation should occur,
- * contribute to the complexity of the function. But it should all be
- * worthwhile, because this allocator should: 1) be a full implementation of
- * the MFT zone approach used by Windows NT, 2) cause reduction in
- * fragmentation, and 3) be speedy in allocations (the code is not optimized
- * for speed, but the algorithm is, so further speed improvements are probably
- * possible).
- *
- * FIXME: We should be monitoring cluster allocation and increment the MFT zone
- * size dynamically but this is something for the future. We will just cause
- * heavier fragmentation by not doing it and I am not even sure Windows would
- * grow the MFT zone dynamically, so it might even be correct not to do this.
- * The overhead in doing dynamic MFT zone expansion would be very large and
- * unlikely worth the effort. (AIA)
- *
- * TODO: I have added in double the required zone position pointer wrap around
- * logic which can be optimized to having only one of the two logic sets.
- * However, having the double logic will work fine, but if we have only one of
- * the sets and we get it wrong somewhere, then we get into trouble, so
- * removing the duplicate logic requires _very_ careful consideration of _all_
- * possible code paths. So at least for now, I am leaving the double logic -
- * better safe than sorry... (AIA)
- *
- * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
- * on return.
- * - This function takes the volume lcn bitmap lock for writing and
- * modifies the bitmap contents.
- */
-runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
- const s64 count, const LCN start_lcn,
- const NTFS_CLUSTER_ALLOCATION_ZONES zone,
- const bool is_extension)
-{
- LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
- LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
- s64 clusters;
- loff_t i_size;
- struct inode *lcnbmp_vi;
- runlist_element *rl = NULL;
- struct address_space *mapping;
- struct page *page = NULL;
- u8 *buf, *byte;
- int err = 0, rlpos, rlsize, buf_size;
- u8 pass, done_zones, search_zone, need_writeback = 0, bit;
-
- ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
- "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
- (unsigned long long)count,
- (unsigned long long)start_lcn,
- zone == MFT_ZONE ? "MFT" : "DATA");
- BUG_ON(!vol);
- lcnbmp_vi = vol->lcnbmp_ino;
- BUG_ON(!lcnbmp_vi);
- BUG_ON(start_vcn < 0);
- BUG_ON(count < 0);
- BUG_ON(start_lcn < -1);
- BUG_ON(zone < FIRST_ZONE);
- BUG_ON(zone > LAST_ZONE);
-
- /* Return NULL if @count is zero. */
- if (!count)
- return NULL;
- /* Take the lcnbmp lock for writing. */
- down_write(&vol->lcnbmp_lock);
- /*
- * If no specific @start_lcn was requested, use the current data zone
- * position, otherwise use the requested @start_lcn but make sure it
- * lies outside the mft zone. Also set done_zones to 0 (no zones done)
- * and pass depending on whether we are starting inside a zone (1) or
- * at the beginning of a zone (2). If requesting from the MFT_ZONE,
- * we either start at the current position within the mft zone or at
- * the specified position. If the latter is out of bounds then we start
- * at the beginning of the MFT_ZONE.
- */
- done_zones = 0;
- pass = 1;
- /*
- * zone_start and zone_end are the current search range. search_zone
- * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
- * volume) and 4 for data zone 2 (start of volume till start of mft
- * zone).
- */
- zone_start = start_lcn;
- if (zone_start < 0) {
- if (zone == DATA_ZONE)
- zone_start = vol->data1_zone_pos;
- else
- zone_start = vol->mft_zone_pos;
- if (!zone_start) {
- /*
- * Zone starts at beginning of volume which means a
- * single pass is sufficient.
- */
- pass = 2;
- }
- } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
- zone_start < vol->mft_zone_end) {
- zone_start = vol->mft_zone_end;
- /*
- * Starting at beginning of data1_zone which means a single
- * pass in this zone is sufficient.
- */
- pass = 2;
- } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
- zone_start >= vol->mft_zone_end)) {
- zone_start = vol->mft_lcn;
- if (!vol->mft_zone_end)
- zone_start = 0;
- /*
- * Starting at beginning of volume which means a single pass
- * is sufficient.
- */
- pass = 2;
- }
- if (zone == MFT_ZONE) {
- zone_end = vol->mft_zone_end;
- search_zone = 1;
- } else /* if (zone == DATA_ZONE) */ {
- /* Skip searching the mft zone. */
- done_zones |= 1;
- if (zone_start >= vol->mft_zone_end) {
- zone_end = vol->nr_clusters;
- search_zone = 2;
- } else {
- zone_end = vol->mft_zone_start;
- search_zone = 4;
- }
- }
- /*
- * bmp_pos is the current bit position inside the bitmap. We use
- * bmp_initial_pos to determine whether or not to do a zone switch.
- */
- bmp_pos = bmp_initial_pos = zone_start;
-
- /* Loop until all clusters are allocated, i.e. clusters == 0. */
- clusters = count;
- rlpos = rlsize = 0;
- mapping = lcnbmp_vi->i_mapping;
- i_size = i_size_read(lcnbmp_vi);
- while (1) {
- ntfs_debug("Start of outer while loop: done_zones 0x%x, "
- "search_zone %i, pass %i, zone_start 0x%llx, "
- "zone_end 0x%llx, bmp_initial_pos 0x%llx, "
- "bmp_pos 0x%llx, rlpos %i, rlsize %i.",
- done_zones, search_zone, pass,
- (unsigned long long)zone_start,
- (unsigned long long)zone_end,
- (unsigned long long)bmp_initial_pos,
- (unsigned long long)bmp_pos, rlpos, rlsize);
- /* Loop until we run out of free clusters. */
- last_read_pos = bmp_pos >> 3;
- ntfs_debug("last_read_pos 0x%llx.",
- (unsigned long long)last_read_pos);
- if (last_read_pos > i_size) {
- ntfs_debug("End of attribute reached. "
- "Skipping to zone_pass_done.");
- goto zone_pass_done;
- }
- if (likely(page)) {
- if (need_writeback) {
- ntfs_debug("Marking page dirty.");
- flush_dcache_page(page);
- set_page_dirty(page);
- need_writeback = 0;
- }
- ntfs_unmap_page(page);
- }
- page = ntfs_map_page(mapping, last_read_pos >>
- PAGE_SHIFT);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- ntfs_error(vol->sb, "Failed to map page.");
- goto out;
- }
- buf_size = last_read_pos & ~PAGE_MASK;
- buf = page_address(page) + buf_size;
- buf_size = PAGE_SIZE - buf_size;
- if (unlikely(last_read_pos + buf_size > i_size))
- buf_size = i_size - last_read_pos;
- buf_size <<= 3;
- lcn = bmp_pos & 7;
- bmp_pos &= ~(LCN)7;
- ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
- "bmp_pos 0x%llx, need_writeback %i.", buf_size,
- (unsigned long long)lcn,
- (unsigned long long)bmp_pos, need_writeback);
- while (lcn < buf_size && lcn + bmp_pos < zone_end) {
- byte = buf + (lcn >> 3);
- ntfs_debug("In inner while loop: buf_size %i, "
- "lcn 0x%llx, bmp_pos 0x%llx, "
- "need_writeback %i, byte ofs 0x%x, "
- "*byte 0x%x.", buf_size,
- (unsigned long long)lcn,
- (unsigned long long)bmp_pos,
- need_writeback,
- (unsigned int)(lcn >> 3),
- (unsigned int)*byte);
- /* Skip full bytes. */
- if (*byte == 0xff) {
- lcn = (lcn + 8) & ~(LCN)7;
- ntfs_debug("Continuing while loop 1.");
- continue;
- }
- bit = 1 << (lcn & 7);
- ntfs_debug("bit 0x%x.", bit);
- /* If the bit is already set, go onto the next one. */
- if (*byte & bit) {
- lcn++;
- ntfs_debug("Continuing while loop 2.");
- continue;
- }
- /*
- * Allocate more memory if needed, including space for
- * the terminator element.
- * ntfs_malloc_nofs() operates on whole pages only.
- */
- if ((rlpos + 2) * sizeof(*rl) > rlsize) {
- runlist_element *rl2;
-
- ntfs_debug("Reallocating memory.");
- if (!rl)
- ntfs_debug("First free bit is at LCN "
- "0x%llx.",
- (unsigned long long)
- (lcn + bmp_pos));
- rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
- if (unlikely(!rl2)) {
- err = -ENOMEM;
- ntfs_error(vol->sb, "Failed to "
- "allocate memory.");
- goto out;
- }
- memcpy(rl2, rl, rlsize);
- ntfs_free(rl);
- rl = rl2;
- rlsize += PAGE_SIZE;
- ntfs_debug("Reallocated memory, rlsize 0x%x.",
- rlsize);
- }
- /* Allocate the bitmap bit. */
- *byte |= bit;
- /* We need to write this bitmap page to disk. */
- need_writeback = 1;
- ntfs_debug("*byte 0x%x, need_writeback is set.",
- (unsigned int)*byte);
- /*
- * Coalesce with previous run if adjacent LCNs.
- * Otherwise, append a new run.
- */
- ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
- "prev_lcn 0x%llx, lcn 0x%llx, "
- "bmp_pos 0x%llx, prev_run_len 0x%llx, "
- "rlpos %i.",
- (unsigned long long)(lcn + bmp_pos),
- 1ULL, (unsigned long long)prev_lcn,
- (unsigned long long)lcn,
- (unsigned long long)bmp_pos,
- (unsigned long long)prev_run_len,
- rlpos);
- if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
- ntfs_debug("Coalescing to run (lcn 0x%llx, "
- "len 0x%llx).",
- (unsigned long long)
- rl[rlpos - 1].lcn,
- (unsigned long long)
- rl[rlpos - 1].length);
- rl[rlpos - 1].length = ++prev_run_len;
- ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
- "prev_run_len 0x%llx.",
- (unsigned long long)
- rl[rlpos - 1].lcn,
- (unsigned long long)
- rl[rlpos - 1].length,
- (unsigned long long)
- prev_run_len);
- } else {
- if (likely(rlpos)) {
- ntfs_debug("Adding new run, (previous "
- "run lcn 0x%llx, "
- "len 0x%llx).",
- (unsigned long long)
- rl[rlpos - 1].lcn,
- (unsigned long long)
- rl[rlpos - 1].length);
- rl[rlpos].vcn = rl[rlpos - 1].vcn +
- prev_run_len;
- } else {
- ntfs_debug("Adding new run, is first "
- "run.");
- rl[rlpos].vcn = start_vcn;
- }
- rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
- rl[rlpos].length = prev_run_len = 1;
- rlpos++;
- }
- /* Done? */
- if (!--clusters) {
- LCN tc;
- /*
- * Update the current zone position. Positions
- * of already scanned zones have been updated
- * during the respective zone switches.
- */
- tc = lcn + bmp_pos + 1;
- ntfs_debug("Done. Updating current zone "
- "position, tc 0x%llx, "
- "search_zone %i.",
- (unsigned long long)tc,
- search_zone);
- switch (search_zone) {
- case 1:
- ntfs_debug("Before checks, "
- "vol->mft_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->mft_zone_pos);
- if (tc >= vol->mft_zone_end) {
- vol->mft_zone_pos =
- vol->mft_lcn;
- if (!vol->mft_zone_end)
- vol->mft_zone_pos = 0;
- } else if ((bmp_initial_pos >=
- vol->mft_zone_pos ||
- tc > vol->mft_zone_pos)
- && tc >= vol->mft_lcn)
- vol->mft_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->mft_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->mft_zone_pos);
- break;
- case 2:
- ntfs_debug("Before checks, "
- "vol->data1_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data1_zone_pos);
- if (tc >= vol->nr_clusters)
- vol->data1_zone_pos =
- vol->mft_zone_end;
- else if ((bmp_initial_pos >=
- vol->data1_zone_pos ||
- tc > vol->data1_zone_pos)
- && tc >= vol->mft_zone_end)
- vol->data1_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->data1_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data1_zone_pos);
- break;
- case 4:
- ntfs_debug("Before checks, "
- "vol->data2_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data2_zone_pos);
- if (tc >= vol->mft_zone_start)
- vol->data2_zone_pos = 0;
- else if (bmp_initial_pos >=
- vol->data2_zone_pos ||
- tc > vol->data2_zone_pos)
- vol->data2_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->data2_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data2_zone_pos);
- break;
- default:
- BUG();
- }
- ntfs_debug("Finished. Going to out.");
- goto out;
- }
- lcn++;
- }
- bmp_pos += buf_size;
- ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
- "0x%llx, bmp_pos 0x%llx, need_writeback %i.",
- buf_size, (unsigned long long)lcn,
- (unsigned long long)bmp_pos, need_writeback);
- if (bmp_pos < zone_end) {
- ntfs_debug("Continuing outer while loop, "
- "bmp_pos 0x%llx, zone_end 0x%llx.",
- (unsigned long long)bmp_pos,
- (unsigned long long)zone_end);
- continue;
- }
-zone_pass_done: /* Finished with the current zone pass. */
- ntfs_debug("At zone_pass_done, pass %i.", pass);
- if (pass == 1) {
- /*
- * Now do pass 2, scanning the first part of the zone
- * we omitted in pass 1.
- */
- pass = 2;
- zone_end = zone_start;
- switch (search_zone) {
- case 1: /* mft_zone */
- zone_start = vol->mft_zone_start;
- break;
- case 2: /* data1_zone */
- zone_start = vol->mft_zone_end;
- break;
- case 4: /* data2_zone */
- zone_start = 0;
- break;
- default:
- BUG();
- }
- /* Sanity check. */
- if (zone_end < zone_start)
- zone_end = zone_start;
- bmp_pos = zone_start;
- ntfs_debug("Continuing outer while loop, pass 2, "
- "zone_start 0x%llx, zone_end 0x%llx, "
- "bmp_pos 0x%llx.",
- (unsigned long long)zone_start,
- (unsigned long long)zone_end,
- (unsigned long long)bmp_pos);
- continue;
- } /* pass == 2 */
-done_zones_check:
- ntfs_debug("At done_zones_check, search_zone %i, done_zones "
- "before 0x%x, done_zones after 0x%x.",
- search_zone, done_zones,
- done_zones | search_zone);
- done_zones |= search_zone;
- if (done_zones < 7) {
- ntfs_debug("Switching zone.");
- /* Now switch to the next zone we haven't done yet. */
- pass = 1;
- switch (search_zone) {
- case 1:
- ntfs_debug("Switching from mft zone to data1 "
- "zone.");
- /* Update mft zone position. */
- if (rlpos) {
- LCN tc;
-
- ntfs_debug("Before checks, "
- "vol->mft_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->mft_zone_pos);
- tc = rl[rlpos - 1].lcn +
- rl[rlpos - 1].length;
- if (tc >= vol->mft_zone_end) {
- vol->mft_zone_pos =
- vol->mft_lcn;
- if (!vol->mft_zone_end)
- vol->mft_zone_pos = 0;
- } else if ((bmp_initial_pos >=
- vol->mft_zone_pos ||
- tc > vol->mft_zone_pos)
- && tc >= vol->mft_lcn)
- vol->mft_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->mft_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->mft_zone_pos);
- }
- /* Switch from mft zone to data1 zone. */
-switch_to_data1_zone: search_zone = 2;
- zone_start = bmp_initial_pos =
- vol->data1_zone_pos;
- zone_end = vol->nr_clusters;
- if (zone_start == vol->mft_zone_end)
- pass = 2;
- if (zone_start >= zone_end) {
- vol->data1_zone_pos = zone_start =
- vol->mft_zone_end;
- pass = 2;
- }
- break;
- case 2:
- ntfs_debug("Switching from data1 zone to "
- "data2 zone.");
- /* Update data1 zone position. */
- if (rlpos) {
- LCN tc;
-
- ntfs_debug("Before checks, "
- "vol->data1_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data1_zone_pos);
- tc = rl[rlpos - 1].lcn +
- rl[rlpos - 1].length;
- if (tc >= vol->nr_clusters)
- vol->data1_zone_pos =
- vol->mft_zone_end;
- else if ((bmp_initial_pos >=
- vol->data1_zone_pos ||
- tc > vol->data1_zone_pos)
- && tc >= vol->mft_zone_end)
- vol->data1_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->data1_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data1_zone_pos);
- }
- /* Switch from data1 zone to data2 zone. */
- search_zone = 4;
- zone_start = bmp_initial_pos =
- vol->data2_zone_pos;
- zone_end = vol->mft_zone_start;
- if (!zone_start)
- pass = 2;
- if (zone_start >= zone_end) {
- vol->data2_zone_pos = zone_start =
- bmp_initial_pos = 0;
- pass = 2;
- }
- break;
- case 4:
- ntfs_debug("Switching from data2 zone to "
- "data1 zone.");
- /* Update data2 zone position. */
- if (rlpos) {
- LCN tc;
-
- ntfs_debug("Before checks, "
- "vol->data2_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data2_zone_pos);
- tc = rl[rlpos - 1].lcn +
- rl[rlpos - 1].length;
- if (tc >= vol->mft_zone_start)
- vol->data2_zone_pos = 0;
- else if (bmp_initial_pos >=
- vol->data2_zone_pos ||
- tc > vol->data2_zone_pos)
- vol->data2_zone_pos = tc;
- ntfs_debug("After checks, "
- "vol->data2_zone_pos "
- "0x%llx.",
- (unsigned long long)
- vol->data2_zone_pos);
- }
- /* Switch from data2 zone to data1 zone. */
- goto switch_to_data1_zone;
- default:
- BUG();
- }
- ntfs_debug("After zone switch, search_zone %i, "
- "pass %i, bmp_initial_pos 0x%llx, "
- "zone_start 0x%llx, zone_end 0x%llx.",
- search_zone, pass,
- (unsigned long long)bmp_initial_pos,
- (unsigned long long)zone_start,
- (unsigned long long)zone_end);
- bmp_pos = zone_start;
- if (zone_start == zone_end) {
- ntfs_debug("Empty zone, going to "
- "done_zones_check.");
- /* Empty zone. Don't bother searching it. */
- goto done_zones_check;
- }
- ntfs_debug("Continuing outer while loop.");
- continue;
- } /* done_zones == 7 */
- ntfs_debug("All zones are finished.");
- /*
- * All zones are finished! If DATA_ZONE, shrink mft zone. If
- * MFT_ZONE, we have really run out of space.
- */
- mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
- ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
- "0x%llx, mft_zone_size 0x%llx.",
- (unsigned long long)vol->mft_zone_start,
- (unsigned long long)vol->mft_zone_end,
- (unsigned long long)mft_zone_size);
- if (zone == MFT_ZONE || mft_zone_size <= 0) {
- ntfs_debug("No free clusters left, going to out.");
- /* Really no more space left on device. */
- err = -ENOSPC;
- goto out;
- } /* zone == DATA_ZONE && mft_zone_size > 0 */
- ntfs_debug("Shrinking mft zone.");
- zone_end = vol->mft_zone_end;
- mft_zone_size >>= 1;
- if (mft_zone_size > 0)
- vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
- else /* mft zone and data2 zone no longer exist. */
- vol->data2_zone_pos = vol->mft_zone_start =
- vol->mft_zone_end = 0;
- if (vol->mft_zone_pos >= vol->mft_zone_end) {
- vol->mft_zone_pos = vol->mft_lcn;
- if (!vol->mft_zone_end)
- vol->mft_zone_pos = 0;
- }
- bmp_pos = zone_start = bmp_initial_pos =
- vol->data1_zone_pos = vol->mft_zone_end;
- search_zone = 2;
- pass = 2;
- done_zones &= ~2;
- ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
- "vol->mft_zone_start 0x%llx, "
- "vol->mft_zone_end 0x%llx, "
- "vol->mft_zone_pos 0x%llx, search_zone 2, "
- "pass 2, dones_zones 0x%x, zone_start 0x%llx, "
- "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
- "continuing outer while loop.",
- (unsigned long long)mft_zone_size,
- (unsigned long long)vol->mft_zone_start,
- (unsigned long long)vol->mft_zone_end,
- (unsigned long long)vol->mft_zone_pos,
- done_zones, (unsigned long long)zone_start,
- (unsigned long long)zone_end,
- (unsigned long long)vol->data1_zone_pos);
- }
- ntfs_debug("After outer while loop.");
-out:
- ntfs_debug("At out.");
- /* Add runlist terminator element. */
- if (likely(rl)) {
- rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
- rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
- rl[rlpos].length = 0;
- }
- if (likely(page && !IS_ERR(page))) {
- if (need_writeback) {
- ntfs_debug("Marking page dirty.");
- flush_dcache_page(page);
- set_page_dirty(page);
- need_writeback = 0;
- }
- ntfs_unmap_page(page);
- }
- if (likely(!err)) {
- up_write(&vol->lcnbmp_lock);
- ntfs_debug("Done.");
- return rl;
- }
- ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
- "(error %i).", err);
- if (rl) {
- int err2;
-
- if (err == -ENOSPC)
- ntfs_debug("Not enough space to complete allocation, "
- "err -ENOSPC, first free lcn 0x%llx, "
- "could allocate up to 0x%llx "
- "clusters.",
- (unsigned long long)rl[0].lcn,
- (unsigned long long)(count - clusters));
- /* Deallocate all allocated clusters. */
- ntfs_debug("Attempting rollback...");
- err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
- if (err2) {
- ntfs_error(vol->sb, "Failed to rollback (error %i). "
- "Leaving inconsistent metadata! "
- "Unmount and run chkdsk.", err2);
- NVolSetErrors(vol);
- }
- /* Free the runlist. */
- ntfs_free(rl);
- } else if (err == -ENOSPC)
- ntfs_debug("No space left at all, err = -ENOSPC, first free "
- "lcn = 0x%llx.",
- (long long)vol->data1_zone_pos);
- up_write(&vol->lcnbmp_lock);
- return ERR_PTR(err);
-}
-
-/**
- * __ntfs_cluster_free - free clusters on an ntfs volume
- * @ni: ntfs inode whose runlist describes the clusters to free
- * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
- * @count: number of clusters to free or -1 for all clusters
- * @ctx: active attribute search context if present or NULL if not
- * @is_rollback: true if this is a rollback operation
- *
- * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the vfs inode @ni.
- *
- * If @count is -1, all clusters from @start_vcn to the end of the runlist are
- * deallocated. Thus, to completely free all clusters in a runlist, use
- * @start_vcn = 0 and @count = -1.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record. This is needed when __ntfs_cluster_free() encounters unmapped
- * runlist fragments and allows their mapping. If you do not have the mft
- * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
- * perform the necessary mapping and unmapping.
- *
- * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
- * before returning. Thus, @ctx will be left pointing to the same attribute on
- * return as on entry. However, the actual pointers in @ctx may point to
- * different memory locations on return, so you must remember to reset any
- * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
- * you will probably want to do:
- * m = ctx->mrec;
- * a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * @is_rollback should always be 'false', it is for internal use to rollback
- * errors. You probably want to use ntfs_cluster_free() instead.
- *
- * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
- * remove from the runlist or mark sparse the freed runs later.
- *
- * Return the number of deallocated clusters (not counting sparse ones) on
- * success and -errno on error.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- * is no longer valid, i.e. you need to either call
- * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- * In that case PTR_ERR(@ctx->mrec) will give you the error code for
- * why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- * and is locked on return. Note the runlist may be modified when
- * needed runlist fragments need to be mapped.
- * - The volume lcn bitmap must be unlocked on entry and is unlocked
- * on return.
- * - This function takes the volume lcn bitmap lock for writing and
- * modifies the bitmap contents.
- * - If @ctx is NULL, the base mft record of @ni must not be mapped on
- * entry and it will be left unmapped on return.
- * - If @ctx is not NULL, the base mft record must be mapped on entry
- * and it will be left mapped on return.
- */
-s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
- ntfs_attr_search_ctx *ctx, const bool is_rollback)
-{
- s64 delta, to_free, total_freed, real_freed;
- ntfs_volume *vol;
- struct inode *lcnbmp_vi;
- runlist_element *rl;
- int err;
-
- BUG_ON(!ni);
- ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
- "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
- (unsigned long long)count,
- is_rollback ? " (rollback)" : "");
- vol = ni->vol;
- lcnbmp_vi = vol->lcnbmp_ino;
- BUG_ON(!lcnbmp_vi);
- BUG_ON(start_vcn < 0);
- BUG_ON(count < -1);
- /*
- * Lock the lcn bitmap for writing but only if not rolling back. We
- * must hold the lock all the way including through rollback otherwise
- * rollback is not possible because once we have cleared a bit and
- * dropped the lock, anyone could have set the bit again, thus
- * allocating the cluster for another use.
- */
- if (likely(!is_rollback))
- down_write(&vol->lcnbmp_lock);
-
- total_freed = real_freed = 0;
-
- rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
- if (IS_ERR(rl)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "Failed to find first runlist "
- "element (error %li), aborting.",
- PTR_ERR(rl));
- err = PTR_ERR(rl);
- goto err_out;
- }
- if (unlikely(rl->lcn < LCN_HOLE)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "First runlist element has "
- "invalid lcn, aborting.");
- err = -EIO;
- goto err_out;
- }
- /* Find the starting cluster inside the run that needs freeing. */
- delta = start_vcn - rl->vcn;
-
- /* The number of clusters in this run that need freeing. */
- to_free = rl->length - delta;
- if (count >= 0 && to_free > count)
- to_free = count;
-
- if (likely(rl->lcn >= 0)) {
- /* Do the actual freeing of the clusters in this run. */
- err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
- to_free, likely(!is_rollback) ? 0 : 1);
- if (unlikely(err)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "Failed to clear first run "
- "(error %i), aborting.", err);
- goto err_out;
- }
- /* We have freed @to_free real clusters. */
- real_freed = to_free;
- };
- /* Go to the next run and adjust the number of clusters left to free. */
- ++rl;
- if (count >= 0)
- count -= to_free;
-
- /* Keep track of the total "freed" clusters, including sparse ones. */
- total_freed = to_free;
- /*
- * Loop over the remaining runs, using @count as a capping value, and
- * free them.
- */
- for (; rl->length && count != 0; ++rl) {
- if (unlikely(rl->lcn < LCN_HOLE)) {
- VCN vcn;
-
- /* Attempt to map runlist. */
- vcn = rl->vcn;
- rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
- if (IS_ERR(rl)) {
- err = PTR_ERR(rl);
- if (!is_rollback)
- ntfs_error(vol->sb, "Failed to map "
- "runlist fragment or "
- "failed to find "
- "subsequent runlist "
- "element.");
- goto err_out;
- }
- if (unlikely(rl->lcn < LCN_HOLE)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "Runlist element "
- "has invalid lcn "
- "(0x%llx).",
- (unsigned long long)
- rl->lcn);
- err = -EIO;
- goto err_out;
- }
- }
- /* The number of clusters in this run that need freeing. */
- to_free = rl->length;
- if (count >= 0 && to_free > count)
- to_free = count;
-
- if (likely(rl->lcn >= 0)) {
- /* Do the actual freeing of the clusters in the run. */
- err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
- to_free, likely(!is_rollback) ? 0 : 1);
- if (unlikely(err)) {
- if (!is_rollback)
- ntfs_error(vol->sb, "Failed to clear "
- "subsequent run.");
- goto err_out;
- }
- /* We have freed @to_free real clusters. */
- real_freed += to_free;
- }
- /* Adjust the number of clusters left to free. */
- if (count >= 0)
- count -= to_free;
-
- /* Update the total done clusters. */
- total_freed += to_free;
- }
- if (likely(!is_rollback))
- up_write(&vol->lcnbmp_lock);
-
- BUG_ON(count > 0);
-
- /* We are done. Return the number of actually freed clusters. */
- ntfs_debug("Done.");
- return real_freed;
-err_out:
- if (is_rollback)
- return err;
- /* If no real clusters were freed, no need to rollback. */
- if (!real_freed) {
- up_write(&vol->lcnbmp_lock);
- return err;
- }
- /*
- * Attempt to rollback and if that succeeds just return the error code.
- * If rollback fails, set the volume errors flag, emit an error
- * message, and return the error code.
- */
- delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
- if (delta < 0) {
- ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving "
- "inconsistent metadata! Unmount and run "
- "chkdsk.", (int)delta);
- NVolSetErrors(vol);
- }
- up_write(&vol->lcnbmp_lock);
- ntfs_error(vol->sb, "Aborting (error %i).", err);
- return err;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
deleted file mode 100644
index 1589a6d8434b..000000000000
--- a/fs/ntfs/lcnalloc.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_LCNALLOC_H
-#define _LINUX_NTFS_LCNALLOC_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "attrib.h"
-#include "types.h"
-#include "inode.h"
-#include "runlist.h"
-#include "volume.h"
-
-typedef enum {
- FIRST_ZONE = 0, /* For sanity checking. */
- MFT_ZONE = 0, /* Allocate from $MFT zone. */
- DATA_ZONE = 1, /* Allocate from $DATA zone. */
- LAST_ZONE = 1, /* For sanity checking. */
-} NTFS_CLUSTER_ALLOCATION_ZONES;
-
-extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
- const VCN start_vcn, const s64 count, const LCN start_lcn,
- const NTFS_CLUSTER_ALLOCATION_ZONES zone,
- const bool is_extension);
-
-extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
- s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
-
-/**
- * ntfs_cluster_free - free clusters on an ntfs volume
- * @ni: ntfs inode whose runlist describes the clusters to free
- * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
- * @count: number of clusters to free or -1 for all clusters
- * @ctx: active attribute search context if present or NULL if not
- *
- * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the ntfs inode @ni.
- *
- * If @count is -1, all clusters from @start_vcn to the end of the runlist are
- * deallocated. Thus, to completely free all clusters in a runlist, use
- * @start_vcn = 0 and @count = -1.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record. This is needed when ntfs_cluster_free() encounters unmapped runlist
- * fragments and allows their mapping. If you do not have the mft record
- * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
- * the necessary mapping and unmapping.
- *
- * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
- * before returning. Thus, @ctx will be left pointing to the same attribute on
- * return as on entry. However, the actual pointers in @ctx may point to
- * different memory locations on return, so you must remember to reset any
- * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
- * you will probably want to do:
- * m = ctx->mrec;
- * a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
- * from the runlist or mark sparse the freed runs later.
- *
- * Return the number of deallocated clusters (not counting sparse ones) on
- * success and -errno on error.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- * is no longer valid, i.e. you need to either call
- * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- * In that case PTR_ERR(@ctx->mrec) will give you the error code for
- * why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- * and is locked on return. Note the runlist may be modified when
- * needed runlist fragments need to be mapped.
- * - The volume lcn bitmap must be unlocked on entry and is unlocked
- * on return.
- * - This function takes the volume lcn bitmap lock for writing and
- * modifies the bitmap contents.
- * - If @ctx is NULL, the base mft record of @ni must not be mapped on
- * entry and it will be left unmapped on return.
- * - If @ctx is not NULL, the base mft record must be mapped on entry
- * and it will be left mapped on return.
- */
-static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
- s64 count, ntfs_attr_search_ctx *ctx)
-{
- return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
-}
-
-extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
- const runlist_element *rl);
-
-/**
- * ntfs_cluster_free_from_rl - free clusters from runlist
- * @vol: mounted ntfs volume on which to free the clusters
- * @rl: runlist describing the clusters to free
- *
- * Free all the clusters described by the runlist @rl on the volume @vol. In
- * the case of an error being returned, at least some of the clusters were not
- * freed.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - This function takes the volume lcn bitmap lock for writing and
- * modifies the bitmap contents.
- * - The caller must have locked the runlist @rl for reading or
- * writing.
- */
-static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
- const runlist_element *rl)
-{
- int ret;
-
- down_write(&vol->lcnbmp_lock);
- ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
- up_write(&vol->lcnbmp_lock);
- return ret;
-}
-
-#endif /* NTFS_RW */
-
-#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
deleted file mode 100644
index 6ce60ffc6ac0..000000000000
--- a/fs/ntfs/logfile.c
+++ /dev/null
@@ -1,849 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2002-2007 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/buffer_head.h>
-#include <linux/bitops.h>
-#include <linux/log2.h>
-#include <linux/bio.h>
-
-#include "attrib.h"
-#include "aops.h"
-#include "debug.h"
-#include "logfile.h"
-#include "malloc.h"
-#include "volume.h"
-#include "ntfs.h"
-
-/**
- * ntfs_check_restart_page_header - check the page header for consistency
- * @vi: $LogFile inode to which the restart page header belongs
- * @rp: restart page header to check
- * @pos: position in @vi at which the restart page header resides
- *
- * Check the restart page header @rp for consistency and return 'true' if it is
- * consistent and 'false' otherwise.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- */
-static bool ntfs_check_restart_page_header(struct inode *vi,
- RESTART_PAGE_HEADER *rp, s64 pos)
-{
- u32 logfile_system_page_size, logfile_log_page_size;
- u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
- bool have_usa = true;
-
- ntfs_debug("Entering.");
- /*
- * If the system or log page sizes are smaller than the ntfs block size
- * or either is not a power of 2 we cannot handle this log file.
- */
- logfile_system_page_size = le32_to_cpu(rp->system_page_size);
- logfile_log_page_size = le32_to_cpu(rp->log_page_size);
- if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
- logfile_log_page_size < NTFS_BLOCK_SIZE ||
- logfile_system_page_size &
- (logfile_system_page_size - 1) ||
- !is_power_of_2(logfile_log_page_size)) {
- ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
- return false;
- }
- /*
- * We must be either at !pos (1st restart page) or at pos = system page
- * size (2nd restart page).
- */
- if (pos && pos != logfile_system_page_size) {
- ntfs_error(vi->i_sb, "Found restart area in incorrect "
- "position in $LogFile.");
- return false;
- }
- /* We only know how to handle version 1.1. */
- if (sle16_to_cpu(rp->major_ver) != 1 ||
- sle16_to_cpu(rp->minor_ver) != 1) {
- ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
- "supported. (This driver supports version "
- "1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
- (int)sle16_to_cpu(rp->minor_ver));
- return false;
- }
- /*
- * If chkdsk has been run the restart page may not be protected by an
- * update sequence array.
- */
- if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
- have_usa = false;
- goto skip_usa_checks;
- }
- /* Verify the size of the update sequence array. */
- usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
- if (usa_count != le16_to_cpu(rp->usa_count)) {
- ntfs_error(vi->i_sb, "$LogFile restart page specifies "
- "inconsistent update sequence array count.");
- return false;
- }
- /* Verify the position of the update sequence array. */
- usa_ofs = le16_to_cpu(rp->usa_ofs);
- usa_end = usa_ofs + usa_count * sizeof(u16);
- if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
- usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
- ntfs_error(vi->i_sb, "$LogFile restart page specifies "
- "inconsistent update sequence array offset.");
- return false;
- }
-skip_usa_checks:
- /*
- * Verify the position of the restart area. It must be:
- * - aligned to 8-byte boundary,
- * - after the update sequence array, and
- * - within the system page size.
- */
- ra_ofs = le16_to_cpu(rp->restart_area_offset);
- if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
- ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
- ra_ofs > logfile_system_page_size) {
- ntfs_error(vi->i_sb, "$LogFile restart page specifies "
- "inconsistent restart area offset.");
- return false;
- }
- /*
- * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
- * set.
- */
- if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
- ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
- "by chkdsk but a chkdsk LSN is specified.");
- return false;
- }
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * ntfs_check_restart_area - check the restart area for consistency
- * @vi: $LogFile inode to which the restart page belongs
- * @rp: restart page whose restart area to check
- *
- * Check the restart area of the restart page @rp for consistency and return
- * 'true' if it is consistent and 'false' otherwise.
- *
- * This function assumes that the restart page header has already been
- * consistency checked.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- */
-static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
-{
- u64 file_size;
- RESTART_AREA *ra;
- u16 ra_ofs, ra_len, ca_ofs;
- u8 fs_bits;
-
- ntfs_debug("Entering.");
- ra_ofs = le16_to_cpu(rp->restart_area_offset);
- ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
- /*
- * Everything before ra->file_size must be before the first word
- * protected by an update sequence number. This ensures that it is
- * safe to access ra->client_array_offset.
- */
- if (ra_ofs + offsetof(RESTART_AREA, file_size) >
- NTFS_BLOCK_SIZE - sizeof(u16)) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent file offset.");
- return false;
- }
- /*
- * Now that we can access ra->client_array_offset, make sure everything
- * up to the log client array is before the first word protected by an
- * update sequence number. This ensures we can access all of the
- * restart area elements safely. Also, the client array offset must be
- * aligned to an 8-byte boundary.
- */
- ca_ofs = le16_to_cpu(ra->client_array_offset);
- if (((ca_ofs + 7) & ~7) != ca_ofs ||
- ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent client array offset.");
- return false;
- }
- /*
- * The restart area must end within the system page size both when
- * calculated manually and as specified by ra->restart_area_length.
- * Also, the calculated length must not exceed the specified length.
- */
- ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
- sizeof(LOG_CLIENT_RECORD);
- if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
- ra_ofs + le16_to_cpu(ra->restart_area_length) >
- le32_to_cpu(rp->system_page_size) ||
- ra_len > le16_to_cpu(ra->restart_area_length)) {
- ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
- "of the system page size specified by the "
- "restart page header and/or the specified "
- "restart area length is inconsistent.");
- return false;
- }
- /*
- * The ra->client_free_list and ra->client_in_use_list must be either
- * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
- * overflowing the client array.
- */
- if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
- le16_to_cpu(ra->client_free_list) >=
- le16_to_cpu(ra->log_clients)) ||
- (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
- le16_to_cpu(ra->client_in_use_list) >=
- le16_to_cpu(ra->log_clients))) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "overflowing client free and/or in use lists.");
- return false;
- }
- /*
- * Check ra->seq_number_bits against ra->file_size for consistency.
- * We cannot just use ffs() because the file size is not a power of 2.
- */
- file_size = (u64)sle64_to_cpu(ra->file_size);
- fs_bits = 0;
- while (file_size) {
- file_size >>= 1;
- fs_bits++;
- }
- if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent sequence number bits.");
- return false;
- }
- /* The log record header length must be a multiple of 8. */
- if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
- le16_to_cpu(ra->log_record_header_length)) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent log record header length.");
- return false;
- }
- /* Dito for the log page data offset. */
- if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
- le16_to_cpu(ra->log_page_data_offset)) {
- ntfs_error(vi->i_sb, "$LogFile restart area specifies "
- "inconsistent log page data offset.");
- return false;
- }
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * ntfs_check_log_client_array - check the log client array for consistency
- * @vi: $LogFile inode to which the restart page belongs
- * @rp: restart page whose log client array to check
- *
- * Check the log client array of the restart page @rp for consistency and
- * return 'true' if it is consistent and 'false' otherwise.
- *
- * This function assumes that the restart page header and the restart area have
- * already been consistency checked.
- *
- * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
- * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
- * restart page and the page must be multi sector transfer deprotected.
- */
-static bool ntfs_check_log_client_array(struct inode *vi,
- RESTART_PAGE_HEADER *rp)
-{
- RESTART_AREA *ra;
- LOG_CLIENT_RECORD *ca, *cr;
- u16 nr_clients, idx;
- bool in_free_list, idx_is_first;
-
- ntfs_debug("Entering.");
- ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
- ca = (LOG_CLIENT_RECORD*)((u8*)ra +
- le16_to_cpu(ra->client_array_offset));
- /*
- * Check the ra->client_free_list first and then check the
- * ra->client_in_use_list. Check each of the log client records in
- * each of the lists and check that the array does not overflow the
- * ra->log_clients value. Also keep track of the number of records
- * visited as there cannot be more than ra->log_clients records and
- * that way we detect eventual loops in within a list.
- */
- nr_clients = le16_to_cpu(ra->log_clients);
- idx = le16_to_cpu(ra->client_free_list);
- in_free_list = true;
-check_list:
- for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
- idx = le16_to_cpu(cr->next_client)) {
- if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
- goto err_out;
- /* Set @cr to the current log client record. */
- cr = ca + idx;
- /* The first log client record must not have a prev_client. */
- if (idx_is_first) {
- if (cr->prev_client != LOGFILE_NO_CLIENT)
- goto err_out;
- idx_is_first = false;
- }
- }
- /* Switch to and check the in use list if we just did the free list. */
- if (in_free_list) {
- in_free_list = false;
- idx = le16_to_cpu(ra->client_in_use_list);
- goto check_list;
- }
- ntfs_debug("Done.");
- return true;
-err_out:
- ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
- return false;
-}
-
-/**
- * ntfs_check_and_load_restart_page - check the restart page for consistency
- * @vi: $LogFile inode to which the restart page belongs
- * @rp: restart page to check
- * @pos: position in @vi at which the restart page resides
- * @wrp: [OUT] copy of the multi sector transfer deprotected restart page
- * @lsn: [OUT] set to the current logfile lsn on success
- *
- * Check the restart page @rp for consistency and return 0 if it is consistent
- * and -errno otherwise. The restart page may have been modified by chkdsk in
- * which case its magic is CHKD instead of RSTR.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- *
- * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
- * copy of the complete multi sector transfer deprotected page. On failure,
- * *@wrp is undefined.
- *
- * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
- * logfile lsn according to this restart page. On failure, *@lsn is undefined.
- *
- * The following error codes are defined:
- * -EINVAL - The restart page is inconsistent.
- * -ENOMEM - Not enough memory to load the restart page.
- * -EIO - Failed to reading from $LogFile.
- */
-static int ntfs_check_and_load_restart_page(struct inode *vi,
- RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
- LSN *lsn)
-{
- RESTART_AREA *ra;
- RESTART_PAGE_HEADER *trp;
- int size, err;
-
- ntfs_debug("Entering.");
- /* Check the restart page header for consistency. */
- if (!ntfs_check_restart_page_header(vi, rp, pos)) {
- /* Error output already done inside the function. */
- return -EINVAL;
- }
- /* Check the restart area for consistency. */
- if (!ntfs_check_restart_area(vi, rp)) {
- /* Error output already done inside the function. */
- return -EINVAL;
- }
- ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
- /*
- * Allocate a buffer to store the whole restart page so we can multi
- * sector transfer deprotect it.
- */
- trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
- if (!trp) {
- ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
- "restart page buffer.");
- return -ENOMEM;
- }
- /*
- * Read the whole of the restart page into the buffer. If it fits
- * completely inside @rp, just copy it from there. Otherwise map all
- * the required pages and copy the data from them.
- */
- size = PAGE_SIZE - (pos & ~PAGE_MASK);
- if (size >= le32_to_cpu(rp->system_page_size)) {
- memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
- } else {
- pgoff_t idx;
- struct page *page;
- int have_read, to_read;
-
- /* First copy what we already have in @rp. */
- memcpy(trp, rp, size);
- /* Copy the remaining data one page at a time. */
- have_read = size;
- to_read = le32_to_cpu(rp->system_page_size) - size;
- idx = (pos + size) >> PAGE_SHIFT;
- BUG_ON((pos + size) & ~PAGE_MASK);
- do {
- page = ntfs_map_page(vi->i_mapping, idx);
- if (IS_ERR(page)) {
- ntfs_error(vi->i_sb, "Error mapping $LogFile "
- "page (index %lu).", idx);
- err = PTR_ERR(page);
- if (err != -EIO && err != -ENOMEM)
- err = -EIO;
- goto err_out;
- }
- size = min_t(int, to_read, PAGE_SIZE);
- memcpy((u8*)trp + have_read, page_address(page), size);
- ntfs_unmap_page(page);
- have_read += size;
- to_read -= size;
- idx++;
- } while (to_read > 0);
- }
- /*
- * Perform the multi sector transfer deprotection on the buffer if the
- * restart page is protected.
- */
- if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
- && post_read_mst_fixup((NTFS_RECORD*)trp,
- le32_to_cpu(rp->system_page_size))) {
- /*
- * A multi sector tranfer error was detected. We only need to
- * abort if the restart page contents exceed the multi sector
- * transfer fixup of the first sector.
- */
- if (le16_to_cpu(rp->restart_area_offset) +
- le16_to_cpu(ra->restart_area_length) >
- NTFS_BLOCK_SIZE - sizeof(u16)) {
- ntfs_error(vi->i_sb, "Multi sector transfer error "
- "detected in $LogFile restart page.");
- err = -EINVAL;
- goto err_out;
- }
- }
- /*
- * If the restart page is modified by chkdsk or there are no active
- * logfile clients, the logfile is consistent. Otherwise, need to
- * check the log client records for consistency, too.
- */
- err = 0;
- if (ntfs_is_rstr_record(rp->magic) &&
- ra->client_in_use_list != LOGFILE_NO_CLIENT) {
- if (!ntfs_check_log_client_array(vi, trp)) {
- err = -EINVAL;
- goto err_out;
- }
- }
- if (lsn) {
- if (ntfs_is_rstr_record(rp->magic))
- *lsn = sle64_to_cpu(ra->current_lsn);
- else /* if (ntfs_is_chkd_record(rp->magic)) */
- *lsn = sle64_to_cpu(rp->chkdsk_lsn);
- }
- ntfs_debug("Done.");
- if (wrp)
- *wrp = trp;
- else {
-err_out:
- ntfs_free(trp);
- }
- return err;
-}
-
-/**
- * ntfs_check_logfile - check the journal for consistency
- * @log_vi: struct inode of loaded journal $LogFile to check
- * @rp: [OUT] on success this is a copy of the current restart page
- *
- * Check the $LogFile journal for consistency and return 'true' if it is
- * consistent and 'false' if not. On success, the current restart page is
- * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it.
- *
- * At present we only check the two restart pages and ignore the log record
- * pages.
- *
- * Note that the MstProtected flag is not set on the $LogFile inode and hence
- * when reading pages they are not deprotected. This is because we do not know
- * if the $LogFile was created on a system with a different page size to ours
- * yet and mst deprotection would fail if our page size is smaller.
- */
-bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
-{
- s64 size, pos;
- LSN rstr1_lsn, rstr2_lsn;
- ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
- struct address_space *mapping = log_vi->i_mapping;
- struct page *page = NULL;
- u8 *kaddr = NULL;
- RESTART_PAGE_HEADER *rstr1_ph = NULL;
- RESTART_PAGE_HEADER *rstr2_ph = NULL;
- int log_page_size, err;
- bool logfile_is_empty = true;
- u8 log_page_bits;
-
- ntfs_debug("Entering.");
- /* An empty $LogFile must have been clean before it got emptied. */
- if (NVolLogFileEmpty(vol))
- goto is_empty;
- size = i_size_read(log_vi);
- /* Make sure the file doesn't exceed the maximum allowed size. */
- if (size > MaxLogFileSize)
- size = MaxLogFileSize;
- /*
- * Truncate size to a multiple of the page cache size or the default
- * log page size if the page cache size is between the default log page
- * log page size if the page cache size is between the default log page
- * size and twice that.
- */
- if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <=
- DefaultLogPageSize * 2)
- log_page_size = DefaultLogPageSize;
- else
- log_page_size = PAGE_SIZE;
- /*
- * Use ntfs_ffs() instead of ffs() to enable the compiler to
- * optimize log_page_size and log_page_bits into constants.
- */
- log_page_bits = ntfs_ffs(log_page_size) - 1;
- size &= ~(s64)(log_page_size - 1);
- /*
- * Ensure the log file is big enough to store at least the two restart
- * pages and the minimum number of log record pages.
- */
- if (size < log_page_size * 2 || (size - log_page_size * 2) >>
- log_page_bits < MinLogRecordPages) {
- ntfs_error(vol->sb, "$LogFile is too small.");
- return false;
- }
- /*
- * Read through the file looking for a restart page. Since the restart
- * page header is at the beginning of a page we only need to search at
- * what could be the beginning of a page (for each page size) rather
- * than scanning the whole file byte by byte. If all potential places
- * contain empty and uninitialzed records, the log file can be assumed
- * to be empty.
- */
- for (pos = 0; pos < size; pos <<= 1) {
- pgoff_t idx = pos >> PAGE_SHIFT;
- if (!page || page->index != idx) {
- if (page)
- ntfs_unmap_page(page);
- page = ntfs_map_page(mapping, idx);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Error mapping $LogFile "
- "page (index %lu).", idx);
- goto err_out;
- }
- }
- kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK);
- /*
- * A non-empty block means the logfile is not empty while an
- * empty block after a non-empty block has been encountered
- * means we are done.
- */
- if (!ntfs_is_empty_recordp((le32*)kaddr))
- logfile_is_empty = false;
- else if (!logfile_is_empty)
- break;
- /*
- * A log record page means there cannot be a restart page after
- * this so no need to continue searching.
- */
- if (ntfs_is_rcrd_recordp((le32*)kaddr))
- break;
- /* If not a (modified by chkdsk) restart page, continue. */
- if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
- !ntfs_is_chkd_recordp((le32*)kaddr)) {
- if (!pos)
- pos = NTFS_BLOCK_SIZE >> 1;
- continue;
- }
- /*
- * Check the (modified by chkdsk) restart page for consistency
- * and get a copy of the complete multi sector transfer
- * deprotected restart page.
- */
- err = ntfs_check_and_load_restart_page(log_vi,
- (RESTART_PAGE_HEADER*)kaddr, pos,
- !rstr1_ph ? &rstr1_ph : &rstr2_ph,
- !rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
- if (!err) {
- /*
- * If we have now found the first (modified by chkdsk)
- * restart page, continue looking for the second one.
- */
- if (!pos) {
- pos = NTFS_BLOCK_SIZE >> 1;
- continue;
- }
- /*
- * We have now found the second (modified by chkdsk)
- * restart page, so we can stop looking.
- */
- break;
- }
- /*
- * Error output already done inside the function. Note, we do
- * not abort if the restart page was invalid as we might still
- * find a valid one further in the file.
- */
- if (err != -EINVAL) {
- ntfs_unmap_page(page);
- goto err_out;
- }
- /* Continue looking. */
- if (!pos)
- pos = NTFS_BLOCK_SIZE >> 1;
- }
- if (page)
- ntfs_unmap_page(page);
- if (logfile_is_empty) {
- NVolSetLogFileEmpty(vol);
-is_empty:
- ntfs_debug("Done. ($LogFile is empty.)");
- return true;
- }
- if (!rstr1_ph) {
- BUG_ON(rstr2_ph);
- ntfs_error(vol->sb, "Did not find any restart pages in "
- "$LogFile and it was not empty.");
- return false;
- }
- /* If both restart pages were found, use the more recent one. */
- if (rstr2_ph) {
- /*
- * If the second restart area is more recent, switch to it.
- * Otherwise just throw it away.
- */
- if (rstr2_lsn > rstr1_lsn) {
- ntfs_debug("Using second restart page as it is more "
- "recent.");
- ntfs_free(rstr1_ph);
- rstr1_ph = rstr2_ph;
- /* rstr1_lsn = rstr2_lsn; */
- } else {
- ntfs_debug("Using first restart page as it is more "
- "recent.");
- ntfs_free(rstr2_ph);
- }
- rstr2_ph = NULL;
- }
- /* All consistency checks passed. */
- if (rp)
- *rp = rstr1_ph;
- else
- ntfs_free(rstr1_ph);
- ntfs_debug("Done.");
- return true;
-err_out:
- if (rstr1_ph)
- ntfs_free(rstr1_ph);
- return false;
-}
-
-/**
- * ntfs_is_logfile_clean - check in the journal if the volume is clean
- * @log_vi: struct inode of loaded journal $LogFile to check
- * @rp: copy of the current restart page
- *
- * Analyze the $LogFile journal and return 'true' if it indicates the volume was
- * shutdown cleanly and 'false' if not.
- *
- * At present we only look at the two restart pages and ignore the log record
- * pages. This is a little bit crude in that there will be a very small number
- * of cases where we think that a volume is dirty when in fact it is clean.
- * This should only affect volumes that have not been shutdown cleanly but did
- * not have any pending, non-check-pointed i/o, i.e. they were completely idle
- * at least for the five seconds preceding the unclean shutdown.
- *
- * This function assumes that the $LogFile journal has already been consistency
- * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
- * is empty this function requires that NVolLogFileEmpty() is true otherwise an
- * empty volume will be reported as dirty.
- */
-bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
-{
- ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
- RESTART_AREA *ra;
-
- ntfs_debug("Entering.");
- /* An empty $LogFile must have been clean before it got emptied. */
- if (NVolLogFileEmpty(vol)) {
- ntfs_debug("Done. ($LogFile is empty.)");
- return true;
- }
- BUG_ON(!rp);
- if (!ntfs_is_rstr_record(rp->magic) &&
- !ntfs_is_chkd_record(rp->magic)) {
- ntfs_error(vol->sb, "Restart page buffer is invalid. This is "
- "probably a bug in that the $LogFile should "
- "have been consistency checked before calling "
- "this function.");
- return false;
- }
- ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
- /*
- * If the $LogFile has active clients, i.e. it is open, and we do not
- * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
- * we assume there was an unclean shutdown.
- */
- if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
- !(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
- ntfs_debug("Done. $LogFile indicates a dirty shutdown.");
- return false;
- }
- /* $LogFile indicates a clean shutdown. */
- ntfs_debug("Done. $LogFile indicates a clean shutdown.");
- return true;
-}
-
-/**
- * ntfs_empty_logfile - empty the contents of the $LogFile journal
- * @log_vi: struct inode of loaded journal $LogFile to empty
- *
- * Empty the contents of the $LogFile journal @log_vi and return 'true' on
- * success and 'false' on error.
- *
- * This function assumes that the $LogFile journal has already been consistency
- * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
- * has been used to ensure that the $LogFile is clean.
- */
-bool ntfs_empty_logfile(struct inode *log_vi)
-{
- VCN vcn, end_vcn;
- ntfs_inode *log_ni = NTFS_I(log_vi);
- ntfs_volume *vol = log_ni->vol;
- struct super_block *sb = vol->sb;
- runlist_element *rl;
- unsigned long flags;
- unsigned block_size, block_size_bits;
- int err;
- bool should_wait = true;
-
- ntfs_debug("Entering.");
- if (NVolLogFileEmpty(vol)) {
- ntfs_debug("Done.");
- return true;
- }
- /*
- * We cannot use ntfs_attr_set() because we may be still in the middle
- * of a mount operation. Thus we do the emptying by hand by first
- * zapping the page cache pages for the $LogFile/$DATA attribute and
- * then emptying each of the buffers in each of the clusters specified
- * by the runlist by hand.
- */
- block_size = sb->s_blocksize;
- block_size_bits = sb->s_blocksize_bits;
- vcn = 0;
- read_lock_irqsave(&log_ni->size_lock, flags);
- end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
- vol->cluster_size_bits;
- read_unlock_irqrestore(&log_ni->size_lock, flags);
- truncate_inode_pages(log_vi->i_mapping, 0);
- down_write(&log_ni->runlist.lock);
- rl = log_ni->runlist.rl;
- if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
-map_vcn:
- err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
- if (err) {
- ntfs_error(sb, "Failed to map runlist fragment (error "
- "%d).", -err);
- goto err;
- }
- rl = log_ni->runlist.rl;
- BUG_ON(!rl || vcn < rl->vcn || !rl->length);
- }
- /* Seek to the runlist element containing @vcn. */
- while (rl->length && vcn >= rl[1].vcn)
- rl++;
- do {
- LCN lcn;
- sector_t block, end_block;
- s64 len;
-
- /*
- * If this run is not mapped map it now and start again as the
- * runlist will have been updated.
- */
- lcn = rl->lcn;
- if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
- vcn = rl->vcn;
- goto map_vcn;
- }
- /* If this run is not valid abort with an error. */
- if (unlikely(!rl->length || lcn < LCN_HOLE))
- goto rl_err;
- /* Skip holes. */
- if (lcn == LCN_HOLE)
- continue;
- block = lcn << vol->cluster_size_bits >> block_size_bits;
- len = rl->length;
- if (rl[1].vcn > end_vcn)
- len = end_vcn - rl->vcn;
- end_block = (lcn + len) << vol->cluster_size_bits >>
- block_size_bits;
- /* Iterate over the blocks in the run and empty them. */
- do {
- struct buffer_head *bh;
-
- /* Obtain the buffer, possibly not uptodate. */
- bh = sb_getblk(sb, block);
- BUG_ON(!bh);
- /* Setup buffer i/o submission. */
- lock_buffer(bh);
- bh->b_end_io = end_buffer_write_sync;
- get_bh(bh);
- /* Set the entire contents of the buffer to 0xff. */
- memset(bh->b_data, -1, block_size);
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- if (buffer_dirty(bh))
- clear_buffer_dirty(bh);
- /*
- * Submit the buffer and wait for i/o to complete but
- * only for the first buffer so we do not miss really
- * serious i/o errors. Once the first buffer has
- * completed ignore errors afterwards as we can assume
- * that if one buffer worked all of them will work.
- */
- submit_bh(REQ_OP_WRITE, bh);
- if (should_wait) {
- should_wait = false;
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- goto io_err;
- }
- brelse(bh);
- } while (++block < end_block);
- } while ((++rl)->vcn < end_vcn);
- up_write(&log_ni->runlist.lock);
- /*
- * Zap the pages again just in case any got instantiated whilst we were
- * emptying the blocks by hand. FIXME: We may not have completed
- * writing to all the buffer heads yet so this may happen too early.
- * We really should use a kernel thread to do the emptying
- * asynchronously and then we can also set the volume dirty and output
- * an error message if emptying should fail.
- */
- truncate_inode_pages(log_vi->i_mapping, 0);
- /* Set the flag so we do not have to do it again on remount. */
- NVolSetLogFileEmpty(vol);
- ntfs_debug("Done.");
- return true;
-io_err:
- ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk.");
- goto dirty_err;
-rl_err:
- ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk.");
-dirty_err:
- NVolSetErrors(vol);
- err = -EIO;
-err:
- up_write(&log_ni->runlist.lock);
- ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
- -err);
- return false;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
deleted file mode 100644
index 429d4909cc72..000000000000
--- a/fs/ntfs/logfile.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of
- * the Linux-NTFS project.
- *
- * Copyright (c) 2000-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_LOGFILE_H
-#define _LINUX_NTFS_LOGFILE_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "types.h"
-#include "endian.h"
-#include "layout.h"
-
-/*
- * Journal ($LogFile) organization:
- *
- * Two restart areas present in the first two pages (restart pages, one restart
- * area in each page). When the volume is dismounted they should be identical,
- * except for the update sequence array which usually has a different update
- * sequence number.
- *
- * These are followed by log records organized in pages headed by a log record
- * header going up to log file size. Not all pages contain log records when a
- * volume is first formatted, but as the volume ages, all records will be used.
- * When the log file fills up, the records at the beginning are purged (by
- * modifying the oldest_lsn to a higher value presumably) and writing begins
- * at the beginning of the file. Effectively, the log file is viewed as a
- * circular entity.
- *
- * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
- * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We
- * probably only want to support 1.1 as this seems to be the current version
- * and we don't know how that differs from the older versions. The only
- * exception is if the journal is clean as marked by the two restart pages
- * then it doesn't matter whether we are on an earlier version. We can just
- * reinitialize the logfile and start again with version 1.1.
- */
-
-/* Some $LogFile related constants. */
-#define MaxLogFileSize 0x100000000ULL
-#define DefaultLogPageSize 4096
-#define MinLogRecordPages 48
-
-/*
- * Log file restart page header (begins the restart area).
- */
-typedef struct {
-/*Ofs*/
-/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */
-/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h.
- When creating, set this to be immediately
- after this header structure (without any
- alignment). */
-/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */
-
-/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by
- chkdsk. Only used when the magic is changed
- to "CHKD". Otherwise this is zero. */
-/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file
- was created, has to be >= 512 and a power of
- 2. Use this to calculate the required size
- of the usa (usa_count) and add it to usa_ofs.
- Then verify that the result is less than the
- value of the restart_area_offset. */
-/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >=
- 512 and a power of 2. The default is 4096
- and is used when the system page size is
- between 4096 and 8192. Otherwise this is
- set to the system page size instead. */
-/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to
- the RESTART_AREA. Value has to be aligned
- to 8-byte boundary. When creating, set this
- to be after the usa. */
-/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major
- version is 1. */
-/* 28*/ sle16 major_ver; /* Log file major version. We only support
- version 1.1. */
-/* sizeof() = 30 (0x1e) bytes */
-} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
-
-/*
- * Constant for the log client indices meaning that there are no client records
- * in this particular client array. Also inside the client records themselves,
- * this means that there are no client records preceding or following this one.
- */
-#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff)
-#define LOGFILE_NO_CLIENT_CPU 0xffff
-
-/*
- * These are the so far known RESTART_AREA_* flags (16-bit) which contain
- * information about the log file in which they are present.
- */
-enum {
- RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
- RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
-} __attribute__ ((__packed__));
-
-typedef le16 RESTART_AREA_FLAGS;
-
-/*
- * Log file restart area record. The offset of this record is found by adding
- * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
- * in it. See notes at restart_area_offset above.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log
- when the restart area was last written.
- This happens often but what is the interval?
- Is it just fixed time or is it every time a
- check point is written or somethine else?
- On create set to 0. */
-/* 8*/ le16 log_clients; /* Number of log client records in the array of
- log client records which follows this
- restart area. Must be 1. */
-/* 10*/ le16 client_free_list; /* The index of the first free log client record
- in the array of log client records.
- LOGFILE_NO_CLIENT means that there are no
- free log client records in the array.
- If != LOGFILE_NO_CLIENT, check that
- log_clients > client_free_list. On Win2k
- and presumably earlier, on a clean volume
- this is != LOGFILE_NO_CLIENT, and it should
- be 0, i.e. the first (and only) client
- record is free and thus the logfile is
- closed and hence clean. A dirty volume
- would have left the logfile open and hence
- this would be LOGFILE_NO_CLIENT. On WinXP
- and presumably later, the logfile is always
- open, even on clean shutdown so this should
- always be LOGFILE_NO_CLIENT. */
-/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client
- record in the array of log client records.
- LOGFILE_NO_CLIENT means that there are no
- in-use log client records in the array. If
- != LOGFILE_NO_CLIENT check that log_clients
- > client_in_use_list. On Win2k and
- presumably earlier, on a clean volume this
- is LOGFILE_NO_CLIENT, i.e. there are no
- client records in use and thus the logfile
- is closed and hence clean. A dirty volume
- would have left the logfile open and hence
- this would be != LOGFILE_NO_CLIENT, and it
- should be 0, i.e. the first (and only)
- client record is in use. On WinXP and
- presumably later, the logfile is always
- open, even on clean shutdown so this should
- always be 0. */
-/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k
- and presumably earlier this is always 0. On
- WinXP and presumably later, if the logfile
- was shutdown cleanly, the second bit,
- RESTART_VOLUME_IS_CLEAN, is set. This bit
- is cleared when the volume is mounted by
- WinXP and set when the volume is dismounted,
- thus if the logfile is dirty, this bit is
- clear. Thus we don't need to check the
- Windows version to determine if the logfile
- is clean. Instead if the logfile is closed,
- we know it must be clean. If it is open and
- this bit is set, we also know it must be
- clean. If on the other hand the logfile is
- open and this bit is clear, we can be almost
- certain that the logfile is dirty. */
-/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence
- number. This is calculated as 67 - the
- number of bits required to store the logfile
- size in bytes and this can be used in with
- the specified file_size as a consistency
- check. */
-/* 20*/ le16 restart_area_length;/* Length of the restart area including the
- client array. Following checks required if
- version matches. Otherwise, skip them.
- restart_area_offset + restart_area_length
- has to be <= system_page_size. Also,
- restart_area_length has to be >=
- client_array_offset + (log_clients *
- sizeof(log client record)). */
-/* 22*/ le16 client_array_offset;/* Offset from the start of this record to
- the first log client record if versions are
- matched. When creating, set this to be
- after this restart area structure, aligned
- to 8-bytes boundary. If the versions do not
- match, this is ignored and the offset is
- assumed to be (sizeof(RESTART_AREA) + 7) &
- ~7, i.e. rounded up to first 8-byte
- boundary. Either way, client_array_offset
- has to be aligned to an 8-byte boundary.
- Also, restart_area_offset +
- client_array_offset has to be <= 510.
- Finally, client_array_offset + (log_clients
- * sizeof(log client record)) has to be <=
- system_page_size. On Win2k and presumably
- earlier, this is 0x30, i.e. immediately
- following this record. On WinXP and
- presumably later, this is 0x40, i.e. there
- are 16 extra bytes between this record and
- the client array. This probably means that
- the RESTART_AREA record is actually bigger
- in WinXP and later. */
-/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the
- restart_area_offset + the offset of the
- file_size are > 510 then corruption has
- occurred. This is the very first check when
- starting with the restart_area as if it
- fails it means that some of the above values
- will be corrupted by the multi sector
- transfer protection. The file_size has to
- be rounded down to be a multiple of the
- log_page_size in the RESTART_PAGE_HEADER and
- then it has to be at least big enough to
- store the two restart pages and 48 (0x30)
- log record pages. */
-/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including
- the log record header. On create set to
- 0. */
-/* 36*/ le16 log_record_header_length;/* Byte size of the log record header.
- If the version matches then check that the
- value of log_record_header_length is a
- multiple of 8, i.e.
- (log_record_header_length + 7) & ~7 ==
- log_record_header_length. When creating set
- it to sizeof(LOG_RECORD_HEADER), aligned to
- 8 bytes. */
-/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record
- page. Must be a multiple of 8. On create
- set it to immediately after the update
- sequence array of the log record page. */
-/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every
- time the logfile is restarted which happens
- at mount time when the logfile is opened.
- When creating set to a random value. Win2k
- sets it to the low 32 bits of the current
- system time in NTFS format (see time.h). */
-/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */
-/* sizeof() = 48 (0x30) bytes */
-} __attribute__ ((__packed__)) RESTART_AREA;
-
-/*
- * Log client record. The offset of this record is found by adding the offset
- * of the RESTART_AREA to the client_array_offset value found in it.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create
- set to 0. */
-/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart
- the volume, i.e. the current position within
- the log file. At present, if clean this
- should = current_lsn in restart area but it
- probably also = current_lsn when dirty most
- of the time. At create set to 0. */
-/* 16*/ le16 prev_client; /* The offset to the previous log client record
- in the array of log client records.
- LOGFILE_NO_CLIENT means there is no previous
- client record, i.e. this is the first one.
- This is always LOGFILE_NO_CLIENT. */
-/* 18*/ le16 next_client; /* The offset to the next log client record in
- the array of log client records.
- LOGFILE_NO_CLIENT means there are no next
- client records, i.e. this is the last one.
- This is always LOGFILE_NO_CLIENT. */
-/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set
- to zero every time the logfile is restarted
- and it is incremented when the logfile is
- closed at dismount time. Thus it is 0 when
- dirty and 1 when clean. On WinXP and
- presumably later, this is always 0. */
-/* 22*/ u8 reserved[6]; /* Reserved/alignment. */
-/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should
- always be 8. */
-/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should
- always be "NTFS" with the remaining bytes
- set to 0. */
-/* sizeof() = 160 (0xa0) bytes */
-} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
-
-extern bool ntfs_check_logfile(struct inode *log_vi,
- RESTART_PAGE_HEADER **rp);
-
-extern bool ntfs_is_logfile_clean(struct inode *log_vi,
- const RESTART_PAGE_HEADER *rp);
-
-extern bool ntfs_empty_logfile(struct inode *log_vi);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
deleted file mode 100644
index 7068425735f1..000000000000
--- a/fs/ntfs/malloc.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_MALLOC_H
-#define _LINUX_NTFS_MALLOC_H
-
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-/**
- * __ntfs_malloc - allocate memory in multiples of pages
- * @size: number of bytes to allocate
- * @gfp_mask: extra flags for the allocator
- *
- * Internal function. You probably want ntfs_malloc_nofs()...
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * If there was insufficient memory to complete the request, return NULL.
- * Depending on @gfp_mask the allocation may be guaranteed to succeed.
- */
-static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
-{
- if (likely(size <= PAGE_SIZE)) {
- BUG_ON(!size);
- /* kmalloc() has per-CPU caches so is faster for now. */
- return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
- /* return (void *)__get_free_page(gfp_mask); */
- }
- if (likely((size >> PAGE_SHIFT) < totalram_pages()))
- return __vmalloc(size, gfp_mask);
- return NULL;
-}
-
-/**
- * ntfs_malloc_nofs - allocate memory in multiples of pages
- * @size: number of bytes to allocate
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * If there was insufficient memory to complete the request, return NULL.
- */
-static inline void *ntfs_malloc_nofs(unsigned long size)
-{
- return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
-}
-
-/**
- * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
- * @size: number of bytes to allocate
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * This function guarantees that the allocation will succeed. It will sleep
- * for as long as it takes to complete the allocation.
- *
- * If there was insufficient memory to complete the request, return NULL.
- */
-static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
-{
- return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
-}
-
-static inline void ntfs_free(void *addr)
-{
- kvfree(addr);
-}
-
-#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
deleted file mode 100644
index 6fd1dc4b08c8..000000000000
--- a/fs/ntfs/mft.c
+++ /dev/null
@@ -1,2907 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/bio.h>
-
-#include "attrib.h"
-#include "aops.h"
-#include "bitmap.h"
-#include "debug.h"
-#include "dir.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-
-#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE)
-
-/**
- * map_mft_record_page - map the page in which a specific mft record resides
- * @ni: ntfs inode whose mft record page to map
- *
- * This maps the page in which the mft record of the ntfs inode @ni is situated
- * and returns a pointer to the mft record within the mapped page.
- *
- * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
- * contains the negative error code returned.
- */
-static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
-{
- loff_t i_size;
- ntfs_volume *vol = ni->vol;
- struct inode *mft_vi = vol->mft_ino;
- struct page *page;
- unsigned long index, end_index;
- unsigned ofs;
-
- BUG_ON(ni->page);
- /*
- * The index into the page cache and the offset within the page cache
- * page of the wanted mft record. FIXME: We need to check for
- * overflowing the unsigned long, but I don't think we would ever get
- * here if the volume was that big...
- */
- index = (u64)ni->mft_no << vol->mft_record_size_bits >>
- PAGE_SHIFT;
- ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
-
- i_size = i_size_read(mft_vi);
- /* The maximum valid index into the page cache for $MFT's data. */
- end_index = i_size >> PAGE_SHIFT;
-
- /* If the wanted index is out of bounds the mft record doesn't exist. */
- if (unlikely(index >= end_index)) {
- if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
- vol->mft_record_size) {
- page = ERR_PTR(-ENOENT);
- ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
- "which is beyond the end of the mft. "
- "This is probably a bug in the ntfs "
- "driver.", ni->mft_no);
- goto err_out;
- }
- }
- /* Read, map, and pin the page. */
- page = ntfs_map_page(mft_vi->i_mapping, index);
- if (!IS_ERR(page)) {
- /* Catch multi sector transfer fixup errors. */
- if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
- ofs)))) {
- ni->page = page;
- ni->page_ofs = ofs;
- return page_address(page) + ofs;
- }
- ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. "
- "Run chkdsk.", ni->mft_no);
- ntfs_unmap_page(page);
- page = ERR_PTR(-EIO);
- NVolSetErrors(vol);
- }
-err_out:
- ni->page = NULL;
- ni->page_ofs = 0;
- return (void*)page;
-}
-
-/**
- * map_mft_record - map, pin and lock an mft record
- * @ni: ntfs inode whose MFT record to map
- *
- * First, take the mrec_lock mutex. We might now be sleeping, while waiting
- * for the mutex if it was already locked by someone else.
- *
- * The page of the record is mapped using map_mft_record_page() before being
- * returned to the caller.
- *
- * This in turn uses ntfs_map_page() to get the page containing the wanted mft
- * record (it in turn calls read_cache_page() which reads it in from disk if
- * necessary, increments the use count on the page so that it cannot disappear
- * under us and returns a reference to the page cache page).
- *
- * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
- * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
- * and the post-read mst fixups on each mft record in the page have been
- * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
- * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
- * ntfs_map_page() waits for PG_locked to become clear and checks if
- * PG_uptodate is set and returns an error code if not. This provides
- * sufficient protection against races when reading/using the page.
- *
- * However there is the write mapping to think about. Doing the above described
- * checking here will be fine, because when initiating the write we will set
- * PG_locked and clear PG_uptodate making sure nobody is touching the page
- * contents. Doing the locking this way means that the commit to disk code in
- * the page cache code paths is automatically sufficiently locked with us as
- * we will not touch a page that has been locked or is not uptodate. The only
- * locking problem then is them locking the page while we are accessing it.
- *
- * So that code will end up having to own the mrec_lock of all mft
- * records/inodes present in the page before I/O can proceed. In that case we
- * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
- * accessing anything without owning the mrec_lock mutex. But we do need to
- * use them because of the read_cache_page() invocation and the code becomes so
- * much simpler this way that it is well worth it.
- *
- * The mft record is now ours and we return a pointer to it. You need to check
- * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
- * the error code.
- *
- * NOTE: Caller is responsible for setting the mft record dirty before calling
- * unmap_mft_record(). This is obviously only necessary if the caller really
- * modified the mft record...
- * Q: Do we want to recycle one of the VFS inode state bits instead?
- * A: No, the inode ones mean we want to change the mft record, not we want to
- * write it out.
- */
-MFT_RECORD *map_mft_record(ntfs_inode *ni)
-{
- MFT_RECORD *m;
-
- ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
-
- /* Make sure the ntfs inode doesn't go away. */
- atomic_inc(&ni->count);
-
- /* Serialize access to this mft record. */
- mutex_lock(&ni->mrec_lock);
-
- m = map_mft_record_page(ni);
- if (!IS_ERR(m))
- return m;
-
- mutex_unlock(&ni->mrec_lock);
- atomic_dec(&ni->count);
- ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
- return m;
-}
-
-/**
- * unmap_mft_record_page - unmap the page in which a specific mft record resides
- * @ni: ntfs inode whose mft record page to unmap
- *
- * This unmaps the page in which the mft record of the ntfs inode @ni is
- * situated and returns. This is a NOOP if highmem is not configured.
- *
- * The unmap happens via ntfs_unmap_page() which in turn decrements the use
- * count on the page thus releasing it from the pinned state.
- *
- * We do not actually unmap the page from memory of course, as that will be
- * done by the page cache code itself when memory pressure increases or
- * whatever.
- */
-static inline void unmap_mft_record_page(ntfs_inode *ni)
-{
- BUG_ON(!ni->page);
-
- // TODO: If dirty, blah...
- ntfs_unmap_page(ni->page);
- ni->page = NULL;
- ni->page_ofs = 0;
- return;
-}
-
-/**
- * unmap_mft_record - release a mapped mft record
- * @ni: ntfs inode whose MFT record to unmap
- *
- * We release the page mapping and the mrec_lock mutex which unmaps the mft
- * record and releases it for others to get hold of. We also release the ntfs
- * inode by decrementing the ntfs inode reference count.
- *
- * NOTE: If caller has modified the mft record, it is imperative to set the mft
- * record dirty BEFORE calling unmap_mft_record().
- */
-void unmap_mft_record(ntfs_inode *ni)
-{
- struct page *page = ni->page;
-
- BUG_ON(!page);
-
- ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
-
- unmap_mft_record_page(ni);
- mutex_unlock(&ni->mrec_lock);
- atomic_dec(&ni->count);
- /*
- * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
- * ntfs_clear_extent_inode() in the extent inode case, and to the
- * caller in the non-extent, yet pure ntfs inode case, to do the actual
- * tear down of all structures and freeing of all allocated memory.
- */
- return;
-}
-
-/**
- * map_extent_mft_record - load an extent inode and attach it to its base
- * @base_ni: base ntfs inode
- * @mref: mft reference of the extent inode to load
- * @ntfs_ino: on successful return, pointer to the ntfs_inode structure
- *
- * Load the extent mft record @mref and attach it to its base inode @base_ni.
- * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
- * PTR_ERR(result) gives the negative error code.
- *
- * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
- * structure of the mapped extent inode.
- */
-MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
- ntfs_inode **ntfs_ino)
-{
- MFT_RECORD *m;
- ntfs_inode *ni = NULL;
- ntfs_inode **extent_nis = NULL;
- int i;
- unsigned long mft_no = MREF(mref);
- u16 seq_no = MSEQNO(mref);
- bool destroy_ni = false;
-
- ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
- mft_no, base_ni->mft_no);
- /* Make sure the base ntfs inode doesn't go away. */
- atomic_inc(&base_ni->count);
- /*
- * Check if this extent inode has already been added to the base inode,
- * in which case just return it. If not found, add it to the base
- * inode before returning it.
- */
- mutex_lock(&base_ni->extent_lock);
- if (base_ni->nr_extents > 0) {
- extent_nis = base_ni->ext.extent_ntfs_inos;
- for (i = 0; i < base_ni->nr_extents; i++) {
- if (mft_no != extent_nis[i]->mft_no)
- continue;
- ni = extent_nis[i];
- /* Make sure the ntfs inode doesn't go away. */
- atomic_inc(&ni->count);
- break;
- }
- }
- if (likely(ni != NULL)) {
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- /* We found the record; just have to map and return it. */
- m = map_mft_record(ni);
- /* map_mft_record() has incremented this on success. */
- atomic_dec(&ni->count);
- if (!IS_ERR(m)) {
- /* Verify the sequence number. */
- if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
- ntfs_debug("Done 1.");
- *ntfs_ino = ni;
- return m;
- }
- unmap_mft_record(ni);
- ntfs_error(base_ni->vol->sb, "Found stale extent mft "
- "reference! Corrupt filesystem. "
- "Run chkdsk.");
- return ERR_PTR(-EIO);
- }
-map_err_out:
- ntfs_error(base_ni->vol->sb, "Failed to map extent "
- "mft record, error code %ld.", -PTR_ERR(m));
- return m;
- }
- /* Record wasn't there. Get a new ntfs inode and initialize it. */
- ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
- if (unlikely(!ni)) {
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- return ERR_PTR(-ENOMEM);
- }
- ni->vol = base_ni->vol;
- ni->seq_no = seq_no;
- ni->nr_extents = -1;
- ni->ext.base_ntfs_ino = base_ni;
- /* Now map the record. */
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- ntfs_clear_extent_inode(ni);
- goto map_err_out;
- }
- /* Verify the sequence number if it is present. */
- if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
- ntfs_error(base_ni->vol->sb, "Found stale extent mft "
- "reference! Corrupt filesystem. Run chkdsk.");
- destroy_ni = true;
- m = ERR_PTR(-EIO);
- goto unm_err_out;
- }
- /* Attach extent inode to base inode, reallocating memory if needed. */
- if (!(base_ni->nr_extents & 3)) {
- ntfs_inode **tmp;
- int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
-
- tmp = kmalloc(new_size, GFP_NOFS);
- if (unlikely(!tmp)) {
- ntfs_error(base_ni->vol->sb, "Failed to allocate "
- "internal buffer.");
- destroy_ni = true;
- m = ERR_PTR(-ENOMEM);
- goto unm_err_out;
- }
- if (base_ni->nr_extents) {
- BUG_ON(!base_ni->ext.extent_ntfs_inos);
- memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
- 4 * sizeof(ntfs_inode *));
- kfree(base_ni->ext.extent_ntfs_inos);
- }
- base_ni->ext.extent_ntfs_inos = tmp;
- }
- base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- ntfs_debug("Done 2.");
- *ntfs_ino = ni;
- return m;
-unm_err_out:
- unmap_mft_record(ni);
- mutex_unlock(&base_ni->extent_lock);
- atomic_dec(&base_ni->count);
- /*
- * If the extent inode was not attached to the base inode we need to
- * release it or we will leak memory.
- */
- if (destroy_ni)
- ntfs_clear_extent_inode(ni);
- return m;
-}
-
-#ifdef NTFS_RW
-
-/**
- * __mark_mft_record_dirty - set the mft record and the page containing it dirty
- * @ni: ntfs inode describing the mapped mft record
- *
- * Internal function. Users should call mark_mft_record_dirty() instead.
- *
- * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
- * as well as the page containing the mft record, dirty. Also, mark the base
- * vfs inode dirty. This ensures that any changes to the mft record are
- * written out to disk.
- *
- * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
- * on the base vfs inode, because even though file data may have been modified,
- * it is dirty in the inode meta data rather than the data page cache of the
- * inode, and thus there are no data pages that need writing out. Therefore, a
- * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because ->write_inode needs to be called even
- * in case of fdatasync. This needs to happen or the file data would not
- * necessarily hit the device synchronously, even though the vfs inode has the
- * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
- * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
- * which is not what I_DIRTY_SYNC on its own would suggest.
- */
-void __mark_mft_record_dirty(ntfs_inode *ni)
-{
- ntfs_inode *base_ni;
-
- ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
- BUG_ON(NInoAttr(ni));
- mark_ntfs_record_dirty(ni->page, ni->page_ofs);
- /* Determine the base vfs inode and mark it dirty, too. */
- mutex_lock(&ni->extent_lock);
- if (likely(ni->nr_extents >= 0))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- mutex_unlock(&ni->extent_lock);
- __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
-}
-
-static const char *ntfs_please_email = "Please email "
- "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
- "this message. Thank you.";
-
-/**
- * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
- * @vol: ntfs volume on which the mft record to synchronize resides
- * @mft_no: mft record number of mft record to synchronize
- * @m: mapped, mst protected (extent) mft record to synchronize
- *
- * Write the mapped, mst protected (extent) mft record @m with mft record
- * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
- * bypassing the page cache and the $MFTMirr inode itself.
- *
- * This function is only for use at umount time when the mft mirror inode has
- * already been disposed off. We BUG() if we are called while the mft mirror
- * inode is still attached to the volume.
- *
- * On success return 0. On error return -errno.
- *
- * NOTE: This function is not implemented yet as I am not convinced it can
- * actually be triggered considering the sequence of commits we do in super.c::
- * ntfs_put_super(). But just in case we provide this place holder as the
- * alternative would be either to BUG() or to get a NULL pointer dereference
- * and Oops.
- */
-static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
- const unsigned long mft_no, MFT_RECORD *m)
-{
- BUG_ON(vol->mftmirr_ino);
- ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
- "implemented yet. %s", ntfs_please_email);
- return -EOPNOTSUPP;
-}
-
-/**
- * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
- * @vol: ntfs volume on which the mft record to synchronize resides
- * @mft_no: mft record number of mft record to synchronize
- * @m: mapped, mst protected (extent) mft record to synchronize
- * @sync: if true, wait for i/o completion
- *
- * Write the mapped, mst protected (extent) mft record @m with mft record
- * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
- *
- * On success return 0. On error return -errno and set the volume errors flag
- * in the ntfs volume @vol.
- *
- * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
- *
- * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
- * schedule i/o via ->writepage or do it via kntfsd or whatever.
- */
-int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
- MFT_RECORD *m, int sync)
-{
- struct page *page;
- unsigned int blocksize = vol->sb->s_blocksize;
- int max_bhs = vol->mft_record_size / blocksize;
- struct buffer_head *bhs[MAX_BHS];
- struct buffer_head *bh, *head;
- u8 *kmirr;
- runlist_element *rl;
- unsigned int block_start, block_end, m_start, m_end, page_ofs;
- int i_bhs, nr_bhs, err = 0;
- unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
-
- ntfs_debug("Entering for inode 0x%lx.", mft_no);
- BUG_ON(!max_bhs);
- if (WARN_ON(max_bhs > MAX_BHS))
- return -EINVAL;
- if (unlikely(!vol->mftmirr_ino)) {
- /* This could happen during umount... */
- err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
- if (likely(!err))
- return err;
- goto err_out;
- }
- /* Get the page containing the mirror copy of the mft record @m. */
- page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
- (PAGE_SHIFT - vol->mft_record_size_bits));
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to map mft mirror page.");
- err = PTR_ERR(page);
- goto err_out;
- }
- lock_page(page);
- BUG_ON(!PageUptodate(page));
- ClearPageUptodate(page);
- /* Offset of the mft mirror record inside the page. */
- page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
- /* The address in the page of the mirror copy of the mft record @m. */
- kmirr = page_address(page) + page_ofs;
- /* Copy the mst protected mft record to the mirror. */
- memcpy(kmirr, m, vol->mft_record_size);
- /* Create uptodate buffers if not present. */
- if (unlikely(!page_has_buffers(page))) {
- struct buffer_head *tail;
-
- bh = head = alloc_page_buffers(page, blocksize, true);
- do {
- set_buffer_uptodate(bh);
- tail = bh;
- bh = bh->b_this_page;
- } while (bh);
- tail->b_this_page = head;
- attach_page_private(page, head);
- }
- bh = head = page_buffers(page);
- BUG_ON(!bh);
- rl = NULL;
- nr_bhs = 0;
- block_start = 0;
- m_start = kmirr - (u8*)page_address(page);
- m_end = m_start + vol->mft_record_size;
- do {
- block_end = block_start + blocksize;
- /* If the buffer is outside the mft record, skip it. */
- if (block_end <= m_start)
- continue;
- if (unlikely(block_start >= m_end))
- break;
- /* Need to map the buffer if it is not mapped already. */
- if (unlikely(!buffer_mapped(bh))) {
- VCN vcn;
- LCN lcn;
- unsigned int vcn_ofs;
-
- bh->b_bdev = vol->sb->s_bdev;
- /* Obtain the vcn and offset of the current block. */
- vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
- (block_start - m_start);
- vcn_ofs = vcn & vol->cluster_size_mask;
- vcn >>= vol->cluster_size_bits;
- if (!rl) {
- down_read(&NTFS_I(vol->mftmirr_ino)->
- runlist.lock);
- rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
- /*
- * $MFTMirr always has the whole of its runlist
- * in memory.
- */
- BUG_ON(!rl);
- }
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- /* For $MFTMirr, only lcn >= 0 is a successful remap. */
- if (likely(lcn >= 0)) {
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn <<
- vol->cluster_size_bits) +
- vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
- } else {
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Cannot write mft mirror "
- "record 0x%lx because its "
- "location on disk could not "
- "be determined (error code "
- "%lli).", mft_no,
- (long long)lcn);
- err = -EIO;
- }
- }
- BUG_ON(!buffer_uptodate(bh));
- BUG_ON(!nr_bhs && (m_start != block_start));
- BUG_ON(nr_bhs >= max_bhs);
- bhs[nr_bhs++] = bh;
- BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
- } while (block_start = block_end, (bh = bh->b_this_page) != head);
- if (unlikely(rl))
- up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
- if (likely(!err)) {
- /* Lock buffers and start synchronous write i/o on them. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
- struct buffer_head *tbh = bhs[i_bhs];
-
- if (!trylock_buffer(tbh))
- BUG();
- BUG_ON(!buffer_uptodate(tbh));
- clear_buffer_dirty(tbh);
- get_bh(tbh);
- tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, tbh);
- }
- /* Wait on i/o completion of buffers. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
- struct buffer_head *tbh = bhs[i_bhs];
-
- wait_on_buffer(tbh);
- if (unlikely(!buffer_uptodate(tbh))) {
- err = -EIO;
- /*
- * Set the buffer uptodate so the page and
- * buffer states do not become out of sync.
- */
- set_buffer_uptodate(tbh);
- }
- }
- } else /* if (unlikely(err)) */ {
- /* Clean the buffers. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
- clear_buffer_dirty(bhs[i_bhs]);
- }
- /* Current state: all buffers are clean, unlocked, and uptodate. */
- /* Remove the mst protection fixups again. */
- post_write_mst_fixup((NTFS_RECORD*)kmirr);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- ntfs_unmap_page(page);
- if (likely(!err)) {
- ntfs_debug("Done.");
- } else {
- ntfs_error(vol->sb, "I/O error while writing mft mirror "
- "record 0x%lx!", mft_no);
-err_out:
- ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
- "code %i). Volume will be left marked dirty "
- "on umount. Run ntfsfix on the partition "
- "after umounting to correct this.", -err);
- NVolSetErrors(vol);
- }
- return err;
-}
-
-/**
- * write_mft_record_nolock - write out a mapped (extent) mft record
- * @ni: ntfs inode describing the mapped (extent) mft record
- * @m: mapped (extent) mft record to write
- * @sync: if true, wait for i/o completion
- *
- * Write the mapped (extent) mft record @m described by the (regular or extent)
- * ntfs inode @ni to backing store. If the mft record @m has a counterpart in
- * the mft mirror, that is also updated.
- *
- * We only write the mft record if the ntfs inode @ni is dirty and the first
- * buffer belonging to its mft record is dirty, too. We ignore the dirty state
- * of subsequent buffers because we could have raced with
- * fs/ntfs/aops.c::mark_ntfs_record_dirty().
- *
- * On success, clean the mft record and return 0. On error, leave the mft
- * record dirty and return -errno.
- *
- * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
- * However, if the mft record has a counterpart in the mft mirror and @sync is
- * true, we write the mft record, wait for i/o completion, and only then write
- * the mft mirror copy. This ensures that if the system crashes either the mft
- * or the mft mirror will contain a self-consistent mft record @m. If @sync is
- * false on the other hand, we start i/o on both and then wait for completion
- * on them. This provides a speedup but no longer guarantees that you will end
- * up with a self-consistent mft record in the case of a crash but if you asked
- * for asynchronous writing you probably do not care about that anyway.
- *
- * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
- * schedule i/o via ->writepage or do it via kntfsd or whatever.
- */
-int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
-{
- ntfs_volume *vol = ni->vol;
- struct page *page = ni->page;
- unsigned int blocksize = vol->sb->s_blocksize;
- unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
- int max_bhs = vol->mft_record_size / blocksize;
- struct buffer_head *bhs[MAX_BHS];
- struct buffer_head *bh, *head;
- runlist_element *rl;
- unsigned int block_start, block_end, m_start, m_end;
- int i_bhs, nr_bhs, err = 0;
-
- ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
- BUG_ON(NInoAttr(ni));
- BUG_ON(!max_bhs);
- BUG_ON(!PageLocked(page));
- if (WARN_ON(max_bhs > MAX_BHS)) {
- err = -EINVAL;
- goto err_out;
- }
- /*
- * If the ntfs_inode is clean no need to do anything. If it is dirty,
- * mark it as clean now so that it can be redirtied later on if needed.
- * There is no danger of races since the caller is holding the locks
- * for the mft record @m and the page it is in.
- */
- if (!NInoTestClearDirty(ni))
- goto done;
- bh = head = page_buffers(page);
- BUG_ON(!bh);
- rl = NULL;
- nr_bhs = 0;
- block_start = 0;
- m_start = ni->page_ofs;
- m_end = m_start + vol->mft_record_size;
- do {
- block_end = block_start + blocksize;
- /* If the buffer is outside the mft record, skip it. */
- if (block_end <= m_start)
- continue;
- if (unlikely(block_start >= m_end))
- break;
- /*
- * If this block is not the first one in the record, we ignore
- * the buffer's dirty state because we could have raced with a
- * parallel mark_ntfs_record_dirty().
- */
- if (block_start == m_start) {
- /* This block is the first one in the record. */
- if (!buffer_dirty(bh)) {
- BUG_ON(nr_bhs);
- /* Clean records are not written out. */
- break;
- }
- }
- /* Need to map the buffer if it is not mapped already. */
- if (unlikely(!buffer_mapped(bh))) {
- VCN vcn;
- LCN lcn;
- unsigned int vcn_ofs;
-
- bh->b_bdev = vol->sb->s_bdev;
- /* Obtain the vcn and offset of the current block. */
- vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
- (block_start - m_start);
- vcn_ofs = vcn & vol->cluster_size_mask;
- vcn >>= vol->cluster_size_bits;
- if (!rl) {
- down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
- rl = NTFS_I(vol->mft_ino)->runlist.rl;
- BUG_ON(!rl);
- }
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- /* For $MFT, only lcn >= 0 is a successful remap. */
- if (likely(lcn >= 0)) {
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn <<
- vol->cluster_size_bits) +
- vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
- } else {
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Cannot write mft record "
- "0x%lx because its location "
- "on disk could not be "
- "determined (error code %lli).",
- ni->mft_no, (long long)lcn);
- err = -EIO;
- }
- }
- BUG_ON(!buffer_uptodate(bh));
- BUG_ON(!nr_bhs && (m_start != block_start));
- BUG_ON(nr_bhs >= max_bhs);
- bhs[nr_bhs++] = bh;
- BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
- } while (block_start = block_end, (bh = bh->b_this_page) != head);
- if (unlikely(rl))
- up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
- if (!nr_bhs)
- goto done;
- if (unlikely(err))
- goto cleanup_out;
- /* Apply the mst protection fixups. */
- err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
- if (err) {
- ntfs_error(vol->sb, "Failed to apply mst fixups!");
- goto cleanup_out;
- }
- flush_dcache_mft_record_page(ni);
- /* Lock buffers and start synchronous write i/o on them. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
- struct buffer_head *tbh = bhs[i_bhs];
-
- if (!trylock_buffer(tbh))
- BUG();
- BUG_ON(!buffer_uptodate(tbh));
- clear_buffer_dirty(tbh);
- get_bh(tbh);
- tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, tbh);
- }
- /* Synchronize the mft mirror now if not @sync. */
- if (!sync && ni->mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
- /* Wait on i/o completion of buffers. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
- struct buffer_head *tbh = bhs[i_bhs];
-
- wait_on_buffer(tbh);
- if (unlikely(!buffer_uptodate(tbh))) {
- err = -EIO;
- /*
- * Set the buffer uptodate so the page and buffer
- * states do not become out of sync.
- */
- if (PageUptodate(page))
- set_buffer_uptodate(tbh);
- }
- }
- /* If @sync, now synchronize the mft mirror. */
- if (sync && ni->mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
- /* Remove the mst protection fixups again. */
- post_write_mst_fixup((NTFS_RECORD*)m);
- flush_dcache_mft_record_page(ni);
- if (unlikely(err)) {
- /* I/O error during writing. This is really bad! */
- ntfs_error(vol->sb, "I/O error while writing mft record "
- "0x%lx! Marking base inode as bad. You "
- "should unmount the volume and run chkdsk.",
- ni->mft_no);
- goto err_out;
- }
-done:
- ntfs_debug("Done.");
- return 0;
-cleanup_out:
- /* Clean the buffers. */
- for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
- clear_buffer_dirty(bhs[i_bhs]);
-err_out:
- /*
- * Current state: all buffers are clean, unlocked, and uptodate.
- * The caller should mark the base inode as bad so that no more i/o
- * happens. ->clear_inode() will still be invoked so all extent inodes
- * and other allocated memory will be freed.
- */
- if (err == -ENOMEM) {
- ntfs_error(vol->sb, "Not enough memory to write mft record. "
- "Redirtying so the write is retried later.");
- mark_mft_record_dirty(ni);
- err = 0;
- } else
- NVolSetErrors(vol);
- return err;
-}
-
-/**
- * ntfs_may_write_mft_record - check if an mft record may be written out
- * @vol: [IN] ntfs volume on which the mft record to check resides
- * @mft_no: [IN] mft record number of the mft record to check
- * @m: [IN] mapped mft record to check
- * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned
- *
- * Check if the mapped (base or extent) mft record @m with mft record number
- * @mft_no belonging to the ntfs volume @vol may be written out. If necessary
- * and possible the ntfs inode of the mft record is locked and the base vfs
- * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The
- * caller is responsible for unlocking the ntfs inode and unpinning the base
- * vfs inode.
- *
- * Return 'true' if the mft record may be written out and 'false' if not.
- *
- * The caller has locked the page and cleared the uptodate flag on it which
- * means that we can safely write out any dirty mft records that do not have
- * their inodes in icache as determined by ilookup5() as anyone
- * opening/creating such an inode would block when attempting to map the mft
- * record in read_cache_page() until we are finished with the write out.
- *
- * Here is a description of the tests we perform:
- *
- * If the inode is found in icache we know the mft record must be a base mft
- * record. If it is dirty, we do not write it and return 'false' as the vfs
- * inode write paths will result in the access times being updated which would
- * cause the base mft record to be redirtied and written out again. (We know
- * the access time update will modify the base mft record because Windows
- * chkdsk complains if the standard information attribute is not in the base
- * mft record.)
- *
- * If the inode is in icache and not dirty, we attempt to lock the mft record
- * and if we find the lock was already taken, it is not safe to write the mft
- * record and we return 'false'.
- *
- * If we manage to obtain the lock we have exclusive access to the mft record,
- * which also allows us safe writeout of the mft record. We then set
- * @locked_ni to the locked ntfs inode and return 'true'.
- *
- * Note we cannot just lock the mft record and sleep while waiting for the lock
- * because this would deadlock due to lock reversal (normally the mft record is
- * locked before the page is locked but we already have the page locked here
- * when we try to lock the mft record).
- *
- * If the inode is not in icache we need to perform further checks.
- *
- * If the mft record is not a FILE record or it is a base mft record, we can
- * safely write it and return 'true'.
- *
- * We now know the mft record is an extent mft record. We check if the inode
- * corresponding to its base mft record is in icache and obtain a reference to
- * it if it is. If it is not, we can safely write it and return 'true'.
- *
- * We now have the base inode for the extent mft record. We check if it has an
- * ntfs inode for the extent mft record attached and if not it is safe to write
- * the extent mft record and we return 'true'.
- *
- * The ntfs inode for the extent mft record is attached to the base inode so we
- * attempt to lock the extent mft record and if we find the lock was already
- * taken, it is not safe to write the extent mft record and we return 'false'.
- *
- * If we manage to obtain the lock we have exclusive access to the extent mft
- * record, which also allows us safe writeout of the extent mft record. We
- * set the ntfs inode of the extent mft record clean and then set @locked_ni to
- * the now locked ntfs inode and return 'true'.
- *
- * Note, the reason for actually writing dirty mft records here and not just
- * relying on the vfs inode dirty code paths is that we can have mft records
- * modified without them ever having actual inodes in memory. Also we can have
- * dirty mft records with clean ntfs inodes in memory. None of the described
- * cases would result in the dirty mft records being written out if we only
- * relied on the vfs inode dirty code paths. And these cases can really occur
- * during allocation of new mft records and in particular when the
- * initialized_size of the $MFT/$DATA attribute is extended and the new space
- * is initialized using ntfs_mft_record_format(). The clean inode can then
- * appear if the mft record is reused for a new inode before it got written
- * out.
- */
-bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
- const MFT_RECORD *m, ntfs_inode **locked_ni)
-{
- struct super_block *sb = vol->sb;
- struct inode *mft_vi = vol->mft_ino;
- struct inode *vi;
- ntfs_inode *ni, *eni, **extent_nis;
- int i;
- ntfs_attr na;
-
- ntfs_debug("Entering for inode 0x%lx.", mft_no);
- /*
- * Normally we do not return a locked inode so set @locked_ni to NULL.
- */
- BUG_ON(!locked_ni);
- *locked_ni = NULL;
- /*
- * Check if the inode corresponding to this mft record is in the VFS
- * inode cache and obtain a reference to it if it is.
- */
- ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
- na.mft_no = mft_no;
- na.name = NULL;
- na.name_len = 0;
- na.type = AT_UNUSED;
- /*
- * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
- * we get here for it rather often.
- */
- if (!mft_no) {
- /* Balance the below iput(). */
- vi = igrab(mft_vi);
- BUG_ON(vi != mft_vi);
- } else {
- /*
- * Have to use ilookup5_nowait() since ilookup5() waits for the
- * inode lock which causes ntfs to deadlock when a concurrent
- * inode write via the inode dirty code paths and the page
- * dirty code path of the inode dirty code path when writing
- * $MFT occurs.
- */
- vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
- }
- if (vi) {
- ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
- /* The inode is in icache. */
- ni = NTFS_I(vi);
- /* Take a reference to the ntfs inode. */
- atomic_inc(&ni->count);
- /* If the inode is dirty, do not write this record. */
- if (NInoDirty(ni)) {
- ntfs_debug("Inode 0x%lx is dirty, do not write it.",
- mft_no);
- atomic_dec(&ni->count);
- iput(vi);
- return false;
- }
- ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
- /* The inode is not dirty, try to take the mft record lock. */
- if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
- ntfs_debug("Mft record 0x%lx is already locked, do "
- "not write it.", mft_no);
- atomic_dec(&ni->count);
- iput(vi);
- return false;
- }
- ntfs_debug("Managed to lock mft record 0x%lx, write it.",
- mft_no);
- /*
- * The write has to occur while we hold the mft record lock so
- * return the locked ntfs inode.
- */
- *locked_ni = ni;
- return true;
- }
- ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
- /* The inode is not in icache. */
- /* Write the record if it is not a mft record (type "FILE"). */
- if (!ntfs_is_mft_record(m->magic)) {
- ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
- mft_no);
- return true;
- }
- /* Write the mft record if it is a base inode. */
- if (!m->base_mft_record) {
- ntfs_debug("Mft record 0x%lx is a base record, write it.",
- mft_no);
- return true;
- }
- /*
- * This is an extent mft record. Check if the inode corresponding to
- * its base mft record is in icache and obtain a reference to it if it
- * is.
- */
- na.mft_no = MREF_LE(m->base_mft_record);
- ntfs_debug("Mft record 0x%lx is an extent record. Looking for base "
- "inode 0x%lx in icache.", mft_no, na.mft_no);
- if (!na.mft_no) {
- /* Balance the below iput(). */
- vi = igrab(mft_vi);
- BUG_ON(vi != mft_vi);
- } else
- vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
- &na);
- if (!vi) {
- /*
- * The base inode is not in icache, write this extent mft
- * record.
- */
- ntfs_debug("Base inode 0x%lx is not in icache, write the "
- "extent record.", na.mft_no);
- return true;
- }
- ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
- /*
- * The base inode is in icache. Check if it has the extent inode
- * corresponding to this extent mft record attached.
- */
- ni = NTFS_I(vi);
- mutex_lock(&ni->extent_lock);
- if (ni->nr_extents <= 0) {
- /*
- * The base inode has no attached extent inodes, write this
- * extent mft record.
- */
- mutex_unlock(&ni->extent_lock);
- iput(vi);
- ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
- "write the extent record.", na.mft_no);
- return true;
- }
- /* Iterate over the attached extent inodes. */
- extent_nis = ni->ext.extent_ntfs_inos;
- for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
- if (mft_no == extent_nis[i]->mft_no) {
- /*
- * Found the extent inode corresponding to this extent
- * mft record.
- */
- eni = extent_nis[i];
- break;
- }
- }
- /*
- * If the extent inode was not attached to the base inode, write this
- * extent mft record.
- */
- if (!eni) {
- mutex_unlock(&ni->extent_lock);
- iput(vi);
- ntfs_debug("Extent inode 0x%lx is not attached to its base "
- "inode 0x%lx, write the extent record.",
- mft_no, na.mft_no);
- return true;
- }
- ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
- mft_no, na.mft_no);
- /* Take a reference to the extent ntfs inode. */
- atomic_inc(&eni->count);
- mutex_unlock(&ni->extent_lock);
- /*
- * Found the extent inode coresponding to this extent mft record.
- * Try to take the mft record lock.
- */
- if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
- atomic_dec(&eni->count);
- iput(vi);
- ntfs_debug("Extent mft record 0x%lx is already locked, do "
- "not write it.", mft_no);
- return false;
- }
- ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
- mft_no);
- if (NInoTestClearDirty(eni))
- ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
- mft_no);
- /*
- * The write has to occur while we hold the mft record lock so return
- * the locked extent ntfs inode.
- */
- *locked_ni = eni;
- return true;
-}
-
-static const char *es = " Leaving inconsistent metadata. Unmount and run "
- "chkdsk.";
-
-/**
- * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
- * @vol: volume on which to search for a free mft record
- * @base_ni: open base inode if allocating an extent mft record or NULL
- *
- * Search for a free mft record in the mft bitmap attribute on the ntfs volume
- * @vol.
- *
- * If @base_ni is NULL start the search at the default allocator position.
- *
- * If @base_ni is not NULL start the search at the mft record after the base
- * mft record @base_ni.
- *
- * Return the free mft record on success and -errno on error. An error code of
- * -ENOSPC means that there are no free mft records in the currently
- * initialized mft bitmap.
- *
- * Locking: Caller must hold vol->mftbmp_lock for writing.
- */
-static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
- ntfs_inode *base_ni)
-{
- s64 pass_end, ll, data_pos, pass_start, ofs, bit;
- unsigned long flags;
- struct address_space *mftbmp_mapping;
- u8 *buf, *byte;
- struct page *page;
- unsigned int page_ofs, size;
- u8 pass, b;
-
- ntfs_debug("Searching for free mft record in the currently "
- "initialized mft bitmap.");
- mftbmp_mapping = vol->mftbmp_ino->i_mapping;
- /*
- * Set the end of the pass making sure we do not overflow the mft
- * bitmap.
- */
- read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
- pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
- vol->mft_record_size_bits;
- read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
- read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
- ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
- read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
- if (pass_end > ll)
- pass_end = ll;
- pass = 1;
- if (!base_ni)
- data_pos = vol->mft_data_pos;
- else
- data_pos = base_ni->mft_no + 1;
- if (data_pos < 24)
- data_pos = 24;
- if (data_pos >= pass_end) {
- data_pos = 24;
- pass = 2;
- /* This happens on a freshly formatted volume. */
- if (data_pos >= pass_end)
- return -ENOSPC;
- }
- pass_start = data_pos;
- ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
- "pass_end 0x%llx, data_pos 0x%llx.", pass,
- (long long)pass_start, (long long)pass_end,
- (long long)data_pos);
- /* Loop until a free mft record is found. */
- for (; pass <= 2;) {
- /* Cap size to pass_end. */
- ofs = data_pos >> 3;
- page_ofs = ofs & ~PAGE_MASK;
- size = PAGE_SIZE - page_ofs;
- ll = ((pass_end + 7) >> 3) - ofs;
- if (size > ll)
- size = ll;
- size <<= 3;
- /*
- * If we are still within the active pass, search the next page
- * for a zero bit.
- */
- if (size) {
- page = ntfs_map_page(mftbmp_mapping,
- ofs >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read mft "
- "bitmap, aborting.");
- return PTR_ERR(page);
- }
- buf = (u8*)page_address(page) + page_ofs;
- bit = data_pos & 7;
- data_pos &= ~7ull;
- ntfs_debug("Before inner for loop: size 0x%x, "
- "data_pos 0x%llx, bit 0x%llx", size,
- (long long)data_pos, (long long)bit);
- for (; bit < size && data_pos + bit < pass_end;
- bit &= ~7ull, bit += 8) {
- byte = buf + (bit >> 3);
- if (*byte == 0xff)
- continue;
- b = ffz((unsigned long)*byte);
- if (b < 8 && b >= (bit & 7)) {
- ll = data_pos + (bit & ~7ull) + b;
- if (unlikely(ll > (1ll << 32))) {
- ntfs_unmap_page(page);
- return -ENOSPC;
- }
- *byte |= 1 << b;
- flush_dcache_page(page);
- set_page_dirty(page);
- ntfs_unmap_page(page);
- ntfs_debug("Done. (Found and "
- "allocated mft record "
- "0x%llx.)",
- (long long)ll);
- return ll;
- }
- }
- ntfs_debug("After inner for loop: size 0x%x, "
- "data_pos 0x%llx, bit 0x%llx", size,
- (long long)data_pos, (long long)bit);
- data_pos += size;
- ntfs_unmap_page(page);
- /*
- * If the end of the pass has not been reached yet,
- * continue searching the mft bitmap for a zero bit.
- */
- if (data_pos < pass_end)
- continue;
- }
- /* Do the next pass. */
- if (++pass == 2) {
- /*
- * Starting the second pass, in which we scan the first
- * part of the zone which we omitted earlier.
- */
- pass_end = pass_start;
- data_pos = pass_start = 24;
- ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
- "0x%llx.", pass, (long long)pass_start,
- (long long)pass_end);
- if (data_pos >= pass_end)
- break;
- }
- }
- /* No free mft records in currently initialized mft bitmap. */
- ntfs_debug("Done. (No free mft records left in currently initialized "
- "mft bitmap.)");
- return -ENOSPC;
-}
-
-/**
- * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
- * @vol: volume on which to extend the mft bitmap attribute
- *
- * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
- *
- * Note: Only changes allocated_size, i.e. does not touch initialized_size or
- * data_size.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - Caller must hold vol->mftbmp_lock for writing.
- * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
- * writing and releases it before returning.
- * - This function takes vol->lcnbmp_lock for writing and releases it
- * before returning.
- */
-static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
-{
- LCN lcn;
- s64 ll;
- unsigned long flags;
- struct page *page;
- ntfs_inode *mft_ni, *mftbmp_ni;
- runlist_element *rl, *rl2 = NULL;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *mrec;
- ATTR_RECORD *a = NULL;
- int ret, mp_size;
- u32 old_alen = 0;
- u8 *b, tb;
- struct {
- u8 added_cluster:1;
- u8 added_run:1;
- u8 mp_rebuilt:1;
- } status = { 0, 0, 0 };
-
- ntfs_debug("Extending mft bitmap allocation.");
- mft_ni = NTFS_I(vol->mft_ino);
- mftbmp_ni = NTFS_I(vol->mftbmp_ino);
- /*
- * Determine the last lcn of the mft bitmap. The allocated size of the
- * mft bitmap cannot be zero so we are ok to do this.
- */
- down_write(&mftbmp_ni->runlist.lock);
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- ll = mftbmp_ni->allocated_size;
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
- (ll - 1) >> vol->cluster_size_bits, NULL);
- if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to determine last allocated "
- "cluster of mft bitmap attribute.");
- if (!IS_ERR(rl))
- ret = -EIO;
- else
- ret = PTR_ERR(rl);
- return ret;
- }
- lcn = rl->lcn + rl->length;
- ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
- (long long)lcn);
- /*
- * Attempt to get the cluster following the last allocated cluster by
- * hand as it may be in the MFT zone so the allocator would not give it
- * to us.
- */
- ll = lcn >> 3;
- page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
- ll >> PAGE_SHIFT);
- if (IS_ERR(page)) {
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
- return PTR_ERR(page);
- }
- b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
- tb = 1 << (lcn & 7ull);
- down_write(&vol->lcnbmp_lock);
- if (*b != 0xff && !(*b & tb)) {
- /* Next cluster is free, allocate it. */
- *b |= tb;
- flush_dcache_page(page);
- set_page_dirty(page);
- up_write(&vol->lcnbmp_lock);
- ntfs_unmap_page(page);
- /* Update the mft bitmap runlist. */
- rl->length++;
- rl[1].vcn++;
- status.added_cluster = 1;
- ntfs_debug("Appending one cluster to mft bitmap.");
- } else {
- up_write(&vol->lcnbmp_lock);
- ntfs_unmap_page(page);
- /* Allocate a cluster from the DATA_ZONE. */
- rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
- true);
- if (IS_ERR(rl2)) {
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to allocate a cluster for "
- "the mft bitmap.");
- return PTR_ERR(rl2);
- }
- rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
- if (IS_ERR(rl)) {
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to merge runlists for mft "
- "bitmap.");
- if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to deallocate "
- "allocated cluster.%s", es);
- NVolSetErrors(vol);
- }
- ntfs_free(rl2);
- return PTR_ERR(rl);
- }
- mftbmp_ni->runlist.rl = rl;
- status.added_run = 1;
- ntfs_debug("Adding one run to mft bitmap.");
- /* Find the last run in the new runlist. */
- for (; rl[1].length; rl++)
- ;
- }
- /*
- * Update the attribute record as well. Note: @rl is the last
- * (non-terminator) runlist element of mft bitmap.
- */
- mrec = map_mft_record(mft_ni);
- if (IS_ERR(mrec)) {
- ntfs_error(vol->sb, "Failed to map mft record.");
- ret = PTR_ERR(mrec);
- goto undo_alloc;
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.");
- ret = -ENOMEM;
- goto undo_alloc;
- }
- ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
- 0, ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "mft bitmap attribute.");
- if (ret == -ENOENT)
- ret = -EIO;
- goto undo_alloc;
- }
- a = ctx->attr;
- ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
- /* Search back for the previous last allocated cluster of mft bitmap. */
- for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
- if (ll >= rl2->vcn)
- break;
- }
- BUG_ON(ll < rl2->vcn);
- BUG_ON(ll >= rl2->vcn + rl2->length);
- /* Get the size for the new mapping pairs array for this extent. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
- if (unlikely(mp_size <= 0)) {
- ntfs_error(vol->sb, "Get size for mapping pairs failed for "
- "mft bitmap attribute extent.");
- ret = mp_size;
- if (!ret)
- ret = -EIO;
- goto undo_alloc;
- }
- /* Expand the attribute record if necessary. */
- old_alen = le32_to_cpu(a->length);
- ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
- if (unlikely(ret)) {
- if (ret != -ENOSPC) {
- ntfs_error(vol->sb, "Failed to resize attribute "
- "record for mft bitmap attribute.");
- goto undo_alloc;
- }
- // TODO: Deal with this by moving this extent to a new mft
- // record or by starting a new extent in a new mft record or by
- // moving other attributes out of this mft record.
- // Note: It will need to be a special mft record and if none of
- // those are available it gets rather complicated...
- ntfs_error(vol->sb, "Not enough space in this mft record to "
- "accommodate extended mft bitmap attribute "
- "extent. Cannot handle this yet.");
- ret = -EOPNOTSUPP;
- goto undo_alloc;
- }
- status.mp_rebuilt = 1;
- /* Generate the mapping pairs array directly into the attr record. */
- ret = ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
- mp_size, rl2, ll, -1, NULL);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to build mapping pairs array for "
- "mft bitmap attribute.");
- goto undo_alloc;
- }
- /* Update the highest_vcn. */
- a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
- /*
- * We now have extended the mft bitmap allocated_size by one cluster.
- * Reflect this in the ntfs_inode structure and the attribute record.
- */
- if (a->data.non_resident.lowest_vcn) {
- /*
- * We are not in the first attribute extent, switch to it, but
- * first ensure the changes will make it to disk later.
- */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_reinit_search_ctx(ctx);
- ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find first attribute "
- "extent of mft bitmap attribute.");
- goto restore_undo_alloc;
- }
- a = ctx->attr;
- }
- write_lock_irqsave(&mftbmp_ni->size_lock, flags);
- mftbmp_ni->allocated_size += vol->cluster_size;
- a->data.non_resident.allocated_size =
- cpu_to_sle64(mftbmp_ni->allocated_size);
- write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- up_write(&mftbmp_ni->runlist.lock);
- ntfs_debug("Done.");
- return 0;
-restore_undo_alloc:
- ntfs_attr_reinit_search_ctx(ctx);
- if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
- 0, ctx)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "mft bitmap attribute.%s", es);
- write_lock_irqsave(&mftbmp_ni->size_lock, flags);
- mftbmp_ni->allocated_size += vol->cluster_size;
- write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- up_write(&mftbmp_ni->runlist.lock);
- /*
- * The only thing that is now wrong is ->allocated_size of the
- * base attribute extent which chkdsk should be able to fix.
- */
- NVolSetErrors(vol);
- return ret;
- }
- a = ctx->attr;
- a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
-undo_alloc:
- if (status.added_cluster) {
- /* Truncate the last run in the runlist by one cluster. */
- rl->length--;
- rl[1].vcn--;
- } else if (status.added_run) {
- lcn = rl->lcn;
- /* Remove the last run from the runlist. */
- rl->lcn = rl[1].lcn;
- rl->length = 0;
- }
- /* Deallocate the cluster. */
- down_write(&vol->lcnbmp_lock);
- if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
- ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
- NVolSetErrors(vol);
- }
- up_write(&vol->lcnbmp_lock);
- if (status.mp_rebuilt) {
- if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- old_alen - le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- rl2, ll, -1, NULL)) {
- ntfs_error(vol->sb, "Failed to restore mapping pairs "
- "array.%s", es);
- NVolSetErrors(vol);
- }
- if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
- "record.%s", es);
- NVolSetErrors(vol);
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- }
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (!IS_ERR(mrec))
- unmap_mft_record(mft_ni);
- up_write(&mftbmp_ni->runlist.lock);
- return ret;
-}
-
-/**
- * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
- * @vol: volume on which to extend the mft bitmap attribute
- *
- * Extend the initialized portion of the mft bitmap attribute on the ntfs
- * volume @vol by 8 bytes.
- *
- * Note: Only changes initialized_size and data_size, i.e. requires that
- * allocated_size is big enough to fit the new initialized_size.
- *
- * Return 0 on success and -error on error.
- *
- * Locking: Caller must hold vol->mftbmp_lock for writing.
- */
-static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
-{
- s64 old_data_size, old_initialized_size;
- unsigned long flags;
- struct inode *mftbmp_vi;
- ntfs_inode *mft_ni, *mftbmp_ni;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *mrec;
- ATTR_RECORD *a;
- int ret;
-
- ntfs_debug("Extending mft bitmap initiailized (and data) size.");
- mft_ni = NTFS_I(vol->mft_ino);
- mftbmp_vi = vol->mftbmp_ino;
- mftbmp_ni = NTFS_I(mftbmp_vi);
- /* Get the attribute record. */
- mrec = map_mft_record(mft_ni);
- if (IS_ERR(mrec)) {
- ntfs_error(vol->sb, "Failed to map mft record.");
- return PTR_ERR(mrec);
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.");
- ret = -ENOMEM;
- goto unm_err_out;
- }
- ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find first attribute extent of "
- "mft bitmap attribute.");
- if (ret == -ENOENT)
- ret = -EIO;
- goto put_err_out;
- }
- a = ctx->attr;
- write_lock_irqsave(&mftbmp_ni->size_lock, flags);
- old_data_size = i_size_read(mftbmp_vi);
- old_initialized_size = mftbmp_ni->initialized_size;
- /*
- * We can simply update the initialized_size before filling the space
- * with zeroes because the caller is holding the mft bitmap lock for
- * writing which ensures that no one else is trying to access the data.
- */
- mftbmp_ni->initialized_size += 8;
- a->data.non_resident.initialized_size =
- cpu_to_sle64(mftbmp_ni->initialized_size);
- if (mftbmp_ni->initialized_size > old_data_size) {
- i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
- a->data.non_resident.data_size =
- cpu_to_sle64(mftbmp_ni->initialized_size);
- }
- write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- /* Initialize the mft bitmap attribute value with zeroes. */
- ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
- if (likely(!ret)) {
- ntfs_debug("Done. (Wrote eight initialized bytes to mft "
- "bitmap.");
- return 0;
- }
- ntfs_error(vol->sb, "Failed to write to mft bitmap.");
- /* Try to recover from the error. */
- mrec = map_mft_record(mft_ni);
- if (IS_ERR(mrec)) {
- ntfs_error(vol->sb, "Failed to map mft record.%s", es);
- NVolSetErrors(vol);
- return ret;
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.%s", es);
- NVolSetErrors(vol);
- goto unm_err_out;
- }
- if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
- mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
- ntfs_error(vol->sb, "Failed to find first attribute extent of "
- "mft bitmap attribute.%s", es);
- NVolSetErrors(vol);
-put_err_out:
- ntfs_attr_put_search_ctx(ctx);
-unm_err_out:
- unmap_mft_record(mft_ni);
- goto err_out;
- }
- a = ctx->attr;
- write_lock_irqsave(&mftbmp_ni->size_lock, flags);
- mftbmp_ni->initialized_size = old_initialized_size;
- a->data.non_resident.initialized_size =
- cpu_to_sle64(old_initialized_size);
- if (i_size_read(mftbmp_vi) != old_data_size) {
- i_size_write(mftbmp_vi, old_data_size);
- a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
- }
- write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
-#ifdef DEBUG
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
- "data_size 0x%llx, initialized_size 0x%llx.",
- (long long)mftbmp_ni->allocated_size,
- (long long)i_size_read(mftbmp_vi),
- (long long)mftbmp_ni->initialized_size);
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
-err_out:
- return ret;
-}
-
-/**
- * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
- * @vol: volume on which to extend the mft data attribute
- *
- * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
- * worth of clusters or if not enough space for this by one mft record worth
- * of clusters.
- *
- * Note: Only changes allocated_size, i.e. does not touch initialized_size or
- * data_size.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - Caller must hold vol->mftbmp_lock for writing.
- * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
- * writing and releases it before returning.
- * - This function calls functions which take vol->lcnbmp_lock for
- * writing and release it before returning.
- */
-static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
-{
- LCN lcn;
- VCN old_last_vcn;
- s64 min_nr, nr, ll;
- unsigned long flags;
- ntfs_inode *mft_ni;
- runlist_element *rl, *rl2;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *mrec;
- ATTR_RECORD *a = NULL;
- int ret, mp_size;
- u32 old_alen = 0;
- bool mp_rebuilt = false;
-
- ntfs_debug("Extending mft data allocation.");
- mft_ni = NTFS_I(vol->mft_ino);
- /*
- * Determine the preferred allocation location, i.e. the last lcn of
- * the mft data attribute. The allocated size of the mft data
- * attribute cannot be zero so we are ok to do this.
- */
- down_write(&mft_ni->runlist.lock);
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ll = mft_ni->allocated_size;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- rl = ntfs_attr_find_vcn_nolock(mft_ni,
- (ll - 1) >> vol->cluster_size_bits, NULL);
- if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
- up_write(&mft_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to determine last allocated "
- "cluster of mft data attribute.");
- if (!IS_ERR(rl))
- ret = -EIO;
- else
- ret = PTR_ERR(rl);
- return ret;
- }
- lcn = rl->lcn + rl->length;
- ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
- /* Minimum allocation is one mft record worth of clusters. */
- min_nr = vol->mft_record_size >> vol->cluster_size_bits;
- if (!min_nr)
- min_nr = 1;
- /* Want to allocate 16 mft records worth of clusters. */
- nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
- if (!nr)
- nr = min_nr;
- /* Ensure we do not go above 2^32-1 mft records. */
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ll = mft_ni->allocated_size;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
- vol->mft_record_size_bits >= (1ll << 32))) {
- nr = min_nr;
- if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
- vol->mft_record_size_bits >= (1ll << 32))) {
- ntfs_warning(vol->sb, "Cannot allocate mft record "
- "because the maximum number of inodes "
- "(2^32) has already been reached.");
- up_write(&mft_ni->runlist.lock);
- return -ENOSPC;
- }
- }
- ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
- nr > min_nr ? "default" : "minimal", (long long)nr);
- old_last_vcn = rl[1].vcn;
- do {
- rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
- true);
- if (!IS_ERR(rl2))
- break;
- if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
- ntfs_error(vol->sb, "Failed to allocate the minimal "
- "number of clusters (%lli) for the "
- "mft data attribute.", (long long)nr);
- up_write(&mft_ni->runlist.lock);
- return PTR_ERR(rl2);
- }
- /*
- * There is not enough space to do the allocation, but there
- * might be enough space to do a minimal allocation so try that
- * before failing.
- */
- nr = min_nr;
- ntfs_debug("Retrying mft data allocation with minimal cluster "
- "count %lli.", (long long)nr);
- } while (1);
- rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
- if (IS_ERR(rl)) {
- up_write(&mft_ni->runlist.lock);
- ntfs_error(vol->sb, "Failed to merge runlists for mft data "
- "attribute.");
- if (ntfs_cluster_free_from_rl(vol, rl2)) {
- ntfs_error(vol->sb, "Failed to deallocate clusters "
- "from the mft data attribute.%s", es);
- NVolSetErrors(vol);
- }
- ntfs_free(rl2);
- return PTR_ERR(rl);
- }
- mft_ni->runlist.rl = rl;
- ntfs_debug("Allocated %lli clusters.", (long long)nr);
- /* Find the last run in the new runlist. */
- for (; rl[1].length; rl++)
- ;
- /* Update the attribute record as well. */
- mrec = map_mft_record(mft_ni);
- if (IS_ERR(mrec)) {
- ntfs_error(vol->sb, "Failed to map mft record.");
- ret = PTR_ERR(mrec);
- goto undo_alloc;
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.");
- ret = -ENOMEM;
- goto undo_alloc;
- }
- ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
- CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "mft data attribute.");
- if (ret == -ENOENT)
- ret = -EIO;
- goto undo_alloc;
- }
- a = ctx->attr;
- ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
- /* Search back for the previous last allocated cluster of mft bitmap. */
- for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
- if (ll >= rl2->vcn)
- break;
- }
- BUG_ON(ll < rl2->vcn);
- BUG_ON(ll >= rl2->vcn + rl2->length);
- /* Get the size for the new mapping pairs array for this extent. */
- mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
- if (unlikely(mp_size <= 0)) {
- ntfs_error(vol->sb, "Get size for mapping pairs failed for "
- "mft data attribute extent.");
- ret = mp_size;
- if (!ret)
- ret = -EIO;
- goto undo_alloc;
- }
- /* Expand the attribute record if necessary. */
- old_alen = le32_to_cpu(a->length);
- ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
- if (unlikely(ret)) {
- if (ret != -ENOSPC) {
- ntfs_error(vol->sb, "Failed to resize attribute "
- "record for mft data attribute.");
- goto undo_alloc;
- }
- // TODO: Deal with this by moving this extent to a new mft
- // record or by starting a new extent in a new mft record or by
- // moving other attributes out of this mft record.
- // Note: Use the special reserved mft records and ensure that
- // this extent is not required to find the mft record in
- // question. If no free special records left we would need to
- // move an existing record away, insert ours in its place, and
- // then place the moved record into the newly allocated space
- // and we would then need to update all references to this mft
- // record appropriately. This is rather complicated...
- ntfs_error(vol->sb, "Not enough space in this mft record to "
- "accommodate extended mft data attribute "
- "extent. Cannot handle this yet.");
- ret = -EOPNOTSUPP;
- goto undo_alloc;
- }
- mp_rebuilt = true;
- /* Generate the mapping pairs array directly into the attr record. */
- ret = ntfs_mapping_pairs_build(vol, (u8*)a +
- le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
- mp_size, rl2, ll, -1, NULL);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to build mapping pairs array of "
- "mft data attribute.");
- goto undo_alloc;
- }
- /* Update the highest_vcn. */
- a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
- /*
- * We now have extended the mft data allocated_size by nr clusters.
- * Reflect this in the ntfs_inode structure and the attribute record.
- * @rl is the last (non-terminator) runlist element of mft data
- * attribute.
- */
- if (a->data.non_resident.lowest_vcn) {
- /*
- * We are not in the first attribute extent, switch to it, but
- * first ensure the changes will make it to disk later.
- */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_reinit_search_ctx(ctx);
- ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
- mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
- ctx);
- if (unlikely(ret)) {
- ntfs_error(vol->sb, "Failed to find first attribute "
- "extent of mft data attribute.");
- goto restore_undo_alloc;
- }
- a = ctx->attr;
- }
- write_lock_irqsave(&mft_ni->size_lock, flags);
- mft_ni->allocated_size += nr << vol->cluster_size_bits;
- a->data.non_resident.allocated_size =
- cpu_to_sle64(mft_ni->allocated_size);
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- up_write(&mft_ni->runlist.lock);
- ntfs_debug("Done.");
- return 0;
-restore_undo_alloc:
- ntfs_attr_reinit_search_ctx(ctx);
- if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
- CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
- ntfs_error(vol->sb, "Failed to find last attribute extent of "
- "mft data attribute.%s", es);
- write_lock_irqsave(&mft_ni->size_lock, flags);
- mft_ni->allocated_size += nr << vol->cluster_size_bits;
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- up_write(&mft_ni->runlist.lock);
- /*
- * The only thing that is now wrong is ->allocated_size of the
- * base attribute extent which chkdsk should be able to fix.
- */
- NVolSetErrors(vol);
- return ret;
- }
- ctx->attr->data.non_resident.highest_vcn =
- cpu_to_sle64(old_last_vcn - 1);
-undo_alloc:
- if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
- ntfs_error(vol->sb, "Failed to free clusters from mft data "
- "attribute.%s", es);
- NVolSetErrors(vol);
- }
-
- if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
- ntfs_error(vol->sb, "Failed to truncate mft data attribute "
- "runlist.%s", es);
- NVolSetErrors(vol);
- }
- if (ctx) {
- a = ctx->attr;
- if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
- if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- old_alen - le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
- rl2, ll, -1, NULL)) {
- ntfs_error(vol->sb, "Failed to restore mapping pairs "
- "array.%s", es);
- NVolSetErrors(vol);
- }
- if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
- "record.%s", es);
- NVolSetErrors(vol);
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- } else if (IS_ERR(ctx->mrec)) {
- ntfs_error(vol->sb, "Failed to restore attribute search "
- "context.%s", es);
- NVolSetErrors(vol);
- }
- ntfs_attr_put_search_ctx(ctx);
- }
- if (!IS_ERR(mrec))
- unmap_mft_record(mft_ni);
- up_write(&mft_ni->runlist.lock);
- return ret;
-}
-
-/**
- * ntfs_mft_record_layout - layout an mft record into a memory buffer
- * @vol: volume to which the mft record will belong
- * @mft_no: mft reference specifying the mft record number
- * @m: destination buffer of size >= @vol->mft_record_size bytes
- *
- * Layout an empty, unused mft record with the mft record number @mft_no into
- * the buffer @m. The volume @vol is needed because the mft record structure
- * was modified in NTFS 3.1 so we need to know which volume version this mft
- * record will be used on.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
- MFT_RECORD *m)
-{
- ATTR_RECORD *a;
-
- ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
- if (mft_no >= (1ll << 32)) {
- ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
- "maximum of 2^32.", (long long)mft_no);
- return -ERANGE;
- }
- /* Start by clearing the whole mft record to gives us a clean slate. */
- memset(m, 0, vol->mft_record_size);
- /* Aligned to 2-byte boundary. */
- if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
- m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
- else {
- m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
- /*
- * Set the NTFS 3.1+ specific fields while we know that the
- * volume version is 3.1+.
- */
- m->reserved = 0;
- m->mft_record_number = cpu_to_le32((u32)mft_no);
- }
- m->magic = magic_FILE;
- if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
- m->usa_count = cpu_to_le16(vol->mft_record_size /
- NTFS_BLOCK_SIZE + 1);
- else {
- m->usa_count = cpu_to_le16(1);
- ntfs_warning(vol->sb, "Sector size is bigger than mft record "
- "size. Setting usa_count to 1. If chkdsk "
- "reports this as corruption, please email "
- "linux-ntfs-dev@lists.sourceforge.net stating "
- "that you saw this message and that the "
- "modified filesystem created was corrupt. "
- "Thank you.");
- }
- /* Set the update sequence number to 1. */
- *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
- m->lsn = 0;
- m->sequence_number = cpu_to_le16(1);
- m->link_count = 0;
- /*
- * Place the attributes straight after the update sequence array,
- * aligned to 8-byte boundary.
- */
- m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
- (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
- m->flags = 0;
- /*
- * Using attrs_offset plus eight bytes (for the termination attribute).
- * attrs_offset is already aligned to 8-byte boundary, so no need to
- * align again.
- */
- m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
- m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
- m->base_mft_record = 0;
- m->next_attr_instance = 0;
- /* Add the termination attribute. */
- a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
- a->type = AT_END;
- a->length = 0;
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * ntfs_mft_record_format - format an mft record on an ntfs volume
- * @vol: volume on which to format the mft record
- * @mft_no: mft record number to format
- *
- * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
- * mft record into the appropriate place of the mft data attribute. This is
- * used when extending the mft data attribute.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
-{
- loff_t i_size;
- struct inode *mft_vi = vol->mft_ino;
- struct page *page;
- MFT_RECORD *m;
- pgoff_t index, end_index;
- unsigned int ofs;
- int err;
-
- ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
- /*
- * The index into the page cache and the offset within the page cache
- * page of the wanted mft record.
- */
- index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
- ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
- /* The maximum valid index into the page cache for $MFT's data. */
- i_size = i_size_read(mft_vi);
- end_index = i_size >> PAGE_SHIFT;
- if (unlikely(index >= end_index)) {
- if (unlikely(index > end_index || ofs + vol->mft_record_size >=
- (i_size & ~PAGE_MASK))) {
- ntfs_error(vol->sb, "Tried to format non-existing mft "
- "record 0x%llx.", (long long)mft_no);
- return -ENOENT;
- }
- }
- /* Read, map, and pin the page containing the mft record. */
- page = ntfs_map_page(mft_vi->i_mapping, index);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to map page containing mft record "
- "to format 0x%llx.", (long long)mft_no);
- return PTR_ERR(page);
- }
- lock_page(page);
- BUG_ON(!PageUptodate(page));
- ClearPageUptodate(page);
- m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
- err = ntfs_mft_record_layout(vol, mft_no, m);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
- (long long)mft_no);
- SetPageUptodate(page);
- unlock_page(page);
- ntfs_unmap_page(page);
- return err;
- }
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- /*
- * Make sure the mft record is written out to disk. We could use
- * ilookup5() to check if an inode is in icache and so on but this is
- * unnecessary as ntfs_writepage() will write the dirty record anyway.
- */
- mark_ntfs_record_dirty(page, ofs);
- ntfs_unmap_page(page);
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
- * @vol: [IN] volume on which to allocate the mft record
- * @mode: [IN] mode if want a file or directory, i.e. base inode or 0
- * @base_ni: [IN] open base inode if allocating an extent mft record or NULL
- * @mrec: [OUT] on successful return this is the mapped mft record
- *
- * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
- *
- * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
- * direvctory inode, and allocate it at the default allocator position. In
- * this case @mode is the file mode as given to us by the caller. We in
- * particular use @mode to distinguish whether a file or a directory is being
- * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
- *
- * If @base_ni is not NULL make the allocated mft record an extent record,
- * allocate it starting at the mft record after the base mft record and attach
- * the allocated and opened ntfs inode to the base inode @base_ni. In this
- * case @mode must be 0 as it is meaningless for extent inodes.
- *
- * You need to check the return value with IS_ERR(). If false, the function
- * was successful and the return value is the now opened ntfs inode of the
- * allocated mft record. *@mrec is then set to the allocated, mapped, pinned,
- * and locked mft record. If IS_ERR() is true, the function failed and the
- * error code is obtained from PTR_ERR(return value). *@mrec is undefined in
- * this case.
- *
- * Allocation strategy:
- *
- * To find a free mft record, we scan the mft bitmap for a zero bit. To
- * optimize this we start scanning at the place specified by @base_ni or if
- * @base_ni is NULL we start where we last stopped and we perform wrap around
- * when we reach the end. Note, we do not try to allocate mft records below
- * number 24 because numbers 0 to 15 are the defined system files anyway and 16
- * to 24 are special in that they are used for storing extension mft records
- * for the $DATA attribute of $MFT. This is required to avoid the possibility
- * of creating a runlist with a circular dependency which once written to disk
- * can never be read in again. Windows will only use records 16 to 24 for
- * normal files if the volume is completely out of space. We never use them
- * which means that when the volume is really out of space we cannot create any
- * more files while Windows can still create up to 8 small files. We can start
- * doing this at some later time, it does not matter much for now.
- *
- * When scanning the mft bitmap, we only search up to the last allocated mft
- * record. If there are no free records left in the range 24 to number of
- * allocated mft records, then we extend the $MFT/$DATA attribute in order to
- * create free mft records. We extend the allocated size of $MFT/$DATA by 16
- * records at a time or one cluster, if cluster size is above 16kiB. If there
- * is not sufficient space to do this, we try to extend by a single mft record
- * or one cluster, if cluster size is above the mft record size.
- *
- * No matter how many mft records we allocate, we initialize only the first
- * allocated mft record, incrementing mft data size and initialized size
- * accordingly, open an ntfs_inode for it and return it to the caller, unless
- * there are less than 24 mft records, in which case we allocate and initialize
- * mft records until we reach record 24 which we consider as the first free mft
- * record for use by normal files.
- *
- * If during any stage we overflow the initialized data in the mft bitmap, we
- * extend the initialized size (and data size) by 8 bytes, allocating another
- * cluster if required. The bitmap data size has to be at least equal to the
- * number of mft records in the mft, but it can be bigger, in which case the
- * superflous bits are padded with zeroes.
- *
- * Thus, when we return successfully (IS_ERR() is false), we will have:
- * - initialized / extended the mft bitmap if necessary,
- * - initialized / extended the mft data if necessary,
- * - set the bit corresponding to the mft record being allocated in the
- * mft bitmap,
- * - opened an ntfs_inode for the allocated mft record, and we will have
- * - returned the ntfs_inode as well as the allocated mapped, pinned, and
- * locked mft record.
- *
- * On error, the volume will be left in a consistent state and no record will
- * be allocated. If rolling back a partial operation fails, we may leave some
- * inconsistent metadata in which case we set NVolErrors() so the volume is
- * left dirty when unmounted.
- *
- * Note, this function cannot make use of most of the normal functions, like
- * for example for attribute resizing, etc, because when the run list overflows
- * the base mft record and an attribute list is used, it is very important that
- * the extension mft records used to store the $DATA attribute of $MFT can be
- * reached without having to read the information contained inside them, as
- * this would make it impossible to find them in the first place after the
- * volume is unmounted. $MFT/$BITMAP probably does not need to follow this
- * rule because the bitmap is not essential for finding the mft records, but on
- * the other hand, handling the bitmap in this special way would make life
- * easier because otherwise there might be circular invocations of functions
- * when reading the bitmap.
- */
-ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
- ntfs_inode *base_ni, MFT_RECORD **mrec)
-{
- s64 ll, bit, old_data_initialized, old_data_size;
- unsigned long flags;
- struct inode *vi;
- struct page *page;
- ntfs_inode *mft_ni, *mftbmp_ni, *ni;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- pgoff_t index;
- unsigned int ofs;
- int err;
- le16 seq_no, usn;
- bool record_formatted = false;
-
- if (base_ni) {
- ntfs_debug("Entering (allocating an extent mft record for "
- "base mft record 0x%llx).",
- (long long)base_ni->mft_no);
- /* @mode and @base_ni are mutually exclusive. */
- BUG_ON(mode);
- } else
- ntfs_debug("Entering (allocating a base mft record).");
- if (mode) {
- /* @mode and @base_ni are mutually exclusive. */
- BUG_ON(base_ni);
- /* We only support creation of normal files and directories. */
- if (!S_ISREG(mode) && !S_ISDIR(mode))
- return ERR_PTR(-EOPNOTSUPP);
- }
- BUG_ON(!mrec);
- mft_ni = NTFS_I(vol->mft_ino);
- mftbmp_ni = NTFS_I(vol->mftbmp_ino);
- down_write(&vol->mftbmp_lock);
- bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
- if (bit >= 0) {
- ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
- (long long)bit);
- goto have_alloc_rec;
- }
- if (bit != -ENOSPC) {
- up_write(&vol->mftbmp_lock);
- return ERR_PTR(bit);
- }
- /*
- * No free mft records left. If the mft bitmap already covers more
- * than the currently used mft records, the next records are all free,
- * so we can simply allocate the first unused mft record.
- * Note: We also have to make sure that the mft bitmap at least covers
- * the first 24 mft records as they are special and whilst they may not
- * be in use, we do not allocate from them.
- */
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- old_data_initialized = mftbmp_ni->initialized_size;
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
- bit = ll;
- if (bit < 24)
- bit = 24;
- if (unlikely(bit >= (1ll << 32)))
- goto max_err_out;
- ntfs_debug("Found free record (#2), bit 0x%llx.",
- (long long)bit);
- goto found_free_rec;
- }
- /*
- * The mft bitmap needs to be expanded until it covers the first unused
- * mft record that we can allocate.
- * Note: The smallest mft record we allocate is mft record 24.
- */
- bit = old_data_initialized << 3;
- if (unlikely(bit >= (1ll << 32)))
- goto max_err_out;
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- old_data_size = mftbmp_ni->allocated_size;
- ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
- "data_size 0x%llx, initialized_size 0x%llx.",
- (long long)old_data_size,
- (long long)i_size_read(vol->mftbmp_ino),
- (long long)old_data_initialized);
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
- if (old_data_initialized + 8 > old_data_size) {
- /* Need to extend bitmap by one more cluster. */
- ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
- err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
- if (unlikely(err)) {
- up_write(&vol->mftbmp_lock);
- goto err_out;
- }
-#ifdef DEBUG
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- ntfs_debug("Status of mftbmp after allocation extension: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mftbmp_ni->allocated_size,
- (long long)i_size_read(vol->mftbmp_ino),
- (long long)mftbmp_ni->initialized_size);
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
- }
- /*
- * We now have sufficient allocated space, extend the initialized_size
- * as well as the data_size if necessary and fill the new space with
- * zeroes.
- */
- err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
- if (unlikely(err)) {
- up_write(&vol->mftbmp_lock);
- goto err_out;
- }
-#ifdef DEBUG
- read_lock_irqsave(&mftbmp_ni->size_lock, flags);
- ntfs_debug("Status of mftbmp after initialized extension: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mftbmp_ni->allocated_size,
- (long long)i_size_read(vol->mftbmp_ino),
- (long long)mftbmp_ni->initialized_size);
- read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
- ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
-found_free_rec:
- /* @bit is the found free mft record, allocate it in the mft bitmap. */
- ntfs_debug("At found_free_rec.");
- err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
- up_write(&vol->mftbmp_lock);
- goto err_out;
- }
- ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
-have_alloc_rec:
- /*
- * The mft bitmap is now uptodate. Deal with mft data attribute now.
- * Note, we keep hold of the mft bitmap lock for writing until all
- * modifications to the mft data attribute are complete, too, as they
- * will impact decisions for mft bitmap and mft record allocation done
- * by a parallel allocation and if the lock is not maintained a
- * parallel allocation could allocate the same mft record as this one.
- */
- ll = (bit + 1) << vol->mft_record_size_bits;
- read_lock_irqsave(&mft_ni->size_lock, flags);
- old_data_initialized = mft_ni->initialized_size;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- if (ll <= old_data_initialized) {
- ntfs_debug("Allocated mft record already initialized.");
- goto mft_rec_already_initialized;
- }
- ntfs_debug("Initializing allocated mft record.");
- /*
- * The mft record is outside the initialized data. Extend the mft data
- * attribute until it covers the allocated record. The loop is only
- * actually traversed more than once when a freshly formatted volume is
- * first written to so it optimizes away nicely in the common case.
- */
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ntfs_debug("Status of mft data before extension: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mft_ni->allocated_size,
- (long long)i_size_read(vol->mft_ino),
- (long long)mft_ni->initialized_size);
- while (ll > mft_ni->allocated_size) {
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- err = ntfs_mft_data_extend_allocation_nolock(vol);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to extend mft data "
- "allocation.");
- goto undo_mftbmp_alloc_nolock;
- }
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ntfs_debug("Status of mft data after allocation extension: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mft_ni->allocated_size,
- (long long)i_size_read(vol->mft_ino),
- (long long)mft_ni->initialized_size);
- }
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- /*
- * Extend mft data initialized size (and data size of course) to reach
- * the allocated mft record, formatting the mft records allong the way.
- * Note: We only modify the ntfs_inode structure as that is all that is
- * needed by ntfs_mft_record_format(). We will update the attribute
- * record itself in one fell swoop later on.
- */
- write_lock_irqsave(&mft_ni->size_lock, flags);
- old_data_initialized = mft_ni->initialized_size;
- old_data_size = vol->mft_ino->i_size;
- while (ll > mft_ni->initialized_size) {
- s64 new_initialized_size, mft_no;
-
- new_initialized_size = mft_ni->initialized_size +
- vol->mft_record_size;
- mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
- if (new_initialized_size > i_size_read(vol->mft_ino))
- i_size_write(vol->mft_ino, new_initialized_size);
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- ntfs_debug("Initializing mft record 0x%llx.",
- (long long)mft_no);
- err = ntfs_mft_record_format(vol, mft_no);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to format mft record.");
- goto undo_data_init;
- }
- write_lock_irqsave(&mft_ni->size_lock, flags);
- mft_ni->initialized_size = new_initialized_size;
- }
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- record_formatted = true;
- /* Update the mft data attribute record to reflect the new sizes. */
- m = map_mft_record(mft_ni);
- if (IS_ERR(m)) {
- ntfs_error(vol->sb, "Failed to map mft record.");
- err = PTR_ERR(m);
- goto undo_data_init;
- }
- ctx = ntfs_attr_get_search_ctx(mft_ni, m);
- if (unlikely(!ctx)) {
- ntfs_error(vol->sb, "Failed to get search context.");
- err = -ENOMEM;
- unmap_mft_record(mft_ni);
- goto undo_data_init;
- }
- err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to find first attribute extent of "
- "mft data attribute.");
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- goto undo_data_init;
- }
- a = ctx->attr;
- read_lock_irqsave(&mft_ni->size_lock, flags);
- a->data.non_resident.initialized_size =
- cpu_to_sle64(mft_ni->initialized_size);
- a->data.non_resident.data_size =
- cpu_to_sle64(i_size_read(vol->mft_ino));
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- /* Ensure the changes make it to disk. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(mft_ni);
- read_lock_irqsave(&mft_ni->size_lock, flags);
- ntfs_debug("Status of mft data after mft record initialization: "
- "allocated_size 0x%llx, data_size 0x%llx, "
- "initialized_size 0x%llx.",
- (long long)mft_ni->allocated_size,
- (long long)i_size_read(vol->mft_ino),
- (long long)mft_ni->initialized_size);
- BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
- BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
-mft_rec_already_initialized:
- /*
- * We can finally drop the mft bitmap lock as the mft data attribute
- * has been fully updated. The only disparity left is that the
- * allocated mft record still needs to be marked as in use to match the
- * set bit in the mft bitmap but this is actually not a problem since
- * this mft record is not referenced from anywhere yet and the fact
- * that it is allocated in the mft bitmap means that no-one will try to
- * allocate it either.
- */
- up_write(&vol->mftbmp_lock);
- /*
- * We now have allocated and initialized the mft record. Calculate the
- * index of and the offset within the page cache page the record is in.
- */
- index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
- ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
- /* Read, map, and pin the page containing the mft record. */
- page = ntfs_map_page(vol->mft_ino->i_mapping, index);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to map page containing allocated "
- "mft record 0x%llx.", (long long)bit);
- err = PTR_ERR(page);
- goto undo_mftbmp_alloc;
- }
- lock_page(page);
- BUG_ON(!PageUptodate(page));
- ClearPageUptodate(page);
- m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
- /* If we just formatted the mft record no need to do it again. */
- if (!record_formatted) {
- /* Sanity check that the mft record is really not in use. */
- if (ntfs_is_file_record(m->magic) &&
- (m->flags & MFT_RECORD_IN_USE)) {
- ntfs_error(vol->sb, "Mft record 0x%llx was marked "
- "free in mft bitmap but is marked "
- "used itself. Corrupt filesystem. "
- "Unmount and run chkdsk.",
- (long long)bit);
- err = -EIO;
- SetPageUptodate(page);
- unlock_page(page);
- ntfs_unmap_page(page);
- NVolSetErrors(vol);
- goto undo_mftbmp_alloc;
- }
- /*
- * We need to (re-)format the mft record, preserving the
- * sequence number if it is not zero as well as the update
- * sequence number if it is not zero or -1 (0xffff). This
- * means we do not need to care whether or not something went
- * wrong with the previous mft record.
- */
- seq_no = m->sequence_number;
- usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
- err = ntfs_mft_record_layout(vol, bit, m);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to layout allocated mft "
- "record 0x%llx.", (long long)bit);
- SetPageUptodate(page);
- unlock_page(page);
- ntfs_unmap_page(page);
- goto undo_mftbmp_alloc;
- }
- if (seq_no)
- m->sequence_number = seq_no;
- if (usn && le16_to_cpu(usn) != 0xffff)
- *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
- }
- /* Set the mft record itself in use. */
- m->flags |= MFT_RECORD_IN_USE;
- if (S_ISDIR(mode))
- m->flags |= MFT_RECORD_IS_DIRECTORY;
- flush_dcache_page(page);
- SetPageUptodate(page);
- if (base_ni) {
- MFT_RECORD *m_tmp;
-
- /*
- * Setup the base mft record in the extent mft record. This
- * completes initialization of the allocated extent mft record
- * and we can simply use it with map_extent_mft_record().
- */
- m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
- base_ni->seq_no);
- /*
- * Allocate an extent inode structure for the new mft record,
- * attach it to the base inode @base_ni and map, pin, and lock
- * its, i.e. the allocated, mft record.
- */
- m_tmp = map_extent_mft_record(base_ni, bit, &ni);
- if (IS_ERR(m_tmp)) {
- ntfs_error(vol->sb, "Failed to map allocated extent "
- "mft record 0x%llx.", (long long)bit);
- err = PTR_ERR(m_tmp);
- /* Set the mft record itself not in use. */
- m->flags &= cpu_to_le16(
- ~le16_to_cpu(MFT_RECORD_IN_USE));
- flush_dcache_page(page);
- /* Make sure the mft record is written out to disk. */
- mark_ntfs_record_dirty(page, ofs);
- unlock_page(page);
- ntfs_unmap_page(page);
- goto undo_mftbmp_alloc;
- }
- BUG_ON(m != m_tmp);
- /*
- * Make sure the allocated mft record is written out to disk.
- * No need to set the inode dirty because the caller is going
- * to do that anyway after finishing with the new extent mft
- * record (e.g. at a minimum a new attribute will be added to
- * the mft record.
- */
- mark_ntfs_record_dirty(page, ofs);
- unlock_page(page);
- /*
- * Need to unmap the page since map_extent_mft_record() mapped
- * it as well so we have it mapped twice at the moment.
- */
- ntfs_unmap_page(page);
- } else {
- /*
- * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink
- * is set to 1 but the mft record->link_count is 0. The caller
- * needs to bear this in mind.
- */
- vi = new_inode(vol->sb);
- if (unlikely(!vi)) {
- err = -ENOMEM;
- /* Set the mft record itself not in use. */
- m->flags &= cpu_to_le16(
- ~le16_to_cpu(MFT_RECORD_IN_USE));
- flush_dcache_page(page);
- /* Make sure the mft record is written out to disk. */
- mark_ntfs_record_dirty(page, ofs);
- unlock_page(page);
- ntfs_unmap_page(page);
- goto undo_mftbmp_alloc;
- }
- vi->i_ino = bit;
-
- /* The owner and group come from the ntfs volume. */
- vi->i_uid = vol->uid;
- vi->i_gid = vol->gid;
-
- /* Initialize the ntfs specific part of @vi. */
- ntfs_init_big_inode(vi);
- ni = NTFS_I(vi);
- /*
- * Set the appropriate mode, attribute type, and name. For
- * directories, also setup the index values to the defaults.
- */
- if (S_ISDIR(mode)) {
- vi->i_mode = S_IFDIR | S_IRWXUGO;
- vi->i_mode &= ~vol->dmask;
-
- NInoSetMstProtected(ni);
- ni->type = AT_INDEX_ALLOCATION;
- ni->name = I30;
- ni->name_len = 4;
-
- ni->itype.index.block_size = 4096;
- ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
- ni->itype.index.collation_rule = COLLATION_FILE_NAME;
- if (vol->cluster_size <= ni->itype.index.block_size) {
- ni->itype.index.vcn_size = vol->cluster_size;
- ni->itype.index.vcn_size_bits =
- vol->cluster_size_bits;
- } else {
- ni->itype.index.vcn_size = vol->sector_size;
- ni->itype.index.vcn_size_bits =
- vol->sector_size_bits;
- }
- } else {
- vi->i_mode = S_IFREG | S_IRWXUGO;
- vi->i_mode &= ~vol->fmask;
-
- ni->type = AT_DATA;
- ni->name = NULL;
- ni->name_len = 0;
- }
- if (IS_RDONLY(vi))
- vi->i_mode &= ~S_IWUGO;
-
- /* Set the inode times to the current time. */
- simple_inode_init_ts(vi);
- /*
- * Set the file size to 0, the ntfs inode sizes are set to 0 by
- * the call to ntfs_init_big_inode() below.
- */
- vi->i_size = 0;
- vi->i_blocks = 0;
-
- /* Set the sequence number. */
- vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
- /*
- * Manually map, pin, and lock the mft record as we already
- * have its page mapped and it is very easy to do.
- */
- atomic_inc(&ni->count);
- mutex_lock(&ni->mrec_lock);
- ni->page = page;
- ni->page_ofs = ofs;
- /*
- * Make sure the allocated mft record is written out to disk.
- * NOTE: We do not set the ntfs inode dirty because this would
- * fail in ntfs_write_inode() because the inode does not have a
- * standard information attribute yet. Also, there is no need
- * to set the inode dirty because the caller is going to do
- * that anyway after finishing with the new mft record (e.g. at
- * a minimum some new attributes will be added to the mft
- * record.
- */
- mark_ntfs_record_dirty(page, ofs);
- unlock_page(page);
-
- /* Add the inode to the inode hash for the superblock. */
- insert_inode_hash(vi);
-
- /* Update the default mft allocation position. */
- vol->mft_data_pos = bit + 1;
- }
- /*
- * Return the opened, allocated inode of the allocated mft record as
- * well as the mapped, pinned, and locked mft record.
- */
- ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
- base_ni ? "extent " : "", (long long)bit);
- *mrec = m;
- return ni;
-undo_data_init:
- write_lock_irqsave(&mft_ni->size_lock, flags);
- mft_ni->initialized_size = old_data_initialized;
- i_size_write(vol->mft_ino, old_data_size);
- write_unlock_irqrestore(&mft_ni->size_lock, flags);
- goto undo_mftbmp_alloc_nolock;
-undo_mftbmp_alloc:
- down_write(&vol->mftbmp_lock);
-undo_mftbmp_alloc_nolock:
- if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
- ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
- NVolSetErrors(vol);
- }
- up_write(&vol->mftbmp_lock);
-err_out:
- return ERR_PTR(err);
-max_err_out:
- ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
- "number of inodes (2^32) has already been reached.");
- up_write(&vol->mftbmp_lock);
- return ERR_PTR(-ENOSPC);
-}
-
-/**
- * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
- * @ni: ntfs inode of the mapped extent mft record to free
- * @m: mapped extent mft record of the ntfs inode @ni
- *
- * Free the mapped extent mft record @m of the extent ntfs inode @ni.
- *
- * Note that this function unmaps the mft record and closes and destroys @ni
- * internally and hence you cannot use either @ni nor @m any more after this
- * function returns success.
- *
- * On success return 0 and on error return -errno. @ni and @m are still valid
- * in this case and have not been freed.
- *
- * For some errors an error message is displayed and the success code 0 is
- * returned and the volume is then left dirty on umount. This makes sense in
- * case we could not rollback the changes that were already done since the
- * caller no longer wants to reference this mft record so it does not matter to
- * the caller if something is wrong with it as long as it is properly detached
- * from the base inode.
- */
-int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
-{
- unsigned long mft_no = ni->mft_no;
- ntfs_volume *vol = ni->vol;
- ntfs_inode *base_ni;
- ntfs_inode **extent_nis;
- int i, err;
- le16 old_seq_no;
- u16 seq_no;
-
- BUG_ON(NInoAttr(ni));
- BUG_ON(ni->nr_extents != -1);
-
- mutex_lock(&ni->extent_lock);
- base_ni = ni->ext.base_ntfs_ino;
- mutex_unlock(&ni->extent_lock);
-
- BUG_ON(base_ni->nr_extents <= 0);
-
- ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
- mft_no, base_ni->mft_no);
-
- mutex_lock(&base_ni->extent_lock);
-
- /* Make sure we are holding the only reference to the extent inode. */
- if (atomic_read(&ni->count) > 2) {
- ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
- "not freeing.", base_ni->mft_no);
- mutex_unlock(&base_ni->extent_lock);
- return -EBUSY;
- }
-
- /* Dissociate the ntfs inode from the base inode. */
- extent_nis = base_ni->ext.extent_ntfs_inos;
- err = -ENOENT;
- for (i = 0; i < base_ni->nr_extents; i++) {
- if (ni != extent_nis[i])
- continue;
- extent_nis += i;
- base_ni->nr_extents--;
- memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
- sizeof(ntfs_inode*));
- err = 0;
- break;
- }
-
- mutex_unlock(&base_ni->extent_lock);
-
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
- "its base inode 0x%lx.", mft_no,
- base_ni->mft_no);
- BUG();
- }
-
- /*
- * The extent inode is no longer attached to the base inode so no one
- * can get a reference to it any more.
- */
-
- /* Mark the mft record as not in use. */
- m->flags &= ~MFT_RECORD_IN_USE;
-
- /* Increment the sequence number, skipping zero, if it is not zero. */
- old_seq_no = m->sequence_number;
- seq_no = le16_to_cpu(old_seq_no);
- if (seq_no == 0xffff)
- seq_no = 1;
- else if (seq_no)
- seq_no++;
- m->sequence_number = cpu_to_le16(seq_no);
-
- /*
- * Set the ntfs inode dirty and write it out. We do not need to worry
- * about the base inode here since whatever caused the extent mft
- * record to be freed is guaranteed to do it already.
- */
- NInoSetDirty(ni);
- err = write_mft_record(ni, m, 0);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
- "freeing.", mft_no);
- goto rollback;
- }
-rollback_error:
- /* Unmap and throw away the now freed extent inode. */
- unmap_extent_mft_record(ni);
- ntfs_clear_extent_inode(ni);
-
- /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
- down_write(&vol->mftbmp_lock);
- err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
- up_write(&vol->mftbmp_lock);
- if (unlikely(err)) {
- /*
- * The extent inode is gone but we failed to deallocate it in
- * the mft bitmap. Just emit a warning and leave the volume
- * dirty on umount.
- */
- ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
- NVolSetErrors(vol);
- }
- return 0;
-rollback:
- /* Rollback what we did... */
- mutex_lock(&base_ni->extent_lock);
- extent_nis = base_ni->ext.extent_ntfs_inos;
- if (!(base_ni->nr_extents & 3)) {
- int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
-
- extent_nis = kmalloc(new_size, GFP_NOFS);
- if (unlikely(!extent_nis)) {
- ntfs_error(vol->sb, "Failed to allocate internal "
- "buffer during rollback.%s", es);
- mutex_unlock(&base_ni->extent_lock);
- NVolSetErrors(vol);
- goto rollback_error;
- }
- if (base_ni->nr_extents) {
- BUG_ON(!base_ni->ext.extent_ntfs_inos);
- memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
- new_size - 4 * sizeof(ntfs_inode*));
- kfree(base_ni->ext.extent_ntfs_inos);
- }
- base_ni->ext.extent_ntfs_inos = extent_nis;
- }
- m->flags |= MFT_RECORD_IN_USE;
- m->sequence_number = old_seq_no;
- extent_nis[base_ni->nr_extents++] = ni;
- mutex_unlock(&base_ni->extent_lock);
- mark_mft_record_dirty(ni);
- return err;
-}
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
deleted file mode 100644
index 49c001af16ed..000000000000
--- a/fs/ntfs/mft.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_MFT_H
-#define _LINUX_NTFS_MFT_H
-
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-
-#include "inode.h"
-
-extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
-extern void unmap_mft_record(ntfs_inode *ni);
-
-extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
- ntfs_inode **ntfs_ino);
-
-static inline void unmap_extent_mft_record(ntfs_inode *ni)
-{
- unmap_mft_record(ni);
- return;
-}
-
-#ifdef NTFS_RW
-
-/**
- * flush_dcache_mft_record_page - flush_dcache_page() for mft records
- * @ni: ntfs inode structure of mft record
- *
- * Call flush_dcache_page() for the page in which an mft record resides.
- *
- * This must be called every time an mft record is modified, just after the
- * modification.
- */
-static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
-{
- flush_dcache_page(ni->page);
-}
-
-extern void __mark_mft_record_dirty(ntfs_inode *ni);
-
-/**
- * mark_mft_record_dirty - set the mft record and the page containing it dirty
- * @ni: ntfs inode describing the mapped mft record
- *
- * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
- * as well as the page containing the mft record, dirty. Also, mark the base
- * vfs inode dirty. This ensures that any changes to the mft record are
- * written out to disk.
- *
- * NOTE: Do not do anything if the mft record is already marked dirty.
- */
-static inline void mark_mft_record_dirty(ntfs_inode *ni)
-{
- if (!NInoTestSetDirty(ni))
- __mark_mft_record_dirty(ni);
-}
-
-extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
- MFT_RECORD *m, int sync);
-
-extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
-
-/**
- * write_mft_record - write out a mapped (extent) mft record
- * @ni: ntfs inode describing the mapped (extent) mft record
- * @m: mapped (extent) mft record to write
- * @sync: if true, wait for i/o completion
- *
- * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
- * locks the page for the duration of the write. This ensures that there are
- * no race conditions between writing the mft record via the dirty inode code
- * paths and via the page cache write back code paths or between writing
- * neighbouring mft records residing in the same page.
- *
- * Locking the page also serializes us against ->read_folio() if the page is not
- * uptodate.
- *
- * On success, clean the mft record and return 0. On error, leave the mft
- * record dirty and return -errno.
- */
-static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
-{
- struct page *page = ni->page;
- int err;
-
- BUG_ON(!page);
- lock_page(page);
- err = write_mft_record_nolock(ni, m, sync);
- unlock_page(page);
- return err;
-}
-
-extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
- const unsigned long mft_no, const MFT_RECORD *m,
- ntfs_inode **locked_ni);
-
-extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
- ntfs_inode *base_ni, MFT_RECORD **mrec);
-extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
deleted file mode 100644
index 16b3c884abfc..000000000000
--- a/fs/ntfs/mst.c
+++ /dev/null
@@ -1,189 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * mst.c - NTFS multi sector transfer protection handling code. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#include "ntfs.h"
-
-/**
- * post_read_mst_fixup - deprotect multi sector transfer protected data
- * @b: pointer to the data to deprotect
- * @size: size in bytes of @b
- *
- * Perform the necessary post read multi sector transfer fixup and detect the
- * presence of incomplete multi sector transfers. - In that case, overwrite the
- * magic of the ntfs record header being processed with "BAAD" (in memory only!)
- * and abort processing.
- *
- * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
- *
- * NOTE: We consider the absence / invalidity of an update sequence array to
- * mean that the structure is not protected at all and hence doesn't need to
- * be fixed up. Thus, we return success and not failure in this case. This is
- * in contrast to pre_write_mst_fixup(), see below.
- */
-int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
-{
- u16 usa_ofs, usa_count, usn;
- u16 *usa_pos, *data_pos;
-
- /* Setup the variables. */
- usa_ofs = le16_to_cpu(b->usa_ofs);
- /* Decrement usa_count to get number of fixups. */
- usa_count = le16_to_cpu(b->usa_count) - 1;
- /* Size and alignment checks. */
- if ( size & (NTFS_BLOCK_SIZE - 1) ||
- usa_ofs & 1 ||
- usa_ofs + (usa_count * 2) > size ||
- (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
- return 0;
- /* Position of usn in update sequence array. */
- usa_pos = (u16*)b + usa_ofs/sizeof(u16);
- /*
- * The update sequence number which has to be equal to each of the
- * u16 values before they are fixed up. Note no need to care for
- * endianness since we are comparing and moving data for on disk
- * structures which means the data is consistent. - If it is
- * consistenty the wrong endianness it doesn't make any difference.
- */
- usn = *usa_pos;
- /*
- * Position in protected data of first u16 that needs fixing up.
- */
- data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
- /*
- * Check for incomplete multi sector transfer(s).
- */
- while (usa_count--) {
- if (*data_pos != usn) {
- /*
- * Incomplete multi sector transfer detected! )-:
- * Set the magic to "BAAD" and return failure.
- * Note that magic_BAAD is already converted to le32.
- */
- b->magic = magic_BAAD;
- return -EINVAL;
- }
- data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
- }
- /* Re-setup the variables. */
- usa_count = le16_to_cpu(b->usa_count) - 1;
- data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
- /* Fixup all sectors. */
- while (usa_count--) {
- /*
- * Increment position in usa and restore original data from
- * the usa into the data buffer.
- */
- *data_pos = *(++usa_pos);
- /* Increment position in data as well. */
- data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
- }
- return 0;
-}
-
-/**
- * pre_write_mst_fixup - apply multi sector transfer protection
- * @b: pointer to the data to protect
- * @size: size in bytes of @b
- *
- * Perform the necessary pre write multi sector transfer fixup on the data
- * pointer to by @b of @size.
- *
- * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
- * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
- *
- * NOTE: We consider the absence / invalidity of an update sequence array to
- * mean that the structure is not subject to protection and hence doesn't need
- * to be fixed up. This means that you have to create a valid update sequence
- * array header in the ntfs record before calling this function, otherwise it
- * will fail (the header needs to contain the position of the update sequence
- * array together with the number of elements in the array). You also need to
- * initialise the update sequence number before calling this function
- * otherwise a random word will be used (whatever was in the record at that
- * position at that time).
- */
-int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
-{
- le16 *usa_pos, *data_pos;
- u16 usa_ofs, usa_count, usn;
- le16 le_usn;
-
- /* Sanity check + only fixup if it makes sense. */
- if (!b || ntfs_is_baad_record(b->magic) ||
- ntfs_is_hole_record(b->magic))
- return -EINVAL;
- /* Setup the variables. */
- usa_ofs = le16_to_cpu(b->usa_ofs);
- /* Decrement usa_count to get number of fixups. */
- usa_count = le16_to_cpu(b->usa_count) - 1;
- /* Size and alignment checks. */
- if ( size & (NTFS_BLOCK_SIZE - 1) ||
- usa_ofs & 1 ||
- usa_ofs + (usa_count * 2) > size ||
- (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
- return -EINVAL;
- /* Position of usn in update sequence array. */
- usa_pos = (le16*)((u8*)b + usa_ofs);
- /*
- * Cyclically increment the update sequence number
- * (skipping 0 and -1, i.e. 0xffff).
- */
- usn = le16_to_cpup(usa_pos) + 1;
- if (usn == 0xffff || !usn)
- usn = 1;
- le_usn = cpu_to_le16(usn);
- *usa_pos = le_usn;
- /* Position in data of first u16 that needs fixing up. */
- data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
- /* Fixup all sectors. */
- while (usa_count--) {
- /*
- * Increment the position in the usa and save the
- * original data from the data buffer into the usa.
- */
- *(++usa_pos) = *data_pos;
- /* Apply fixup to data. */
- *data_pos = le_usn;
- /* Increment position in data as well. */
- data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
- }
- return 0;
-}
-
-/**
- * post_write_mst_fixup - fast deprotect multi sector transfer protected data
- * @b: pointer to the data to deprotect
- *
- * Perform the necessary post write multi sector transfer fixup, not checking
- * for any errors, because we assume we have just used pre_write_mst_fixup(),
- * thus the data will be fine or we would never have gotten here.
- */
-void post_write_mst_fixup(NTFS_RECORD *b)
-{
- le16 *usa_pos, *data_pos;
-
- u16 usa_ofs = le16_to_cpu(b->usa_ofs);
- u16 usa_count = le16_to_cpu(b->usa_count) - 1;
-
- /* Position of usn in update sequence array. */
- usa_pos = (le16*)b + usa_ofs/sizeof(le16);
-
- /* Position in protected data of first u16 that needs fixing up. */
- data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
-
- /* Fixup all sectors. */
- while (usa_count--) {
- /*
- * Increment position in usa and restore original data from
- * the usa into the data buffer.
- */
- *data_pos = *(++usa_pos);
-
- /* Increment position in data as well. */
- data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
- }
-}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
deleted file mode 100644
index d7498ddc4a72..000000000000
--- a/fs/ntfs/namei.c
+++ /dev/null
@@ -1,392 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include <linux/dcache.h>
-#include <linux/exportfs.h>
-#include <linux/security.h>
-#include <linux/slab.h>
-
-#include "attrib.h"
-#include "debug.h"
-#include "dir.h"
-#include "mft.h"
-#include "ntfs.h"
-
-/**
- * ntfs_lookup - find the inode represented by a dentry in a directory inode
- * @dir_ino: directory inode in which to look for the inode
- * @dent: dentry representing the inode to look for
- * @flags: lookup flags
- *
- * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
- * in the directory inode @dir_ino and if found attaches the inode to the
- * dentry @dent.
- *
- * In more detail, the dentry @dent specifies which inode to look for by
- * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
- * converts the name to Unicode and walks the contents of the directory inode
- * @dir_ino looking for the converted Unicode name. If the name is found in the
- * directory, the corresponding inode is loaded by calling ntfs_iget() on its
- * inode number and the inode is associated with the dentry @dent via a call to
- * d_splice_alias().
- *
- * If the name is not found in the directory, a NULL inode is inserted into the
- * dentry @dent via a call to d_add(). The dentry is then termed a negative
- * dentry.
- *
- * Only if an actual error occurs, do we return an error via ERR_PTR().
- *
- * In order to handle the case insensitivity issues of NTFS with regards to the
- * dcache and the dcache requiring only one dentry per directory, we deal with
- * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
- * a case sensitive dcache. This means that we get the full benefit of dcache
- * speed when the file/directory is looked up with the same case as returned by
- * ->ntfs_readdir() but that a lookup for any other case (or for the short file
- * name) will not find anything in dcache and will enter ->ntfs_lookup()
- * instead, where we search the directory for a fully matching file name
- * (including case) and if that is not found, we search for a file name that
- * matches with different case and if that has non-POSIX semantics we return
- * that. We actually do only one search (case sensitive) and keep tabs on
- * whether we have found a case insensitive match in the process.
- *
- * To simplify matters for us, we do not treat the short vs long filenames as
- * two hard links but instead if the lookup matches a short filename, we
- * return the dentry for the corresponding long filename instead.
- *
- * There are three cases we need to distinguish here:
- *
- * 1) @dent perfectly matches (i.e. including case) a directory entry with a
- * file name in the WIN32 or POSIX namespaces. In this case
- * ntfs_lookup_inode_by_name() will return with name set to NULL and we
- * just d_splice_alias() @dent.
- * 2) @dent matches (not including case) a directory entry with a file name in
- * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
- * with name set to point to a kmalloc()ed ntfs_name structure containing
- * the properly cased little endian Unicode name. We convert the name to the
- * current NLS code page, search if a dentry with this name already exists
- * and if so return that instead of @dent. At this point things are
- * complicated by the possibility of 'disconnected' dentries due to NFS
- * which we deal with appropriately (see the code comments). The VFS will
- * then destroy the old @dent and use the one we returned. If a dentry is
- * not found, we allocate a new one, d_splice_alias() it, and return it as
- * above.
- * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
- * directory entry with a file name in the DOS namespace. In this case
- * ntfs_lookup_inode_by_name() will return with name set to point to a
- * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
- * of the inode. We use the mft reference to read the inode and to find the
- * file name in the WIN32 namespace corresponding to the matched short file
- * name. We then convert the name to the current NLS code page, and proceed
- * searching for a dentry with this name, etc, as in case 2), above.
- *
- * Locking: Caller must hold i_mutex on the directory.
- */
-static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
- unsigned int flags)
-{
- ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
- struct inode *dent_inode;
- ntfschar *uname;
- ntfs_name *name = NULL;
- MFT_REF mref;
- unsigned long dent_ino;
- int uname_len;
-
- ntfs_debug("Looking up %pd in directory inode 0x%lx.",
- dent, dir_ino->i_ino);
- /* Convert the name of the dentry to Unicode. */
- uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
- &uname);
- if (uname_len < 0) {
- if (uname_len != -ENAMETOOLONG)
- ntfs_error(vol->sb, "Failed to convert name to "
- "Unicode.");
- return ERR_PTR(uname_len);
- }
- mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
- &name);
- kmem_cache_free(ntfs_name_cache, uname);
- if (!IS_ERR_MREF(mref)) {
- dent_ino = MREF(mref);
- ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
- dent_inode = ntfs_iget(vol->sb, dent_ino);
- if (!IS_ERR(dent_inode)) {
- /* Consistency check. */
- if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
- NTFS_I(dent_inode)->seq_no ||
- dent_ino == FILE_MFT) {
- /* Perfect WIN32/POSIX match. -- Case 1. */
- if (!name) {
- ntfs_debug("Done. (Case 1.)");
- return d_splice_alias(dent_inode, dent);
- }
- /*
- * We are too indented. Handle imperfect
- * matches and short file names further below.
- */
- goto handle_name;
- }
- ntfs_error(vol->sb, "Found stale reference to inode "
- "0x%lx (reference sequence number = "
- "0x%x, inode sequence number = 0x%x), "
- "returning -EIO. Run chkdsk.",
- dent_ino, MSEQNO(mref),
- NTFS_I(dent_inode)->seq_no);
- iput(dent_inode);
- dent_inode = ERR_PTR(-EIO);
- } else
- ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
- "error code %li.", dent_ino,
- PTR_ERR(dent_inode));
- kfree(name);
- /* Return the error code. */
- return ERR_CAST(dent_inode);
- }
- /* It is guaranteed that @name is no longer allocated at this point. */
- if (MREF_ERR(mref) == -ENOENT) {
- ntfs_debug("Entry was not found, adding negative dentry.");
- /* The dcache will handle negative entries. */
- d_add(dent, NULL);
- ntfs_debug("Done.");
- return NULL;
- }
- ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
- "code %i.", -MREF_ERR(mref));
- return ERR_PTR(MREF_ERR(mref));
- // TODO: Consider moving this lot to a separate function! (AIA)
-handle_name:
- {
- MFT_RECORD *m;
- ntfs_attr_search_ctx *ctx;
- ntfs_inode *ni = NTFS_I(dent_inode);
- int err;
- struct qstr nls_name;
-
- nls_name.name = NULL;
- if (name->type != FILE_NAME_DOS) { /* Case 2. */
- ntfs_debug("Case 2.");
- nls_name.len = (unsigned)ntfs_ucstonls(vol,
- (ntfschar*)&name->name, name->len,
- (unsigned char**)&nls_name.name, 0);
- kfree(name);
- } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */
- FILE_NAME_ATTR *fn;
-
- ntfs_debug("Case 3.");
- kfree(name);
-
- /* Find the WIN32 name corresponding to the matched DOS name. */
- ni = NTFS_I(dent_inode);
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- do {
- ATTR_RECORD *a;
- u32 val_len;
-
- err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
- NULL, 0, ctx);
- if (unlikely(err)) {
- ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
- "namespace counterpart to DOS "
- "file name. Run chkdsk.");
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- /* Consistency checks. */
- a = ctx->attr;
- if (a->non_resident || a->flags)
- goto eio_err_out;
- val_len = le32_to_cpu(a->data.resident.value_length);
- if (le16_to_cpu(a->data.resident.value_offset) +
- val_len > le32_to_cpu(a->length))
- goto eio_err_out;
- fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
- ctx->attr->data.resident.value_offset));
- if ((u32)(fn->file_name_length * sizeof(ntfschar) +
- sizeof(FILE_NAME_ATTR)) > val_len)
- goto eio_err_out;
- } while (fn->file_name_type != FILE_NAME_WIN32);
-
- /* Convert the found WIN32 name to current NLS code page. */
- nls_name.len = (unsigned)ntfs_ucstonls(vol,
- (ntfschar*)&fn->file_name, fn->file_name_length,
- (unsigned char**)&nls_name.name, 0);
-
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- }
- m = NULL;
- ctx = NULL;
-
- /* Check if a conversion error occurred. */
- if ((signed)nls_name.len < 0) {
- err = (signed)nls_name.len;
- goto err_out;
- }
- nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len);
-
- dent = d_add_ci(dent, dent_inode, &nls_name);
- kfree(nls_name.name);
- return dent;
-
-eio_err_out:
- ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
- err = -EIO;
-err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(ni);
- iput(dent_inode);
- ntfs_error(vol->sb, "Failed, returning error code %i.", err);
- return ERR_PTR(err);
- }
-}
-
-/*
- * Inode operations for directories.
- */
-const struct inode_operations ntfs_dir_inode_ops = {
- .lookup = ntfs_lookup, /* VFS: Lookup directory. */
-};
-
-/**
- * ntfs_get_parent - find the dentry of the parent of a given directory dentry
- * @child_dent: dentry of the directory whose parent directory to find
- *
- * Find the dentry for the parent directory of the directory specified by the
- * dentry @child_dent. This function is called from
- * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
- * default ->decode_fh() which is export_decode_fh() in the same file.
- *
- * The code is based on the ext3 ->get_parent() implementation found in
- * fs/ext3/namei.c::ext3_get_parent().
- *
- * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down.
- *
- * Return the dentry of the parent directory on success or the error code on
- * error (IS_ERR() is true).
- */
-static struct dentry *ntfs_get_parent(struct dentry *child_dent)
-{
- struct inode *vi = d_inode(child_dent);
- ntfs_inode *ni = NTFS_I(vi);
- MFT_RECORD *mrec;
- ntfs_attr_search_ctx *ctx;
- ATTR_RECORD *attr;
- FILE_NAME_ATTR *fn;
- unsigned long parent_ino;
- int err;
-
- ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
- /* Get the mft record of the inode belonging to the child dentry. */
- mrec = map_mft_record(ni);
- if (IS_ERR(mrec))
- return ERR_CAST(mrec);
- /* Find the first file name attribute in the mft record. */
- ctx = ntfs_attr_get_search_ctx(ni, mrec);
- if (unlikely(!ctx)) {
- unmap_mft_record(ni);
- return ERR_PTR(-ENOMEM);
- }
-try_next:
- err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
- 0, ctx);
- if (unlikely(err)) {
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- if (err == -ENOENT)
- ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
- "file name attribute. Run chkdsk.",
- vi->i_ino);
- return ERR_PTR(err);
- }
- attr = ctx->attr;
- if (unlikely(attr->non_resident))
- goto try_next;
- fn = (FILE_NAME_ATTR *)((u8 *)attr +
- le16_to_cpu(attr->data.resident.value_offset));
- if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
- (u8*)attr + le32_to_cpu(attr->length)))
- goto try_next;
- /* Get the inode number of the parent directory. */
- parent_ino = MREF_LE(fn->parent_directory);
- /* Release the search context and the mft record of the child. */
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
-
- return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
-}
-
-static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
-{
- struct inode *inode;
-
- inode = ntfs_iget(sb, ino);
- if (!IS_ERR(inode)) {
- if (is_bad_inode(inode) || inode->i_generation != generation) {
- iput(inode);
- inode = ERR_PTR(-ESTALE);
- }
- }
-
- return inode;
-}
-
-static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- ntfs_nfs_get_inode);
-}
-
-static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- ntfs_nfs_get_inode);
-}
-
-/*
- * Export operations allowing NFS exporting of mounted NTFS partitions.
- *
- * We use the default ->encode_fh() for now. Note that they
- * use 32 bits to store the inode number which is an unsigned long so on 64-bit
- * architectures is usually 64 bits so it would all fail horribly on huge
- * volumes. I guess we need to define our own encode and decode fh functions
- * that store 64-bit inode numbers at some point but for now we will ignore the
- * problem...
- *
- * We also use the default ->get_name() helper (used by ->decode_fh() via
- * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
- * independent.
- *
- * The default ->get_parent() just returns -EACCES so we have to provide our
- * own and the default ->get_dentry() is incompatible with NTFS due to not
- * allowing the inode number 0 which is used in NTFS for the system file $MFT
- * and due to using iget() whereas NTFS needs ntfs_iget().
- */
-const struct export_operations ntfs_export_ops = {
- .encode_fh = generic_encode_ino32_fh,
- .get_parent = ntfs_get_parent, /* Find the parent of a given
- directory. */
- .fh_to_dentry = ntfs_fh_to_dentry,
- .fh_to_parent = ntfs_fh_to_parent,
-};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
deleted file mode 100644
index e81376ea9152..000000000000
--- a/fs/ntfs/ntfs.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ntfs.h - Defines for NTFS Linux kernel driver.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- * Copyright (C) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_H
-#define _LINUX_NTFS_H
-
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
-#include <linux/nls.h>
-#include <linux/smp.h>
-#include <linux/pagemap.h>
-
-#include "types.h"
-#include "volume.h"
-#include "layout.h"
-
-typedef enum {
- NTFS_BLOCK_SIZE = 512,
- NTFS_BLOCK_SIZE_BITS = 9,
- NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */
- NTFS_MAX_NAME_LEN = 255,
- NTFS_MAX_ATTR_NAME_LEN = 255,
- NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */
- NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE,
-} NTFS_CONSTANTS;
-
-/* Global variables. */
-
-/* Slab caches (from super.c). */
-extern struct kmem_cache *ntfs_name_cache;
-extern struct kmem_cache *ntfs_inode_cache;
-extern struct kmem_cache *ntfs_big_inode_cache;
-extern struct kmem_cache *ntfs_attr_ctx_cache;
-extern struct kmem_cache *ntfs_index_ctx_cache;
-
-/* The various operations structs defined throughout the driver files. */
-extern const struct address_space_operations ntfs_normal_aops;
-extern const struct address_space_operations ntfs_compressed_aops;
-extern const struct address_space_operations ntfs_mst_aops;
-
-extern const struct file_operations ntfs_file_ops;
-extern const struct inode_operations ntfs_file_inode_ops;
-
-extern const struct file_operations ntfs_dir_ops;
-extern const struct inode_operations ntfs_dir_inode_ops;
-
-extern const struct file_operations ntfs_empty_file_ops;
-extern const struct inode_operations ntfs_empty_inode_ops;
-
-extern const struct export_operations ntfs_export_ops;
-
-/**
- * NTFS_SB - return the ntfs volume given a vfs super block
- * @sb: VFS super block
- *
- * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
- */
-static inline ntfs_volume *NTFS_SB(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-/* Declarations of functions and global variables. */
-
-/* From fs/ntfs/compress.c */
-extern int ntfs_read_compressed_block(struct page *page);
-extern int allocate_compression_buffers(void);
-extern void free_compression_buffers(void);
-
-/* From fs/ntfs/super.c */
-#define default_upcase_len 0x10000
-extern struct mutex ntfs_lock;
-
-typedef struct {
- int val;
- char *str;
-} option_t;
-extern const option_t on_errors_arr[];
-
-/* From fs/ntfs/mst.c */
-extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
-extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
-extern void post_write_mst_fixup(NTFS_RECORD *b);
-
-/* From fs/ntfs/unistr.c */
-extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
- const ntfschar *s2, size_t s2_len,
- const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_size);
-extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
- const ntfschar *name2, const u32 name2_len,
- const int err_val, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
-extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
- const ntfschar *upcase, const u32 upcase_size);
-extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
- const ntfschar *upcase, const u32 upcase_len);
-extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
- const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
- FILE_NAME_ATTR *file_name_attr2,
- const int err_val, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
- const int ins_len, ntfschar **outs);
-extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
- const int ins_len, unsigned char **outs, int outs_len);
-
-/* From fs/ntfs/upcase.c */
-extern ntfschar *generate_default_upcase(void);
-
-static inline int ntfs_ffs(int x)
-{
- int r = 1;
-
- if (!x)
- return 0;
- if (!(x & 0xffff)) {
- x >>= 16;
- r += 16;
- }
- if (!(x & 0xff)) {
- x >>= 8;
- r += 8;
- }
- if (!(x & 0xf)) {
- x >>= 4;
- r += 4;
- }
- if (!(x & 3)) {
- x >>= 2;
- r += 2;
- }
- if (!(x & 1)) {
- x >>= 1;
- r += 1;
- }
- return r;
-}
-
-#endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
deleted file mode 100644
index 9160480222fd..000000000000
--- a/fs/ntfs/quota.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS
- * project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include "index.h"
-#include "quota.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/**
- * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
- * @vol: ntfs volume on which to mark the quotas out of date
- *
- * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
- * success and 'false' on error.
- */
-bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
-{
- ntfs_index_context *ictx;
- QUOTA_CONTROL_ENTRY *qce;
- const le32 qid = QUOTA_DEFAULTS_ID;
- int err;
-
- ntfs_debug("Entering.");
- if (NVolQuotaOutOfDate(vol))
- goto done;
- if (!vol->quota_ino || !vol->quota_q_ino) {
- ntfs_error(vol->sb, "Quota inodes are not open.");
- return false;
- }
- inode_lock(vol->quota_q_ino);
- ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
- if (!ictx) {
- ntfs_error(vol->sb, "Failed to get index context.");
- goto err_out;
- }
- err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
- if (err) {
- if (err == -ENOENT)
- ntfs_error(vol->sb, "Quota defaults entry is not "
- "present.");
- else
- ntfs_error(vol->sb, "Lookup of quota defaults entry "
- "failed.");
- goto err_out;
- }
- if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
- ntfs_error(vol->sb, "Quota defaults entry size is invalid. "
- "Run chkdsk.");
- goto err_out;
- }
- qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
- if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
- ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
- "supported.", le32_to_cpu(qce->version));
- goto err_out;
- }
- ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
- /* If quotas are already marked out of date, no need to do anything. */
- if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
- goto set_done;
- /*
- * If quota tracking is neither requested, nor enabled and there are no
- * pending deletes, no need to mark the quotas out of date.
- */
- if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
- QUOTA_FLAG_TRACKING_REQUESTED |
- QUOTA_FLAG_PENDING_DELETES)))
- goto set_done;
- /*
- * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
- * This is verified on WinXP to be sufficient to cause windows to
- * rescan the volume on boot and update all quota entries.
- */
- qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
- /* Ensure the modified flags are written to disk. */
- ntfs_index_entry_flush_dcache_page(ictx);
- ntfs_index_entry_mark_dirty(ictx);
-set_done:
- ntfs_index_ctx_put(ictx);
- inode_unlock(vol->quota_q_ino);
- /*
- * We set the flag so we do not try to mark the quotas out of date
- * again on remount.
- */
- NVolSetQuotaOutOfDate(vol);
-done:
- ntfs_debug("Done.");
- return true;
-err_out:
- if (ictx)
- ntfs_index_ctx_put(ictx);
- inode_unlock(vol->quota_q_ino);
- return false;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
deleted file mode 100644
index fe3132a3d6d2..000000000000
--- a/fs/ntfs/quota.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_QUOTA_H
-#define _LINUX_NTFS_QUOTA_H
-
-#ifdef NTFS_RW
-
-#include "types.h"
-#include "volume.h"
-
-extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
deleted file mode 100644
index 0d448e9881f7..000000000000
--- a/fs/ntfs/runlist.c
+++ /dev/null
@@ -1,1893 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002-2005 Richard Russon
- */
-
-#include "debug.h"
-#include "dir.h"
-#include "endian.h"
-#include "malloc.h"
-#include "ntfs.h"
-
-/**
- * ntfs_rl_mm - runlist memmove
- *
- * It is up to the caller to serialize access to the runlist @base.
- */
-static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
- int size)
-{
- if (likely((dst != src) && (size > 0)))
- memmove(base + dst, base + src, size * sizeof(*base));
-}
-
-/**
- * ntfs_rl_mc - runlist memory copy
- *
- * It is up to the caller to serialize access to the runlists @dstbase and
- * @srcbase.
- */
-static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
- runlist_element *srcbase, int src, int size)
-{
- if (likely(size > 0))
- memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
-}
-
-/**
- * ntfs_rl_realloc - Reallocate memory for runlists
- * @rl: original runlist
- * @old_size: number of runlist elements in the original runlist @rl
- * @new_size: number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required. To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns an entire page of memory.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B. If the new allocation doesn't require a different number of pages in
- * memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
- int old_size, int new_size)
-{
- runlist_element *new_rl;
-
- old_size = PAGE_ALIGN(old_size * sizeof(*rl));
- new_size = PAGE_ALIGN(new_size * sizeof(*rl));
- if (old_size == new_size)
- return rl;
-
- new_rl = ntfs_malloc_nofs(new_size);
- if (unlikely(!new_rl))
- return ERR_PTR(-ENOMEM);
-
- if (likely(rl != NULL)) {
- if (unlikely(old_size > new_size))
- old_size = new_size;
- memcpy(new_rl, rl, old_size);
- ntfs_free(rl);
- }
- return new_rl;
-}
-
-/**
- * ntfs_rl_realloc_nofail - Reallocate memory for runlists
- * @rl: original runlist
- * @old_size: number of runlist elements in the original runlist @rl
- * @new_size: number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required. To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns an entire page of memory.
- *
- * This function guarantees that the allocation will succeed. It will sleep
- * for as long as it takes to complete the allocation.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B. If the new allocation doesn't require a different number of pages in
- * memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
- int old_size, int new_size)
-{
- runlist_element *new_rl;
-
- old_size = PAGE_ALIGN(old_size * sizeof(*rl));
- new_size = PAGE_ALIGN(new_size * sizeof(*rl));
- if (old_size == new_size)
- return rl;
-
- new_rl = ntfs_malloc_nofs_nofail(new_size);
- BUG_ON(!new_rl);
-
- if (likely(rl != NULL)) {
- if (unlikely(old_size > new_size))
- old_size = new_size;
- memcpy(new_rl, rl, old_size);
- ntfs_free(rl);
- }
- return new_rl;
-}
-
-/**
- * ntfs_are_rl_mergeable - test if two runlists can be joined together
- * @dst: original runlist
- * @src: new runlist to test for mergeability with @dst
- *
- * Test if two runlists can be joined together. For this, their VCNs and LCNs
- * must be adjacent.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * Return: true Success, the runlists can be merged.
- * false Failure, the runlists cannot be merged.
- */
-static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
- runlist_element *src)
-{
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /* We can merge unmapped regions even if they are misaligned. */
- if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
- return true;
- /* If the runs are misaligned, we cannot merge them. */
- if ((dst->vcn + dst->length) != src->vcn)
- return false;
- /* If both runs are non-sparse and contiguous, we can merge them. */
- if ((dst->lcn >= 0) && (src->lcn >= 0) &&
- ((dst->lcn + dst->length) == src->lcn))
- return true;
- /* If we are merging two holes, we can merge them. */
- if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
- return true;
- /* Cannot merge. */
- return false;
-}
-
-/**
- * __ntfs_rl_merge - merge two runlists without testing if they can be merged
- * @dst: original, destination runlist
- * @src: new runlist to merge with @dst
- *
- * Merge the two runlists, writing into the destination runlist @dst. The
- * caller must make sure the runlists can be merged or this will corrupt the
- * destination runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- */
-static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
-{
- dst->length += src->length;
-}
-
-/**
- * ntfs_rl_append - append a runlist after a given element
- * @dst: original runlist to be worked on
- * @dsize: number of elements in @dst (including end marker)
- * @src: runlist to be inserted into @dst
- * @ssize: number of elements in @src (excluding end marker)
- * @loc: append the new runlist @src after this element in @dst
- *
- * Append the runlist @src after element @loc in @dst. Merge the right end of
- * the new runlist, if necessary. Adjust the size of the hole before the
- * appended runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_append(runlist_element *dst,
- int dsize, runlist_element *src, int ssize, int loc)
-{
- bool right = false; /* Right end of @src needs merging. */
- int marker; /* End of the inserted runs. */
-
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /* First, check if the right hand end needs merging. */
- if ((loc + 1) < dsize)
- right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
-
- /* Space required: @dst size + @src size, less one if we merged. */
- dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
- if (IS_ERR(dst))
- return dst;
- /*
- * We are guaranteed to succeed from here so can start modifying the
- * original runlists.
- */
-
- /* First, merge the right hand end, if necessary. */
- if (right)
- __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
-
- /* First run after the @src runs that have been inserted. */
- marker = loc + ssize + 1;
-
- /* Move the tail of @dst out of the way, then copy in @src. */
- ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
- ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
- /* Adjust the size of the preceding hole. */
- dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
-
- /* We may have changed the length of the file, so fix the end marker */
- if (dst[marker].lcn == LCN_ENOENT)
- dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
-
- return dst;
-}
-
-/**
- * ntfs_rl_insert - insert a runlist into another
- * @dst: original runlist to be worked on
- * @dsize: number of elements in @dst (including end marker)
- * @src: new runlist to be inserted
- * @ssize: number of elements in @src (excluding end marker)
- * @loc: insert the new runlist @src before this element in @dst
- *
- * Insert the runlist @src before element @loc in the runlist @dst. Merge the
- * left end of the new runlist, if necessary. Adjust the size of the hole
- * after the inserted runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
- int dsize, runlist_element *src, int ssize, int loc)
-{
- bool left = false; /* Left end of @src needs merging. */
- bool disc = false; /* Discontinuity between @dst and @src. */
- int marker; /* End of the inserted runs. */
-
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /*
- * disc => Discontinuity between the end of @dst and the start of @src.
- * This means we might need to insert a "not mapped" run.
- */
- if (loc == 0)
- disc = (src[0].vcn > 0);
- else {
- s64 merged_length;
-
- left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-
- merged_length = dst[loc - 1].length;
- if (left)
- merged_length += src->length;
-
- disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
- }
- /*
- * Space required: @dst size + @src size, less one if we merged, plus
- * one if there was a discontinuity.
- */
- dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
- if (IS_ERR(dst))
- return dst;
- /*
- * We are guaranteed to succeed from here so can start modifying the
- * original runlist.
- */
- if (left)
- __ntfs_rl_merge(dst + loc - 1, src);
- /*
- * First run after the @src runs that have been inserted.
- * Nominally, @marker equals @loc + @ssize, i.e. location + number of
- * runs in @src. However, if @left, then the first run in @src has
- * been merged with one in @dst. And if @disc, then @dst and @src do
- * not meet and we need an extra run to fill the gap.
- */
- marker = loc + ssize - left + disc;
-
- /* Move the tail of @dst out of the way, then copy in @src. */
- ntfs_rl_mm(dst, marker, loc, dsize - loc);
- ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
-
- /* Adjust the VCN of the first run after the insertion... */
- dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
- /* ... and the length. */
- if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
- dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
-
- /* Writing beyond the end of the file and there is a discontinuity. */
- if (disc) {
- if (loc > 0) {
- dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
- dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
- } else {
- dst[loc].vcn = 0;
- dst[loc].length = dst[loc + 1].vcn;
- }
- dst[loc].lcn = LCN_RL_NOT_MAPPED;
- }
- return dst;
-}
-
-/**
- * ntfs_rl_replace - overwrite a runlist element with another runlist
- * @dst: original runlist to be worked on
- * @dsize: number of elements in @dst (including end marker)
- * @src: new runlist to be inserted
- * @ssize: number of elements in @src (excluding end marker)
- * @loc: index in runlist @dst to overwrite with @src
- *
- * Replace the runlist element @dst at @loc with @src. Merge the left and
- * right ends of the inserted runlist, if necessary.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
- int dsize, runlist_element *src, int ssize, int loc)
-{
- signed delta;
- bool left = false; /* Left end of @src needs merging. */
- bool right = false; /* Right end of @src needs merging. */
- int tail; /* Start of tail of @dst. */
- int marker; /* End of the inserted runs. */
-
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /* First, see if the left and right ends need merging. */
- if ((loc + 1) < dsize)
- right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
- if (loc > 0)
- left = ntfs_are_rl_mergeable(dst + loc - 1, src);
- /*
- * Allocate some space. We will need less if the left, right, or both
- * ends get merged. The -1 accounts for the run being replaced.
- */
- delta = ssize - 1 - left - right;
- if (delta > 0) {
- dst = ntfs_rl_realloc(dst, dsize, dsize + delta);
- if (IS_ERR(dst))
- return dst;
- }
- /*
- * We are guaranteed to succeed from here so can start modifying the
- * original runlists.
- */
-
- /* First, merge the left and right ends, if necessary. */
- if (right)
- __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
- if (left)
- __ntfs_rl_merge(dst + loc - 1, src);
- /*
- * Offset of the tail of @dst. This needs to be moved out of the way
- * to make space for the runs to be copied from @src, i.e. the first
- * run of the tail of @dst.
- * Nominally, @tail equals @loc + 1, i.e. location, skipping the
- * replaced run. However, if @right, then one of @dst's runs is
- * already merged into @src.
- */
- tail = loc + right + 1;
- /*
- * First run after the @src runs that have been inserted, i.e. where
- * the tail of @dst needs to be moved to.
- * Nominally, @marker equals @loc + @ssize, i.e. location + number of
- * runs in @src. However, if @left, then the first run in @src has
- * been merged with one in @dst.
- */
- marker = loc + ssize - left;
-
- /* Move the tail of @dst out of the way, then copy in @src. */
- ntfs_rl_mm(dst, marker, tail, dsize - tail);
- ntfs_rl_mc(dst, loc, src, left, ssize - left);
-
- /* We may have changed the length of the file, so fix the end marker. */
- if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
- dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
- return dst;
-}
-
-/**
- * ntfs_rl_split - insert a runlist into the centre of a hole
- * @dst: original runlist to be worked on
- * @dsize: number of elements in @dst (including end marker)
- * @src: new runlist to be inserted
- * @ssize: number of elements in @src (excluding end marker)
- * @loc: index in runlist @dst at which to split and insert @src
- *
- * Split the runlist @dst at @loc into two and insert @new in between the two
- * fragments. No merging of runlists is necessary. Adjust the size of the
- * holes either side.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
- runlist_element *src, int ssize, int loc)
-{
- BUG_ON(!dst);
- BUG_ON(!src);
-
- /* Space required: @dst size + @src size + one new hole. */
- dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
- if (IS_ERR(dst))
- return dst;
- /*
- * We are guaranteed to succeed from here so can start modifying the
- * original runlists.
- */
-
- /* Move the tail of @dst out of the way, then copy in @src. */
- ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
- ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
- /* Adjust the size of the holes either size of @src. */
- dst[loc].length = dst[loc+1].vcn - dst[loc].vcn;
- dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length;
- dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
-
- return dst;
-}
-
-/**
- * ntfs_runlists_merge - merge two runlists into one
- * @drl: original runlist to be worked on
- * @srl: new runlist to be merged into @drl
- *
- * First we sanity check the two runlists @srl and @drl to make sure that they
- * are sensible and can be merged. The runlist @srl must be either after the
- * runlist @drl or completely within a hole (or unmapped region) in @drl.
- *
- * It is up to the caller to serialize access to the runlists @drl and @srl.
- *
- * Merging of runlists is necessary in two cases:
- * 1. When attribute lists are used and a further extent is being mapped.
- * 2. When new clusters are allocated to fill a hole or extend a file.
- *
- * There are four possible ways @srl can be merged. It can:
- * - be inserted at the beginning of a hole,
- * - split the hole in two and be inserted between the two fragments,
- * - be appended at the end of a hole, or it can
- * - replace the whole hole.
- * It can also be appended to the end of the runlist, which is just a variant
- * of the insert case.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @drl and @srl are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EINVAL - Invalid parameters were passed in.
- * -ERANGE - The runlists overlap and cannot be merged.
- */
-runlist_element *ntfs_runlists_merge(runlist_element *drl,
- runlist_element *srl)
-{
- int di, si; /* Current index into @[ds]rl. */
- int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */
- int dins; /* Index into @drl at which to insert @srl. */
- int dend, send; /* Last index into @[ds]rl. */
- int dfinal, sfinal; /* The last index into @[ds]rl with
- lcn >= LCN_HOLE. */
- int marker = 0;
- VCN marker_vcn = 0;
-
-#ifdef DEBUG
- ntfs_debug("dst:");
- ntfs_debug_dump_runlist(drl);
- ntfs_debug("src:");
- ntfs_debug_dump_runlist(srl);
-#endif
-
- /* Check for silly calling... */
- if (unlikely(!srl))
- return drl;
- if (IS_ERR(srl) || IS_ERR(drl))
- return ERR_PTR(-EINVAL);
-
- /* Check for the case where the first mapping is being done now. */
- if (unlikely(!drl)) {
- drl = srl;
- /* Complete the source runlist if necessary. */
- if (unlikely(drl[0].vcn)) {
- /* Scan to the end of the source runlist. */
- for (dend = 0; likely(drl[dend].length); dend++)
- ;
- dend++;
- drl = ntfs_rl_realloc(drl, dend, dend + 1);
- if (IS_ERR(drl))
- return drl;
- /* Insert start element at the front of the runlist. */
- ntfs_rl_mm(drl, 1, 0, dend);
- drl[0].vcn = 0;
- drl[0].lcn = LCN_RL_NOT_MAPPED;
- drl[0].length = drl[1].vcn;
- }
- goto finished;
- }
-
- si = di = 0;
-
- /* Skip any unmapped start element(s) in the source runlist. */
- while (srl[si].length && srl[si].lcn < LCN_HOLE)
- si++;
-
- /* Can't have an entirely unmapped source runlist. */
- BUG_ON(!srl[si].length);
-
- /* Record the starting points. */
- sstart = si;
-
- /*
- * Skip forward in @drl until we reach the position where @srl needs to
- * be inserted. If we reach the end of @drl, @srl just needs to be
- * appended to @drl.
- */
- for (; drl[di].length; di++) {
- if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
- break;
- }
- dins = di;
-
- /* Sanity check for illegal overlaps. */
- if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
- (srl[si].lcn >= 0)) {
- ntfs_error(NULL, "Run lists overlap. Cannot merge!");
- return ERR_PTR(-ERANGE);
- }
-
- /* Scan to the end of both runlists in order to know their sizes. */
- for (send = si; srl[send].length; send++)
- ;
- for (dend = di; drl[dend].length; dend++)
- ;
-
- if (srl[send].lcn == LCN_ENOENT)
- marker_vcn = srl[marker = send].vcn;
-
- /* Scan to the last element with lcn >= LCN_HOLE. */
- for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
- ;
- for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
- ;
-
- {
- bool start;
- bool finish;
- int ds = dend + 1; /* Number of elements in drl & srl */
- int ss = sfinal - sstart + 1;
-
- start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */
- (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */
- finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */
- ((drl[dins].vcn + drl[dins].length) <= /* End of hole */
- (srl[send - 1].vcn + srl[send - 1].length)));
-
- /* Or we will lose an end marker. */
- if (finish && !drl[dins].length)
- ss++;
- if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
- finish = false;
-#if 0
- ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
- ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
- ntfs_debug("start = %i, finish = %i", start, finish);
- ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
-#endif
- if (start) {
- if (finish)
- drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
- else
- drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
- } else {
- if (finish)
- drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
- else
- drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
- }
- if (IS_ERR(drl)) {
- ntfs_error(NULL, "Merge failed.");
- return drl;
- }
- ntfs_free(srl);
- if (marker) {
- ntfs_debug("Triggering marker code.");
- for (ds = dend; drl[ds].length; ds++)
- ;
- /* We only need to care if @srl ended after @drl. */
- if (drl[ds].vcn <= marker_vcn) {
- int slots = 0;
-
- if (drl[ds].vcn == marker_vcn) {
- ntfs_debug("Old marker = 0x%llx, replacing "
- "with LCN_ENOENT.",
- (unsigned long long)
- drl[ds].lcn);
- drl[ds].lcn = LCN_ENOENT;
- goto finished;
- }
- /*
- * We need to create an unmapped runlist element in
- * @drl or extend an existing one before adding the
- * ENOENT terminator.
- */
- if (drl[ds].lcn == LCN_ENOENT) {
- ds--;
- slots = 1;
- }
- if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
- /* Add an unmapped runlist element. */
- if (!slots) {
- drl = ntfs_rl_realloc_nofail(drl, ds,
- ds + 2);
- slots = 2;
- }
- ds++;
- /* Need to set vcn if it isn't set already. */
- if (slots != 1)
- drl[ds].vcn = drl[ds - 1].vcn +
- drl[ds - 1].length;
- drl[ds].lcn = LCN_RL_NOT_MAPPED;
- /* We now used up a slot. */
- slots--;
- }
- drl[ds].length = marker_vcn - drl[ds].vcn;
- /* Finally add the ENOENT terminator. */
- ds++;
- if (!slots)
- drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
- drl[ds].vcn = marker_vcn;
- drl[ds].lcn = LCN_ENOENT;
- drl[ds].length = (s64)0;
- }
- }
- }
-
-finished:
- /* The merge was completed successfully. */
- ntfs_debug("Merged runlist:");
- ntfs_debug_dump_runlist(drl);
- return drl;
-}
-
-/**
- * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
- * @vol: ntfs volume on which the attribute resides
- * @attr: attribute record whose mapping pairs array to decompress
- * @old_rl: optional runlist in which to insert @attr's runlist
- *
- * It is up to the caller to serialize access to the runlist @old_rl.
- *
- * Decompress the attribute @attr's mapping pairs array into a runlist. On
- * success, return the decompressed runlist.
- *
- * If @old_rl is not NULL, decompressed runlist is inserted into the
- * appropriate place in @old_rl and the resultant, combined runlist is
- * returned. The original @old_rl is deallocated.
- *
- * On error, return -errno. @old_rl is left unmodified in that case.
- *
- * The following error codes are defined:
- * -ENOMEM - Not enough memory to allocate runlist array.
- * -EIO - Corrupt runlist.
- * -EINVAL - Invalid parameters were passed in.
- * -ERANGE - The two runlists overlap.
- *
- * FIXME: For now we take the conceptionally simplest approach of creating the
- * new runlist disregarding the already existing one and then splicing the
- * two into one, if that is possible (we check for overlap and discard the new
- * runlist if overlap present before returning ERR_PTR(-ERANGE)).
- */
-runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
- const ATTR_RECORD *attr, runlist_element *old_rl)
-{
- VCN vcn; /* Current vcn. */
- LCN lcn; /* Current lcn. */
- s64 deltaxcn; /* Change in [vl]cn. */
- runlist_element *rl; /* The output runlist. */
- u8 *buf; /* Current position in mapping pairs array. */
- u8 *attr_end; /* End of attribute. */
- int rlsize; /* Size of runlist buffer. */
- u16 rlpos; /* Current runlist position in units of
- runlist_elements. */
- u8 b; /* Current byte offset in buf. */
-
-#ifdef DEBUG
- /* Make sure attr exists and is non-resident. */
- if (!attr || !attr->non_resident || sle64_to_cpu(
- attr->data.non_resident.lowest_vcn) < (VCN)0) {
- ntfs_error(vol->sb, "Invalid arguments.");
- return ERR_PTR(-EINVAL);
- }
-#endif
- /* Start at vcn = lowest_vcn and lcn 0. */
- vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
- lcn = 0;
- /* Get start of the mapping pairs array. */
- buf = (u8*)attr + le16_to_cpu(
- attr->data.non_resident.mapping_pairs_offset);
- attr_end = (u8*)attr + le32_to_cpu(attr->length);
- if (unlikely(buf < (u8*)attr || buf > attr_end)) {
- ntfs_error(vol->sb, "Corrupt attribute.");
- return ERR_PTR(-EIO);
- }
- /* If the mapping pairs array is valid but empty, nothing to do. */
- if (!vcn && !*buf)
- return old_rl;
- /* Current position in runlist array. */
- rlpos = 0;
- /* Allocate first page and set current runlist size to one page. */
- rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
- if (unlikely(!rl))
- return ERR_PTR(-ENOMEM);
- /* Insert unmapped starting element if necessary. */
- if (vcn) {
- rl->vcn = 0;
- rl->lcn = LCN_RL_NOT_MAPPED;
- rl->length = vcn;
- rlpos++;
- }
- while (buf < attr_end && *buf) {
- /*
- * Allocate more memory if needed, including space for the
- * not-mapped and terminator elements. ntfs_malloc_nofs()
- * operates on whole pages only.
- */
- if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
- runlist_element *rl2;
-
- rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
- if (unlikely(!rl2)) {
- ntfs_free(rl);
- return ERR_PTR(-ENOMEM);
- }
- memcpy(rl2, rl, rlsize);
- ntfs_free(rl);
- rl = rl2;
- rlsize += PAGE_SIZE;
- }
- /* Enter the current vcn into the current runlist element. */
- rl[rlpos].vcn = vcn;
- /*
- * Get the change in vcn, i.e. the run length in clusters.
- * Doing it this way ensures that we signextend negative values.
- * A negative run length doesn't make any sense, but hey, I
- * didn't make up the NTFS specs and Windows NT4 treats the run
- * length as a signed value so that's how it is...
- */
- b = *buf & 0xf;
- if (b) {
- if (unlikely(buf + b > attr_end))
- goto io_error;
- for (deltaxcn = (s8)buf[b--]; b; b--)
- deltaxcn = (deltaxcn << 8) + buf[b];
- } else { /* The length entry is compulsory. */
- ntfs_error(vol->sb, "Missing length entry in mapping "
- "pairs array.");
- deltaxcn = (s64)-1;
- }
- /*
- * Assume a negative length to indicate data corruption and
- * hence clean-up and return NULL.
- */
- if (unlikely(deltaxcn < 0)) {
- ntfs_error(vol->sb, "Invalid length in mapping pairs "
- "array.");
- goto err_out;
- }
- /*
- * Enter the current run length into the current runlist
- * element.
- */
- rl[rlpos].length = deltaxcn;
- /* Increment the current vcn by the current run length. */
- vcn += deltaxcn;
- /*
- * There might be no lcn change at all, as is the case for
- * sparse clusters on NTFS 3.0+, in which case we set the lcn
- * to LCN_HOLE.
- */
- if (!(*buf & 0xf0))
- rl[rlpos].lcn = LCN_HOLE;
- else {
- /* Get the lcn change which really can be negative. */
- u8 b2 = *buf & 0xf;
- b = b2 + ((*buf >> 4) & 0xf);
- if (buf + b > attr_end)
- goto io_error;
- for (deltaxcn = (s8)buf[b--]; b > b2; b--)
- deltaxcn = (deltaxcn << 8) + buf[b];
- /* Change the current lcn to its new value. */
- lcn += deltaxcn;
-#ifdef DEBUG
- /*
- * On NTFS 1.2-, apparently can have lcn == -1 to
- * indicate a hole. But we haven't verified ourselves
- * whether it is really the lcn or the deltaxcn that is
- * -1. So if either is found give us a message so we
- * can investigate it further!
- */
- if (vol->major_ver < 3) {
- if (unlikely(deltaxcn == (LCN)-1))
- ntfs_error(vol->sb, "lcn delta == -1");
- if (unlikely(lcn == (LCN)-1))
- ntfs_error(vol->sb, "lcn == -1");
- }
-#endif
- /* Check lcn is not below -1. */
- if (unlikely(lcn < (LCN)-1)) {
- ntfs_error(vol->sb, "Invalid LCN < -1 in "
- "mapping pairs array.");
- goto err_out;
- }
- /* Enter the current lcn into the runlist element. */
- rl[rlpos].lcn = lcn;
- }
- /* Get to the next runlist element. */
- rlpos++;
- /* Increment the buffer position to the next mapping pair. */
- buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
- }
- if (unlikely(buf >= attr_end))
- goto io_error;
- /*
- * If there is a highest_vcn specified, it must be equal to the final
- * vcn in the runlist - 1, or something has gone badly wrong.
- */
- deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
- if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
-mpa_err:
- ntfs_error(vol->sb, "Corrupt mapping pairs array in "
- "non-resident attribute.");
- goto err_out;
- }
- /* Setup not mapped runlist element if this is the base extent. */
- if (!attr->data.non_resident.lowest_vcn) {
- VCN max_cluster;
-
- max_cluster = ((sle64_to_cpu(
- attr->data.non_resident.allocated_size) +
- vol->cluster_size - 1) >>
- vol->cluster_size_bits) - 1;
- /*
- * A highest_vcn of zero means this is a single extent
- * attribute so simply terminate the runlist with LCN_ENOENT).
- */
- if (deltaxcn) {
- /*
- * If there is a difference between the highest_vcn and
- * the highest cluster, the runlist is either corrupt
- * or, more likely, there are more extents following
- * this one.
- */
- if (deltaxcn < max_cluster) {
- ntfs_debug("More extents to follow; deltaxcn "
- "= 0x%llx, max_cluster = "
- "0x%llx",
- (unsigned long long)deltaxcn,
- (unsigned long long)
- max_cluster);
- rl[rlpos].vcn = vcn;
- vcn += rl[rlpos].length = max_cluster -
- deltaxcn;
- rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
- rlpos++;
- } else if (unlikely(deltaxcn > max_cluster)) {
- ntfs_error(vol->sb, "Corrupt attribute. "
- "deltaxcn = 0x%llx, "
- "max_cluster = 0x%llx",
- (unsigned long long)deltaxcn,
- (unsigned long long)
- max_cluster);
- goto mpa_err;
- }
- }
- rl[rlpos].lcn = LCN_ENOENT;
- } else /* Not the base extent. There may be more extents to follow. */
- rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
-
- /* Setup terminating runlist element. */
- rl[rlpos].vcn = vcn;
- rl[rlpos].length = (s64)0;
- /* If no existing runlist was specified, we are done. */
- if (!old_rl) {
- ntfs_debug("Mapping pairs array successfully decompressed:");
- ntfs_debug_dump_runlist(rl);
- return rl;
- }
- /* Now combine the new and old runlists checking for overlaps. */
- old_rl = ntfs_runlists_merge(old_rl, rl);
- if (!IS_ERR(old_rl))
- return old_rl;
- ntfs_free(rl);
- ntfs_error(vol->sb, "Failed to merge runlists.");
- return old_rl;
-io_error:
- ntfs_error(vol->sb, "Corrupt attribute.");
-err_out:
- ntfs_free(rl);
- return ERR_PTR(-EIO);
-}
-
-/**
- * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
- * @rl: runlist to use for conversion
- * @vcn: vcn to convert
- *
- * Convert the virtual cluster number @vcn of an attribute into a logical
- * cluster number (lcn) of a device using the runlist @rl to map vcns to their
- * corresponding lcns.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * Since lcns must be >= 0, we use negative return codes with special meaning:
- *
- * Return code Meaning / Description
- * ==================================================
- * LCN_HOLE Hole / not allocated on disk.
- * LCN_RL_NOT_MAPPED This is part of the runlist which has not been
- * inserted into the runlist yet.
- * LCN_ENOENT There is no such vcn in the attribute.
- *
- * Locking: - The caller must have locked the runlist (for reading or writing).
- * - This function does not touch the lock, nor does it modify the
- * runlist.
- */
-LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
-{
- int i;
-
- BUG_ON(vcn < 0);
- /*
- * If rl is NULL, assume that we have found an unmapped runlist. The
- * caller can then attempt to map it and fail appropriately if
- * necessary.
- */
- if (unlikely(!rl))
- return LCN_RL_NOT_MAPPED;
-
- /* Catch out of lower bounds vcn. */
- if (unlikely(vcn < rl[0].vcn))
- return LCN_ENOENT;
-
- for (i = 0; likely(rl[i].length); i++) {
- if (unlikely(vcn < rl[i+1].vcn)) {
- if (likely(rl[i].lcn >= (LCN)0))
- return rl[i].lcn + (vcn - rl[i].vcn);
- return rl[i].lcn;
- }
- }
- /*
- * The terminator element is setup to the correct value, i.e. one of
- * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
- */
- if (likely(rl[i].lcn < (LCN)0))
- return rl[i].lcn;
- /* Just in case... We could replace this with BUG() some day. */
- return LCN_ENOENT;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_rl_find_vcn_nolock - find a vcn in a runlist
- * @rl: runlist to search
- * @vcn: vcn to find
- *
- * Find the virtual cluster number @vcn in the runlist @rl and return the
- * address of the runlist element containing the @vcn on success.
- *
- * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of
- * the runlist.
- *
- * Locking: The runlist must be locked on entry.
- */
-runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn)
-{
- BUG_ON(vcn < 0);
- if (unlikely(!rl || vcn < rl[0].vcn))
- return NULL;
- while (likely(rl->length)) {
- if (unlikely(vcn < rl[1].vcn)) {
- if (likely(rl->lcn >= LCN_HOLE))
- return rl;
- return NULL;
- }
- rl++;
- }
- if (likely(rl->lcn == LCN_ENOENT))
- return rl;
- return NULL;
-}
-
-/**
- * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
- * @n: number for which to get the number of bytes for
- *
- * Return the number of bytes required to store @n unambiguously as
- * a signed number.
- *
- * This is used in the context of the mapping pairs array to determine how
- * many bytes will be needed in the array to store a given logical cluster
- * number (lcn) or a specific run length.
- *
- * Return the number of bytes written. This function cannot fail.
- */
-static inline int ntfs_get_nr_significant_bytes(const s64 n)
-{
- s64 l = n;
- int i;
- s8 j;
-
- i = 0;
- do {
- l >>= 8;
- i++;
- } while (l != 0 && l != -1);
- j = (n >> 8 * (i - 1)) & 0xff;
- /* If the sign bit is wrong, we need an extra byte. */
- if ((n < 0 && j >= 0) || (n > 0 && j < 0))
- i++;
- return i;
-}
-
-/**
- * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
- * @vol: ntfs volume (needed for the ntfs version)
- * @rl: locked runlist to determine the size of the mapping pairs of
- * @first_vcn: first vcn which to include in the mapping pairs array
- * @last_vcn: last vcn which to include in the mapping pairs array
- *
- * Walk the locked runlist @rl and calculate the size in bytes of the mapping
- * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
- * finishing with vcn @last_vcn.
- *
- * A @last_vcn of -1 means end of runlist and in that case the size of the
- * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
- * and finishing at the end of the runlist is determined.
- *
- * This for example allows us to allocate a buffer of the right size when
- * building the mapping pairs array.
- *
- * If @rl is NULL, just return 1 (for the single terminator byte).
- *
- * Return the calculated size in bytes on success. On error, return -errno.
- * The following error codes are defined:
- * -EINVAL - Run list contains unmapped elements. Make sure to only pass
- * fully mapped runlists to this function.
- * -EIO - The runlist is corrupt.
- *
- * Locking: @rl must be locked on entry (either for reading or writing), it
- * remains locked throughout, and is left locked upon return.
- */
-int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
- const runlist_element *rl, const VCN first_vcn,
- const VCN last_vcn)
-{
- LCN prev_lcn;
- int rls;
- bool the_end = false;
-
- BUG_ON(first_vcn < 0);
- BUG_ON(last_vcn < -1);
- BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
- if (!rl) {
- BUG_ON(first_vcn);
- BUG_ON(last_vcn > 0);
- return 1;
- }
- /* Skip to runlist element containing @first_vcn. */
- while (rl->length && first_vcn >= rl[1].vcn)
- rl++;
- if (unlikely((!rl->length && first_vcn > rl->vcn) ||
- first_vcn < rl->vcn))
- return -EINVAL;
- prev_lcn = 0;
- /* Always need the termining zero byte. */
- rls = 1;
- /* Do the first partial run if present. */
- if (first_vcn > rl->vcn) {
- s64 delta, length = rl->length;
-
- /* We know rl->length != 0 already. */
- if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
- goto err_out;
- /*
- * If @stop_vcn is given and finishes inside this run, cap the
- * run length.
- */
- if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
- s64 s1 = last_vcn + 1;
- if (unlikely(rl[1].vcn > s1))
- length = s1 - rl->vcn;
- the_end = true;
- }
- delta = first_vcn - rl->vcn;
- /* Header byte + length. */
- rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
- /*
- * If the logical cluster number (lcn) denotes a hole and we
- * are on NTFS 3.0+, we don't store it at all, i.e. we need
- * zero space. On earlier NTFS versions we just store the lcn.
- * Note: this assumes that on NTFS 1.2-, holes are stored with
- * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
- */
- if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
- prev_lcn = rl->lcn;
- if (likely(rl->lcn >= 0))
- prev_lcn += delta;
- /* Change in lcn. */
- rls += ntfs_get_nr_significant_bytes(prev_lcn);
- }
- /* Go to next runlist element. */
- rl++;
- }
- /* Do the full runs. */
- for (; rl->length && !the_end; rl++) {
- s64 length = rl->length;
-
- if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
- goto err_out;
- /*
- * If @stop_vcn is given and finishes inside this run, cap the
- * run length.
- */
- if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
- s64 s1 = last_vcn + 1;
- if (unlikely(rl[1].vcn > s1))
- length = s1 - rl->vcn;
- the_end = true;
- }
- /* Header byte + length. */
- rls += 1 + ntfs_get_nr_significant_bytes(length);
- /*
- * If the logical cluster number (lcn) denotes a hole and we
- * are on NTFS 3.0+, we don't store it at all, i.e. we need
- * zero space. On earlier NTFS versions we just store the lcn.
- * Note: this assumes that on NTFS 1.2-, holes are stored with
- * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
- */
- if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
- /* Change in lcn. */
- rls += ntfs_get_nr_significant_bytes(rl->lcn -
- prev_lcn);
- prev_lcn = rl->lcn;
- }
- }
- return rls;
-err_out:
- if (rl->lcn == LCN_RL_NOT_MAPPED)
- rls = -EINVAL;
- else
- rls = -EIO;
- return rls;
-}
-
-/**
- * ntfs_write_significant_bytes - write the significant bytes of a number
- * @dst: destination buffer to write to
- * @dst_max: pointer to last byte of destination buffer for bounds checking
- * @n: number whose significant bytes to write
- *
- * Store in @dst, the minimum bytes of the number @n which are required to
- * identify @n unambiguously as a signed number, taking care not to exceed
- * @dest_max, the maximum position within @dst to which we are allowed to
- * write.
- *
- * This is used when building the mapping pairs array of a runlist to compress
- * a given logical cluster number (lcn) or a specific run length to the minimum
- * size possible.
- *
- * Return the number of bytes written on success. On error, i.e. the
- * destination buffer @dst is too small, return -ENOSPC.
- */
-static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
- const s64 n)
-{
- s64 l = n;
- int i;
- s8 j;
-
- i = 0;
- do {
- if (unlikely(dst > dst_max))
- goto err_out;
- *dst++ = l & 0xffll;
- l >>= 8;
- i++;
- } while (l != 0 && l != -1);
- j = (n >> 8 * (i - 1)) & 0xff;
- /* If the sign bit is wrong, we need an extra byte. */
- if (n < 0 && j >= 0) {
- if (unlikely(dst > dst_max))
- goto err_out;
- i++;
- *dst = (s8)-1;
- } else if (n > 0 && j < 0) {
- if (unlikely(dst > dst_max))
- goto err_out;
- i++;
- *dst = (s8)0;
- }
- return i;
-err_out:
- return -ENOSPC;
-}
-
-/**
- * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
- * @vol: ntfs volume (needed for the ntfs version)
- * @dst: destination buffer to which to write the mapping pairs array
- * @dst_len: size of destination buffer @dst in bytes
- * @rl: locked runlist for which to build the mapping pairs array
- * @first_vcn: first vcn which to include in the mapping pairs array
- * @last_vcn: last vcn which to include in the mapping pairs array
- * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC
- *
- * Create the mapping pairs array from the locked runlist @rl, starting at vcn
- * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
- * @dst_len is the size of @dst in bytes and it should be at least equal to the
- * value obtained by calling ntfs_get_size_for_mapping_pairs().
- *
- * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
- * array corresponding to the runlist starting at vcn @first_vcn and finishing
- * at the end of the runlist is created.
- *
- * If @rl is NULL, just write a single terminator byte to @dst.
- *
- * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
- * the first vcn outside the destination buffer. Note that on error, @dst has
- * been filled with all the mapping pairs that will fit, thus it can be treated
- * as partial success, in that a new attribute extent needs to be created or
- * the next extent has to be used and the mapping pairs build has to be
- * continued with @first_vcn set to *@stop_vcn.
- *
- * Return 0 on success and -errno on error. The following error codes are
- * defined:
- * -EINVAL - Run list contains unmapped elements. Make sure to only pass
- * fully mapped runlists to this function.
- * -EIO - The runlist is corrupt.
- * -ENOSPC - The destination buffer is too small.
- *
- * Locking: @rl must be locked on entry (either for reading or writing), it
- * remains locked throughout, and is left locked upon return.
- */
-int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
- const int dst_len, const runlist_element *rl,
- const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
-{
- LCN prev_lcn;
- s8 *dst_max, *dst_next;
- int err = -ENOSPC;
- bool the_end = false;
- s8 len_len, lcn_len;
-
- BUG_ON(first_vcn < 0);
- BUG_ON(last_vcn < -1);
- BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
- BUG_ON(dst_len < 1);
- if (!rl) {
- BUG_ON(first_vcn);
- BUG_ON(last_vcn > 0);
- if (stop_vcn)
- *stop_vcn = 0;
- /* Terminator byte. */
- *dst = 0;
- return 0;
- }
- /* Skip to runlist element containing @first_vcn. */
- while (rl->length && first_vcn >= rl[1].vcn)
- rl++;
- if (unlikely((!rl->length && first_vcn > rl->vcn) ||
- first_vcn < rl->vcn))
- return -EINVAL;
- /*
- * @dst_max is used for bounds checking in
- * ntfs_write_significant_bytes().
- */
- dst_max = dst + dst_len - 1;
- prev_lcn = 0;
- /* Do the first partial run if present. */
- if (first_vcn > rl->vcn) {
- s64 delta, length = rl->length;
-
- /* We know rl->length != 0 already. */
- if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
- goto err_out;
- /*
- * If @stop_vcn is given and finishes inside this run, cap the
- * run length.
- */
- if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
- s64 s1 = last_vcn + 1;
- if (unlikely(rl[1].vcn > s1))
- length = s1 - rl->vcn;
- the_end = true;
- }
- delta = first_vcn - rl->vcn;
- /* Write length. */
- len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
- length - delta);
- if (unlikely(len_len < 0))
- goto size_err;
- /*
- * If the logical cluster number (lcn) denotes a hole and we
- * are on NTFS 3.0+, we don't store it at all, i.e. we need
- * zero space. On earlier NTFS versions we just write the lcn
- * change. FIXME: Do we need to write the lcn change or just
- * the lcn in that case? Not sure as I have never seen this
- * case on NT4. - We assume that we just need to write the lcn
- * change until someone tells us otherwise... (AIA)
- */
- if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
- prev_lcn = rl->lcn;
- if (likely(rl->lcn >= 0))
- prev_lcn += delta;
- /* Write change in lcn. */
- lcn_len = ntfs_write_significant_bytes(dst + 1 +
- len_len, dst_max, prev_lcn);
- if (unlikely(lcn_len < 0))
- goto size_err;
- } else
- lcn_len = 0;
- dst_next = dst + len_len + lcn_len + 1;
- if (unlikely(dst_next > dst_max))
- goto size_err;
- /* Update header byte. */
- *dst = lcn_len << 4 | len_len;
- /* Position at next mapping pairs array element. */
- dst = dst_next;
- /* Go to next runlist element. */
- rl++;
- }
- /* Do the full runs. */
- for (; rl->length && !the_end; rl++) {
- s64 length = rl->length;
-
- if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
- goto err_out;
- /*
- * If @stop_vcn is given and finishes inside this run, cap the
- * run length.
- */
- if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
- s64 s1 = last_vcn + 1;
- if (unlikely(rl[1].vcn > s1))
- length = s1 - rl->vcn;
- the_end = true;
- }
- /* Write length. */
- len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
- length);
- if (unlikely(len_len < 0))
- goto size_err;
- /*
- * If the logical cluster number (lcn) denotes a hole and we
- * are on NTFS 3.0+, we don't store it at all, i.e. we need
- * zero space. On earlier NTFS versions we just write the lcn
- * change. FIXME: Do we need to write the lcn change or just
- * the lcn in that case? Not sure as I have never seen this
- * case on NT4. - We assume that we just need to write the lcn
- * change until someone tells us otherwise... (AIA)
- */
- if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
- /* Write change in lcn. */
- lcn_len = ntfs_write_significant_bytes(dst + 1 +
- len_len, dst_max, rl->lcn - prev_lcn);
- if (unlikely(lcn_len < 0))
- goto size_err;
- prev_lcn = rl->lcn;
- } else
- lcn_len = 0;
- dst_next = dst + len_len + lcn_len + 1;
- if (unlikely(dst_next > dst_max))
- goto size_err;
- /* Update header byte. */
- *dst = lcn_len << 4 | len_len;
- /* Position at next mapping pairs array element. */
- dst = dst_next;
- }
- /* Success. */
- err = 0;
-size_err:
- /* Set stop vcn. */
- if (stop_vcn)
- *stop_vcn = rl->vcn;
- /* Add terminator byte. */
- *dst = 0;
- return err;
-err_out:
- if (rl->lcn == LCN_RL_NOT_MAPPED)
- err = -EINVAL;
- else
- err = -EIO;
- return err;
-}
-
-/**
- * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
- * @vol: ntfs volume (needed for error output)
- * @runlist: runlist to truncate
- * @new_length: the new length of the runlist in VCNs
- *
- * Truncate the runlist described by @runlist as well as the memory buffer
- * holding the runlist elements to a length of @new_length VCNs.
- *
- * If @new_length lies within the runlist, the runlist elements with VCNs of
- * @new_length and above are discarded. As a special case if @new_length is
- * zero, the runlist is discarded and set to NULL.
- *
- * If @new_length lies beyond the runlist, a sparse runlist element is added to
- * the end of the runlist @runlist or if the last runlist element is a sparse
- * one already, this is extended.
- *
- * Note, no checking is done for unmapped runlist elements. It is assumed that
- * the caller has mapped any elements that need to be mapped already.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: The caller must hold @runlist->lock for writing.
- */
-int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
- const s64 new_length)
-{
- runlist_element *rl;
- int old_size;
-
- ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
- BUG_ON(!runlist);
- BUG_ON(new_length < 0);
- rl = runlist->rl;
- if (!new_length) {
- ntfs_debug("Freeing runlist.");
- runlist->rl = NULL;
- if (rl)
- ntfs_free(rl);
- return 0;
- }
- if (unlikely(!rl)) {
- /*
- * Create a runlist consisting of a sparse runlist element of
- * length @new_length followed by a terminator runlist element.
- */
- rl = ntfs_malloc_nofs(PAGE_SIZE);
- if (unlikely(!rl)) {
- ntfs_error(vol->sb, "Not enough memory to allocate "
- "runlist element buffer.");
- return -ENOMEM;
- }
- runlist->rl = rl;
- rl[1].length = rl->vcn = 0;
- rl->lcn = LCN_HOLE;
- rl[1].vcn = rl->length = new_length;
- rl[1].lcn = LCN_ENOENT;
- return 0;
- }
- BUG_ON(new_length < rl->vcn);
- /* Find @new_length in the runlist. */
- while (likely(rl->length && new_length >= rl[1].vcn))
- rl++;
- /*
- * If not at the end of the runlist we need to shrink it.
- * If at the end of the runlist we need to expand it.
- */
- if (rl->length) {
- runlist_element *trl;
- bool is_end;
-
- ntfs_debug("Shrinking runlist.");
- /* Determine the runlist size. */
- trl = rl + 1;
- while (likely(trl->length))
- trl++;
- old_size = trl - runlist->rl + 1;
- /* Truncate the run. */
- rl->length = new_length - rl->vcn;
- /*
- * If a run was partially truncated, make the following runlist
- * element a terminator.
- */
- is_end = false;
- if (rl->length) {
- rl++;
- if (!rl->length)
- is_end = true;
- rl->vcn = new_length;
- rl->length = 0;
- }
- rl->lcn = LCN_ENOENT;
- /* Reallocate memory if necessary. */
- if (!is_end) {
- int new_size = rl - runlist->rl + 1;
- rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
- if (IS_ERR(rl))
- ntfs_warning(vol->sb, "Failed to shrink "
- "runlist buffer. This just "
- "wastes a bit of memory "
- "temporarily so we ignore it "
- "and return success.");
- else
- runlist->rl = rl;
- }
- } else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
- ntfs_debug("Expanding runlist.");
- /*
- * If there is a previous runlist element and it is a sparse
- * one, extend it. Otherwise need to add a new, sparse runlist
- * element.
- */
- if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
- (rl - 1)->length = new_length - (rl - 1)->vcn;
- else {
- /* Determine the runlist size. */
- old_size = rl - runlist->rl + 1;
- /* Reallocate memory if necessary. */
- rl = ntfs_rl_realloc(runlist->rl, old_size,
- old_size + 1);
- if (IS_ERR(rl)) {
- ntfs_error(vol->sb, "Failed to expand runlist "
- "buffer, aborting.");
- return PTR_ERR(rl);
- }
- runlist->rl = rl;
- /*
- * Set @rl to the same runlist element in the new
- * runlist as before in the old runlist.
- */
- rl += old_size - 1;
- /* Add a new, sparse runlist element. */
- rl->lcn = LCN_HOLE;
- rl->length = new_length - rl->vcn;
- /* Add a new terminator runlist element. */
- rl++;
- rl->length = 0;
- }
- rl->vcn = new_length;
- rl->lcn = LCN_ENOENT;
- } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
- /* Runlist already has same size as requested. */
- rl->lcn = LCN_ENOENT;
- }
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * ntfs_rl_punch_nolock - punch a hole into a runlist
- * @vol: ntfs volume (needed for error output)
- * @runlist: runlist to punch a hole into
- * @start: starting VCN of the hole to be created
- * @length: size of the hole to be created in units of clusters
- *
- * Punch a hole into the runlist @runlist starting at VCN @start and of size
- * @length clusters.
- *
- * Return 0 on success and -errno on error, in which case @runlist has not been
- * modified.
- *
- * If @start and/or @start + @length are outside the runlist return error code
- * -ENOENT.
- *
- * If the runlist contains unmapped or error elements between @start and @start
- * + @length return error code -EINVAL.
- *
- * Locking: The caller must hold @runlist->lock for writing.
- */
-int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
- const VCN start, const s64 length)
-{
- const VCN end = start + length;
- s64 delta;
- runlist_element *rl, *rl_end, *rl_real_end, *trl;
- int old_size;
- bool lcn_fixup = false;
-
- ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
- (long long)start, (long long)length);
- BUG_ON(!runlist);
- BUG_ON(start < 0);
- BUG_ON(length < 0);
- BUG_ON(end < 0);
- rl = runlist->rl;
- if (unlikely(!rl)) {
- if (likely(!start && !length))
- return 0;
- return -ENOENT;
- }
- /* Find @start in the runlist. */
- while (likely(rl->length && start >= rl[1].vcn))
- rl++;
- rl_end = rl;
- /* Find @end in the runlist. */
- while (likely(rl_end->length && end >= rl_end[1].vcn)) {
- /* Verify there are no unmapped or error elements. */
- if (unlikely(rl_end->lcn < LCN_HOLE))
- return -EINVAL;
- rl_end++;
- }
- /* Check the last element. */
- if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
- return -EINVAL;
- /* This covers @start being out of bounds, too. */
- if (!rl_end->length && end > rl_end->vcn)
- return -ENOENT;
- if (!length)
- return 0;
- if (!rl->length)
- return -ENOENT;
- rl_real_end = rl_end;
- /* Determine the runlist size. */
- while (likely(rl_real_end->length))
- rl_real_end++;
- old_size = rl_real_end - runlist->rl + 1;
- /* If @start is in a hole simply extend the hole. */
- if (rl->lcn == LCN_HOLE) {
- /*
- * If both @start and @end are in the same sparse run, we are
- * done.
- */
- if (end <= rl[1].vcn) {
- ntfs_debug("Done (requested hole is already sparse).");
- return 0;
- }
-extend_hole:
- /* Extend the hole. */
- rl->length = end - rl->vcn;
- /* If @end is in a hole, merge it with the current one. */
- if (rl_end->lcn == LCN_HOLE) {
- rl_end++;
- rl->length = rl_end->vcn - rl->vcn;
- }
- /* We have done the hole. Now deal with the remaining tail. */
- rl++;
- /* Cut out all runlist elements up to @end. */
- if (rl < rl_end)
- memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
- sizeof(*rl));
- /* Adjust the beginning of the tail if necessary. */
- if (end > rl->vcn) {
- delta = end - rl->vcn;
- rl->vcn = end;
- rl->length -= delta;
- /* Only adjust the lcn if it is real. */
- if (rl->lcn >= 0)
- rl->lcn += delta;
- }
-shrink_allocation:
- /* Reallocate memory if the allocation changed. */
- if (rl < rl_end) {
- rl = ntfs_rl_realloc(runlist->rl, old_size,
- old_size - (rl_end - rl));
- if (IS_ERR(rl))
- ntfs_warning(vol->sb, "Failed to shrink "
- "runlist buffer. This just "
- "wastes a bit of memory "
- "temporarily so we ignore it "
- "and return success.");
- else
- runlist->rl = rl;
- }
- ntfs_debug("Done (extend hole).");
- return 0;
- }
- /*
- * If @start is at the beginning of a run things are easier as there is
- * no need to split the first run.
- */
- if (start == rl->vcn) {
- /*
- * @start is at the beginning of a run.
- *
- * If the previous run is sparse, extend its hole.
- *
- * If @end is not in the same run, switch the run to be sparse
- * and extend the newly created hole.
- *
- * Thus both of these cases reduce the problem to the above
- * case of "@start is in a hole".
- */
- if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
- rl--;
- goto extend_hole;
- }
- if (end >= rl[1].vcn) {
- rl->lcn = LCN_HOLE;
- goto extend_hole;
- }
- /*
- * The final case is when @end is in the same run as @start.
- * For this need to split the run into two. One run for the
- * sparse region between the beginning of the old run, i.e.
- * @start, and @end and one for the remaining non-sparse
- * region, i.e. between @end and the end of the old run.
- */
- trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
- if (IS_ERR(trl))
- goto enomem_out;
- old_size++;
- if (runlist->rl != trl) {
- rl = trl + (rl - runlist->rl);
- rl_end = trl + (rl_end - runlist->rl);
- rl_real_end = trl + (rl_real_end - runlist->rl);
- runlist->rl = trl;
- }
-split_end:
- /* Shift all the runs up by one. */
- memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
- /* Finally, setup the two split runs. */
- rl->lcn = LCN_HOLE;
- rl->length = length;
- rl++;
- rl->vcn += length;
- /* Only adjust the lcn if it is real. */
- if (rl->lcn >= 0 || lcn_fixup)
- rl->lcn += length;
- rl->length -= length;
- ntfs_debug("Done (split one).");
- return 0;
- }
- /*
- * @start is neither in a hole nor at the beginning of a run.
- *
- * If @end is in a hole, things are easier as simply truncating the run
- * @start is in to end at @start - 1, deleting all runs after that up
- * to @end, and finally extending the beginning of the run @end is in
- * to be @start is all that is needed.
- */
- if (rl_end->lcn == LCN_HOLE) {
- /* Truncate the run containing @start. */
- rl->length = start - rl->vcn;
- rl++;
- /* Cut out all runlist elements up to @end. */
- if (rl < rl_end)
- memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
- sizeof(*rl));
- /* Extend the beginning of the run @end is in to be @start. */
- rl->vcn = start;
- rl->length = rl[1].vcn - start;
- goto shrink_allocation;
- }
- /*
- * If @end is not in a hole there are still two cases to distinguish.
- * Either @end is or is not in the same run as @start.
- *
- * The second case is easier as it can be reduced to an already solved
- * problem by truncating the run @start is in to end at @start - 1.
- * Then, if @end is in the next run need to split the run into a sparse
- * run followed by a non-sparse run (already covered above) and if @end
- * is not in the next run switching it to be sparse, again reduces the
- * problem to the already covered case of "@start is in a hole".
- */
- if (end >= rl[1].vcn) {
- /*
- * If @end is not in the next run, reduce the problem to the
- * case of "@start is in a hole".
- */
- if (rl[1].length && end >= rl[2].vcn) {
- /* Truncate the run containing @start. */
- rl->length = start - rl->vcn;
- rl++;
- rl->vcn = start;
- rl->lcn = LCN_HOLE;
- goto extend_hole;
- }
- trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
- if (IS_ERR(trl))
- goto enomem_out;
- old_size++;
- if (runlist->rl != trl) {
- rl = trl + (rl - runlist->rl);
- rl_end = trl + (rl_end - runlist->rl);
- rl_real_end = trl + (rl_real_end - runlist->rl);
- runlist->rl = trl;
- }
- /* Truncate the run containing @start. */
- rl->length = start - rl->vcn;
- rl++;
- /*
- * @end is in the next run, reduce the problem to the case
- * where "@start is at the beginning of a run and @end is in
- * the same run as @start".
- */
- delta = rl->vcn - start;
- rl->vcn = start;
- if (rl->lcn >= 0) {
- rl->lcn -= delta;
- /* Need this in case the lcn just became negative. */
- lcn_fixup = true;
- }
- rl->length += delta;
- goto split_end;
- }
- /*
- * The first case from above, i.e. @end is in the same run as @start.
- * We need to split the run into three. One run for the non-sparse
- * region between the beginning of the old run and @start, one for the
- * sparse region between @start and @end, and one for the remaining
- * non-sparse region, i.e. between @end and the end of the old run.
- */
- trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
- if (IS_ERR(trl))
- goto enomem_out;
- old_size += 2;
- if (runlist->rl != trl) {
- rl = trl + (rl - runlist->rl);
- rl_end = trl + (rl_end - runlist->rl);
- rl_real_end = trl + (rl_real_end - runlist->rl);
- runlist->rl = trl;
- }
- /* Shift all the runs up by two. */
- memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
- /* Finally, setup the three split runs. */
- rl->length = start - rl->vcn;
- rl++;
- rl->vcn = start;
- rl->lcn = LCN_HOLE;
- rl->length = length;
- rl++;
- delta = end - rl->vcn;
- rl->vcn = end;
- rl->lcn += delta;
- rl->length -= delta;
- ntfs_debug("Done (split both).");
- return 0;
-enomem_out:
- ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
- return -ENOMEM;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
deleted file mode 100644
index 38de0a375f59..000000000000
--- a/fs/ntfs/runlist.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_RUNLIST_H
-#define _LINUX_NTFS_RUNLIST_H
-
-#include "types.h"
-#include "layout.h"
-#include "volume.h"
-
-/**
- * runlist_element - in memory vcn to lcn mapping array element
- * @vcn: starting vcn of the current array element
- * @lcn: starting lcn of the current array element
- * @length: length in clusters of the current array element
- *
- * The last vcn (in fact the last vcn + 1) is reached when length == 0.
- *
- * When lcn == -1 this means that the count vcns starting at vcn are not
- * physically allocated (i.e. this is a hole / data is sparse).
- */
-typedef struct { /* In memory vcn to lcn mapping structure element. */
- VCN vcn; /* vcn = Starting virtual cluster number. */
- LCN lcn; /* lcn = Starting logical cluster number. */
- s64 length; /* Run length in clusters. */
-} runlist_element;
-
-/**
- * runlist - in memory vcn to lcn mapping array including a read/write lock
- * @rl: pointer to an array of runlist elements
- * @lock: read/write spinlock for serializing access to @rl
- *
- */
-typedef struct {
- runlist_element *rl;
- struct rw_semaphore lock;
-} runlist;
-
-static inline void ntfs_init_runlist(runlist *rl)
-{
- rl->rl = NULL;
- init_rwsem(&rl->lock);
-}
-
-typedef enum {
- LCN_HOLE = -1, /* Keep this as highest value or die! */
- LCN_RL_NOT_MAPPED = -2,
- LCN_ENOENT = -3,
- LCN_ENOMEM = -4,
- LCN_EIO = -5,
-} LCN_SPECIAL_VALUES;
-
-extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
- runlist_element *srl);
-
-extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
- const ATTR_RECORD *attr, runlist_element *old_rl);
-
-extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
-
-#ifdef NTFS_RW
-
-extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
- const VCN vcn);
-
-extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
- const runlist_element *rl, const VCN first_vcn,
- const VCN last_vcn);
-
-extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
- const int dst_len, const runlist_element *rl,
- const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
-
-extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
- runlist *const runlist, const s64 new_length);
-
-int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
- const VCN start, const s64 length);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
deleted file mode 100644
index 56a7d5bd33e4..000000000000
--- a/fs/ntfs/super.c
+++ /dev/null
@@ -1,3202 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2001,2002 Richard Russon
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/blkdev.h> /* For bdev_logical_block_size(). */
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <linux/moduleparam.h>
-#include <linux/bitmap.h>
-
-#include "sysctl.h"
-#include "logfile.h"
-#include "quota.h"
-#include "usnjrnl.h"
-#include "dir.h"
-#include "debug.h"
-#include "index.h"
-#include "inode.h"
-#include "aops.h"
-#include "layout.h"
-#include "malloc.h"
-#include "ntfs.h"
-
-/* Number of mounted filesystems which have compression enabled. */
-static unsigned long ntfs_nr_compression_users;
-
-/* A global default upcase table and a corresponding reference count. */
-static ntfschar *default_upcase;
-static unsigned long ntfs_nr_upcase_users;
-
-/* Error constants/strings used in inode.c::ntfs_show_options(). */
-typedef enum {
- /* One of these must be present, default is ON_ERRORS_CONTINUE. */
- ON_ERRORS_PANIC = 0x01,
- ON_ERRORS_REMOUNT_RO = 0x02,
- ON_ERRORS_CONTINUE = 0x04,
- /* Optional, can be combined with any of the above. */
- ON_ERRORS_RECOVER = 0x10,
-} ON_ERRORS_ACTIONS;
-
-const option_t on_errors_arr[] = {
- { ON_ERRORS_PANIC, "panic" },
- { ON_ERRORS_REMOUNT_RO, "remount-ro", },
- { ON_ERRORS_CONTINUE, "continue", },
- { ON_ERRORS_RECOVER, "recover" },
- { 0, NULL }
-};
-
-/**
- * simple_getbool - convert input string to a boolean value
- * @s: input string to convert
- * @setval: where to store the output boolean value
- *
- * Copied from old ntfs driver (which copied from vfat driver).
- *
- * "1", "yes", "true", or an empty string are converted to %true.
- * "0", "no", and "false" are converted to %false.
- *
- * Return: %1 if the string is converted or was empty and *setval contains it;
- * %0 if the string was not valid.
- */
-static int simple_getbool(char *s, bool *setval)
-{
- if (s) {
- if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
- *setval = true;
- else if (!strcmp(s, "0") || !strcmp(s, "no") ||
- !strcmp(s, "false"))
- *setval = false;
- else
- return 0;
- } else
- *setval = true;
- return 1;
-}
-
-/**
- * parse_options - parse the (re)mount options
- * @vol: ntfs volume
- * @opt: string containing the (re)mount options
- *
- * Parse the recognized options in @opt for the ntfs volume described by @vol.
- */
-static bool parse_options(ntfs_volume *vol, char *opt)
-{
- char *p, *v, *ov;
- static char *utf8 = "utf8";
- int errors = 0, sloppy = 0;
- kuid_t uid = INVALID_UID;
- kgid_t gid = INVALID_GID;
- umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
- int mft_zone_multiplier = -1, on_errors = -1;
- int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
- struct nls_table *nls_map = NULL, *old_nls;
-
- /* I am lazy... (-8 */
-#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \
- if (!strcmp(p, option)) { \
- if (!v || !*v) \
- variable = default_value; \
- else { \
- variable = simple_strtoul(ov = v, &v, 0); \
- if (*v) \
- goto needs_val; \
- } \
- }
-#define NTFS_GETOPT(option, variable) \
- if (!strcmp(p, option)) { \
- if (!v || !*v) \
- goto needs_arg; \
- variable = simple_strtoul(ov = v, &v, 0); \
- if (*v) \
- goto needs_val; \
- }
-#define NTFS_GETOPT_UID(option, variable) \
- if (!strcmp(p, option)) { \
- uid_t uid_value; \
- if (!v || !*v) \
- goto needs_arg; \
- uid_value = simple_strtoul(ov = v, &v, 0); \
- if (*v) \
- goto needs_val; \
- variable = make_kuid(current_user_ns(), uid_value); \
- if (!uid_valid(variable)) \
- goto needs_val; \
- }
-#define NTFS_GETOPT_GID(option, variable) \
- if (!strcmp(p, option)) { \
- gid_t gid_value; \
- if (!v || !*v) \
- goto needs_arg; \
- gid_value = simple_strtoul(ov = v, &v, 0); \
- if (*v) \
- goto needs_val; \
- variable = make_kgid(current_user_ns(), gid_value); \
- if (!gid_valid(variable)) \
- goto needs_val; \
- }
-#define NTFS_GETOPT_OCTAL(option, variable) \
- if (!strcmp(p, option)) { \
- if (!v || !*v) \
- goto needs_arg; \
- variable = simple_strtoul(ov = v, &v, 8); \
- if (*v) \
- goto needs_val; \
- }
-#define NTFS_GETOPT_BOOL(option, variable) \
- if (!strcmp(p, option)) { \
- bool val; \
- if (!simple_getbool(v, &val)) \
- goto needs_bool; \
- variable = val; \
- }
-#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \
- if (!strcmp(p, option)) { \
- int _i; \
- if (!v || !*v) \
- goto needs_arg; \
- ov = v; \
- if (variable == -1) \
- variable = 0; \
- for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
- if (!strcmp(opt_array[_i].str, v)) { \
- variable |= opt_array[_i].val; \
- break; \
- } \
- if (!opt_array[_i].str || !*opt_array[_i].str) \
- goto needs_val; \
- }
- if (!opt || !*opt)
- goto no_mount_options;
- ntfs_debug("Entering with mount options string: %s", opt);
- while ((p = strsep(&opt, ","))) {
- if ((v = strchr(p, '=')))
- *v++ = 0;
- NTFS_GETOPT_UID("uid", uid)
- else NTFS_GETOPT_GID("gid", gid)
- else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
- else NTFS_GETOPT_OCTAL("fmask", fmask)
- else NTFS_GETOPT_OCTAL("dmask", dmask)
- else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
- else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
- else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
- else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
- else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
- else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
- on_errors_arr)
- else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
- ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
- p);
- else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
- if (!strcmp(p, "iocharset"))
- ntfs_warning(vol->sb, "Option iocharset is "
- "deprecated. Please use "
- "option nls=<charsetname> in "
- "the future.");
- if (!v || !*v)
- goto needs_arg;
-use_utf8:
- old_nls = nls_map;
- nls_map = load_nls(v);
- if (!nls_map) {
- if (!old_nls) {
- ntfs_error(vol->sb, "NLS character set "
- "%s not found.", v);
- return false;
- }
- ntfs_error(vol->sb, "NLS character set %s not "
- "found. Using previous one %s.",
- v, old_nls->charset);
- nls_map = old_nls;
- } else /* nls_map */ {
- unload_nls(old_nls);
- }
- } else if (!strcmp(p, "utf8")) {
- bool val = false;
- ntfs_warning(vol->sb, "Option utf8 is no longer "
- "supported, using option nls=utf8. Please "
- "use option nls=utf8 in the future and "
- "make sure utf8 is compiled either as a "
- "module or into the kernel.");
- if (!v || !*v)
- val = true;
- else if (!simple_getbool(v, &val))
- goto needs_bool;
- if (val) {
- v = utf8;
- goto use_utf8;
- }
- } else {
- ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
- if (errors < INT_MAX)
- errors++;
- }
-#undef NTFS_GETOPT_OPTIONS_ARRAY
-#undef NTFS_GETOPT_BOOL
-#undef NTFS_GETOPT
-#undef NTFS_GETOPT_WITH_DEFAULT
- }
-no_mount_options:
- if (errors && !sloppy)
- return false;
- if (sloppy)
- ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
- "unrecognized mount option(s) and continuing.");
- /* Keep this first! */
- if (on_errors != -1) {
- if (!on_errors) {
- ntfs_error(vol->sb, "Invalid errors option argument "
- "or bug in options parser.");
- return false;
- }
- }
- if (nls_map) {
- if (vol->nls_map && vol->nls_map != nls_map) {
- ntfs_error(vol->sb, "Cannot change NLS character set "
- "on remount.");
- return false;
- } /* else (!vol->nls_map) */
- ntfs_debug("Using NLS character set %s.", nls_map->charset);
- vol->nls_map = nls_map;
- } else /* (!nls_map) */ {
- if (!vol->nls_map) {
- vol->nls_map = load_nls_default();
- if (!vol->nls_map) {
- ntfs_error(vol->sb, "Failed to load default "
- "NLS character set.");
- return false;
- }
- ntfs_debug("Using default NLS character set (%s).",
- vol->nls_map->charset);
- }
- }
- if (mft_zone_multiplier != -1) {
- if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
- mft_zone_multiplier) {
- ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
- "on remount.");
- return false;
- }
- if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
- ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
- "Using default value, i.e. 1.");
- mft_zone_multiplier = 1;
- }
- vol->mft_zone_multiplier = mft_zone_multiplier;
- }
- if (!vol->mft_zone_multiplier)
- vol->mft_zone_multiplier = 1;
- if (on_errors != -1)
- vol->on_errors = on_errors;
- if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
- vol->on_errors |= ON_ERRORS_CONTINUE;
- if (uid_valid(uid))
- vol->uid = uid;
- if (gid_valid(gid))
- vol->gid = gid;
- if (fmask != (umode_t)-1)
- vol->fmask = fmask;
- if (dmask != (umode_t)-1)
- vol->dmask = dmask;
- if (show_sys_files != -1) {
- if (show_sys_files)
- NVolSetShowSystemFiles(vol);
- else
- NVolClearShowSystemFiles(vol);
- }
- if (case_sensitive != -1) {
- if (case_sensitive)
- NVolSetCaseSensitive(vol);
- else
- NVolClearCaseSensitive(vol);
- }
- if (disable_sparse != -1) {
- if (disable_sparse)
- NVolClearSparseEnabled(vol);
- else {
- if (!NVolSparseEnabled(vol) &&
- vol->major_ver && vol->major_ver < 3)
- ntfs_warning(vol->sb, "Not enabling sparse "
- "support due to NTFS volume "
- "version %i.%i (need at least "
- "version 3.0).", vol->major_ver,
- vol->minor_ver);
- else
- NVolSetSparseEnabled(vol);
- }
- }
- return true;
-needs_arg:
- ntfs_error(vol->sb, "The %s option requires an argument.", p);
- return false;
-needs_bool:
- ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
- return false;
-needs_val:
- ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
- return false;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_write_volume_flags - write new flags to the volume information flags
- * @vol: ntfs volume on which to modify the flags
- * @flags: new flags value for the volume information flags
- *
- * Internal function. You probably want to use ntfs_{set,clear}_volume_flags()
- * instead (see below).
- *
- * Replace the volume information flags on the volume @vol with the value
- * supplied in @flags. Note, this overwrites the volume information flags, so
- * make sure to combine the flags you want to modify with the old flags and use
- * the result when calling ntfs_write_volume_flags().
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
-{
- ntfs_inode *ni = NTFS_I(vol->vol_ino);
- MFT_RECORD *m;
- VOLUME_INFORMATION *vi;
- ntfs_attr_search_ctx *ctx;
- int err;
-
- ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
- le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
- if (vol->vol_flags == flags)
- goto done;
- BUG_ON(!ni);
- m = map_mft_record(ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(ni, m);
- if (!ctx) {
- err = -ENOMEM;
- goto put_unm_err_out;
- }
- err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
- ctx);
- if (err)
- goto put_unm_err_out;
- vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- vol->vol_flags = vi->flags = flags;
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
-done:
- ntfs_debug("Done.");
- return 0;
-put_unm_err_out:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
-err_out:
- ntfs_error(vol->sb, "Failed with error code %i.", -err);
- return err;
-}
-
-/**
- * ntfs_set_volume_flags - set bits in the volume information flags
- * @vol: ntfs volume on which to modify the flags
- * @flags: flags to set on the volume
- *
- * Set the bits in @flags in the volume information flags on the volume @vol.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
-{
- flags &= VOLUME_FLAGS_MASK;
- return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
-}
-
-/**
- * ntfs_clear_volume_flags - clear bits in the volume information flags
- * @vol: ntfs volume on which to modify the flags
- * @flags: flags to clear on the volume
- *
- * Clear the bits in @flags in the volume information flags on the volume @vol.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
-{
- flags &= VOLUME_FLAGS_MASK;
- flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
- return ntfs_write_volume_flags(vol, flags);
-}
-
-#endif /* NTFS_RW */
-
-/**
- * ntfs_remount - change the mount options of a mounted ntfs filesystem
- * @sb: superblock of mounted ntfs filesystem
- * @flags: remount flags
- * @opt: remount options string
- *
- * Change the mount options of an already mounted ntfs filesystem.
- *
- * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after
- * ntfs_remount() returns successfully (i.e. returns 0). Otherwise,
- * @sb->s_flags are not changed.
- */
-static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
-{
- ntfs_volume *vol = NTFS_SB(sb);
-
- ntfs_debug("Entering with remount options string: %s", opt);
-
- sync_filesystem(sb);
-
-#ifndef NTFS_RW
- /* For read-only compiled driver, enforce read-only flag. */
- *flags |= SB_RDONLY;
-#else /* NTFS_RW */
- /*
- * For the read-write compiled driver, if we are remounting read-write,
- * make sure there are no volume errors and that no unsupported volume
- * flags are set. Also, empty the logfile journal as it would become
- * stale as soon as something is written to the volume and mark the
- * volume dirty so that chkdsk is run if the volume is not umounted
- * cleanly. Finally, mark the quotas out of date so Windows rescans
- * the volume on boot and updates them.
- *
- * When remounting read-only, mark the volume clean if no volume errors
- * have occurred.
- */
- if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
- static const char *es = ". Cannot remount read-write.";
-
- /* Remounting read-write. */
- if (NVolErrors(vol)) {
- ntfs_error(sb, "Volume has errors and is read-only%s",
- es);
- return -EROFS;
- }
- if (vol->vol_flags & VOLUME_IS_DIRTY) {
- ntfs_error(sb, "Volume is dirty and read-only%s", es);
- return -EROFS;
- }
- if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
- ntfs_error(sb, "Volume has been modified by chkdsk "
- "and is read-only%s", es);
- return -EROFS;
- }
- if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
- ntfs_error(sb, "Volume has unsupported flags set "
- "(0x%x) and is read-only%s",
- (unsigned)le16_to_cpu(vol->vol_flags),
- es);
- return -EROFS;
- }
- if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
- ntfs_error(sb, "Failed to set dirty bit in volume "
- "information flags%s", es);
- return -EROFS;
- }
-#if 0
- // TODO: Enable this code once we start modifying anything that
- // is different between NTFS 1.2 and 3.x...
- /* Set NT4 compatibility flag on newer NTFS version volumes. */
- if ((vol->major_ver > 1)) {
- if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
- ntfs_error(sb, "Failed to set NT4 "
- "compatibility flag%s", es);
- NVolSetErrors(vol);
- return -EROFS;
- }
- }
-#endif
- if (!ntfs_empty_logfile(vol->logfile_ino)) {
- ntfs_error(sb, "Failed to empty journal $LogFile%s",
- es);
- NVolSetErrors(vol);
- return -EROFS;
- }
- if (!ntfs_mark_quotas_out_of_date(vol)) {
- ntfs_error(sb, "Failed to mark quotas out of date%s",
- es);
- NVolSetErrors(vol);
- return -EROFS;
- }
- if (!ntfs_stamp_usnjrnl(vol)) {
- ntfs_error(sb, "Failed to stamp transaction log "
- "($UsnJrnl)%s", es);
- NVolSetErrors(vol);
- return -EROFS;
- }
- } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
- /* Remounting read-only. */
- if (!NVolErrors(vol)) {
- if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
- ntfs_warning(sb, "Failed to clear dirty bit "
- "in volume information "
- "flags. Run chkdsk.");
- }
- }
-#endif /* NTFS_RW */
-
- // TODO: Deal with *flags.
-
- if (!parse_options(vol, opt))
- return -EINVAL;
-
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
- * @sb: Super block of the device to which @b belongs.
- * @b: Boot sector of device @sb to check.
- * @silent: If 'true', all output will be silenced.
- *
- * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
- * sector. Returns 'true' if it is valid and 'false' if not.
- *
- * @sb is only needed for warning/error output, i.e. it can be NULL when silent
- * is 'true'.
- */
-static bool is_boot_sector_ntfs(const struct super_block *sb,
- const NTFS_BOOT_SECTOR *b, const bool silent)
-{
- /*
- * Check that checksum == sum of u32 values from b to the checksum
- * field. If checksum is zero, no checking is done. We will work when
- * the checksum test fails, since some utilities update the boot sector
- * ignoring the checksum which leaves the checksum out-of-date. We
- * report a warning if this is the case.
- */
- if ((void*)b < (void*)&b->checksum && b->checksum && !silent) {
- le32 *u;
- u32 i;
-
- for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
- i += le32_to_cpup(u);
- if (le32_to_cpu(b->checksum) != i)
- ntfs_warning(sb, "Invalid boot sector checksum.");
- }
- /* Check OEMidentifier is "NTFS " */
- if (b->oem_id != magicNTFS)
- goto not_ntfs;
- /* Check bytes per sector value is between 256 and 4096. */
- if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
- le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
- goto not_ntfs;
- /* Check sectors per cluster value is valid. */
- switch (b->bpb.sectors_per_cluster) {
- case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
- break;
- default:
- goto not_ntfs;
- }
- /* Check the cluster size is not above the maximum (64kiB). */
- if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
- b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE)
- goto not_ntfs;
- /* Check reserved/unused fields are really zero. */
- if (le16_to_cpu(b->bpb.reserved_sectors) ||
- le16_to_cpu(b->bpb.root_entries) ||
- le16_to_cpu(b->bpb.sectors) ||
- le16_to_cpu(b->bpb.sectors_per_fat) ||
- le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
- goto not_ntfs;
- /* Check clusters per file mft record value is valid. */
- if ((u8)b->clusters_per_mft_record < 0xe1 ||
- (u8)b->clusters_per_mft_record > 0xf7)
- switch (b->clusters_per_mft_record) {
- case 1: case 2: case 4: case 8: case 16: case 32: case 64:
- break;
- default:
- goto not_ntfs;
- }
- /* Check clusters per index block value is valid. */
- if ((u8)b->clusters_per_index_record < 0xe1 ||
- (u8)b->clusters_per_index_record > 0xf7)
- switch (b->clusters_per_index_record) {
- case 1: case 2: case 4: case 8: case 16: case 32: case 64:
- break;
- default:
- goto not_ntfs;
- }
- /*
- * Check for valid end of sector marker. We will work without it, but
- * many BIOSes will refuse to boot from a bootsector if the magic is
- * incorrect, so we emit a warning.
- */
- if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
- ntfs_warning(sb, "Invalid end of sector marker.");
- return true;
-not_ntfs:
- return false;
-}
-
-/**
- * read_ntfs_boot_sector - read the NTFS boot sector of a device
- * @sb: super block of device to read the boot sector from
- * @silent: if true, suppress all output
- *
- * Reads the boot sector from the device and validates it. If that fails, tries
- * to read the backup boot sector, first from the end of the device a-la NT4 and
- * later and then from the middle of the device a-la NT3.51 and before.
- *
- * If a valid boot sector is found but it is not the primary boot sector, we
- * repair the primary boot sector silently (unless the device is read-only or
- * the primary boot sector is not accessible).
- *
- * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
- * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
- * to their respective values.
- *
- * Return the unlocked buffer head containing the boot sector or NULL on error.
- */
-static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
- const int silent)
-{
- const char *read_err_str = "Unable to read %s boot sector.";
- struct buffer_head *bh_primary, *bh_backup;
- sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
-
- /* Try to read primary boot sector. */
- if ((bh_primary = sb_bread(sb, 0))) {
- if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
- bh_primary->b_data, silent))
- return bh_primary;
- if (!silent)
- ntfs_error(sb, "Primary boot sector is invalid.");
- } else if (!silent)
- ntfs_error(sb, read_err_str, "primary");
- if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
- if (bh_primary)
- brelse(bh_primary);
- if (!silent)
- ntfs_error(sb, "Mount option errors=recover not used. "
- "Aborting without trying to recover.");
- return NULL;
- }
- /* Try to read NT4+ backup boot sector. */
- if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
- if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
- bh_backup->b_data, silent))
- goto hotfix_primary_boot_sector;
- brelse(bh_backup);
- } else if (!silent)
- ntfs_error(sb, read_err_str, "backup");
- /* Try to read NT3.51- backup boot sector. */
- if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
- if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
- bh_backup->b_data, silent))
- goto hotfix_primary_boot_sector;
- if (!silent)
- ntfs_error(sb, "Could not find a valid backup boot "
- "sector.");
- brelse(bh_backup);
- } else if (!silent)
- ntfs_error(sb, read_err_str, "backup");
- /* We failed. Cleanup and return. */
- if (bh_primary)
- brelse(bh_primary);
- return NULL;
-hotfix_primary_boot_sector:
- if (bh_primary) {
- /*
- * If we managed to read sector zero and the volume is not
- * read-only, copy the found, valid backup boot sector to the
- * primary boot sector. Note we only copy the actual boot
- * sector structure, not the actual whole device sector as that
- * may be bigger and would potentially damage the $Boot system
- * file (FIXME: Would be nice to know if the backup boot sector
- * on a large sector device contains the whole boot loader or
- * just the first 512 bytes).
- */
- if (!sb_rdonly(sb)) {
- ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
- "boot sector from backup copy.");
- memcpy(bh_primary->b_data, bh_backup->b_data,
- NTFS_BLOCK_SIZE);
- mark_buffer_dirty(bh_primary);
- sync_dirty_buffer(bh_primary);
- if (buffer_uptodate(bh_primary)) {
- brelse(bh_backup);
- return bh_primary;
- }
- ntfs_error(sb, "Hot-fix: Device write error while "
- "recovering primary boot sector.");
- } else {
- ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
- "sector failed: Read-only mount.");
- }
- brelse(bh_primary);
- }
- ntfs_warning(sb, "Using backup boot sector.");
- return bh_backup;
-}
-
-/**
- * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
- * @vol: volume structure to initialise with data from boot sector
- * @b: boot sector to parse
- *
- * Parse the ntfs boot sector @b and store all imporant information therein in
- * the ntfs super block @vol. Return 'true' on success and 'false' on error.
- */
-static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
-{
- unsigned int sectors_per_cluster_bits, nr_hidden_sects;
- int clusters_per_mft_record, clusters_per_index_record;
- s64 ll;
-
- vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
- vol->sector_size_bits = ffs(vol->sector_size) - 1;
- ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
- vol->sector_size);
- ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
- vol->sector_size_bits);
- if (vol->sector_size < vol->sb->s_blocksize) {
- ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
- "device block size (%lu). This is not "
- "supported. Sorry.", vol->sector_size,
- vol->sb->s_blocksize);
- return false;
- }
- ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
- sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
- ntfs_debug("sectors_per_cluster_bits = 0x%x",
- sectors_per_cluster_bits);
- nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
- ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
- vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
- vol->cluster_size_mask = vol->cluster_size - 1;
- vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
- ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
- vol->cluster_size);
- ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
- ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
- if (vol->cluster_size < vol->sector_size) {
- ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
- "sector size (%i). This is not supported. "
- "Sorry.", vol->cluster_size, vol->sector_size);
- return false;
- }
- clusters_per_mft_record = b->clusters_per_mft_record;
- ntfs_debug("clusters_per_mft_record = %i (0x%x)",
- clusters_per_mft_record, clusters_per_mft_record);
- if (clusters_per_mft_record > 0)
- vol->mft_record_size = vol->cluster_size <<
- (ffs(clusters_per_mft_record) - 1);
- else
- /*
- * When mft_record_size < cluster_size, clusters_per_mft_record
- * = -log2(mft_record_size) bytes. mft_record_size normaly is
- * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
- */
- vol->mft_record_size = 1 << -clusters_per_mft_record;
- vol->mft_record_size_mask = vol->mft_record_size - 1;
- vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
- ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
- vol->mft_record_size);
- ntfs_debug("vol->mft_record_size_mask = 0x%x",
- vol->mft_record_size_mask);
- ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
- vol->mft_record_size_bits, vol->mft_record_size_bits);
- /*
- * We cannot support mft record sizes above the PAGE_SIZE since
- * we store $MFT/$DATA, the table of mft records in the page cache.
- */
- if (vol->mft_record_size > PAGE_SIZE) {
- ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
- "PAGE_SIZE on your system (%lu). "
- "This is not supported. Sorry.",
- vol->mft_record_size, PAGE_SIZE);
- return false;
- }
- /* We cannot support mft record sizes below the sector size. */
- if (vol->mft_record_size < vol->sector_size) {
- ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
- "sector size (%i). This is not supported. "
- "Sorry.", vol->mft_record_size,
- vol->sector_size);
- return false;
- }
- clusters_per_index_record = b->clusters_per_index_record;
- ntfs_debug("clusters_per_index_record = %i (0x%x)",
- clusters_per_index_record, clusters_per_index_record);
- if (clusters_per_index_record > 0)
- vol->index_record_size = vol->cluster_size <<
- (ffs(clusters_per_index_record) - 1);
- else
- /*
- * When index_record_size < cluster_size,
- * clusters_per_index_record = -log2(index_record_size) bytes.
- * index_record_size normaly equals 4096 bytes, which is
- * encoded as 0xF4 (-12 in decimal).
- */
- vol->index_record_size = 1 << -clusters_per_index_record;
- vol->index_record_size_mask = vol->index_record_size - 1;
- vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
- ntfs_debug("vol->index_record_size = %i (0x%x)",
- vol->index_record_size, vol->index_record_size);
- ntfs_debug("vol->index_record_size_mask = 0x%x",
- vol->index_record_size_mask);
- ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
- vol->index_record_size_bits,
- vol->index_record_size_bits);
- /* We cannot support index record sizes below the sector size. */
- if (vol->index_record_size < vol->sector_size) {
- ntfs_error(vol->sb, "Index record size (%i) is smaller than "
- "the sector size (%i). This is not "
- "supported. Sorry.", vol->index_record_size,
- vol->sector_size);
- return false;
- }
- /*
- * Get the size of the volume in clusters and check for 64-bit-ness.
- * Windows currently only uses 32 bits to save the clusters so we do
- * the same as it is much faster on 32-bit CPUs.
- */
- ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
- if ((u64)ll >= 1ULL << 32) {
- ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry.");
- return false;
- }
- vol->nr_clusters = ll;
- ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
- /*
- * On an architecture where unsigned long is 32-bits, we restrict the
- * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
- * will hopefully optimize the whole check away.
- */
- if (sizeof(unsigned long) < 8) {
- if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
- ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
- "large for this architecture. "
- "Maximum supported is 2TiB. Sorry.",
- (unsigned long long)ll >> (40 -
- vol->cluster_size_bits));
- return false;
- }
- }
- ll = sle64_to_cpu(b->mft_lcn);
- if (ll >= vol->nr_clusters) {
- ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
- "volume. Weird.", (unsigned long long)ll,
- (unsigned long long)ll);
- return false;
- }
- vol->mft_lcn = ll;
- ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
- ll = sle64_to_cpu(b->mftmirr_lcn);
- if (ll >= vol->nr_clusters) {
- ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
- "of volume. Weird.", (unsigned long long)ll,
- (unsigned long long)ll);
- return false;
- }
- vol->mftmirr_lcn = ll;
- ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
-#ifdef NTFS_RW
- /*
- * Work out the size of the mft mirror in number of mft records. If the
- * cluster size is less than or equal to the size taken by four mft
- * records, the mft mirror stores the first four mft records. If the
- * cluster size is bigger than the size taken by four mft records, the
- * mft mirror contains as many mft records as will fit into one
- * cluster.
- */
- if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
- vol->mftmirr_size = 4;
- else
- vol->mftmirr_size = vol->cluster_size >>
- vol->mft_record_size_bits;
- ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
-#endif /* NTFS_RW */
- vol->serial_no = le64_to_cpu(b->volume_serial_number);
- ntfs_debug("vol->serial_no = 0x%llx",
- (unsigned long long)vol->serial_no);
- return true;
-}
-
-/**
- * ntfs_setup_allocators - initialize the cluster and mft allocators
- * @vol: volume structure for which to setup the allocators
- *
- * Setup the cluster (lcn) and mft allocators to the starting values.
- */
-static void ntfs_setup_allocators(ntfs_volume *vol)
-{
-#ifdef NTFS_RW
- LCN mft_zone_size, mft_lcn;
-#endif /* NTFS_RW */
-
- ntfs_debug("vol->mft_zone_multiplier = 0x%x",
- vol->mft_zone_multiplier);
-#ifdef NTFS_RW
- /* Determine the size of the MFT zone. */
- mft_zone_size = vol->nr_clusters;
- switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */
- case 4:
- mft_zone_size >>= 1; /* 50% */
- break;
- case 3:
- mft_zone_size = (mft_zone_size +
- (mft_zone_size >> 1)) >> 2; /* 37.5% */
- break;
- case 2:
- mft_zone_size >>= 2; /* 25% */
- break;
- /* case 1: */
- default:
- mft_zone_size >>= 3; /* 12.5% */
- break;
- }
- /* Setup the mft zone. */
- vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
- ntfs_debug("vol->mft_zone_pos = 0x%llx",
- (unsigned long long)vol->mft_zone_pos);
- /*
- * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
- * source) and if the actual mft_lcn is in the expected place or even
- * further to the front of the volume, extend the mft_zone to cover the
- * beginning of the volume as well. This is in order to protect the
- * area reserved for the mft bitmap as well within the mft_zone itself.
- * On non-standard volumes we do not protect it as the overhead would
- * be higher than the speed increase we would get by doing it.
- */
- mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
- if (mft_lcn * vol->cluster_size < 16 * 1024)
- mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
- vol->cluster_size;
- if (vol->mft_zone_start <= mft_lcn)
- vol->mft_zone_start = 0;
- ntfs_debug("vol->mft_zone_start = 0x%llx",
- (unsigned long long)vol->mft_zone_start);
- /*
- * Need to cap the mft zone on non-standard volumes so that it does
- * not point outside the boundaries of the volume. We do this by
- * halving the zone size until we are inside the volume.
- */
- vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
- while (vol->mft_zone_end >= vol->nr_clusters) {
- mft_zone_size >>= 1;
- vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
- }
- ntfs_debug("vol->mft_zone_end = 0x%llx",
- (unsigned long long)vol->mft_zone_end);
- /*
- * Set the current position within each data zone to the start of the
- * respective zone.
- */
- vol->data1_zone_pos = vol->mft_zone_end;
- ntfs_debug("vol->data1_zone_pos = 0x%llx",
- (unsigned long long)vol->data1_zone_pos);
- vol->data2_zone_pos = 0;
- ntfs_debug("vol->data2_zone_pos = 0x%llx",
- (unsigned long long)vol->data2_zone_pos);
-
- /* Set the mft data allocation position to mft record 24. */
- vol->mft_data_pos = 24;
- ntfs_debug("vol->mft_data_pos = 0x%llx",
- (unsigned long long)vol->mft_data_pos);
-#endif /* NTFS_RW */
-}
-
-#ifdef NTFS_RW
-
-/**
- * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
- * @vol: ntfs super block describing device whose mft mirror to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_mft_mirror(ntfs_volume *vol)
-{
- struct inode *tmp_ino;
- ntfs_inode *tmp_ni;
-
- ntfs_debug("Entering.");
- /* Get mft mirror inode. */
- tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
- if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
- if (!IS_ERR(tmp_ino))
- iput(tmp_ino);
- /* Caller will display error message. */
- return false;
- }
- /*
- * Re-initialize some specifics about $MFTMirr's inode as
- * ntfs_read_inode() will have set up the default ones.
- */
- /* Set uid and gid to root. */
- tmp_ino->i_uid = GLOBAL_ROOT_UID;
- tmp_ino->i_gid = GLOBAL_ROOT_GID;
- /* Regular file. No access for anyone. */
- tmp_ino->i_mode = S_IFREG;
- /* No VFS initiated operations allowed for $MFTMirr. */
- tmp_ino->i_op = &ntfs_empty_inode_ops;
- tmp_ino->i_fop = &ntfs_empty_file_ops;
- /* Put in our special address space operations. */
- tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
- tmp_ni = NTFS_I(tmp_ino);
- /* The $MFTMirr, like the $MFT is multi sector transfer protected. */
- NInoSetMstProtected(tmp_ni);
- NInoSetSparseDisabled(tmp_ni);
- /*
- * Set up our little cheat allowing us to reuse the async read io
- * completion handler for directories.
- */
- tmp_ni->itype.index.block_size = vol->mft_record_size;
- tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
- vol->mftmirr_ino = tmp_ino;
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * check_mft_mirror - compare contents of the mft mirror with the mft
- * @vol: ntfs super block describing device whose mft mirror to check
- *
- * Return 'true' on success or 'false' on error.
- *
- * Note, this function also results in the mft mirror runlist being completely
- * mapped into memory. The mft mirror write code requires this and will BUG()
- * should it find an unmapped runlist element.
- */
-static bool check_mft_mirror(ntfs_volume *vol)
-{
- struct super_block *sb = vol->sb;
- ntfs_inode *mirr_ni;
- struct page *mft_page, *mirr_page;
- u8 *kmft, *kmirr;
- runlist_element *rl, rl2[2];
- pgoff_t index;
- int mrecs_per_page, i;
-
- ntfs_debug("Entering.");
- /* Compare contents of $MFT and $MFTMirr. */
- mrecs_per_page = PAGE_SIZE / vol->mft_record_size;
- BUG_ON(!mrecs_per_page);
- BUG_ON(!vol->mftmirr_size);
- mft_page = mirr_page = NULL;
- kmft = kmirr = NULL;
- index = i = 0;
- do {
- u32 bytes;
-
- /* Switch pages if necessary. */
- if (!(i % mrecs_per_page)) {
- if (index) {
- ntfs_unmap_page(mft_page);
- ntfs_unmap_page(mirr_page);
- }
- /* Get the $MFT page. */
- mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
- index);
- if (IS_ERR(mft_page)) {
- ntfs_error(sb, "Failed to read $MFT.");
- return false;
- }
- kmft = page_address(mft_page);
- /* Get the $MFTMirr page. */
- mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
- index);
- if (IS_ERR(mirr_page)) {
- ntfs_error(sb, "Failed to read $MFTMirr.");
- goto mft_unmap_out;
- }
- kmirr = page_address(mirr_page);
- ++index;
- }
- /* Do not check the record if it is not in use. */
- if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) {
- /* Make sure the record is ok. */
- if (ntfs_is_baad_recordp((le32*)kmft)) {
- ntfs_error(sb, "Incomplete multi sector "
- "transfer detected in mft "
- "record %i.", i);
-mm_unmap_out:
- ntfs_unmap_page(mirr_page);
-mft_unmap_out:
- ntfs_unmap_page(mft_page);
- return false;
- }
- }
- /* Do not check the mirror record if it is not in use. */
- if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) {
- if (ntfs_is_baad_recordp((le32*)kmirr)) {
- ntfs_error(sb, "Incomplete multi sector "
- "transfer detected in mft "
- "mirror record %i.", i);
- goto mm_unmap_out;
- }
- }
- /* Get the amount of data in the current record. */
- bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
- if (bytes < sizeof(MFT_RECORD_OLD) ||
- bytes > vol->mft_record_size ||
- ntfs_is_baad_recordp((le32*)kmft)) {
- bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
- if (bytes < sizeof(MFT_RECORD_OLD) ||
- bytes > vol->mft_record_size ||
- ntfs_is_baad_recordp((le32*)kmirr))
- bytes = vol->mft_record_size;
- }
- /* Compare the two records. */
- if (memcmp(kmft, kmirr, bytes)) {
- ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
- "match. Run ntfsfix or chkdsk.", i);
- goto mm_unmap_out;
- }
- kmft += vol->mft_record_size;
- kmirr += vol->mft_record_size;
- } while (++i < vol->mftmirr_size);
- /* Release the last pages. */
- ntfs_unmap_page(mft_page);
- ntfs_unmap_page(mirr_page);
-
- /* Construct the mft mirror runlist by hand. */
- rl2[0].vcn = 0;
- rl2[0].lcn = vol->mftmirr_lcn;
- rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
- vol->cluster_size - 1) / vol->cluster_size;
- rl2[1].vcn = rl2[0].length;
- rl2[1].lcn = LCN_ENOENT;
- rl2[1].length = 0;
- /*
- * Because we have just read all of the mft mirror, we know we have
- * mapped the full runlist for it.
- */
- mirr_ni = NTFS_I(vol->mftmirr_ino);
- down_read(&mirr_ni->runlist.lock);
- rl = mirr_ni->runlist.rl;
- /* Compare the two runlists. They must be identical. */
- i = 0;
- do {
- if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
- rl2[i].length != rl[i].length) {
- ntfs_error(sb, "$MFTMirr location mismatch. "
- "Run chkdsk.");
- up_read(&mirr_ni->runlist.lock);
- return false;
- }
- } while (rl2[i++].length);
- up_read(&mirr_ni->runlist.lock);
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * load_and_check_logfile - load and check the logfile inode for a volume
- * @vol: ntfs super block describing device whose logfile to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_check_logfile(ntfs_volume *vol,
- RESTART_PAGE_HEADER **rp)
-{
- struct inode *tmp_ino;
-
- ntfs_debug("Entering.");
- tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
- if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
- if (!IS_ERR(tmp_ino))
- iput(tmp_ino);
- /* Caller will display error message. */
- return false;
- }
- if (!ntfs_check_logfile(tmp_ino, rp)) {
- iput(tmp_ino);
- /* ntfs_check_logfile() will have displayed error output. */
- return false;
- }
- NInoSetSparseDisabled(NTFS_I(tmp_ino));
- vol->logfile_ino = tmp_ino;
- ntfs_debug("Done.");
- return true;
-}
-
-#define NTFS_HIBERFIL_HEADER_SIZE 4096
-
-/**
- * check_windows_hibernation_status - check if Windows is suspended on a volume
- * @vol: ntfs super block of device to check
- *
- * Check if Windows is hibernated on the ntfs volume @vol. This is done by
- * looking for the file hiberfil.sys in the root directory of the volume. If
- * the file is not present Windows is definitely not suspended.
- *
- * If hiberfil.sys exists and is less than 4kiB in size it means Windows is
- * definitely suspended (this volume is not the system volume). Caveat: on a
- * system with many volumes it is possible that the < 4kiB check is bogus but
- * for now this should do fine.
- *
- * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the
- * hiberfil header (which is the first 4kiB). If this begins with "hibr",
- * Windows is definitely suspended. If it is completely full of zeroes,
- * Windows is definitely not hibernated. Any other case is treated as if
- * Windows is suspended. This caters for the above mentioned caveat of a
- * system with many volumes where no "hibr" magic would be present and there is
- * no zero header.
- *
- * Return 0 if Windows is not hibernated on the volume, >0 if Windows is
- * hibernated on the volume, and -errno on error.
- */
-static int check_windows_hibernation_status(ntfs_volume *vol)
-{
- MFT_REF mref;
- struct inode *vi;
- struct page *page;
- u32 *kaddr, *kend;
- ntfs_name *name = NULL;
- int ret = 1;
- static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
- cpu_to_le16('i'), cpu_to_le16('b'),
- cpu_to_le16('e'), cpu_to_le16('r'),
- cpu_to_le16('f'), cpu_to_le16('i'),
- cpu_to_le16('l'), cpu_to_le16('.'),
- cpu_to_le16('s'), cpu_to_le16('y'),
- cpu_to_le16('s'), 0 };
-
- ntfs_debug("Entering.");
- /*
- * Find the inode number for the hibernation file by looking up the
- * filename hiberfil.sys in the root directory.
- */
- inode_lock(vol->root_ino);
- mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
- &name);
- inode_unlock(vol->root_ino);
- if (IS_ERR_MREF(mref)) {
- ret = MREF_ERR(mref);
- /* If the file does not exist, Windows is not hibernated. */
- if (ret == -ENOENT) {
- ntfs_debug("hiberfil.sys not present. Windows is not "
- "hibernated on the volume.");
- return 0;
- }
- /* A real error occurred. */
- ntfs_error(vol->sb, "Failed to find inode number for "
- "hiberfil.sys.");
- return ret;
- }
- /* We do not care for the type of match that was found. */
- kfree(name);
- /* Get the inode. */
- vi = ntfs_iget(vol->sb, MREF(mref));
- if (IS_ERR(vi) || is_bad_inode(vi)) {
- if (!IS_ERR(vi))
- iput(vi);
- ntfs_error(vol->sb, "Failed to load hiberfil.sys.");
- return IS_ERR(vi) ? PTR_ERR(vi) : -EIO;
- }
- if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) {
- ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). "
- "Windows is hibernated on the volume. This "
- "is not the system volume.", i_size_read(vi));
- goto iput_out;
- }
- page = ntfs_map_page(vi->i_mapping, 0);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
- ret = PTR_ERR(page);
- goto iput_out;
- }
- kaddr = (u32*)page_address(page);
- if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
- ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is "
- "hibernated on the volume. This is the "
- "system volume.");
- goto unm_iput_out;
- }
- kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr);
- do {
- if (unlikely(*kaddr)) {
- ntfs_debug("hiberfil.sys is larger than 4kiB "
- "(0x%llx), does not contain the "
- "\"hibr\" magic, and does not have a "
- "zero header. Windows is hibernated "
- "on the volume. This is not the "
- "system volume.", i_size_read(vi));
- goto unm_iput_out;
- }
- } while (++kaddr < kend);
- ntfs_debug("hiberfil.sys contains a zero header. Windows is not "
- "hibernated on the volume. This is the system "
- "volume.");
- ret = 0;
-unm_iput_out:
- ntfs_unmap_page(page);
-iput_out:
- iput(vi);
- return ret;
-}
-
-/**
- * load_and_init_quota - load and setup the quota file for a volume if present
- * @vol: ntfs super block describing device whose quota file to load
- *
- * Return 'true' on success or 'false' on error. If $Quota is not present, we
- * leave vol->quota_ino as NULL and return success.
- */
-static bool load_and_init_quota(ntfs_volume *vol)
-{
- MFT_REF mref;
- struct inode *tmp_ino;
- ntfs_name *name = NULL;
- static const ntfschar Quota[7] = { cpu_to_le16('$'),
- cpu_to_le16('Q'), cpu_to_le16('u'),
- cpu_to_le16('o'), cpu_to_le16('t'),
- cpu_to_le16('a'), 0 };
- static ntfschar Q[3] = { cpu_to_le16('$'),
- cpu_to_le16('Q'), 0 };
-
- ntfs_debug("Entering.");
- /*
- * Find the inode number for the quota file by looking up the filename
- * $Quota in the extended system files directory $Extend.
- */
- inode_lock(vol->extend_ino);
- mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
- &name);
- inode_unlock(vol->extend_ino);
- if (IS_ERR_MREF(mref)) {
- /*
- * If the file does not exist, quotas are disabled and have
- * never been enabled on this volume, just return success.
- */
- if (MREF_ERR(mref) == -ENOENT) {
- ntfs_debug("$Quota not present. Volume does not have "
- "quotas enabled.");
- /*
- * No need to try to set quotas out of date if they are
- * not enabled.
- */
- NVolSetQuotaOutOfDate(vol);
- return true;
- }
- /* A real error occurred. */
- ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
- return false;
- }
- /* We do not care for the type of match that was found. */
- kfree(name);
- /* Get the inode. */
- tmp_ino = ntfs_iget(vol->sb, MREF(mref));
- if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
- if (!IS_ERR(tmp_ino))
- iput(tmp_ino);
- ntfs_error(vol->sb, "Failed to load $Quota.");
- return false;
- }
- vol->quota_ino = tmp_ino;
- /* Get the $Q index allocation attribute. */
- tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
- if (IS_ERR(tmp_ino)) {
- ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
- return false;
- }
- vol->quota_q_ino = tmp_ino;
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * load_and_init_usnjrnl - load and setup the transaction log if present
- * @vol: ntfs super block describing device whose usnjrnl file to load
- *
- * Return 'true' on success or 'false' on error.
- *
- * If $UsnJrnl is not present or in the process of being disabled, we set
- * NVolUsnJrnlStamped() and return success.
- *
- * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn,
- * i.e. transaction logging has only just been enabled or the journal has been
- * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
- * and return success.
- */
-static bool load_and_init_usnjrnl(ntfs_volume *vol)
-{
- MFT_REF mref;
- struct inode *tmp_ino;
- ntfs_inode *tmp_ni;
- struct page *page;
- ntfs_name *name = NULL;
- USN_HEADER *uh;
- static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
- cpu_to_le16('U'), cpu_to_le16('s'),
- cpu_to_le16('n'), cpu_to_le16('J'),
- cpu_to_le16('r'), cpu_to_le16('n'),
- cpu_to_le16('l'), 0 };
- static ntfschar Max[5] = { cpu_to_le16('$'),
- cpu_to_le16('M'), cpu_to_le16('a'),
- cpu_to_le16('x'), 0 };
- static ntfschar J[3] = { cpu_to_le16('$'),
- cpu_to_le16('J'), 0 };
-
- ntfs_debug("Entering.");
- /*
- * Find the inode number for the transaction log file by looking up the
- * filename $UsnJrnl in the extended system files directory $Extend.
- */
- inode_lock(vol->extend_ino);
- mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
- &name);
- inode_unlock(vol->extend_ino);
- if (IS_ERR_MREF(mref)) {
- /*
- * If the file does not exist, transaction logging is disabled,
- * just return success.
- */
- if (MREF_ERR(mref) == -ENOENT) {
- ntfs_debug("$UsnJrnl not present. Volume does not "
- "have transaction logging enabled.");
-not_enabled:
- /*
- * No need to try to stamp the transaction log if
- * transaction logging is not enabled.
- */
- NVolSetUsnJrnlStamped(vol);
- return true;
- }
- /* A real error occurred. */
- ntfs_error(vol->sb, "Failed to find inode number for "
- "$UsnJrnl.");
- return false;
- }
- /* We do not care for the type of match that was found. */
- kfree(name);
- /* Get the inode. */
- tmp_ino = ntfs_iget(vol->sb, MREF(mref));
- if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) {
- if (!IS_ERR(tmp_ino))
- iput(tmp_ino);
- ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
- return false;
- }
- vol->usnjrnl_ino = tmp_ino;
- /*
- * If the transaction log is in the process of being deleted, we can
- * ignore it.
- */
- if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) {
- ntfs_debug("$UsnJrnl in the process of being disabled. "
- "Volume does not have transaction logging "
- "enabled.");
- goto not_enabled;
- }
- /* Get the $DATA/$Max attribute. */
- tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4);
- if (IS_ERR(tmp_ino)) {
- ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
- "attribute.");
- return false;
- }
- vol->usnjrnl_max_ino = tmp_ino;
- if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
- ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
- "attribute (size is 0x%llx but should be at "
- "least 0x%zx bytes).", i_size_read(tmp_ino),
- sizeof(USN_HEADER));
- return false;
- }
- /* Get the $DATA/$J attribute. */
- tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
- if (IS_ERR(tmp_ino)) {
- ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
- "attribute.");
- return false;
- }
- vol->usnjrnl_j_ino = tmp_ino;
- /* Verify $J is non-resident and sparse. */
- tmp_ni = NTFS_I(vol->usnjrnl_j_ino);
- if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
- ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
- "and/or not sparse.");
- return false;
- }
- /* Read the USN_HEADER from $DATA/$Max. */
- page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
- "attribute.");
- return false;
- }
- uh = (USN_HEADER*)page_address(page);
- /* Sanity check the $Max. */
- if (unlikely(sle64_to_cpu(uh->allocation_delta) >
- sle64_to_cpu(uh->maximum_size))) {
- ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds "
- "maximum size (0x%llx). $UsnJrnl is corrupt.",
- (long long)sle64_to_cpu(uh->allocation_delta),
- (long long)sle64_to_cpu(uh->maximum_size));
- ntfs_unmap_page(page);
- return false;
- }
- /*
- * If the transaction log has been stamped and nothing has been written
- * to it since, we do not need to stamp it.
- */
- if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >=
- i_size_read(vol->usnjrnl_j_ino))) {
- if (likely(sle64_to_cpu(uh->lowest_valid_usn) ==
- i_size_read(vol->usnjrnl_j_ino))) {
- ntfs_unmap_page(page);
- ntfs_debug("$UsnJrnl is enabled but nothing has been "
- "logged since it was last stamped. "
- "Treating this as if the volume does "
- "not have transaction logging "
- "enabled.");
- goto not_enabled;
- }
- ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) "
- "which is out of bounds (0x%llx). $UsnJrnl "
- "is corrupt.",
- (long long)sle64_to_cpu(uh->lowest_valid_usn),
- i_size_read(vol->usnjrnl_j_ino));
- ntfs_unmap_page(page);
- return false;
- }
- ntfs_unmap_page(page);
- ntfs_debug("Done.");
- return true;
-}
-
-/**
- * load_and_init_attrdef - load the attribute definitions table for a volume
- * @vol: ntfs super block describing device whose attrdef to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_attrdef(ntfs_volume *vol)
-{
- loff_t i_size;
- struct super_block *sb = vol->sb;
- struct inode *ino;
- struct page *page;
- pgoff_t index, max_index;
- unsigned int size;
-
- ntfs_debug("Entering.");
- /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
- ino = ntfs_iget(sb, FILE_AttrDef);
- if (IS_ERR(ino) || is_bad_inode(ino)) {
- if (!IS_ERR(ino))
- iput(ino);
- goto failed;
- }
- NInoSetSparseDisabled(NTFS_I(ino));
- /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
- i_size = i_size_read(ino);
- if (i_size <= 0 || i_size > 0x7fffffff)
- goto iput_failed;
- vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size);
- if (!vol->attrdef)
- goto iput_failed;
- index = 0;
- max_index = i_size >> PAGE_SHIFT;
- size = PAGE_SIZE;
- while (index < max_index) {
- /* Read the attrdef table and copy it into the linear buffer. */
-read_partial_attrdef_page:
- page = ntfs_map_page(ino->i_mapping, index);
- if (IS_ERR(page))
- goto free_iput_failed;
- memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
- page_address(page), size);
- ntfs_unmap_page(page);
- }
- if (size == PAGE_SIZE) {
- size = i_size & ~PAGE_MASK;
- if (size)
- goto read_partial_attrdef_page;
- }
- vol->attrdef_size = i_size;
- ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
- iput(ino);
- return true;
-free_iput_failed:
- ntfs_free(vol->attrdef);
- vol->attrdef = NULL;
-iput_failed:
- iput(ino);
-failed:
- ntfs_error(sb, "Failed to initialize attribute definition table.");
- return false;
-}
-
-#endif /* NTFS_RW */
-
-/**
- * load_and_init_upcase - load the upcase table for an ntfs volume
- * @vol: ntfs super block describing device whose upcase to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_upcase(ntfs_volume *vol)
-{
- loff_t i_size;
- struct super_block *sb = vol->sb;
- struct inode *ino;
- struct page *page;
- pgoff_t index, max_index;
- unsigned int size;
- int i, max;
-
- ntfs_debug("Entering.");
- /* Read upcase table and setup vol->upcase and vol->upcase_len. */
- ino = ntfs_iget(sb, FILE_UpCase);
- if (IS_ERR(ino) || is_bad_inode(ino)) {
- if (!IS_ERR(ino))
- iput(ino);
- goto upcase_failed;
- }
- /*
- * The upcase size must not be above 64k Unicode characters, must not
- * be zero and must be a multiple of sizeof(ntfschar).
- */
- i_size = i_size_read(ino);
- if (!i_size || i_size & (sizeof(ntfschar) - 1) ||
- i_size > 64ULL * 1024 * sizeof(ntfschar))
- goto iput_upcase_failed;
- vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size);
- if (!vol->upcase)
- goto iput_upcase_failed;
- index = 0;
- max_index = i_size >> PAGE_SHIFT;
- size = PAGE_SIZE;
- while (index < max_index) {
- /* Read the upcase table and copy it into the linear buffer. */
-read_partial_upcase_page:
- page = ntfs_map_page(ino->i_mapping, index);
- if (IS_ERR(page))
- goto iput_upcase_failed;
- memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
- page_address(page), size);
- ntfs_unmap_page(page);
- }
- if (size == PAGE_SIZE) {
- size = i_size & ~PAGE_MASK;
- if (size)
- goto read_partial_upcase_page;
- }
- vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS;
- ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
- i_size, 64 * 1024 * sizeof(ntfschar));
- iput(ino);
- mutex_lock(&ntfs_lock);
- if (!default_upcase) {
- ntfs_debug("Using volume specified $UpCase since default is "
- "not present.");
- mutex_unlock(&ntfs_lock);
- return true;
- }
- max = default_upcase_len;
- if (max > vol->upcase_len)
- max = vol->upcase_len;
- for (i = 0; i < max; i++)
- if (vol->upcase[i] != default_upcase[i])
- break;
- if (i == max) {
- ntfs_free(vol->upcase);
- vol->upcase = default_upcase;
- vol->upcase_len = max;
- ntfs_nr_upcase_users++;
- mutex_unlock(&ntfs_lock);
- ntfs_debug("Volume specified $UpCase matches default. Using "
- "default.");
- return true;
- }
- mutex_unlock(&ntfs_lock);
- ntfs_debug("Using volume specified $UpCase since it does not match "
- "the default.");
- return true;
-iput_upcase_failed:
- iput(ino);
- ntfs_free(vol->upcase);
- vol->upcase = NULL;
-upcase_failed:
- mutex_lock(&ntfs_lock);
- if (default_upcase) {
- vol->upcase = default_upcase;
- vol->upcase_len = default_upcase_len;
- ntfs_nr_upcase_users++;
- mutex_unlock(&ntfs_lock);
- ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
- "default.");
- return true;
- }
- mutex_unlock(&ntfs_lock);
- ntfs_error(sb, "Failed to initialize upcase table.");
- return false;
-}
-
-/*
- * The lcn and mft bitmap inodes are NTFS-internal inodes with
- * their own special locking rules:
- */
-static struct lock_class_key
- lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
- mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
-
-/**
- * load_system_files - open the system files using normal functions
- * @vol: ntfs super block describing device whose system files to load
- *
- * Open the system files with normal access functions and complete setting up
- * the ntfs super block @vol.
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_system_files(ntfs_volume *vol)
-{
- struct super_block *sb = vol->sb;
- MFT_RECORD *m;
- VOLUME_INFORMATION *vi;
- ntfs_attr_search_ctx *ctx;
-#ifdef NTFS_RW
- RESTART_PAGE_HEADER *rp;
- int err;
-#endif /* NTFS_RW */
-
- ntfs_debug("Entering.");
-#ifdef NTFS_RW
- /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
- if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
- static const char *es1 = "Failed to load $MFTMirr";
- static const char *es2 = "$MFTMirr does not match $MFT";
- static const char *es3 = ". Run ntfsfix and/or chkdsk.";
-
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- !vol->mftmirr_ino ? es1 : es2,
- es3);
- goto iput_mirr_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s",
- !vol->mftmirr_ino ? es1 : es2, es3);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s",
- !vol->mftmirr_ino ? es1 : es2, es3);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
-#endif /* NTFS_RW */
- /* Get mft bitmap attribute inode. */
- vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
- if (IS_ERR(vol->mftbmp_ino)) {
- ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
- goto iput_mirr_err_out;
- }
- lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
- &mftbmp_runlist_lock_key);
- lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
- &mftbmp_mrec_lock_key);
- /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
- if (!load_and_init_upcase(vol))
- goto iput_mftbmp_err_out;
-#ifdef NTFS_RW
- /*
- * Read attribute definitions table and setup @vol->attrdef and
- * @vol->attrdef_size.
- */
- if (!load_and_init_attrdef(vol))
- goto iput_upcase_err_out;
-#endif /* NTFS_RW */
- /*
- * Get the cluster allocation bitmap inode and verify the size, no
- * need for any locking at this stage as we are already running
- * exclusively as we are mount in progress task.
- */
- vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
- if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
- if (!IS_ERR(vol->lcnbmp_ino))
- iput(vol->lcnbmp_ino);
- goto bitmap_failed;
- }
- lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
- &lcnbmp_runlist_lock_key);
- lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
- &lcnbmp_mrec_lock_key);
-
- NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
- if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
- iput(vol->lcnbmp_ino);
-bitmap_failed:
- ntfs_error(sb, "Failed to load $Bitmap.");
- goto iput_attrdef_err_out;
- }
- /*
- * Get the volume inode and setup our cache of the volume flags and
- * version.
- */
- vol->vol_ino = ntfs_iget(sb, FILE_Volume);
- if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
- if (!IS_ERR(vol->vol_ino))
- iput(vol->vol_ino);
-volume_failed:
- ntfs_error(sb, "Failed to load $Volume.");
- goto iput_lcnbmp_err_out;
- }
- m = map_mft_record(NTFS_I(vol->vol_ino));
- if (IS_ERR(m)) {
-iput_volume_failed:
- iput(vol->vol_ino);
- goto volume_failed;
- }
- if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
- ntfs_error(sb, "Failed to get attribute search context.");
- goto get_ctx_vol_failed;
- }
- if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
- ctx) || ctx->attr->non_resident || ctx->attr->flags) {
-err_put_vol:
- ntfs_attr_put_search_ctx(ctx);
-get_ctx_vol_failed:
- unmap_mft_record(NTFS_I(vol->vol_ino));
- goto iput_volume_failed;
- }
- vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
- le16_to_cpu(ctx->attr->data.resident.value_offset));
- /* Some bounds checks. */
- if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
- le32_to_cpu(ctx->attr->data.resident.value_length) >
- (u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
- goto err_put_vol;
- /* Copy the volume flags and version to the ntfs_volume structure. */
- vol->vol_flags = vi->flags;
- vol->major_ver = vi->major_ver;
- vol->minor_ver = vi->minor_ver;
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(NTFS_I(vol->vol_ino));
- pr_info("volume version %i.%i.\n", vol->major_ver,
- vol->minor_ver);
- if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
- ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
- "volume version %i.%i (need at least version "
- "3.0).", vol->major_ver, vol->minor_ver);
- NVolClearSparseEnabled(vol);
- }
-#ifdef NTFS_RW
- /* Make sure that no unsupported volume flags are set. */
- if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
- static const char *es1a = "Volume is dirty";
- static const char *es1b = "Volume has been modified by chkdsk";
- static const char *es1c = "Volume has unsupported flags set";
- static const char *es2a = ". Run chkdsk and mount in Windows.";
- static const char *es2b = ". Mount in Windows.";
- const char *es1, *es2;
-
- es2 = es2a;
- if (vol->vol_flags & VOLUME_IS_DIRTY)
- es1 = es1a;
- else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
- es1 = es1b;
- es2 = es2b;
- } else {
- es1 = es1c;
- ntfs_warning(sb, "Unsupported volume flags 0x%x "
- "encountered.",
- (unsigned)le16_to_cpu(vol->vol_flags));
- }
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- goto iput_vol_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /*
- * Do not set NVolErrors() because ntfs_remount() re-checks the
- * flags which we need to do in case any flags have changed.
- */
- }
- /*
- * Get the inode for the logfile, check it and determine if the volume
- * was shutdown cleanly.
- */
- rp = NULL;
- if (!load_and_check_logfile(vol, &rp) ||
- !ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
- static const char *es1a = "Failed to load $LogFile";
- static const char *es1b = "$LogFile is not clean";
- static const char *es2 = ". Mount in Windows.";
- const char *es1;
-
- es1 = !vol->logfile_ino ? es1a : es1b;
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- if (vol->logfile_ino) {
- BUG_ON(!rp);
- ntfs_free(rp);
- }
- goto iput_logfile_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
- ntfs_free(rp);
-#endif /* NTFS_RW */
- /* Get the root directory inode so we can do path lookups. */
- vol->root_ino = ntfs_iget(sb, FILE_root);
- if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
- if (!IS_ERR(vol->root_ino))
- iput(vol->root_ino);
- ntfs_error(sb, "Failed to load root directory.");
- goto iput_logfile_err_out;
- }
-#ifdef NTFS_RW
- /*
- * Check if Windows is suspended to disk on the target volume. If it
- * is hibernated, we must not write *anything* to the disk so set
- * NVolErrors() without setting the dirty volume flag and mount
- * read-only. This will prevent read-write remounting and it will also
- * prevent all writes.
- */
- err = check_windows_hibernation_status(vol);
- if (unlikely(err)) {
- static const char *es1a = "Failed to determine if Windows is "
- "hibernated";
- static const char *es1b = "Windows is hibernated";
- static const char *es2 = ". Run chkdsk.";
- const char *es1;
-
- es1 = err < 0 ? es1a : es1b;
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- goto iput_root_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
- /* If (still) a read-write mount, mark the volume dirty. */
- if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
- static const char *es1 = "Failed to set dirty bit in volume "
- "information flags";
- static const char *es2 = ". Run chkdsk.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_root_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- /*
- * Do not set NVolErrors() because ntfs_remount() might manage
- * to set the dirty flag in which case all would be well.
- */
- }
-#if 0
- // TODO: Enable this code once we start modifying anything that is
- // different between NTFS 1.2 and 3.x...
- /*
- * If (still) a read-write mount, set the NT4 compatibility flag on
- * newer NTFS version volumes.
- */
- if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
- ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
- static const char *es1 = "Failed to set NT4 compatibility flag";
- static const char *es2 = ". Run chkdsk.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_root_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- NVolSetErrors(vol);
- }
-#endif
- /* If (still) a read-write mount, empty the logfile. */
- if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) {
- static const char *es1 = "Failed to empty $LogFile";
- static const char *es2 = ". Mount in Windows.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_root_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- NVolSetErrors(vol);
- }
-#endif /* NTFS_RW */
- /* If on NTFS versions before 3.0, we are done. */
- if (unlikely(vol->major_ver < 3))
- return true;
- /* NTFS 3.0+ specific initialization. */
- /* Get the security descriptors inode. */
- vol->secure_ino = ntfs_iget(sb, FILE_Secure);
- if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
- if (!IS_ERR(vol->secure_ino))
- iput(vol->secure_ino);
- ntfs_error(sb, "Failed to load $Secure.");
- goto iput_root_err_out;
- }
- // TODO: Initialize security.
- /* Get the extended system files' directory inode. */
- vol->extend_ino = ntfs_iget(sb, FILE_Extend);
- if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
- !S_ISDIR(vol->extend_ino->i_mode)) {
- if (!IS_ERR(vol->extend_ino))
- iput(vol->extend_ino);
- ntfs_error(sb, "Failed to load $Extend.");
- goto iput_sec_err_out;
- }
-#ifdef NTFS_RW
- /* Find the quota file, load it if present, and set it up. */
- if (!load_and_init_quota(vol)) {
- static const char *es1 = "Failed to load $Quota";
- static const char *es2 = ". Run chkdsk.";
-
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- goto iput_quota_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
- /* If (still) a read-write mount, mark the quotas out of date. */
- if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) {
- static const char *es1 = "Failed to mark quotas out of date";
- static const char *es2 = ". Run chkdsk.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_quota_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- NVolSetErrors(vol);
- }
- /*
- * Find the transaction log file ($UsnJrnl), load it if present, check
- * it, and set it up.
- */
- if (!load_and_init_usnjrnl(vol)) {
- static const char *es1 = "Failed to load $UsnJrnl";
- static const char *es2 = ". Run chkdsk.";
-
- /* If a read-write mount, convert it to a read-only mount. */
- if (!sb_rdonly(sb)) {
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors="
- "continue nor on_errors="
- "remount-ro was specified%s",
- es1, es2);
- goto iput_usnjrnl_err_out;
- }
- sb->s_flags |= SB_RDONLY;
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- } else
- ntfs_warning(sb, "%s. Will not be able to remount "
- "read-write%s", es1, es2);
- /* This will prevent a read-write remount. */
- NVolSetErrors(vol);
- }
- /* If (still) a read-write mount, stamp the transaction log. */
- if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) {
- static const char *es1 = "Failed to stamp transaction log "
- "($UsnJrnl)";
- static const char *es2 = ". Run chkdsk.";
-
- /* Convert to a read-only mount. */
- if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
- ON_ERRORS_CONTINUE))) {
- ntfs_error(sb, "%s and neither on_errors=continue nor "
- "on_errors=remount-ro was specified%s",
- es1, es2);
- goto iput_usnjrnl_err_out;
- }
- ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
- sb->s_flags |= SB_RDONLY;
- NVolSetErrors(vol);
- }
-#endif /* NTFS_RW */
- return true;
-#ifdef NTFS_RW
-iput_usnjrnl_err_out:
- iput(vol->usnjrnl_j_ino);
- iput(vol->usnjrnl_max_ino);
- iput(vol->usnjrnl_ino);
-iput_quota_err_out:
- iput(vol->quota_q_ino);
- iput(vol->quota_ino);
- iput(vol->extend_ino);
-#endif /* NTFS_RW */
-iput_sec_err_out:
- iput(vol->secure_ino);
-iput_root_err_out:
- iput(vol->root_ino);
-iput_logfile_err_out:
-#ifdef NTFS_RW
- iput(vol->logfile_ino);
-iput_vol_err_out:
-#endif /* NTFS_RW */
- iput(vol->vol_ino);
-iput_lcnbmp_err_out:
- iput(vol->lcnbmp_ino);
-iput_attrdef_err_out:
- vol->attrdef_size = 0;
- if (vol->attrdef) {
- ntfs_free(vol->attrdef);
- vol->attrdef = NULL;
- }
-#ifdef NTFS_RW
-iput_upcase_err_out:
-#endif /* NTFS_RW */
- vol->upcase_len = 0;
- mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
- ntfs_nr_upcase_users--;
- vol->upcase = NULL;
- }
- mutex_unlock(&ntfs_lock);
- if (vol->upcase) {
- ntfs_free(vol->upcase);
- vol->upcase = NULL;
- }
-iput_mftbmp_err_out:
- iput(vol->mftbmp_ino);
-iput_mirr_err_out:
-#ifdef NTFS_RW
- iput(vol->mftmirr_ino);
-#endif /* NTFS_RW */
- return false;
-}
-
-/**
- * ntfs_put_super - called by the vfs to unmount a volume
- * @sb: vfs superblock of volume to unmount
- *
- * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
- * the volume is being unmounted (umount system call has been invoked) and it
- * releases all inodes and memory belonging to the NTFS specific part of the
- * super block.
- */
-static void ntfs_put_super(struct super_block *sb)
-{
- ntfs_volume *vol = NTFS_SB(sb);
-
- ntfs_debug("Entering.");
-
-#ifdef NTFS_RW
- /*
- * Commit all inodes while they are still open in case some of them
- * cause others to be dirtied.
- */
- ntfs_commit_inode(vol->vol_ino);
-
- /* NTFS 3.0+ specific. */
- if (vol->major_ver >= 3) {
- if (vol->usnjrnl_j_ino)
- ntfs_commit_inode(vol->usnjrnl_j_ino);
- if (vol->usnjrnl_max_ino)
- ntfs_commit_inode(vol->usnjrnl_max_ino);
- if (vol->usnjrnl_ino)
- ntfs_commit_inode(vol->usnjrnl_ino);
- if (vol->quota_q_ino)
- ntfs_commit_inode(vol->quota_q_ino);
- if (vol->quota_ino)
- ntfs_commit_inode(vol->quota_ino);
- if (vol->extend_ino)
- ntfs_commit_inode(vol->extend_ino);
- if (vol->secure_ino)
- ntfs_commit_inode(vol->secure_ino);
- }
-
- ntfs_commit_inode(vol->root_ino);
-
- down_write(&vol->lcnbmp_lock);
- ntfs_commit_inode(vol->lcnbmp_ino);
- up_write(&vol->lcnbmp_lock);
-
- down_write(&vol->mftbmp_lock);
- ntfs_commit_inode(vol->mftbmp_ino);
- up_write(&vol->mftbmp_lock);
-
- if (vol->logfile_ino)
- ntfs_commit_inode(vol->logfile_ino);
-
- if (vol->mftmirr_ino)
- ntfs_commit_inode(vol->mftmirr_ino);
- ntfs_commit_inode(vol->mft_ino);
-
- /*
- * If a read-write mount and no volume errors have occurred, mark the
- * volume clean. Also, re-commit all affected inodes.
- */
- if (!sb_rdonly(sb)) {
- if (!NVolErrors(vol)) {
- if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
- ntfs_warning(sb, "Failed to clear dirty bit "
- "in volume information "
- "flags. Run chkdsk.");
- ntfs_commit_inode(vol->vol_ino);
- ntfs_commit_inode(vol->root_ino);
- if (vol->mftmirr_ino)
- ntfs_commit_inode(vol->mftmirr_ino);
- ntfs_commit_inode(vol->mft_ino);
- } else {
- ntfs_warning(sb, "Volume has errors. Leaving volume "
- "marked dirty. Run chkdsk.");
- }
- }
-#endif /* NTFS_RW */
-
- iput(vol->vol_ino);
- vol->vol_ino = NULL;
-
- /* NTFS 3.0+ specific clean up. */
- if (vol->major_ver >= 3) {
-#ifdef NTFS_RW
- if (vol->usnjrnl_j_ino) {
- iput(vol->usnjrnl_j_ino);
- vol->usnjrnl_j_ino = NULL;
- }
- if (vol->usnjrnl_max_ino) {
- iput(vol->usnjrnl_max_ino);
- vol->usnjrnl_max_ino = NULL;
- }
- if (vol->usnjrnl_ino) {
- iput(vol->usnjrnl_ino);
- vol->usnjrnl_ino = NULL;
- }
- if (vol->quota_q_ino) {
- iput(vol->quota_q_ino);
- vol->quota_q_ino = NULL;
- }
- if (vol->quota_ino) {
- iput(vol->quota_ino);
- vol->quota_ino = NULL;
- }
-#endif /* NTFS_RW */
- if (vol->extend_ino) {
- iput(vol->extend_ino);
- vol->extend_ino = NULL;
- }
- if (vol->secure_ino) {
- iput(vol->secure_ino);
- vol->secure_ino = NULL;
- }
- }
-
- iput(vol->root_ino);
- vol->root_ino = NULL;
-
- down_write(&vol->lcnbmp_lock);
- iput(vol->lcnbmp_ino);
- vol->lcnbmp_ino = NULL;
- up_write(&vol->lcnbmp_lock);
-
- down_write(&vol->mftbmp_lock);
- iput(vol->mftbmp_ino);
- vol->mftbmp_ino = NULL;
- up_write(&vol->mftbmp_lock);
-
-#ifdef NTFS_RW
- if (vol->logfile_ino) {
- iput(vol->logfile_ino);
- vol->logfile_ino = NULL;
- }
- if (vol->mftmirr_ino) {
- /* Re-commit the mft mirror and mft just in case. */
- ntfs_commit_inode(vol->mftmirr_ino);
- ntfs_commit_inode(vol->mft_ino);
- iput(vol->mftmirr_ino);
- vol->mftmirr_ino = NULL;
- }
- /*
- * We should have no dirty inodes left, due to
- * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
- * the underlying mft records are written out and cleaned.
- */
- ntfs_commit_inode(vol->mft_ino);
- write_inode_now(vol->mft_ino, 1);
-#endif /* NTFS_RW */
-
- iput(vol->mft_ino);
- vol->mft_ino = NULL;
-
- /* Throw away the table of attribute definitions. */
- vol->attrdef_size = 0;
- if (vol->attrdef) {
- ntfs_free(vol->attrdef);
- vol->attrdef = NULL;
- }
- vol->upcase_len = 0;
- /*
- * Destroy the global default upcase table if necessary. Also decrease
- * the number of upcase users if we are a user.
- */
- mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
- ntfs_nr_upcase_users--;
- vol->upcase = NULL;
- }
- if (!ntfs_nr_upcase_users && default_upcase) {
- ntfs_free(default_upcase);
- default_upcase = NULL;
- }
- if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
- free_compression_buffers();
- mutex_unlock(&ntfs_lock);
- if (vol->upcase) {
- ntfs_free(vol->upcase);
- vol->upcase = NULL;
- }
-
- unload_nls(vol->nls_map);
-
- sb->s_fs_info = NULL;
- kfree(vol);
-}
-
-/**
- * get_nr_free_clusters - return the number of free clusters on a volume
- * @vol: ntfs volume for which to obtain free cluster count
- *
- * Calculate the number of free clusters on the mounted NTFS volume @vol. We
- * actually calculate the number of clusters in use instead because this
- * allows us to not care about partial pages as these will be just zero filled
- * and hence not be counted as allocated clusters.
- *
- * The only particularity is that clusters beyond the end of the logical ntfs
- * volume will be marked as allocated to prevent errors which means we have to
- * discount those at the end. This is important as the cluster bitmap always
- * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
- * the logical volume and marked in use when they are not as they do not exist.
- *
- * If any pages cannot be read we assume all clusters in the erroring pages are
- * in use. This means we return an underestimate on errors which is better than
- * an overestimate.
- */
-static s64 get_nr_free_clusters(ntfs_volume *vol)
-{
- s64 nr_free = vol->nr_clusters;
- struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
- struct page *page;
- pgoff_t index, max_index;
-
- ntfs_debug("Entering.");
- /* Serialize accesses to the cluster bitmap. */
- down_read(&vol->lcnbmp_lock);
- /*
- * Convert the number of bits into bytes rounded up, then convert into
- * multiples of PAGE_SIZE, rounding up so that if we have one
- * full and one partial page max_index = 2.
- */
- max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
- ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
- max_index, PAGE_SIZE / 4);
- for (index = 0; index < max_index; index++) {
- unsigned long *kaddr;
-
- /*
- * Read the page from page cache, getting it from backing store
- * if necessary, and increment the use count.
- */
- page = read_mapping_page(mapping, index, NULL);
- /* Ignore pages which errored synchronously. */
- if (IS_ERR(page)) {
- ntfs_debug("read_mapping_page() error. Skipping "
- "page (index 0x%lx).", index);
- nr_free -= PAGE_SIZE * 8;
- continue;
- }
- kaddr = kmap_atomic(page);
- /*
- * Subtract the number of set bits. If this
- * is the last page and it is partial we don't really care as
- * it just means we do a little extra work but it won't affect
- * the result as all out of range bytes are set to zero by
- * ntfs_readpage().
- */
- nr_free -= bitmap_weight(kaddr,
- PAGE_SIZE * BITS_PER_BYTE);
- kunmap_atomic(kaddr);
- put_page(page);
- }
- ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
- /*
- * Fixup for eventual bits outside logical ntfs volume (see function
- * description above).
- */
- if (vol->nr_clusters & 63)
- nr_free += 64 - (vol->nr_clusters & 63);
- up_read(&vol->lcnbmp_lock);
- /* If errors occurred we may well have gone below zero, fix this. */
- if (nr_free < 0)
- nr_free = 0;
- ntfs_debug("Exiting.");
- return nr_free;
-}
-
-/**
- * __get_nr_free_mft_records - return the number of free inodes on a volume
- * @vol: ntfs volume for which to obtain free inode count
- * @nr_free: number of mft records in filesystem
- * @max_index: maximum number of pages containing set bits
- *
- * Calculate the number of free mft records (inodes) on the mounted NTFS
- * volume @vol. We actually calculate the number of mft records in use instead
- * because this allows us to not care about partial pages as these will be just
- * zero filled and hence not be counted as allocated mft record.
- *
- * If any pages cannot be read we assume all mft records in the erroring pages
- * are in use. This means we return an underestimate on errors which is better
- * than an overestimate.
- *
- * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
- */
-static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
- s64 nr_free, const pgoff_t max_index)
-{
- struct address_space *mapping = vol->mftbmp_ino->i_mapping;
- struct page *page;
- pgoff_t index;
-
- ntfs_debug("Entering.");
- /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
- ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
- "0x%lx.", max_index, PAGE_SIZE / 4);
- for (index = 0; index < max_index; index++) {
- unsigned long *kaddr;
-
- /*
- * Read the page from page cache, getting it from backing store
- * if necessary, and increment the use count.
- */
- page = read_mapping_page(mapping, index, NULL);
- /* Ignore pages which errored synchronously. */
- if (IS_ERR(page)) {
- ntfs_debug("read_mapping_page() error. Skipping "
- "page (index 0x%lx).", index);
- nr_free -= PAGE_SIZE * 8;
- continue;
- }
- kaddr = kmap_atomic(page);
- /*
- * Subtract the number of set bits. If this
- * is the last page and it is partial we don't really care as
- * it just means we do a little extra work but it won't affect
- * the result as all out of range bytes are set to zero by
- * ntfs_readpage().
- */
- nr_free -= bitmap_weight(kaddr,
- PAGE_SIZE * BITS_PER_BYTE);
- kunmap_atomic(kaddr);
- put_page(page);
- }
- ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
- index - 1);
- /* If errors occurred we may well have gone below zero, fix this. */
- if (nr_free < 0)
- nr_free = 0;
- ntfs_debug("Exiting.");
- return nr_free;
-}
-
-/**
- * ntfs_statfs - return information about mounted NTFS volume
- * @dentry: dentry from mounted volume
- * @sfs: statfs structure in which to return the information
- *
- * Return information about the mounted NTFS volume @dentry in the statfs structure
- * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
- * called). We interpret the values to be correct of the moment in time at
- * which we are called. Most values are variable otherwise and this isn't just
- * the free values but the totals as well. For example we can increase the
- * total number of file nodes if we run out and we can keep doing this until
- * there is no more space on the volume left at all.
- *
- * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
- * ustat system calls.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
-{
- struct super_block *sb = dentry->d_sb;
- s64 size;
- ntfs_volume *vol = NTFS_SB(sb);
- ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
- pgoff_t max_index;
- unsigned long flags;
-
- ntfs_debug("Entering.");
- /* Type of filesystem. */
- sfs->f_type = NTFS_SB_MAGIC;
- /* Optimal transfer block size. */
- sfs->f_bsize = PAGE_SIZE;
- /*
- * Total data blocks in filesystem in units of f_bsize and since
- * inodes are also stored in data blocs ($MFT is a file) this is just
- * the total clusters.
- */
- sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
- PAGE_SHIFT;
- /* Free data blocks in filesystem in units of f_bsize. */
- size = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
- PAGE_SHIFT;
- if (size < 0LL)
- size = 0LL;
- /* Free blocks avail to non-superuser, same as above on NTFS. */
- sfs->f_bavail = sfs->f_bfree = size;
- /* Serialize accesses to the inode bitmap. */
- down_read(&vol->mftbmp_lock);
- read_lock_irqsave(&mft_ni->size_lock, flags);
- size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
- /*
- * Convert the maximum number of set bits into bytes rounded up, then
- * convert into multiples of PAGE_SIZE, rounding up so that if we
- * have one full and one partial page max_index = 2.
- */
- max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
- + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- read_unlock_irqrestore(&mft_ni->size_lock, flags);
- /* Number of inodes in filesystem (at this point in time). */
- sfs->f_files = size;
- /* Free inodes in fs (based on current total count). */
- sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index);
- up_read(&vol->mftbmp_lock);
- /*
- * File system id. This is extremely *nix flavour dependent and even
- * within Linux itself all fs do their own thing. I interpret this to
- * mean a unique id associated with the mounted fs and not the id
- * associated with the filesystem driver, the latter is already given
- * by the filesystem type in sfs->f_type. Thus we use the 64-bit
- * volume serial number splitting it into two 32-bit parts. We enter
- * the least significant 32-bits in f_fsid[0] and the most significant
- * 32-bits in f_fsid[1].
- */
- sfs->f_fsid = u64_to_fsid(vol->serial_no);
- /* Maximum length of filenames. */
- sfs->f_namelen = NTFS_MAX_NAME_LEN;
- return 0;
-}
-
-#ifdef NTFS_RW
-static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
-{
- return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
-}
-#endif
-
-/*
- * The complete super operations.
- */
-static const struct super_operations ntfs_sops = {
- .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */
- .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */
-#ifdef NTFS_RW
- .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to
- disk. */
-#endif /* NTFS_RW */
- .put_super = ntfs_put_super, /* Syscall: umount. */
- .statfs = ntfs_statfs, /* Syscall: statfs */
- .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */
- .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is
- removed from memory. */
- .show_options = ntfs_show_options, /* Show mount options in
- proc. */
-};
-
-/**
- * ntfs_fill_super - mount an ntfs filesystem
- * @sb: super block of ntfs filesystem to mount
- * @opt: string containing the mount options
- * @silent: silence error output
- *
- * ntfs_fill_super() is called by the VFS to mount the device described by @sb
- * with the mount otions in @data with the NTFS filesystem.
- *
- * If @silent is true, remain silent even if errors are detected. This is used
- * during bootup, when the kernel tries to mount the root filesystem with all
- * registered filesystems one after the other until one succeeds. This implies
- * that all filesystems except the correct one will quite correctly and
- * expectedly return an error, but nobody wants to see error messages when in
- * fact this is what is supposed to happen.
- *
- * NOTE: @sb->s_flags contains the mount options flags.
- */
-static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
-{
- ntfs_volume *vol;
- struct buffer_head *bh;
- struct inode *tmp_ino;
- int blocksize, result;
-
- /*
- * We do a pretty difficult piece of bootstrap by reading the
- * MFT (and other metadata) from disk into memory. We'll only
- * release this metadata during umount, so the locking patterns
- * observed during bootstrap do not count. So turn off the
- * observation of locking patterns (strictly for this context
- * only) while mounting NTFS. [The validator is still active
- * otherwise, even for this context: it will for example record
- * lock class registrations.]
- */
- lockdep_off();
- ntfs_debug("Entering.");
-#ifndef NTFS_RW
- sb->s_flags |= SB_RDONLY;
-#endif /* ! NTFS_RW */
- /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
- sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
- vol = NTFS_SB(sb);
- if (!vol) {
- if (!silent)
- ntfs_error(sb, "Allocation of NTFS volume structure "
- "failed. Aborting mount...");
- lockdep_on();
- return -ENOMEM;
- }
- /* Initialize ntfs_volume structure. */
- *vol = (ntfs_volume) {
- .sb = sb,
- /*
- * Default is group and other don't have any access to files or
- * directories while owner has full access. Further, files by
- * default are not executable but directories are of course
- * browseable.
- */
- .fmask = 0177,
- .dmask = 0077,
- };
- init_rwsem(&vol->mftbmp_lock);
- init_rwsem(&vol->lcnbmp_lock);
-
- /* By default, enable sparse support. */
- NVolSetSparseEnabled(vol);
-
- /* Important to get the mount options dealt with now. */
- if (!parse_options(vol, (char*)opt))
- goto err_out_now;
-
- /* We support sector sizes up to the PAGE_SIZE. */
- if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) {
- if (!silent)
- ntfs_error(sb, "Device has unsupported sector size "
- "(%i). The maximum supported sector "
- "size on this architecture is %lu "
- "bytes.",
- bdev_logical_block_size(sb->s_bdev),
- PAGE_SIZE);
- goto err_out_now;
- }
- /*
- * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
- * sector size, whichever is bigger.
- */
- blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
- if (blocksize < NTFS_BLOCK_SIZE) {
- if (!silent)
- ntfs_error(sb, "Unable to set device block size.");
- goto err_out_now;
- }
- BUG_ON(blocksize != sb->s_blocksize);
- ntfs_debug("Set device block size to %i bytes (block size bits %i).",
- blocksize, sb->s_blocksize_bits);
- /* Determine the size of the device in units of block_size bytes. */
- vol->nr_blocks = sb_bdev_nr_blocks(sb);
- if (!vol->nr_blocks) {
- if (!silent)
- ntfs_error(sb, "Unable to determine device size.");
- goto err_out_now;
- }
- /* Read the boot sector and return unlocked buffer head to it. */
- if (!(bh = read_ntfs_boot_sector(sb, silent))) {
- if (!silent)
- ntfs_error(sb, "Not an NTFS volume.");
- goto err_out_now;
- }
- /*
- * Extract the data from the boot sector and setup the ntfs volume
- * using it.
- */
- result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
- brelse(bh);
- if (!result) {
- if (!silent)
- ntfs_error(sb, "Unsupported NTFS filesystem.");
- goto err_out_now;
- }
- /*
- * If the boot sector indicates a sector size bigger than the current
- * device block size, switch the device block size to the sector size.
- * TODO: It may be possible to support this case even when the set
- * below fails, we would just be breaking up the i/o for each sector
- * into multiple blocks for i/o purposes but otherwise it should just
- * work. However it is safer to leave disabled until someone hits this
- * error message and then we can get them to try it without the setting
- * so we know for sure that it works.
- */
- if (vol->sector_size > blocksize) {
- blocksize = sb_set_blocksize(sb, vol->sector_size);
- if (blocksize != vol->sector_size) {
- if (!silent)
- ntfs_error(sb, "Unable to set device block "
- "size to sector size (%i).",
- vol->sector_size);
- goto err_out_now;
- }
- BUG_ON(blocksize != sb->s_blocksize);
- vol->nr_blocks = sb_bdev_nr_blocks(sb);
- ntfs_debug("Changed device block size to %i bytes (block size "
- "bits %i) to match volume sector size.",
- blocksize, sb->s_blocksize_bits);
- }
- /* Initialize the cluster and mft allocators. */
- ntfs_setup_allocators(vol);
- /* Setup remaining fields in the super block. */
- sb->s_magic = NTFS_SB_MAGIC;
- /*
- * Ntfs allows 63 bits for the file size, i.e. correct would be:
- * sb->s_maxbytes = ~0ULL >> 1;
- * But the kernel uses a long as the page cache page index which on
- * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
- * defined to the maximum the page cache page index can cope with
- * without overflowing the index or to 2^63 - 1, whichever is smaller.
- */
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- /* Ntfs measures time in 100ns intervals. */
- sb->s_time_gran = 100;
- /*
- * Now load the metadata required for the page cache and our address
- * space operations to function. We do this by setting up a specialised
- * read_inode method and then just calling the normal iget() to obtain
- * the inode for $MFT which is sufficient to allow our normal inode
- * operations and associated address space operations to function.
- */
- sb->s_op = &ntfs_sops;
- tmp_ino = new_inode(sb);
- if (!tmp_ino) {
- if (!silent)
- ntfs_error(sb, "Failed to load essential metadata.");
- goto err_out_now;
- }
- tmp_ino->i_ino = FILE_MFT;
- insert_inode_hash(tmp_ino);
- if (ntfs_read_inode_mount(tmp_ino) < 0) {
- if (!silent)
- ntfs_error(sb, "Failed to load essential metadata.");
- goto iput_tmp_ino_err_out_now;
- }
- mutex_lock(&ntfs_lock);
- /*
- * The current mount is a compression user if the cluster size is
- * less than or equal 4kiB.
- */
- if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
- result = allocate_compression_buffers();
- if (result) {
- ntfs_error(NULL, "Failed to allocate buffers "
- "for compression engine.");
- ntfs_nr_compression_users--;
- mutex_unlock(&ntfs_lock);
- goto iput_tmp_ino_err_out_now;
- }
- }
- /*
- * Generate the global default upcase table if necessary. Also
- * temporarily increment the number of upcase users to avoid race
- * conditions with concurrent (u)mounts.
- */
- if (!default_upcase)
- default_upcase = generate_default_upcase();
- ntfs_nr_upcase_users++;
- mutex_unlock(&ntfs_lock);
- /*
- * From now on, ignore @silent parameter. If we fail below this line,
- * it will be due to a corrupt fs or a system error, so we report it.
- */
- /*
- * Open the system files with normal access functions and complete
- * setting up the ntfs super block.
- */
- if (!load_system_files(vol)) {
- ntfs_error(sb, "Failed to load system files.");
- goto unl_upcase_iput_tmp_ino_err_out_now;
- }
-
- /* We grab a reference, simulating an ntfs_iget(). */
- ihold(vol->root_ino);
- if ((sb->s_root = d_make_root(vol->root_ino))) {
- ntfs_debug("Exiting, status successful.");
- /* Release the default upcase if it has no users. */
- mutex_lock(&ntfs_lock);
- if (!--ntfs_nr_upcase_users && default_upcase) {
- ntfs_free(default_upcase);
- default_upcase = NULL;
- }
- mutex_unlock(&ntfs_lock);
- sb->s_export_op = &ntfs_export_ops;
- lockdep_on();
- return 0;
- }
- ntfs_error(sb, "Failed to allocate root directory.");
- /* Clean up after the successful load_system_files() call from above. */
- // TODO: Use ntfs_put_super() instead of repeating all this code...
- // FIXME: Should mark the volume clean as the error is most likely
- // -ENOMEM.
- iput(vol->vol_ino);
- vol->vol_ino = NULL;
- /* NTFS 3.0+ specific clean up. */
- if (vol->major_ver >= 3) {
-#ifdef NTFS_RW
- if (vol->usnjrnl_j_ino) {
- iput(vol->usnjrnl_j_ino);
- vol->usnjrnl_j_ino = NULL;
- }
- if (vol->usnjrnl_max_ino) {
- iput(vol->usnjrnl_max_ino);
- vol->usnjrnl_max_ino = NULL;
- }
- if (vol->usnjrnl_ino) {
- iput(vol->usnjrnl_ino);
- vol->usnjrnl_ino = NULL;
- }
- if (vol->quota_q_ino) {
- iput(vol->quota_q_ino);
- vol->quota_q_ino = NULL;
- }
- if (vol->quota_ino) {
- iput(vol->quota_ino);
- vol->quota_ino = NULL;
- }
-#endif /* NTFS_RW */
- if (vol->extend_ino) {
- iput(vol->extend_ino);
- vol->extend_ino = NULL;
- }
- if (vol->secure_ino) {
- iput(vol->secure_ino);
- vol->secure_ino = NULL;
- }
- }
- iput(vol->root_ino);
- vol->root_ino = NULL;
- iput(vol->lcnbmp_ino);
- vol->lcnbmp_ino = NULL;
- iput(vol->mftbmp_ino);
- vol->mftbmp_ino = NULL;
-#ifdef NTFS_RW
- if (vol->logfile_ino) {
- iput(vol->logfile_ino);
- vol->logfile_ino = NULL;
- }
- if (vol->mftmirr_ino) {
- iput(vol->mftmirr_ino);
- vol->mftmirr_ino = NULL;
- }
-#endif /* NTFS_RW */
- /* Throw away the table of attribute definitions. */
- vol->attrdef_size = 0;
- if (vol->attrdef) {
- ntfs_free(vol->attrdef);
- vol->attrdef = NULL;
- }
- vol->upcase_len = 0;
- mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
- ntfs_nr_upcase_users--;
- vol->upcase = NULL;
- }
- mutex_unlock(&ntfs_lock);
- if (vol->upcase) {
- ntfs_free(vol->upcase);
- vol->upcase = NULL;
- }
- if (vol->nls_map) {
- unload_nls(vol->nls_map);
- vol->nls_map = NULL;
- }
- /* Error exit code path. */
-unl_upcase_iput_tmp_ino_err_out_now:
- /*
- * Decrease the number of upcase users and destroy the global default
- * upcase table if necessary.
- */
- mutex_lock(&ntfs_lock);
- if (!--ntfs_nr_upcase_users && default_upcase) {
- ntfs_free(default_upcase);
- default_upcase = NULL;
- }
- if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
- free_compression_buffers();
- mutex_unlock(&ntfs_lock);
-iput_tmp_ino_err_out_now:
- iput(tmp_ino);
- if (vol->mft_ino && vol->mft_ino != tmp_ino)
- iput(vol->mft_ino);
- vol->mft_ino = NULL;
- /* Errors at this stage are irrelevant. */
-err_out_now:
- sb->s_fs_info = NULL;
- kfree(vol);
- ntfs_debug("Failed, returning -EINVAL.");
- lockdep_on();
- return -EINVAL;
-}
-
-/*
- * This is a slab cache to optimize allocations and deallocations of Unicode
- * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
- * (255) Unicode characters + a terminating NULL Unicode character.
- */
-struct kmem_cache *ntfs_name_cache;
-
-/* Slab caches for efficient allocation/deallocation of inodes. */
-struct kmem_cache *ntfs_inode_cache;
-struct kmem_cache *ntfs_big_inode_cache;
-
-/* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(void *foo)
-{
- ntfs_inode *ni = (ntfs_inode *)foo;
-
- inode_init_once(VFS_I(ni));
-}
-
-/*
- * Slab caches to optimize allocations and deallocations of attribute search
- * contexts and index contexts, respectively.
- */
-struct kmem_cache *ntfs_attr_ctx_cache;
-struct kmem_cache *ntfs_index_ctx_cache;
-
-/* Driver wide mutex. */
-DEFINE_MUTEX(ntfs_lock);
-
-static struct dentry *ntfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
-}
-
-static struct file_system_type ntfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "ntfs",
- .mount = ntfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("ntfs");
-
-/* Stable names for the slab caches. */
-static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
-static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
-static const char ntfs_name_cache_name[] = "ntfs_name_cache";
-static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
-static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
-
-static int __init init_ntfs_fs(void)
-{
- int err = 0;
-
- /* This may be ugly but it results in pretty output so who cares. (-8 */
- pr_info("driver " NTFS_VERSION " [Flags: R/"
-#ifdef NTFS_RW
- "W"
-#else
- "O"
-#endif
-#ifdef DEBUG
- " DEBUG"
-#endif
-#ifdef MODULE
- " MODULE"
-#endif
- "].\n");
-
- ntfs_debug("Debug messages are enabled.");
-
- ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
- sizeof(ntfs_index_context), 0 /* offset */,
- SLAB_HWCACHE_ALIGN, NULL /* ctor */);
- if (!ntfs_index_ctx_cache) {
- pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
- goto ictx_err_out;
- }
- ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
- sizeof(ntfs_attr_search_ctx), 0 /* offset */,
- SLAB_HWCACHE_ALIGN, NULL /* ctor */);
- if (!ntfs_attr_ctx_cache) {
- pr_crit("NTFS: Failed to create %s!\n",
- ntfs_attr_ctx_cache_name);
- goto actx_err_out;
- }
-
- ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
- (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!ntfs_name_cache) {
- pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
- goto name_err_out;
- }
-
- ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
- sizeof(ntfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
- if (!ntfs_inode_cache) {
- pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
- goto inode_err_out;
- }
-
- ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
- sizeof(big_ntfs_inode), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT, ntfs_big_inode_init_once);
- if (!ntfs_big_inode_cache) {
- pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
- goto big_inode_err_out;
- }
-
- /* Register the ntfs sysctls. */
- err = ntfs_sysctl(1);
- if (err) {
- pr_crit("Failed to register NTFS sysctls!\n");
- goto sysctl_err_out;
- }
-
- err = register_filesystem(&ntfs_fs_type);
- if (!err) {
- ntfs_debug("NTFS driver registered successfully.");
- return 0; /* Success! */
- }
- pr_crit("Failed to register NTFS filesystem driver!\n");
-
- /* Unregister the ntfs sysctls. */
- ntfs_sysctl(0);
-sysctl_err_out:
- kmem_cache_destroy(ntfs_big_inode_cache);
-big_inode_err_out:
- kmem_cache_destroy(ntfs_inode_cache);
-inode_err_out:
- kmem_cache_destroy(ntfs_name_cache);
-name_err_out:
- kmem_cache_destroy(ntfs_attr_ctx_cache);
-actx_err_out:
- kmem_cache_destroy(ntfs_index_ctx_cache);
-ictx_err_out:
- if (!err) {
- pr_crit("Aborting NTFS filesystem driver registration...\n");
- err = -ENOMEM;
- }
- return err;
-}
-
-static void __exit exit_ntfs_fs(void)
-{
- ntfs_debug("Unregistering NTFS driver.");
-
- unregister_filesystem(&ntfs_fs_type);
-
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(ntfs_big_inode_cache);
- kmem_cache_destroy(ntfs_inode_cache);
- kmem_cache_destroy(ntfs_name_cache);
- kmem_cache_destroy(ntfs_attr_ctx_cache);
- kmem_cache_destroy(ntfs_index_ctx_cache);
- /* Unregister the ntfs sysctls. */
- ntfs_sysctl(0);
-}
-
-MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
-MODULE_VERSION(NTFS_VERSION);
-MODULE_LICENSE("GPL");
-#ifdef DEBUG
-module_param(debug_msgs, bint, 0);
-MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
-#endif
-
-module_init(init_ntfs_fs)
-module_exit(exit_ntfs_fs)
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
deleted file mode 100644
index 4e980170d86a..000000000000
--- a/fs/ntfs/sysctl.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
- * the Linux-NTFS project. Adapted from the old NTFS driver,
- * Copyright (C) 1997 Martin von Löwis, Régis Duchesne
- *
- * Copyright (c) 2002-2005 Anton Altaparmakov
- */
-
-#ifdef DEBUG
-
-#include <linux/module.h>
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/proc_fs.h>
-#include <linux/sysctl.h>
-
-#include "sysctl.h"
-#include "debug.h"
-
-/* Definition of the ntfs sysctl. */
-static struct ctl_table ntfs_sysctls[] = {
- {
- .procname = "ntfs-debug",
- .data = &debug_msgs, /* Data pointer and size. */
- .maxlen = sizeof(debug_msgs),
- .mode = 0644, /* Mode, proc handler. */
- .proc_handler = proc_dointvec
- },
-};
-
-/* Storage for the sysctls header. */
-static struct ctl_table_header *sysctls_root_table;
-
-/**
- * ntfs_sysctl - add or remove the debug sysctl
- * @add: add (1) or remove (0) the sysctl
- *
- * Add or remove the debug sysctl. Return 0 on success or -errno on error.
- */
-int ntfs_sysctl(int add)
-{
- if (add) {
- BUG_ON(sysctls_root_table);
- sysctls_root_table = register_sysctl("fs", ntfs_sysctls);
- if (!sysctls_root_table)
- return -ENOMEM;
- } else {
- BUG_ON(!sysctls_root_table);
- unregister_sysctl_table(sysctls_root_table);
- sysctls_root_table = NULL;
- }
- return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-#endif /* DEBUG */
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
deleted file mode 100644
index 96bb2299d2d5..000000000000
--- a/fs/ntfs/sysctl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
- * the Linux-NTFS project. Adapted from the old NTFS driver,
- * Copyright (C) 1997 Martin von Löwis, Régis Duchesne
- *
- * Copyright (c) 2002-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_SYSCTL_H
-#define _LINUX_NTFS_SYSCTL_H
-
-
-#if defined(DEBUG) && defined(CONFIG_SYSCTL)
-
-extern int ntfs_sysctl(int add);
-
-#else
-
-/* Just return success. */
-static inline int ntfs_sysctl(int add)
-{
- return 0;
-}
-
-#endif /* DEBUG && CONFIG_SYSCTL */
-#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
deleted file mode 100644
index 6b63261300cc..000000000000
--- a/fs/ntfs/time.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * time.h - NTFS time conversion functions. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_TIME_H
-#define _LINUX_NTFS_TIME_H
-
-#include <linux/time.h> /* For current_kernel_time(). */
-#include <asm/div64.h> /* For do_div(). */
-
-#include "endian.h"
-
-#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
-
-/**
- * utc2ntfs - convert Linux UTC time to NTFS time
- * @ts: Linux UTC time to convert to NTFS time
- *
- * Convert the Linux UTC time @ts to its corresponding NTFS time and return
- * that in little endian format.
- *
- * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
- * and a long tv_nsec where tv_sec is the number of 1-second intervals since
- * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
- * intervals since the value of tv_sec.
- *
- * NTFS uses Microsoft's standard time format which is stored in a s64 and is
- * measured as the number of 100-nano-second intervals since 1st January 1601,
- * 00:00:00 UTC.
- */
-static inline sle64 utc2ntfs(const struct timespec64 ts)
-{
- /*
- * Convert the seconds to 100ns intervals, add the nano-seconds
- * converted to 100ns intervals, and then add the NTFS time offset.
- */
- return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
- NTFS_TIME_OFFSET);
-}
-
-/**
- * get_current_ntfs_time - get the current time in little endian NTFS format
- *
- * Get the current time from the Linux kernel, convert it to its corresponding
- * NTFS time and return that in little endian format.
- */
-static inline sle64 get_current_ntfs_time(void)
-{
- struct timespec64 ts;
-
- ktime_get_coarse_real_ts64(&ts);
- return utc2ntfs(ts);
-}
-
-/**
- * ntfs2utc - convert NTFS time to Linux time
- * @time: NTFS time (little endian) to convert to Linux UTC
- *
- * Convert the little endian NTFS time @time to its corresponding Linux UTC
- * time and return that in cpu format.
- *
- * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
- * and a long tv_nsec where tv_sec is the number of 1-second intervals since
- * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
- * intervals since the value of tv_sec.
- *
- * NTFS uses Microsoft's standard time format which is stored in a s64 and is
- * measured as the number of 100 nano-second intervals since 1st January 1601,
- * 00:00:00 UTC.
- */
-static inline struct timespec64 ntfs2utc(const sle64 time)
-{
- struct timespec64 ts;
-
- /* Subtract the NTFS time offset. */
- u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
- /*
- * Convert the time to 1-second intervals and the remainder to
- * 1-nano-second intervals.
- */
- ts.tv_nsec = do_div(t, 10000000) * 100;
- ts.tv_sec = t;
- return ts;
-}
-
-#endif /* _LINUX_NTFS_TIME_H */
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
deleted file mode 100644
index 9a47859e7a06..000000000000
--- a/fs/ntfs/types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * types.h - Defines for NTFS Linux kernel driver specific types.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_TYPES_H
-#define _LINUX_NTFS_TYPES_H
-
-#include <linux/types.h>
-
-typedef __le16 le16;
-typedef __le32 le32;
-typedef __le64 le64;
-typedef __u16 __bitwise sle16;
-typedef __u32 __bitwise sle32;
-typedef __u64 __bitwise sle64;
-
-/* 2-byte Unicode character type. */
-typedef le16 ntfschar;
-#define UCHAR_T_SIZE_BITS 1
-
-/*
- * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
- * and VCN, to allow for type checking and better code readability.
- */
-typedef s64 VCN;
-typedef sle64 leVCN;
-typedef s64 LCN;
-typedef sle64 leLCN;
-
-/*
- * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
- * values. We define our own type LSN, to allow for type checking and better
- * code readability.
- */
-typedef s64 LSN;
-typedef sle64 leLSN;
-
-/*
- * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values.
- * We define our own type USN, to allow for type checking and better code
- * readability.
- */
-typedef s64 USN;
-typedef sle64 leUSN;
-
-typedef enum {
- CASE_SENSITIVE = 0,
- IGNORE_CASE = 1,
-} IGNORE_CASE_BOOL;
-
-#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
deleted file mode 100644
index a6b6c64f14a9..000000000000
--- a/fs/ntfs/unistr.c
+++ /dev/null
@@ -1,384 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include <linux/slab.h>
-
-#include "types.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/*
- * IMPORTANT
- * =========
- *
- * All these routines assume that the Unicode characters are in little endian
- * encoding inside the strings!!!
- */
-
-/*
- * This is used by the name collation functions to quickly determine what
- * characters are (in)valid.
- */
-static const u8 legal_ansi_char_array[0x40] = {
- 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
- 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-
- 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
- 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-
- 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
- 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
-
- 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
- 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
-};
-
-/**
- * ntfs_are_names_equal - compare two Unicode names for equality
- * @s1: name to compare to @s2
- * @s1_len: length in Unicode characters of @s1
- * @s2: name to compare to @s1
- * @s2_len: length in Unicode characters of @s2
- * @ic: ignore case bool
- * @upcase: upcase table (only if @ic == IGNORE_CASE)
- * @upcase_size: length in Unicode characters of @upcase (if present)
- *
- * Compare the names @s1 and @s2 and return 'true' (1) if the names are
- * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
- * the @upcase table is used to performa a case insensitive comparison.
- */
-bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
- const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_size)
-{
- if (s1_len != s2_len)
- return false;
- if (ic == CASE_SENSITIVE)
- return !ntfs_ucsncmp(s1, s2, s1_len);
- return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
-}
-
-/**
- * ntfs_collate_names - collate two Unicode names
- * @name1: first Unicode name to compare
- * @name2: second Unicode name to compare
- * @err_val: if @name1 contains an invalid character return this value
- * @ic: either CASE_SENSITIVE or IGNORE_CASE
- * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE)
- * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
- *
- * ntfs_collate_names collates two Unicode names and returns:
- *
- * -1 if the first name collates before the second one,
- * 0 if the names match,
- * 1 if the second name collates before the first one, or
- * @err_val if an invalid character is found in @name1 during the comparison.
- *
- * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
- */
-int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
- const ntfschar *name2, const u32 name2_len,
- const int err_val, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_len)
-{
- u32 cnt, min_len;
- u16 c1, c2;
-
- min_len = name1_len;
- if (name1_len > name2_len)
- min_len = name2_len;
- for (cnt = 0; cnt < min_len; ++cnt) {
- c1 = le16_to_cpu(*name1++);
- c2 = le16_to_cpu(*name2++);
- if (ic) {
- if (c1 < upcase_len)
- c1 = le16_to_cpu(upcase[c1]);
- if (c2 < upcase_len)
- c2 = le16_to_cpu(upcase[c2]);
- }
- if (c1 < 64 && legal_ansi_char_array[c1] & 8)
- return err_val;
- if (c1 < c2)
- return -1;
- if (c1 > c2)
- return 1;
- }
- if (name1_len < name2_len)
- return -1;
- if (name1_len == name2_len)
- return 0;
- /* name1_len > name2_len */
- c1 = le16_to_cpu(*name1);
- if (c1 < 64 && legal_ansi_char_array[c1] & 8)
- return err_val;
- return 1;
-}
-
-/**
- * ntfs_ucsncmp - compare two little endian Unicode strings
- * @s1: first string
- * @s2: second string
- * @n: maximum unicode characters to compare
- *
- * Compare the first @n characters of the Unicode strings @s1 and @s2,
- * The strings in little endian format and appropriate le16_to_cpu()
- * conversion is performed on non-little endian machines.
- *
- * The function returns an integer less than, equal to, or greater than zero
- * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
- * to be less than, to match, or be greater than @s2.
- */
-int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
-{
- u16 c1, c2;
- size_t i;
-
- for (i = 0; i < n; ++i) {
- c1 = le16_to_cpu(s1[i]);
- c2 = le16_to_cpu(s2[i]);
- if (c1 < c2)
- return -1;
- if (c1 > c2)
- return 1;
- if (!c1)
- break;
- }
- return 0;
-}
-
-/**
- * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
- * @s1: first string
- * @s2: second string
- * @n: maximum unicode characters to compare
- * @upcase: upcase table
- * @upcase_size: upcase table size in Unicode characters
- *
- * Compare the first @n characters of the Unicode strings @s1 and @s2,
- * ignoring case. The strings in little endian format and appropriate
- * le16_to_cpu() conversion is performed on non-little endian machines.
- *
- * Each character is uppercased using the @upcase table before the comparison.
- *
- * The function returns an integer less than, equal to, or greater than zero
- * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
- * to be less than, to match, or be greater than @s2.
- */
-int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
- const ntfschar *upcase, const u32 upcase_size)
-{
- size_t i;
- u16 c1, c2;
-
- for (i = 0; i < n; ++i) {
- if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
- c1 = le16_to_cpu(upcase[c1]);
- if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
- c2 = le16_to_cpu(upcase[c2]);
- if (c1 < c2)
- return -1;
- if (c1 > c2)
- return 1;
- if (!c1)
- break;
- }
- return 0;
-}
-
-void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
- const u32 upcase_len)
-{
- u32 i;
- u16 u;
-
- for (i = 0; i < name_len; i++)
- if ((u = le16_to_cpu(name[i])) < upcase_len)
- name[i] = upcase[u];
-}
-
-void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
- const ntfschar *upcase, const u32 upcase_len)
-{
- ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
- file_name_attr->file_name_length, upcase, upcase_len);
-}
-
-int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
- FILE_NAME_ATTR *file_name_attr2,
- const int err_val, const IGNORE_CASE_BOOL ic,
- const ntfschar *upcase, const u32 upcase_len)
-{
- return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
- file_name_attr1->file_name_length,
- (ntfschar*)&file_name_attr2->file_name,
- file_name_attr2->file_name_length,
- err_val, ic, upcase, upcase_len);
-}
-
-/**
- * ntfs_nlstoucs - convert NLS string to little endian Unicode string
- * @vol: ntfs volume which we are working with
- * @ins: input NLS string buffer
- * @ins_len: length of input string in bytes
- * @outs: on return contains the allocated output Unicode string buffer
- *
- * Convert the input string @ins, which is in whatever format the loaded NLS
- * map dictates, into a little endian, 2-byte Unicode string.
- *
- * This function allocates the string and the caller is responsible for
- * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
- *
- * On success the function returns the number of Unicode characters written to
- * the output string *@outs (>= 0), not counting the terminating Unicode NULL
- * character. *@outs is set to the allocated output string buffer.
- *
- * On error, a negative number corresponding to the error code is returned. In
- * that case the output string is not allocated. Both *@outs and *@outs_len
- * are then undefined.
- *
- * This might look a bit odd due to fast path optimization...
- */
-int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
- const int ins_len, ntfschar **outs)
-{
- struct nls_table *nls = vol->nls_map;
- ntfschar *ucs;
- wchar_t wc;
- int i, o, wc_len;
-
- /* We do not trust outside sources. */
- if (likely(ins)) {
- ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
- if (likely(ucs)) {
- for (i = o = 0; i < ins_len; i += wc_len) {
- wc_len = nls->char2uni(ins + i, ins_len - i,
- &wc);
- if (likely(wc_len >= 0 &&
- o < NTFS_MAX_NAME_LEN)) {
- if (likely(wc)) {
- ucs[o++] = cpu_to_le16(wc);
- continue;
- } /* else if (!wc) */
- break;
- } /* else if (wc_len < 0 ||
- o >= NTFS_MAX_NAME_LEN) */
- goto name_err;
- }
- ucs[o] = 0;
- *outs = ucs;
- return o;
- } /* else if (!ucs) */
- ntfs_error(vol->sb, "Failed to allocate buffer for converted "
- "name from ntfs_name_cache.");
- return -ENOMEM;
- } /* else if (!ins) */
- ntfs_error(vol->sb, "Received NULL pointer.");
- return -EINVAL;
-name_err:
- kmem_cache_free(ntfs_name_cache, ucs);
- if (wc_len < 0) {
- ntfs_error(vol->sb, "Name using character set %s contains "
- "characters that cannot be converted to "
- "Unicode.", nls->charset);
- i = -EILSEQ;
- } else /* if (o >= NTFS_MAX_NAME_LEN) */ {
- ntfs_error(vol->sb, "Name is too long (maximum length for a "
- "name on NTFS is %d Unicode characters.",
- NTFS_MAX_NAME_LEN);
- i = -ENAMETOOLONG;
- }
- return i;
-}
-
-/**
- * ntfs_ucstonls - convert little endian Unicode string to NLS string
- * @vol: ntfs volume which we are working with
- * @ins: input Unicode string buffer
- * @ins_len: length of input string in Unicode characters
- * @outs: on return contains the (allocated) output NLS string buffer
- * @outs_len: length of output string buffer in bytes
- *
- * Convert the input little endian, 2-byte Unicode string @ins, of length
- * @ins_len into the string format dictated by the loaded NLS.
- *
- * If *@outs is NULL, this function allocates the string and the caller is
- * responsible for calling kfree(*@outs); when finished with it. In this case
- * @outs_len is ignored and can be 0.
- *
- * On success the function returns the number of bytes written to the output
- * string *@outs (>= 0), not counting the terminating NULL byte. If the output
- * string buffer was allocated, *@outs is set to it.
- *
- * On error, a negative number corresponding to the error code is returned. In
- * that case the output string is not allocated. The contents of *@outs are
- * then undefined.
- *
- * This might look a bit odd due to fast path optimization...
- */
-int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
- const int ins_len, unsigned char **outs, int outs_len)
-{
- struct nls_table *nls = vol->nls_map;
- unsigned char *ns;
- int i, o, ns_len, wc;
-
- /* We don't trust outside sources. */
- if (ins) {
- ns = *outs;
- ns_len = outs_len;
- if (ns && !ns_len) {
- wc = -ENAMETOOLONG;
- goto conversion_err;
- }
- if (!ns) {
- ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
- ns = kmalloc(ns_len + 1, GFP_NOFS);
- if (!ns)
- goto mem_err_out;
- }
- for (i = o = 0; i < ins_len; i++) {
-retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
- ns_len - o);
- if (wc > 0) {
- o += wc;
- continue;
- } else if (!wc)
- break;
- else if (wc == -ENAMETOOLONG && ns != *outs) {
- unsigned char *tc;
- /* Grow in multiples of 64 bytes. */
- tc = kmalloc((ns_len + 64) &
- ~63, GFP_NOFS);
- if (tc) {
- memcpy(tc, ns, ns_len);
- ns_len = ((ns_len + 64) & ~63) - 1;
- kfree(ns);
- ns = tc;
- goto retry;
- } /* No memory so goto conversion_error; */
- } /* wc < 0, real error. */
- goto conversion_err;
- }
- ns[o] = 0;
- *outs = ns;
- return o;
- } /* else (!ins) */
- ntfs_error(vol->sb, "Received NULL pointer.");
- return -EINVAL;
-conversion_err:
- ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
- "converted to character set %s. You might want to "
- "try to use the mount option nls=utf8.", nls->charset);
- if (ns != *outs)
- kfree(ns);
- if (wc != -ENAMETOOLONG)
- wc = -EILSEQ;
- return wc;
-mem_err_out:
- ntfs_error(vol->sb, "Failed to allocate name!");
- return -ENOMEM;
-}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
deleted file mode 100644
index 4ebe84a78dea..000000000000
--- a/fs/ntfs/upcase.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include "malloc.h"
-#include "ntfs.h"
-
-ntfschar *generate_default_upcase(void)
-{
- static const int uc_run_table[][3] = { /* Start, End, Add */
- {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
- {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
- {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
- {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
- {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
- {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
- {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
- {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
- {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
- {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
- {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
- {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
- {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
- {0}
- };
-
- static const int uc_dup_table[][2] = { /* Start, End */
- {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
- {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
- {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
- {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
- {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
- {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
- {0}
- };
-
- static const int uc_word_table[][2] = { /* Offset, Value */
- {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
- {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
- {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
- {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
- {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
- {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
- {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
- {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
- {0}
- };
-
- int i, r;
- ntfschar *uc;
-
- uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
- if (!uc)
- return uc;
- memset(uc, 0, default_upcase_len * sizeof(ntfschar));
- /* Generate the little endian Unicode upcase table used by ntfs. */
- for (i = 0; i < default_upcase_len; i++)
- uc[i] = cpu_to_le16(i);
- for (r = 0; uc_run_table[r][0]; r++)
- for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
- le16_add_cpu(&uc[i], uc_run_table[r][2]);
- for (r = 0; uc_dup_table[r][0]; r++)
- for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
- le16_add_cpu(&uc[i + 1], -1);
- for (r = 0; uc_word_table[r][0]; r++)
- uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
- return uc;
-}
diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
deleted file mode 100644
index 9097a0b4ef25..000000000000
--- a/fs/ntfs/usnjrnl.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the
- * Linux-NTFS project.
- *
- * Copyright (c) 2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-
-#include "aops.h"
-#include "debug.h"
-#include "endian.h"
-#include "time.h"
-#include "types.h"
-#include "usnjrnl.h"
-#include "volume.h"
-
-/**
- * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume
- * @vol: ntfs volume on which to stamp the transaction log
- *
- * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
- * 'true' on success and 'false' on error.
- *
- * This function assumes that the transaction log has already been loaded and
- * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
- */
-bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
-{
- ntfs_debug("Entering.");
- if (likely(!NVolUsnJrnlStamped(vol))) {
- sle64 stamp;
- struct page *page;
- USN_HEADER *uh;
-
- page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
- if (IS_ERR(page)) {
- ntfs_error(vol->sb, "Failed to read from "
- "$UsnJrnl/$DATA/$Max attribute.");
- return false;
- }
- uh = (USN_HEADER*)page_address(page);
- stamp = get_current_ntfs_time();
- ntfs_debug("Stamping transaction log ($UsnJrnl): old "
- "journal_id 0x%llx, old lowest_valid_usn "
- "0x%llx, new journal_id 0x%llx, new "
- "lowest_valid_usn 0x%llx.",
- (long long)sle64_to_cpu(uh->journal_id),
- (long long)sle64_to_cpu(uh->lowest_valid_usn),
- (long long)sle64_to_cpu(stamp),
- i_size_read(vol->usnjrnl_j_ino));
- uh->lowest_valid_usn =
- cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino));
- uh->journal_id = stamp;
- flush_dcache_page(page);
- set_page_dirty(page);
- ntfs_unmap_page(page);
- /* Set the flag so we do not have to do it again on remount. */
- NVolSetUsnJrnlStamped(vol);
- }
- ntfs_debug("Done.");
- return true;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
deleted file mode 100644
index 85f531b59395..000000000000
--- a/fs/ntfs/usnjrnl.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
- * Part of the Linux-NTFS project.
- *
- * Copyright (c) 2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_USNJRNL_H
-#define _LINUX_NTFS_USNJRNL_H
-
-#ifdef NTFS_RW
-
-#include "types.h"
-#include "endian.h"
-#include "layout.h"
-#include "volume.h"
-
-/*
- * Transaction log ($UsnJrnl) organization:
- *
- * The transaction log records whenever a file is modified in any way. So for
- * example it will record that file "blah" was written to at a particular time
- * but not what was written. If will record that a file was deleted or
- * created, that a file was truncated, etc. See below for all the reason
- * codes used.
- *
- * The transaction log is in the $Extend directory which is in the root
- * directory of each volume. If it is not present it means transaction
- * logging is disabled. If it is present it means transaction logging is
- * either enabled or in the process of being disabled in which case we can
- * ignore it as it will go away as soon as Windows gets its hands on it.
- *
- * To determine whether the transaction logging is enabled or in the process
- * of being disabled, need to check the volume flags in the
- * $VOLUME_INFORMATION attribute in the $Volume system file (which is present
- * in the root directory and has a fixed mft record number, see layout.h).
- * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log
- * is in the process of being disabled and if this flag is clear it means the
- * transaction log is enabled.
- *
- * The transaction log consists of two parts; the $DATA/$Max attribute as well
- * as the $DATA/$J attribute. $Max is a header describing the transaction
- * log whilst $J is the transaction log data itself as a sequence of variable
- * sized USN_RECORDs (see below for all the structures).
- *
- * We do not care about transaction logging at this point in time but we still
- * need to let windows know that the transaction log is out of date. To do
- * this we need to stamp the transaction log. This involves setting the
- * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used
- * for the next added USN_RECORD to the $DATA/$J attribute as well as
- * generating a new journal_id in $DATA/$Max.
- *
- * The journal_id is as of the current version (2.0) of the transaction log
- * simply the 64-bit timestamp of when the journal was either created or last
- * stamped.
- *
- * To determine the next usn there are two ways. The first is to parse
- * $DATA/$J and to find the last USN_RECORD in it and to add its record_length
- * to its usn (which is the byte offset in the $DATA/$J attribute). The
- * second is simply to take the data size of the attribute. Since the usns
- * are simply byte offsets into $DATA/$J, this is exactly the next usn. For
- * obvious reasons we use the second method as it is much simpler and faster.
- *
- * As an aside, note that to actually disable the transaction log, one would
- * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go
- * through all the mft records on the volume and set the usn field in their
- * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need
- * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally,
- * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag.
- *
- * Note that if a volume is unmounted whilst the transaction log is being
- * disabled, the process will continue the next time the volume is mounted.
- * This is why we can safely mount read-write when we see a transaction log
- * in the process of being deleted.
- */
-
-/* Some $UsnJrnl related constants. */
-#define UsnJrnlMajorVer 2
-#define UsnJrnlMinorVer 0
-
-/*
- * $DATA/$Max attribute. This is (always?) resident and has a fixed size of
- * 32 bytes. It contains the header describing the transaction log.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J
- attribute. */
-/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the
- size of the $DATA/$J attribute. */
-/*0x10*/sle64 journal_id; /* Current id of the transaction log. */
-/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the
- current journal_id. */
-/* sizeof() = 32 (0x20) bytes */
-} __attribute__ ((__packed__)) USN_HEADER;
-
-/*
- * Reason flags (32-bit). Cumulative flags describing the change(s) to the
- * file since it was last opened. I think the names speak for themselves but
- * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://www.linux-ntfs.org/
- */
-enum {
- USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001),
- USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002),
- USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004),
- USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
- USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020),
- USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
- USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100),
- USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200),
- USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400),
- USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800),
- USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000),
- USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000),
- USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000),
- USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000),
- USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000),
- USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000),
- USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000),
- USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000),
- USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
- USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000),
- USN_REASON_CLOSE = cpu_to_le32(0x80000000),
-};
-
-typedef le32 USN_REASON_FLAGS;
-
-/*
- * Source info flags (32-bit). Information about the source of the change(s)
- * to the file. For detailed descriptions of what these mean, see the Linux
- * NTFS project NTFS documentation:
- * http://www.linux-ntfs.org/
- */
-enum {
- USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001),
- USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002),
- USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
-};
-
-typedef le32 USN_SOURCE_INFO_FLAGS;
-
-/*
- * $DATA/$J attribute. This is always non-resident, is marked as sparse, and
- * is of variabled size. It consists of a sequence of variable size
- * USN_RECORDS. The minimum allocated_size is allocation_delta as
- * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is
- * exceeded by more than allocation_delta bytes, allocation_delta bytes are
- * allocated and appended to the $DATA/$J attribute and an equal number of
- * bytes at the beginning of the attribute are freed and made sparse. Note the
- * making sparse only happens at volume checkpoints and hence the actual
- * $DATA/$J size can exceed maximum_size + allocation_delta temporarily.
- */
-typedef struct {
-/*Ofs*/
-/* 0*/le32 length; /* Byte size of this record (8-byte
- aligned). */
-/* 4*/le16 major_ver; /* Major version of the transaction log used
- for this record. */
-/* 6*/le16 minor_ver; /* Minor version of the transaction log used
- for this record. */
-/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or
- directory) described by this record. */
-/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent
- directory of the file described by this
- record. */
-/*0x18*/leUSN usn; /* The usn of this record. Equals the offset
- within the $DATA/$J attribute. */
-/*0x20*/sle64 time; /* Time when this record was created. */
-/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */
-/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */
-/*0x30*/le32 security_id; /* File security_id copied from
- $STANDARD_INFORMATION. */
-/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from
- $STANDARD_INFORMATION or $FILE_NAME (not
- sure which). */
-/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */
-/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the
- start of this record. */
-/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use
- file_name_offset to determine the location
- of the name. */
-/* sizeof() = 60 (0x3c) bytes */
-} __attribute__ ((__packed__)) USN_RECORD;
-
-extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_USNJRNL_H */
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
deleted file mode 100644
index 930a9ae8a053..000000000000
--- a/fs/ntfs/volume.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
- * of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_VOLUME_H
-#define _LINUX_NTFS_VOLUME_H
-
-#include <linux/rwsem.h>
-#include <linux/uidgid.h>
-
-#include "types.h"
-#include "layout.h"
-
-/*
- * The NTFS in memory super block structure.
- */
-typedef struct {
- /*
- * FIXME: Reorder to have commonly used together element within the
- * same cache line, aiming at a cache line size of 32 bytes. Aim for
- * 64 bytes for less commonly used together elements. Put most commonly
- * used elements to front of structure. Obviously do this only when the
- * structure has stabilized... (AIA)
- */
- /* Device specifics. */
- struct super_block *sb; /* Pointer back to the super_block. */
- LCN nr_blocks; /* Number of sb->s_blocksize bytes
- sized blocks on the device. */
- /* Configuration provided by user at mount time. */
- unsigned long flags; /* Miscellaneous flags, see below. */
- kuid_t uid; /* uid that files will be mounted as. */
- kgid_t gid; /* gid that files will be mounted as. */
- umode_t fmask; /* The mask for file permissions. */
- umode_t dmask; /* The mask for directory
- permissions. */
- u8 mft_zone_multiplier; /* Initial mft zone multiplier. */
- u8 on_errors; /* What to do on filesystem errors. */
- /* NTFS bootsector provided information. */
- u16 sector_size; /* in bytes */
- u8 sector_size_bits; /* log2(sector_size) */
- u32 cluster_size; /* in bytes */
- u32 cluster_size_mask; /* cluster_size - 1 */
- u8 cluster_size_bits; /* log2(cluster_size) */
- u32 mft_record_size; /* in bytes */
- u32 mft_record_size_mask; /* mft_record_size - 1 */
- u8 mft_record_size_bits; /* log2(mft_record_size) */
- u32 index_record_size; /* in bytes */
- u32 index_record_size_mask; /* index_record_size - 1 */
- u8 index_record_size_bits; /* log2(index_record_size) */
- LCN nr_clusters; /* Volume size in clusters == number of
- bits in lcn bitmap. */
- LCN mft_lcn; /* Cluster location of mft data. */
- LCN mftmirr_lcn; /* Cluster location of copy of mft. */
- u64 serial_no; /* The volume serial number. */
- /* Mount specific NTFS information. */
- u32 upcase_len; /* Number of entries in upcase[]. */
- ntfschar *upcase; /* The upcase table. */
-
- s32 attrdef_size; /* Size of the attribute definition
- table in bytes. */
- ATTR_DEF *attrdef; /* Table of attribute definitions.
- Obtained from FILE_AttrDef. */
-
-#ifdef NTFS_RW
- /* Variables used by the cluster and mft allocators. */
- s64 mft_data_pos; /* Mft record number at which to
- allocate the next mft record. */
- LCN mft_zone_start; /* First cluster of the mft zone. */
- LCN mft_zone_end; /* First cluster beyond the mft zone. */
- LCN mft_zone_pos; /* Current position in the mft zone. */
- LCN data1_zone_pos; /* Current position in the first data
- zone. */
- LCN data2_zone_pos; /* Current position in the second data
- zone. */
-#endif /* NTFS_RW */
-
- struct inode *mft_ino; /* The VFS inode of $MFT. */
-
- struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */
- struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
- mft record bitmap ($MFT/$BITMAP). */
-#ifdef NTFS_RW
- struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */
- int mftmirr_size; /* Size of mft mirror in mft records. */
-
- struct inode *logfile_ino; /* The VFS inode of $LogFile. */
-#endif /* NTFS_RW */
-
- struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */
- struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
- cluster bitmap ($Bitmap/$DATA). */
-
- struct inode *vol_ino; /* The VFS inode of $Volume. */
- VOLUME_FLAGS vol_flags; /* Volume flags. */
- u8 major_ver; /* Ntfs major version of volume. */
- u8 minor_ver; /* Ntfs minor version of volume. */
-
- struct inode *root_ino; /* The VFS inode of the root
- directory. */
- struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+
- only, otherwise NULL). */
- struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+
- only, otherwise NULL). */
-#ifdef NTFS_RW
- /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */
- struct inode *quota_ino; /* The VFS inode of $Quota. */
- struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */
- /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */
- struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */
- struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */
- struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */
-#endif /* NTFS_RW */
- struct nls_table *nls_map;
-} ntfs_volume;
-
-/*
- * Defined bits for the flags field in the ntfs_volume structure.
- */
-typedef enum {
- NV_Errors, /* 1: Volume has errors, prevent remount rw. */
- NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */
- NV_CaseSensitive, /* 1: Treat file names as case sensitive and
- create filenames in the POSIX namespace.
- Otherwise be case insensitive but still
- create file names in POSIX namespace. */
- NV_LogFileEmpty, /* 1: $LogFile journal is empty. */
- NV_QuotaOutOfDate, /* 1: $Quota is out of date. */
- NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */
- NV_SparseEnabled, /* 1: May create sparse files. */
-} ntfs_volume_flags;
-
-/*
- * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
- * functions.
- */
-#define DEFINE_NVOL_BIT_OPS(flag) \
-static inline int NVol##flag(ntfs_volume *vol) \
-{ \
- return test_bit(NV_##flag, &(vol)->flags); \
-} \
-static inline void NVolSet##flag(ntfs_volume *vol) \
-{ \
- set_bit(NV_##flag, &(vol)->flags); \
-} \
-static inline void NVolClear##flag(ntfs_volume *vol) \
-{ \
- clear_bit(NV_##flag, &(vol)->flags); \
-}
-
-/* Emit the ntfs volume bitops functions. */
-DEFINE_NVOL_BIT_OPS(Errors)
-DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
-DEFINE_NVOL_BIT_OPS(CaseSensitive)
-DEFINE_NVOL_BIT_OPS(LogFileEmpty)
-DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
-DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
-DEFINE_NVOL_BIT_OPS(SparseEnabled)
-
-#endif /* _LINUX_NTFS_VOLUME_H */
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index cae41db0aaa7..084d19d78397 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -431,7 +431,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
* fnd contains tree's path to insert to.
* If fnd is not NULL then dir is locked.
*/
- inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni,
+ inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni,
mode, 0, NULL, 0, fnd);
err = IS_ERR(inode) ? PTR_ERR(inode) :
finish_open(file, dentry, ntfs_file_open);
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index cef5467fd928..9df7c20d066f 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -1825,7 +1825,7 @@ static int __init init_ntfs_fs(void)
ntfs_inode_cachep = kmem_cache_create(
"ntfs_inode_cache", sizeof(struct ntfs_inode), 0,
- (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT),
init_once);
if (!ntfs_inode_cachep) {
err = -ENOMEM;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4d7efefa98c5..1bde1281d514 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -213,7 +213,7 @@ struct o2hb_region {
unsigned int hr_num_pages;
struct page **hr_slot_data;
- struct bdev_handle *hr_bdev_handle;
+ struct file *hr_bdev_file;
struct o2hb_disk_slot *hr_slots;
/* live node map of this region */
@@ -263,7 +263,7 @@ struct o2hb_region {
static inline struct block_device *reg_bdev(struct o2hb_region *reg)
{
- return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL;
+ return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL;
}
struct o2hb_bio_wait_ctxt {
@@ -1509,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slot_data);
}
- if (reg->hr_bdev_handle)
- bdev_release(reg->hr_bdev_handle);
+ if (reg->hr_bdev_file)
+ fput(reg->hr_bdev_file);
kfree(reg->hr_slots);
@@ -1569,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
unsigned long block_bytes;
unsigned int block_bits;
- if (reg->hr_bdev_handle)
+ if (reg->hr_bdev_file)
return -EINVAL;
status = o2hb_read_block_input(reg, page, &block_bytes,
@@ -1598,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
char *p = (char *)page;
ssize_t ret;
- if (reg->hr_bdev_handle)
+ if (reg->hr_bdev_file)
return -EINVAL;
ret = kstrtoull(p, 0, &tmp);
@@ -1623,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
unsigned long tmp;
char *p = (char *)page;
- if (reg->hr_bdev_handle)
+ if (reg->hr_bdev_file)
return -EINVAL;
tmp = simple_strtoul(p, &p, 0);
@@ -1642,7 +1642,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
{
unsigned int ret = 0;
- if (to_o2hb_region(item)->hr_bdev_handle)
+ if (to_o2hb_region(item)->hr_bdev_file)
ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
return ret;
@@ -1753,7 +1753,7 @@ out:
}
/*
- * this is acting as commit; we set up all of hr_bdev_handle and hr_task or
+ * this is acting as commit; we set up all of hr_bdev_file and hr_task or
* nothing
*/
static ssize_t o2hb_region_dev_store(struct config_item *item,
@@ -1769,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
ssize_t ret = -EINVAL;
int live_threshold;
- if (reg->hr_bdev_handle)
+ if (reg->hr_bdev_file)
goto out;
/* We can't heartbeat without having had our node number
@@ -1795,11 +1795,11 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
if (!S_ISBLK(f.file->f_mapping->host->i_mode))
goto out2;
- reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev,
+ reg->hr_bdev_file = bdev_file_open_by_dev(f.file->f_mapping->host->i_rdev,
BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
- if (IS_ERR(reg->hr_bdev_handle)) {
- ret = PTR_ERR(reg->hr_bdev_handle);
- reg->hr_bdev_handle = NULL;
+ if (IS_ERR(reg->hr_bdev_file)) {
+ ret = PTR_ERR(reg->hr_bdev_file);
+ reg->hr_bdev_file = NULL;
goto out2;
}
@@ -1903,8 +1903,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
out3:
if (ret < 0) {
- bdev_release(reg->hr_bdev_handle);
- reg->hr_bdev_handle = NULL;
+ fput(reg->hr_bdev_file);
+ reg->hr_bdev_file = NULL;
}
out2:
fdput(f);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 85215162c9dd..7fc0e920eda7 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -578,7 +578,7 @@ static int __init init_dlmfs_fs(void)
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64a6ef638495..cb40cafbc062 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1615,7 +1615,7 @@ update_holders:
unlock:
lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
- /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+ /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
spin_unlock_irqrestore(&lockres->l_lock, flags);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8b6d15010703..0da8e7bd3261 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2763,6 +2763,7 @@ const struct inode_operations ocfs2_file_iops = {
const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
+ .listxattr = ocfs2_listxattr,
.permission = ocfs2_permission,
.get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 82b28fdacc7e..accf03d4765e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -65,7 +65,7 @@ struct ocfs2_inode_info
tid_t i_sync_tid;
tid_t i_datasync_tid;
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
};
/*
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index f37174e79fad..6de944818c56 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -27,7 +27,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
- if (fl->fl_type == F_WRLCK)
+ if (lock_is_write(fl))
level = 1;
if (!IS_SETLKW(cmd))
trylock = 1;
@@ -53,8 +53,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
*/
locks_init_lock(&request);
- request.fl_type = F_UNLCK;
- request.fl_flags = FL_FLOCK;
+ request.c.flc_type = F_UNLCK;
+ request.c.flc_flags = FL_FLOCK;
locks_lock_file_wait(file, &request);
ocfs2_file_unlock(file);
@@ -100,14 +100,14 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!(fl->fl_flags & FL_FLOCK))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
return locks_lock_file_wait(file, fl);
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
return ocfs2_do_funlock(file, cmd, fl);
else
return ocfs2_do_flock(file, inode, cmd, fl);
@@ -118,7 +118,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!(fl->fl_flags & FL_POSIX))
+ if (!(fl->c.flc_flags & FL_POSIX))
return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index dc9f76ab7e13..0575c2d060eb 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -447,14 +447,17 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
int err;
struct quota_info *dqopt = sb_dqopt(sb);
struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv;
+ unsigned int memalloc;
down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
err = ocfs2_qinfo_lock(info, 1);
if (err < 0)
goto out_sem;
err = __ocfs2_global_write_info(sb, type);
ocfs2_qinfo_unlock(info, 1);
out_sem:
+ memalloc_nofs_restore(memalloc);
up_write(&dqopt->dqio_sem);
return err;
}
@@ -601,6 +604,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_super *osb = OCFS2_SB(sb);
int status = 0;
+ unsigned int memalloc;
trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type,
@@ -618,6 +622,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
goto out_ilock;
}
down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_sync_dquot(dquot);
if (status < 0)
mlog_errno(status);
@@ -625,6 +630,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
status = ocfs2_local_write_dquot(dquot);
if (status < 0)
mlog_errno(status);
+ memalloc_nofs_restore(memalloc);
up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out_ilock:
@@ -662,6 +668,7 @@ static int ocfs2_write_dquot(struct dquot *dquot)
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
+ unsigned int memalloc;
trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type);
@@ -673,7 +680,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
goto out;
}
down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_local_write_dquot(dquot);
+ memalloc_nofs_restore(memalloc);
up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out:
@@ -920,6 +929,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(sb);
+ unsigned int memalloc;
trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
type);
@@ -946,6 +956,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
goto out_ilock;
}
down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_sync_dquot(dquot);
if (status < 0) {
mlog_errno(status);
@@ -954,6 +965,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
/* Now write updated local dquot structure */
status = ocfs2_local_write_dquot(dquot);
out_dlock:
+ memalloc_nofs_restore(memalloc);
up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out_ilock:
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index e09842fc9d4d..8ce462c64c51 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -470,6 +470,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
int bit, chunk;
struct ocfs2_recovery_chunk *rchunk, *next;
qsize_t spacechange, inodechange;
+ unsigned int memalloc;
trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
@@ -521,6 +522,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
goto out_drop_lock;
}
down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
spin_lock(&dquot->dq_dqb_lock);
/* Add usage from quota entry into quota changes
* of our node. Auxiliary variables are important
@@ -553,6 +555,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
unlock_buffer(qbh);
ocfs2_journal_dirty(handle, qbh);
out_commit:
+ memalloc_nofs_restore(memalloc);
up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(OCFS2_SB(sb), handle);
out_drop_lock:
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 9b76ee66aeb2..c11406cd87a8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -744,7 +744,7 @@ static int user_plock(struct ocfs2_cluster_connection *conn,
return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl);
else if (IS_GETLK(cmd))
return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
- else if (fl->fl_type == F_UNLCK)
+ else if (lock_is_unlock(fl))
return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
else
return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6b906424902b..8aabaed2c1cb 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -122,7 +122,7 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
static int ocfs2_enable_quotas(struct ocfs2_super *osb);
static void ocfs2_disable_quotas(struct ocfs2_super *osb);
-static struct dquot **ocfs2_get_dquots(struct inode *inode)
+static struct dquot __rcu **ocfs2_get_dquots(struct inode *inode)
{
return OCFS2_I(inode)->i_dquot;
}
@@ -1706,18 +1706,17 @@ static int ocfs2_initialize_mem_caches(void)
sizeof(struct ocfs2_inode_info),
0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
ocfs2_inode_init_once);
ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
sizeof(struct ocfs2_dquot),
0,
- (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
NULL);
ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
sizeof(struct ocfs2_quota_chunk),
0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ SLAB_RECLAIM_ACCOUNT,
NULL);
if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
!ocfs2_qf_chunk_cachep) {
@@ -2027,8 +2026,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
- memcpy(&sb->s_uuid, di->id2.i_super.s_uuid,
- sizeof(di->id2.i_super.s_uuid));
+ super_set_uuid(sb, di->id2.i_super.s_uuid,
+ sizeof(di->id2.i_super.s_uuid));
osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
diff --git a/fs/open.c b/fs/open.c
index a84d21e55c39..ee8460c83c77 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,7 +29,6 @@
#include <linux/audit.h>
#include <linux/falloc.h>
#include <linux/fs_struct.h>
-#include <linux/ima.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
#include <linux/mnt_idmapping.h>
@@ -154,49 +153,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+long do_ftruncate(struct file *file, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
- struct fd f;
int error;
- error = -EINVAL;
- if (length < 0)
- goto out;
- error = -EBADF;
- f = fdget(fd);
- if (!f.file)
- goto out;
-
/* explicitly opened as large or we are on 64-bit box */
- if (f.file->f_flags & O_LARGEFILE)
+ if (file->f_flags & O_LARGEFILE)
small = 0;
- dentry = f.file->f_path.dentry;
+ dentry = file->f_path.dentry;
inode = dentry->d_inode;
- error = -EINVAL;
- if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
- goto out_putf;
+ if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
+ return -EINVAL;
- error = -EINVAL;
/* Cannot ftruncate over 2^31 bytes without large file support */
if (small && length > MAX_NON_LFS)
- goto out_putf;
+ return -EINVAL;
- error = -EPERM;
/* Check IS_APPEND on real upper inode */
- if (IS_APPEND(file_inode(f.file)))
- goto out_putf;
+ if (IS_APPEND(file_inode(file)))
+ return -EPERM;
sb_start_write(inode->i_sb);
- error = security_file_truncate(f.file);
+ error = security_file_truncate(file);
if (!error)
- error = do_truncate(file_mnt_idmap(f.file), dentry, length,
- ATTR_MTIME | ATTR_CTIME, f.file);
+ error = do_truncate(file_mnt_idmap(file), dentry, length,
+ ATTR_MTIME | ATTR_CTIME, file);
sb_end_write(inode->i_sb);
-out_putf:
+
+ return error;
+}
+
+long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+{
+ struct fd f;
+ int error;
+
+ if (length < 0)
+ return -EINVAL;
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ error = do_ftruncate(f.file, length, small);
+
fdput(f);
-out:
return error;
}
@@ -1364,7 +1366,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
{
struct filename *name = getname_kernel(filename);
struct file *file = ERR_CAST(name);
-
+
if (!IS_ERR(name)) {
file = file_open_name(name, flags, mode);
putname(name);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c4b65a6d41cc..4a0779e3ef79 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -446,7 +446,7 @@ static int __init init_openprom_fs(void)
sizeof(struct op_inode_info),
0,
(SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
op_inode_init_once);
if (!op_inode_cachep)
return -ENOMEM;
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index 3b6982bf6bcf..e75e173a9186 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -22,7 +22,7 @@ int op_cache_initialize(void)
op_cache = kmem_cache_create("orangefs_op_cache",
sizeof(struct orangefs_kernel_op_s),
0,
- ORANGEFS_CACHE_CREATE_FLAGS,
+ 0,
NULL);
if (!op_cache) {
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 926d9c0a428a..e2df7eeadc7a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -93,16 +93,6 @@ enum orangefs_vfs_op_states {
OP_VFS_STATE_GIVEN_UP = 16,
};
-/*
- * orangefs kernel memory related flags
- */
-
-#if (defined CONFIG_DEBUG_SLAB)
-#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE
-#else
-#define ORANGEFS_CACHE_CREATE_FLAGS 0
-#endif
-
extern const struct xattr_handler * const orangefs_xattr_handlers[];
extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 5254256a224d..34849b4a3243 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -527,7 +527,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
if (!ORANGEFS_SB(sb)) {
d = ERR_PTR(-ENOMEM);
- goto free_sb_and_op;
+ goto free_op;
}
ret = orangefs_fill_sb(sb,
@@ -644,7 +644,7 @@ int orangefs_inode_cache_initialize(void)
"orangefs_inode_cache",
sizeof(struct orangefs_inode_s),
0,
- ORANGEFS_CACHE_CREATE_FLAGS,
+ 0,
offsetof(struct orangefs_inode_s,
link_target),
sizeof_field(struct orangefs_inode_s,
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8586e2f5d243..0762575a1e70 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -234,11 +234,11 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen)
{
loff_t tmp;
- if (WARN_ON_ONCE(pos != pos2))
+ if (pos != pos2)
return -EIO;
- if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0))
+ if (pos < 0 || len < 0 || totlen < 0)
return -EIO;
- if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp)))
+ if (check_add_overflow(pos, len, &tmp))
return -EIO;
return 0;
}
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 112b4b12f825..36dcc530ac28 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -280,12 +280,20 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
{
struct ovl_fs_context *ctx = fc->fs_private;
- if (ovl_dentry_weird(path->dentry))
- return invalfc(fc, "filesystem on %s not supported", name);
-
if (!d_is_dir(path->dentry))
return invalfc(fc, "%s is not a directory", name);
+ /*
+ * Root dentries of case-insensitive capable filesystems might
+ * not have the dentry operations set, but still be incompatible
+ * with overlayfs. Check explicitly to prevent post-mount
+ * failures.
+ */
+ if (sb_has_encoding(path->mnt->mnt_sb))
+ return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name);
+
+ if (ovl_dentry_weird(path->dentry))
+ return invalfc(fc, "filesystem on %s not supported", name);
/*
* Check whether upper path is read-only here to report failures
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 2eef6c70b2ae..a40fc7e05525 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -28,41 +28,38 @@ MODULE_LICENSE("GPL");
struct ovl_dir_cache;
-static struct dentry *ovl_d_real(struct dentry *dentry,
- const struct inode *inode)
+static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type)
{
- struct dentry *real = NULL, *lower;
+ struct dentry *upper, *lower;
int err;
- /*
- * vfs is only expected to call d_real() with NULL from d_real_inode()
- * and with overlay inode from file_dentry() on an overlay file.
- *
- * TODO: remove @inode argument from d_real() API, remove code in this
- * function that deals with non-NULL @inode and remove d_real() call
- * from file_dentry().
- */
- if (inode && d_inode(dentry) == inode)
- return dentry;
- else if (inode)
+ switch (type) {
+ case D_REAL_DATA:
+ case D_REAL_METADATA:
+ break;
+ default:
goto bug;
+ }
if (!d_is_reg(dentry)) {
/* d_real_inode() is only relevant for regular files */
return dentry;
}
- real = ovl_dentry_upper(dentry);
- if (real && (inode == d_inode(real)))
- return real;
+ upper = ovl_dentry_upper(dentry);
+ if (upper && (type == D_REAL_METADATA ||
+ ovl_has_upperdata(d_inode(dentry))))
+ return upper;
- if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
- return real;
+ if (type == D_REAL_METADATA) {
+ lower = ovl_dentry_lower(dentry);
+ goto real_lower;
+ }
/*
- * Best effort lazy lookup of lowerdata for !inode case to return
+ * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return
* the real lowerdata dentry. The only current caller of d_real() with
- * NULL inode is d_real_inode() from trace_uprobe and this caller is
+ * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is
* likely going to be followed reading from the file, before placing
* uprobes on offset within the file, so lowerdata should be available
* when setting the uprobe.
@@ -73,18 +70,13 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
lower = ovl_dentry_lowerdata(dentry);
if (!lower)
goto bug;
- real = lower;
- /* Handle recursion */
- real = d_real(real, inode);
+real_lower:
+ /* Handle recursion into stacked lower fs */
+ return d_real(lower, type);
- if (!inode || inode == d_inode(real))
- return real;
bug:
- WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
- __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
- inode ? inode->i_ino : 0, real,
- real && d_inode(real) ? d_inode(real)->i_ino : 0);
+ WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type);
return dentry;
}
@@ -1511,7 +1503,7 @@ static int __init ovl_init(void)
ovl_inode_cachep = kmem_cache_create("ovl_inode",
sizeof(struct ovl_inode), 0,
(SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
ovl_inode_init_once);
if (ovl_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index a8e17f14d7a2..d285d1d7baad 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -774,13 +774,14 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
const struct path *upperpath)
{
bool set = false;
+ uuid_t uuid;
int res;
/* Try to load existing persistent uuid */
- res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, sb->s_uuid.b,
+ res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, uuid.b,
UUID_SIZE);
if (res == UUID_SIZE)
- return true;
+ goto set_uuid;
if (res != -ENODATA)
goto fail;
@@ -808,17 +809,20 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
}
/* Generate overlay instance uuid */
- uuid_gen(&sb->s_uuid);
+ uuid_gen(&uuid);
/* Try to store persistent uuid */
set = true;
- res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, sb->s_uuid.b,
+ res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, uuid.b,
UUID_SIZE);
- if (res == 0)
- return true;
+ if (res)
+ goto fail;
+
+set_uuid:
+ super_set_uuid(sb, uuid.b, sizeof(uuid));
+ return true;
fail:
- memset(sb->s_uuid.b, 0, UUID_SIZE);
ofs->config.uuid = OVL_UUID_NULL;
pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n",
set ? "set" : "get", upperpath->dentry, res);
diff --git a/fs/pidfs.c b/fs/pidfs.c
new file mode 100644
index 000000000000..a63d5d24aa02
--- /dev/null
+++ b/fs/pidfs.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/mount.h>
+#include <linux/pid.h>
+#include <linux/pidfs.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/pseudo_fs.h>
+#include <linux/seq_file.h>
+#include <uapi/linux/pidfd.h>
+
+#include "internal.h"
+
+#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ * namespace (also take care to create new mount namespaces in the
+ * new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ * have exactly one entry, which is 0
+ */
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct pid *pid = pidfd_pid(f);
+ struct pid_namespace *ns;
+ pid_t nr = -1;
+
+ if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+ ns = proc_pid_ns(file_inode(m->file)->i_sb);
+ nr = pid_nr_ns(pid, ns);
+ }
+
+ seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+ seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+ if (nr > 0) {
+ int i;
+
+ /* If nr is non-zero it means that 'pid' is valid and that
+ * ns, i.e. the pid namespace associated with the procfs
+ * instance, is in the pid namespace hierarchy of pid.
+ * Start at one below the already printed level.
+ */
+ for (i = ns->level + 1; i <= pid->level; i++)
+ seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+ }
+#endif
+ seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct pid *pid = pidfd_pid(file);
+ bool thread = file->f_flags & PIDFD_THREAD;
+ struct task_struct *task;
+ __poll_t poll_flags = 0;
+
+ poll_wait(file, &pid->wait_pidfd, pts);
+ /*
+ * Depending on PIDFD_THREAD, inform pollers when the thread
+ * or the whole thread-group exits.
+ */
+ guard(rcu)();
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+ else if (task->exit_state && (thread || thread_group_empty(task)))
+ poll_flags = EPOLLIN | EPOLLRDNORM;
+
+ return poll_flags;
+}
+
+static const struct file_operations pidfs_file_operations = {
+ .poll = pidfd_poll,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+
+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op != &pidfs_file_operations)
+ return ERR_PTR(-EBADF);
+ return file_inode(file)->i_private;
+}
+
+static struct vfsmount *pidfs_mnt __ro_after_init;
+
+#if BITS_PER_LONG == 32
+/*
+ * Provide a fallback mechanism for 32-bit systems so processes remain
+ * reliably comparable by inode number even on those systems.
+ */
+static DEFINE_IDA(pidfd_inum_ida);
+
+static int pidfs_inum(struct pid *pid, unsigned long *ino)
+{
+ int ret;
+
+ ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
+ UINT_MAX, GFP_ATOMIC);
+ if (ret < 0)
+ return -ENOSPC;
+
+ *ino = ret;
+ return 0;
+}
+
+static inline void pidfs_free_inum(unsigned long ino)
+{
+ if (ino > 0)
+ ida_free(&pidfd_inum_ida, ino);
+}
+#else
+static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
+{
+ *ino = pid->ino;
+ return 0;
+}
+#define pidfs_free_inum(ino) ((void)(ino))
+#endif
+
+/*
+ * The vfs falls back to simple_setattr() if i_op->setattr() isn't
+ * implemented. Let's reject it completely until we have a clean
+ * permission concept for pidfds.
+ */
+static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static const struct inode_operations pidfs_inode_operations = {
+ .getattr = pidfs_getattr,
+ .setattr = pidfs_setattr,
+};
+
+static void pidfs_evict_inode(struct inode *inode)
+{
+ struct pid *pid = inode->i_private;
+
+ clear_inode(inode);
+ put_pid(pid);
+ pidfs_free_inum(inode->i_ino);
+}
+
+static const struct super_operations pidfs_sops = {
+ .drop_inode = generic_delete_inode,
+ .evict_inode = pidfs_evict_inode,
+ .statfs = simple_statfs,
+};
+
+static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ struct inode *inode = d_inode(dentry);
+ struct pid *pid = inode->i_private;
+
+ return dynamic_dname(buffer, buflen, "pidfd:[%llu]", pid->ino);
+}
+
+static const struct dentry_operations pidfs_dentry_operations = {
+ .d_delete = always_delete_dentry,
+ .d_dname = pidfs_dname,
+ .d_prune = stashed_dentry_prune,
+};
+
+static int pidfs_init_inode(struct inode *inode, void *data)
+{
+ inode->i_private = data;
+ inode->i_flags |= S_PRIVATE;
+ inode->i_mode |= S_IRWXU;
+ inode->i_op = &pidfs_inode_operations;
+ inode->i_fop = &pidfs_file_operations;
+ /*
+ * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
+ * avoids collisions with the root inode which is 1 for pseudo
+ * filesystems.
+ */
+ return pidfs_inum(data, &inode->i_ino);
+}
+
+static void pidfs_put_data(void *data)
+{
+ struct pid *pid = data;
+ put_pid(pid);
+}
+
+static const struct stashed_operations pidfs_stashed_ops = {
+ .init_inode = pidfs_init_inode,
+ .put_data = pidfs_put_data,
+};
+
+static int pidfs_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, PID_FS_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ops = &pidfs_sops;
+ ctx->dops = &pidfs_dentry_operations;
+ fc->s_fs_info = (void *)&pidfs_stashed_ops;
+ return 0;
+}
+
+static struct file_system_type pidfs_type = {
+ .name = "pidfs",
+ .init_fs_context = pidfs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+
+ struct file *pidfd_file;
+ struct path path;
+ int ret;
+
+ ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ pidfd_file = dentry_open(&path, flags, current_cred());
+ path_put(&path);
+ return pidfd_file;
+}
+
+void __init pidfs_init(void)
+{
+ pidfs_mnt = kern_mount(&pidfs_type);
+ if (IS_ERR(pidfs_mnt))
+ panic("Failed to mount pidfs pseudo filesystem");
+}
diff --git a/fs/pipe.c b/fs/pipe.c
index f1adbfe743d4..50c8a8596b52 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
*/
-static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+#define cmp_int(l, r) ((l > r) - (l < r))
+
+#ifdef CONFIG_PROVE_LOCKING
+static int pipe_lock_cmp_fn(const struct lockdep_map *a,
+ const struct lockdep_map *b)
{
- if (pipe->files)
- mutex_lock_nested(&pipe->mutex, subclass);
+ return cmp_int((unsigned long) a, (unsigned long) b);
}
+#endif
void pipe_lock(struct pipe_inode_info *pipe)
{
- /*
- * pipe_lock() nests non-pipe inode locks (for writing to a file)
- */
- pipe_lock_nested(pipe, I_MUTEX_PARENT);
+ if (pipe->files)
+ mutex_lock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_lock);
@@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe)
}
EXPORT_SYMBOL(pipe_unlock);
-static inline void __pipe_lock(struct pipe_inode_info *pipe)
-{
- mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
-}
-
-static inline void __pipe_unlock(struct pipe_inode_info *pipe)
-{
- mutex_unlock(&pipe->mutex);
-}
-
void pipe_double_lock(struct pipe_inode_info *pipe1,
struct pipe_inode_info *pipe2)
{
BUG_ON(pipe1 == pipe2);
- if (pipe1 < pipe2) {
- pipe_lock_nested(pipe1, I_MUTEX_PARENT);
- pipe_lock_nested(pipe2, I_MUTEX_CHILD);
- } else {
- pipe_lock_nested(pipe2, I_MUTEX_PARENT);
- pipe_lock_nested(pipe1, I_MUTEX_CHILD);
- }
+ if (pipe1 > pipe2)
+ swap(pipe1, pipe2);
+
+ pipe_lock(pipe1);
+ pipe_lock(pipe2);
}
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
return 0;
ret = 0;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
/*
* We only wake up writers if the pipe was full when we started
@@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
ret = -EAGAIN;
break;
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
/*
* We only get here if we didn't actually read anything.
@@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
wake_next_reader = true;
}
if (pipe_empty(pipe->head, pipe->tail))
wake_next_reader = false;
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
if (was_full)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
@@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
if (unlikely(total_len == 0))
return 0;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
@@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* after waiting we need to re-check whether the pipe
* become empty while we dropped the lock.
*/
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
if (was_empty)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
was_empty = pipe_empty(pipe->head, pipe->tail);
wake_next_writer = true;
}
out:
if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
wake_next_writer = false;
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
/*
* If we do do a wakeup event, we do a 'sync' wakeup, because we
@@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case FIONREAD:
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
count = 0;
head = pipe->head;
tail = pipe->tail;
@@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
count += pipe->bufs[tail & mask].len;
tail++;
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return put_user(count, (int __user *)arg);
#ifdef CONFIG_WATCH_QUEUE
case IOC_WATCH_QUEUE_SET_SIZE: {
int ret;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
ret = watch_queue_set_size(pipe, arg);
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return ret;
}
@@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file)
{
struct pipe_inode_info *pipe = file->private_data;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
if (file->f_mode & FMODE_READ)
pipe->readers--;
if (file->f_mode & FMODE_WRITE)
@@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file)
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
put_pipe_info(inode, pipe);
return 0;
@@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on)
struct pipe_inode_info *pipe = filp->private_data;
int retval = 0;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
if (filp->f_mode & FMODE_READ)
retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
@@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on)
/* this can happen only if on == T */
fasync_helper(-1, filp, 0, &pipe->fasync_readers);
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return retval;
}
@@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
+ lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
return pipe;
}
@@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
filp->private_data = pipe;
/* OK, we have a pipe and it's pinned down */
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
/* We can only do regular read/write on fifos */
stream_open(inode, filp);
@@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
}
/* Ok! */
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return 0;
err_rd:
@@ -1230,7 +1221,7 @@ err_wr:
goto err;
err:
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
put_pipe_info(inode, pipe);
return ret;
@@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
if (!pipe)
return -EBADF;
- __pipe_lock(pipe);
+ mutex_lock(&pipe->mutex);
switch (cmd) {
case F_SETPIPE_SZ:
@@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
break;
}
- __pipe_unlock(pipe);
+ mutex_unlock(&pipe->mutex);
return ret;
}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index e1af20893ebe..3f87297dbfdb 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -26,7 +26,6 @@
#include <linux/mnt_idmapping.h>
#include <linux/iversion.h>
#include <linux/security.h>
-#include <linux/evm.h>
#include <linux/fsnotify.h>
#include <linux/filelock.h>
@@ -786,12 +785,12 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
return ERR_PTR(count);
if (count == 0)
return NULL;
-
+
acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
acl_e = acl->a_entries;
-
+
for (end = entry + count; entry != end; acl_e++, entry++) {
acl_e->e_tag = le16_to_cpu(entry->e_tag);
acl_e->e_perm = le16_to_cpu(entry->e_perm);
@@ -1137,7 +1136,7 @@ retry_deleg:
error = -EIO;
if (!error) {
fsnotify_xattr(dentry);
- evm_inode_post_set_acl(dentry, acl_name, kacl);
+ security_inode_post_set_acl(dentry, acl_name, kacl);
}
out_inode_unlock:
@@ -1245,7 +1244,7 @@ retry_deleg:
error = -EIO;
if (!error) {
fsnotify_xattr(dentry);
- evm_inode_post_remove_acl(idmap, dentry, acl_name);
+ security_inode_post_remove_acl(idmap, dentry, acl_name);
}
out_inode_unlock:
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 32b1116ae137..d80a1431ef7b 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -32,7 +32,7 @@ config PROC_FS
config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
- select CRASH_CORE
+ select VMCORE_INFO
help
Provides a virtual ELF core file of the live kernel. This can
be read with gdb and other ELF tools. No modifications can be
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bd08616ed8ba..7b4db9c56e6a 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
obj-y += proc.o
-CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,)
+CFLAGS_task_mmu.o += -Wno-override-init
proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := task_mmu.o
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 902b326e1e56..87dcaae32ff8 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
break;
dst += ret;
}
- if (ret >= 0 && boot_command_line[0]) {
- ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
- boot_command_line);
- if (ret > 0)
- dst += ret;
- }
+ }
+ if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) {
+ ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+ boot_command_line);
+ if (ret > 0)
+ dst += ret;
}
out:
kfree(key);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 05350f3c2812..dcd513dccf55 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -92,7 +92,7 @@ void __init proc_init_kmemcache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_ACCOUNT|
SLAB_PANIC),
init_once);
pde_opener_cache =
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6422e569b080..8e08a9a1b7ed 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,7 +10,7 @@
* Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
*/
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/kcore.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3f78ebbb795f..23fbab954c20 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
- struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
{
pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
@@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
hole_end = end;
for (; addr < hole_end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
if (vma->vm_flags & VM_SOFTDIRTY)
pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
if (pm->show_pfn) {
@@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pagemap_entry_t pme;
pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
}
@@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
return err;
if (pm->show_pfn && (flags & PM_PRESENT))
@@ -1807,7 +1806,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pte_to_swp_entry(pte);
if (is_pfn_swap_entry(swp) &&
- !PageAnon(pfn_swap_entry_to_page(swp)))
+ !folio_test_anon(pfn_swap_entry_folio(swp)))
categories |= PAGE_IS_FILE;
}
if (pte_swp_soft_dirty(pte))
@@ -1873,7 +1872,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pmd_to_swp_entry(pmd);
if (is_pfn_swap_entry(swp) &&
- !PageAnon(pfn_swap_entry_to_page(swp)))
+ !folio_test_anon(pfn_swap_entry_folio(swp)))
categories |= PAGE_IS_FILE;
}
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d0d9bfdad30c..56815799ce79 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -307,7 +307,6 @@ int pstore_put_backend_records(struct pstore_info *psi)
{
struct pstore_private *pos, *tmp;
struct dentry *root;
- int rc = 0;
root = psinfo_lock_root();
if (!root)
@@ -317,11 +316,8 @@ int pstore_put_backend_records(struct pstore_info *psi)
list_for_each_entry_safe(pos, tmp, &records_list, list) {
if (pos->record->psi == psi) {
list_del_init(&pos->list);
- rc = simple_unlink(d_inode(root), pos->dentry);
- if (WARN_ON(rc))
- break;
- d_drop(pos->dentry);
- dput(pos->dentry);
+ d_invalidate(pos->dentry);
+ simple_unlink(d_inode(root), pos->dentry);
pos->dentry = NULL;
}
}
@@ -329,7 +325,7 @@ int pstore_put_backend_records(struct pstore_info *psi)
inode_unlock(d_inode(root));
- return rc;
+ return 0;
}
/*
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 88b34fdbf759..b1a455f42e93 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -893,6 +893,7 @@ static const struct of_device_id dt_match[] = {
{ .compatible = "ramoops" },
{}
};
+MODULE_DEVICE_TABLE(of, dt_match);
static struct platform_driver ramoops_driver = {
.probe = ramoops_probe,
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 2770746bb7aa..694db616663f 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -973,6 +973,8 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone,
char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n",
kmsg_dump_reason_str(record->reason),
record->count);
+ if (!buf)
+ return -ENOMEM;
hlen = strlen(buf);
record->buf = krealloc(buf, hlen + size, GFP_KERNEL);
if (!record->buf) {
@@ -1215,7 +1217,6 @@ static struct pstore_zone **psz_init_zones(enum pstore_type_id type,
pr_err("allocate for zones %s failed\n", name);
return ERR_PTR(-ENOMEM);
}
- memset(zones, 0, c * sizeof(*zones));
for (i = 0; i < c; i++) {
zone = psz_init_zone(type, off, record_size);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 6eb9bb369b57..d79841e94428 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -21,6 +21,7 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
+#include <linux/fs_context.h>
#include "qnx4.h"
#define QNX4_VERSION 4
@@ -30,28 +31,33 @@ static const struct super_operations qnx4_sops;
static struct inode *qnx4_alloc_inode(struct super_block *sb);
static void qnx4_free_inode(struct inode *inode);
-static int qnx4_remount(struct super_block *sb, int *flags, char *data);
static int qnx4_statfs(struct dentry *, struct kstatfs *);
+static int qnx4_get_tree(struct fs_context *fc);
static const struct super_operations qnx4_sops =
{
.alloc_inode = qnx4_alloc_inode,
.free_inode = qnx4_free_inode,
.statfs = qnx4_statfs,
- .remount_fs = qnx4_remount,
};
-static int qnx4_remount(struct super_block *sb, int *flags, char *data)
+static int qnx4_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
struct qnx4_sb_info *qs;
sync_filesystem(sb);
qs = qnx4_sb(sb);
qs->Version = QNX4_VERSION;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
return 0;
}
+static const struct fs_context_operations qnx4_context_opts = {
+ .get_tree = qnx4_get_tree,
+ .reconfigure = qnx4_reconfigure,
+};
+
static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create )
{
unsigned long phys;
@@ -183,12 +189,13 @@ static const char *qnx4_checkroot(struct super_block *sb,
return "bitmap file not found.";
}
-static int qnx4_fill_super(struct super_block *s, void *data, int silent)
+static int qnx4_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh;
struct inode *root;
const char *errmsg;
struct qnx4_sb_info *qs;
+ int silent = fc->sb_flags & SB_SILENT;
qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
if (!qs)
@@ -216,7 +223,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
brelse(bh);
if (errmsg != NULL) {
- if (!silent)
+ if (!silent)
printk(KERN_ERR "qnx4: %s\n", errmsg);
return -EINVAL;
}
@@ -235,6 +242,18 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
return 0;
}
+static int qnx4_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, qnx4_fill_super);
+}
+
+static int qnx4_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &qnx4_context_opts;
+
+ return 0;
+}
+
static void qnx4_kill_sb(struct super_block *sb)
{
struct qnx4_sb_info *qs = qnx4_sb(sb);
@@ -359,7 +378,7 @@ static int init_inodecache(void)
qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
sizeof(struct qnx4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (qnx4_inode_cachep == NULL)
return -ENOMEM;
@@ -376,18 +395,12 @@ static void destroy_inodecache(void)
kmem_cache_destroy(qnx4_inode_cachep);
}
-static struct dentry *qnx4_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
-}
-
static struct file_system_type qnx4_fs_type = {
- .owner = THIS_MODULE,
- .name = "qnx4",
- .mount = qnx4_mount,
- .kill_sb = qnx4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "qnx4",
+ .kill_sb = qnx4_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = qnx4_init_fs_context,
};
MODULE_ALIAS_FS("qnx4");
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index a286c545717f..405913f4faff 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -615,7 +615,7 @@ static int init_inodecache(void)
qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
sizeof(struct qnx6_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (!qnx6_inode_cachep)
return -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 1f0c754416b6..dacbee455c03 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -399,15 +399,17 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
EXPORT_SYMBOL(dquot_mark_dquot_dirty);
/* Dirtify all the dquots - this can block when journalling */
-static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
+static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
{
int ret, err, cnt;
+ struct dquot *dquot;
ret = err = 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquot[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot)
/* Even in case of error we have to continue */
- ret = mark_dquot_dirty(dquot[cnt]);
+ ret = mark_dquot_dirty(dquot);
if (!err)
err = ret;
}
@@ -875,10 +877,7 @@ void dqput(struct dquot *dquot)
}
/* Need to release dquot? */
-#ifdef CONFIG_QUOTA_DEBUG
- /* sanity check */
- BUG_ON(!list_empty(&dquot->dq_free));
-#endif
+ WARN_ON_ONCE(!list_empty(&dquot->dq_free));
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
@@ -987,9 +986,8 @@ we_slept:
* smp_mb__before_atomic() in dquot_acquire().
*/
smp_rmb();
-#ifdef CONFIG_QUOTA_DEBUG
- BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
-#endif
+ /* Has somebody invalidated entry under us? */
+ WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash));
out:
if (empty)
do_destroy_dquot(empty);
@@ -998,14 +996,14 @@ out:
}
EXPORT_SYMBOL(dqget);
-static inline struct dquot **i_dquot(struct inode *inode)
+static inline struct dquot __rcu **i_dquot(struct inode *inode)
{
return inode->i_sb->s_op->get_dquots(inode);
}
static int dqinit_needed(struct inode *inode, int type)
{
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
int cnt;
if (IS_NOQUOTA(inode))
@@ -1095,14 +1093,16 @@ static void remove_dquot_ref(struct super_block *sb, int type)
*/
spin_lock(&dq_data_lock);
if (!IS_NOQUOTA(inode)) {
- struct dquot **dquots = i_dquot(inode);
- struct dquot *dquot = dquots[type];
+ struct dquot __rcu **dquots = i_dquot(inode);
+ struct dquot *dquot = srcu_dereference_check(
+ dquots[type], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
reserved = 1;
#endif
- dquots[type] = NULL;
+ rcu_assign_pointer(dquots[type], NULL);
if (dquot)
dqput(dquot);
}
@@ -1455,7 +1455,8 @@ static int inode_quota_active(const struct inode *inode)
static int __dquot_initialize(struct inode *inode, int type)
{
int cnt, init_needed = 0;
- struct dquot **dquots, *got[MAXQUOTAS] = {};
+ struct dquot __rcu **dquots;
+ struct dquot *got[MAXQUOTAS] = {};
struct super_block *sb = inode->i_sb;
qsize_t rsv;
int ret = 0;
@@ -1530,7 +1531,7 @@ static int __dquot_initialize(struct inode *inode, int type)
if (!got[cnt])
continue;
if (!dquots[cnt]) {
- dquots[cnt] = got[cnt];
+ rcu_assign_pointer(dquots[cnt], got[cnt]);
got[cnt] = NULL;
/*
* Make quota reservation system happy if someone
@@ -1538,12 +1539,16 @@ static int __dquot_initialize(struct inode *inode, int type)
*/
rsv = inode_get_rsv_space(inode);
if (unlikely(rsv)) {
+ struct dquot *dquot = srcu_dereference_check(
+ dquots[cnt], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+
spin_lock(&inode->i_lock);
/* Get reservation again under proper lock */
rsv = __inode_get_rsv_space(inode);
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- dquots[cnt]->dq_dqb.dqb_rsvspace += rsv;
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot->dq_dqb.dqb_rsvspace += rsv;
+ spin_unlock(&dquot->dq_dqb_lock);
spin_unlock(&inode->i_lock);
}
}
@@ -1565,7 +1570,7 @@ EXPORT_SYMBOL(dquot_initialize);
bool dquot_initialize_needed(struct inode *inode)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
int i;
if (!inode_quota_active(inode))
@@ -1590,13 +1595,14 @@ EXPORT_SYMBOL(dquot_initialize_needed);
static void __dquot_drop(struct inode *inode)
{
int cnt;
- struct dquot **dquots = i_dquot(inode);
+ struct dquot __rcu **dquots = i_dquot(inode);
struct dquot *put[MAXQUOTAS];
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- put[cnt] = dquots[cnt];
- dquots[cnt] = NULL;
+ put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+ rcu_assign_pointer(dquots[cnt], NULL);
}
spin_unlock(&dq_data_lock);
dqput_all(put);
@@ -1604,7 +1610,7 @@ static void __dquot_drop(struct inode *inode)
void dquot_drop(struct inode *inode)
{
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
int cnt;
if (IS_NOQUOTA(inode))
@@ -1677,7 +1683,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
int reserve = flags & DQUOT_SPACE_RESERVE;
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
if (!inode_quota_active(inode)) {
if (reserve) {
@@ -1697,27 +1704,26 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
index = srcu_read_lock(&dquot_srcu);
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
if (reserve) {
- ret = dquot_add_space(dquots[cnt], 0, number, flags,
- &warn[cnt]);
+ ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
} else {
- ret = dquot_add_space(dquots[cnt], number, 0, flags,
- &warn[cnt]);
+ ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
}
if (ret) {
/* Back out changes we already did */
for (cnt--; cnt >= 0; cnt--) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
if (reserve)
- dquot_free_reserved_space(dquots[cnt],
- number);
+ dquot_free_reserved_space(dquot, number);
else
- dquot_decr_space(dquots[cnt], number);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ dquot_decr_space(dquot, number);
+ spin_unlock(&dquot->dq_dqb_lock);
}
spin_unlock(&inode->i_lock);
goto out_flush_warn;
@@ -1747,7 +1753,8 @@ int dquot_alloc_inode(struct inode *inode)
{
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
+ struct dquot *dquot;
if (!inode_quota_active(inode))
return 0;
@@ -1758,17 +1765,19 @@ int dquot_alloc_inode(struct inode *inode)
index = srcu_read_lock(&dquot_srcu);
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- ret = dquot_add_inodes(dquots[cnt], 1, &warn[cnt]);
+ ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
if (ret) {
for (cnt--; cnt >= 0; cnt--) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
/* Back out changes we already did */
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- dquot_decr_inodes(dquots[cnt], 1);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot_decr_inodes(dquot, 1);
+ spin_unlock(&dquot->dq_dqb_lock);
}
goto warn_put_all;
}
@@ -1789,7 +1798,8 @@ EXPORT_SYMBOL(dquot_alloc_inode);
*/
void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int cnt, index;
if (!inode_quota_active(inode)) {
@@ -1805,9 +1815,8 @@ void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
spin_lock(&inode->i_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquots[cnt]) {
- struct dquot *dquot = dquots[cnt];
-
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot) {
spin_lock(&dquot->dq_dqb_lock);
if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
number = dquot->dq_dqb.dqb_rsvspace;
@@ -1831,7 +1840,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
*/
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int cnt, index;
if (!inode_quota_active(inode)) {
@@ -1847,9 +1857,8 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
spin_lock(&inode->i_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquots[cnt]) {
- struct dquot *dquot = dquots[cnt];
-
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot) {
spin_lock(&dquot->dq_dqb_lock);
if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
number = dquot->dq_dqb.dqb_curspace;
@@ -1875,7 +1884,8 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int reserve = flags & DQUOT_SPACE_RESERVE, index;
if (!inode_quota_active(inode)) {
@@ -1896,17 +1906,18 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
int wtype;
warn[cnt].w_type = QUOTA_NL_NOWARN;
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- wtype = info_bdq_free(dquots[cnt], number);
+ spin_lock(&dquot->dq_dqb_lock);
+ wtype = info_bdq_free(dquot, number);
if (wtype != QUOTA_NL_NOWARN)
- prepare_warning(&warn[cnt], dquots[cnt], wtype);
+ prepare_warning(&warn[cnt], dquot, wtype);
if (reserve)
- dquot_free_reserved_space(dquots[cnt], number);
+ dquot_free_reserved_space(dquot, number);
else
- dquot_decr_space(dquots[cnt], number);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ dquot_decr_space(dquot, number);
+ spin_unlock(&dquot->dq_dqb_lock);
}
if (reserve)
*inode_reserved_space(inode) -= number;
@@ -1930,7 +1941,8 @@ void dquot_free_inode(struct inode *inode)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
+ struct dquot *dquot;
int index;
if (!inode_quota_active(inode))
@@ -1941,16 +1953,16 @@ void dquot_free_inode(struct inode *inode)
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
int wtype;
-
warn[cnt].w_type = QUOTA_NL_NOWARN;
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- wtype = info_idq_free(dquots[cnt], 1);
+ spin_lock(&dquot->dq_dqb_lock);
+ wtype = info_idq_free(dquot, 1);
if (wtype != QUOTA_NL_NOWARN)
- prepare_warning(&warn[cnt], dquots[cnt], wtype);
- dquot_decr_inodes(dquots[cnt], 1);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ prepare_warning(&warn[cnt], dquot, wtype);
+ dquot_decr_inodes(dquot, 1);
+ spin_unlock(&dquot->dq_dqb_lock);
}
spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
@@ -1976,8 +1988,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
qsize_t cur_space;
qsize_t rsv_space = 0;
qsize_t inode_usage = 1;
+ struct dquot __rcu **dquots;
struct dquot *transfer_from[MAXQUOTAS] = {};
- int cnt, ret = 0;
+ int cnt, index, ret = 0;
char is_valid[MAXQUOTAS] = {};
struct dquot_warn warn_to[MAXQUOTAS];
struct dquot_warn warn_from_inodes[MAXQUOTAS];
@@ -2008,6 +2021,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
}
cur_space = __inode_get_bytes(inode);
rsv_space = __inode_get_rsv_space(inode);
+ dquots = i_dquot(inode);
/*
* Build the transfer_from list, check limits, and update usage in
* the target structures.
@@ -2022,7 +2036,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
if (!sb_has_quota_active(inode->i_sb, cnt))
continue;
is_valid[cnt] = 1;
- transfer_from[cnt] = i_dquot(inode)[cnt];
+ transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
+ &dquot_srcu, lockdep_is_held(&dq_data_lock));
ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
&warn_to[cnt]);
if (ret)
@@ -2061,13 +2076,21 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
rsv_space);
spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
}
- i_dquot(inode)[cnt] = transfer_to[cnt];
+ rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
}
spin_unlock(&inode->i_lock);
spin_unlock(&dq_data_lock);
- mark_all_dquot_dirty(transfer_from);
- mark_all_dquot_dirty(transfer_to);
+ /*
+ * These arrays are local and we hold dquot references so we don't need
+ * the srcu protection but still take dquot_srcu to avoid warning in
+ * mark_all_dquot_dirty().
+ */
+ index = srcu_read_lock(&dquot_srcu);
+ mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
+ mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
+ srcu_read_unlock(&dquot_srcu, index);
+
flush_warnings(warn_to);
flush_warnings(warn_from_inodes);
flush_warnings(warn_from_space);
@@ -2388,7 +2411,8 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
lockdep_assert_held_write(&sb->s_umount);
/* Just unsuspend quotas? */
- BUG_ON(flags & DQUOT_SUSPENDED);
+ if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED))
+ return -EINVAL;
if (!fmt)
return -ESRCH;
@@ -2984,7 +3008,7 @@ static int __init dquot_init(void)
dquot_cachep = kmem_cache_create("dquot",
sizeof(struct dquot), sizeof(unsigned long) * 4,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_PANIC),
NULL);
order = 0;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 0f1493e0f6d0..afceef3ddfaa 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -21,6 +21,12 @@ MODULE_AUTHOR("Jan Kara");
MODULE_DESCRIPTION("Quota trie support");
MODULE_LICENSE("GPL");
+/*
+ * Maximum quota tree depth we support. Only to limit recursion when working
+ * with the tree.
+ */
+#define MAX_QTREE_DEPTH 6
+
#define __QUOTA_QT_PARANOIA
static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
@@ -108,7 +114,7 @@ static int check_dquot_block_header(struct qtree_mem_dqinfo *info,
/* Remove empty block from list and return it */
static int get_free_dqblk(struct qtree_mem_dqinfo *info)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
int ret, blk;
@@ -160,7 +166,7 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
uint blk)
{
- char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
uint nextblk = le32_to_cpu(dh->dqdh_next_free);
uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
@@ -207,7 +213,7 @@ out_buf:
static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
uint blk)
{
- char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
int err;
@@ -255,7 +261,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
{
uint blk, i;
struct qt_disk_dqdbheader *dh;
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
char *ddquot;
*err = 0;
@@ -327,27 +333,36 @@ out_buf:
/* Insert reference to structure into the trie */
static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
- uint *treeblk, int depth)
+ uint *blks, int depth)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
int ret = 0, newson = 0, newact = 0;
__le32 *ref;
uint newblk;
+ int i;
if (!buf)
return -ENOMEM;
- if (!*treeblk) {
+ if (!blks[depth]) {
ret = get_free_dqblk(info);
if (ret < 0)
goto out_buf;
- *treeblk = ret;
+ for (i = 0; i < depth; i++)
+ if (ret == blks[i]) {
+ quota_error(dquot->dq_sb,
+ "Free block already used in tree: block %u",
+ ret);
+ ret = -EIO;
+ goto out_buf;
+ }
+ blks[depth] = ret;
memset(buf, 0, info->dqi_usable_bs);
newact = 1;
} else {
- ret = read_blk(info, *treeblk, buf);
+ ret = read_blk(info, blks[depth], buf);
if (ret < 0) {
quota_error(dquot->dq_sb, "Can't read tree quota "
- "block %u", *treeblk);
+ "block %u", blks[depth]);
goto out_buf;
}
}
@@ -357,8 +372,20 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
info->dqi_blocks - 1);
if (ret)
goto out_buf;
- if (!newblk)
+ if (!newblk) {
newson = 1;
+ } else {
+ for (i = 0; i <= depth; i++)
+ if (newblk == blks[i]) {
+ quota_error(dquot->dq_sb,
+ "Cycle in quota tree detected: block %u index %u",
+ blks[depth],
+ get_index(info, dquot->dq_id, depth));
+ ret = -EIO;
+ goto out_buf;
+ }
+ }
+ blks[depth + 1] = newblk;
if (depth == info->dqi_qtree_depth - 1) {
#ifdef __QUOTA_QT_PARANOIA
if (newblk) {
@@ -370,16 +397,16 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
#endif
- newblk = find_free_dqentry(info, dquot, &ret);
+ blks[depth + 1] = find_free_dqentry(info, dquot, &ret);
} else {
- ret = do_insert_tree(info, dquot, &newblk, depth+1);
+ ret = do_insert_tree(info, dquot, blks, depth + 1);
}
if (newson && ret >= 0) {
ref[get_index(info, dquot->dq_id, depth)] =
- cpu_to_le32(newblk);
- ret = write_blk(info, *treeblk, buf);
+ cpu_to_le32(blks[depth + 1]);
+ ret = write_blk(info, blks[depth], buf);
} else if (newact && ret < 0) {
- put_free_dqblk(info, buf, *treeblk);
+ put_free_dqblk(info, buf, blks[depth]);
}
out_buf:
kfree(buf);
@@ -390,7 +417,7 @@ out_buf:
static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
struct dquot *dquot)
{
- int tmp = QT_TREEOFF;
+ uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
#ifdef __QUOTA_QT_PARANOIA
if (info->dqi_blocks <= QT_TREEOFF) {
@@ -398,7 +425,11 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
return -EIO;
}
#endif
- return do_insert_tree(info, dquot, &tmp, 0);
+ if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+ quota_error(dquot->dq_sb, "Quota tree depth too big!");
+ return -EIO;
+ }
+ return do_insert_tree(info, dquot, blks, 0);
}
/*
@@ -410,7 +441,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
int type = dquot->dq_id.type;
struct super_block *sb = dquot->dq_sb;
ssize_t ret;
- char *ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS);
+ char *ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL);
if (!ddquot)
return -ENOMEM;
@@ -449,7 +480,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
uint blk)
{
struct qt_disk_dqdbheader *dh;
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
int ret = 0;
if (!buf)
@@ -511,19 +542,20 @@ out_buf:
/* Remove reference to dquot from tree */
static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
- uint *blk, int depth)
+ uint *blks, int depth)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
int ret = 0;
uint newblk;
__le32 *ref = (__le32 *)buf;
+ int i;
if (!buf)
return -ENOMEM;
- ret = read_blk(info, *blk, buf);
+ ret = read_blk(info, blks[depth], buf);
if (ret < 0) {
quota_error(dquot->dq_sb, "Can't read quota data block %u",
- *blk);
+ blks[depth]);
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -532,29 +564,38 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
if (ret)
goto out_buf;
+ for (i = 0; i <= depth; i++)
+ if (newblk == blks[i]) {
+ quota_error(dquot->dq_sb,
+ "Cycle in quota tree detected: block %u index %u",
+ blks[depth],
+ get_index(info, dquot->dq_id, depth));
+ ret = -EIO;
+ goto out_buf;
+ }
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
- newblk = 0;
+ blks[depth + 1] = 0;
} else {
- ret = remove_tree(info, dquot, &newblk, depth+1);
+ blks[depth + 1] = newblk;
+ ret = remove_tree(info, dquot, blks, depth + 1);
}
- if (ret >= 0 && !newblk) {
- int i;
+ if (ret >= 0 && !blks[depth + 1]) {
ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
/* Block got empty? */
for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++)
;
/* Don't put the root block into the free block list */
if (i == (info->dqi_usable_bs >> 2)
- && *blk != QT_TREEOFF) {
- put_free_dqblk(info, buf, *blk);
- *blk = 0;
+ && blks[depth] != QT_TREEOFF) {
+ put_free_dqblk(info, buf, blks[depth]);
+ blks[depth] = 0;
} else {
- ret = write_blk(info, *blk, buf);
+ ret = write_blk(info, blks[depth], buf);
if (ret < 0)
quota_error(dquot->dq_sb,
"Can't write quota tree block %u",
- *blk);
+ blks[depth]);
}
}
out_buf:
@@ -565,11 +606,15 @@ out_buf:
/* Delete dquot from tree */
int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
{
- uint tmp = QT_TREEOFF;
+ uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
if (!dquot->dq_off) /* Even not allocated? */
return 0;
- return remove_tree(info, dquot, &tmp, 0);
+ if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+ quota_error(dquot->dq_sb, "Quota tree depth too big!");
+ return -EIO;
+ }
+ return remove_tree(info, dquot, blks, 0);
}
EXPORT_SYMBOL(qtree_delete_dquot);
@@ -577,7 +622,7 @@ EXPORT_SYMBOL(qtree_delete_dquot);
static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
struct dquot *dquot, uint blk)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
loff_t ret = 0;
int i;
char *ddquot;
@@ -613,18 +658,20 @@ out_buf:
/* Find entry for given id in the tree */
static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
- struct dquot *dquot, uint blk, int depth)
+ struct dquot *dquot, uint *blks, int depth)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
loff_t ret = 0;
__le32 *ref = (__le32 *)buf;
+ uint blk;
+ int i;
if (!buf)
return -ENOMEM;
- ret = read_blk(info, blk, buf);
+ ret = read_blk(info, blks[depth], buf);
if (ret < 0) {
quota_error(dquot->dq_sb, "Can't read quota tree block %u",
- blk);
+ blks[depth]);
goto out_buf;
}
ret = 0;
@@ -636,8 +683,19 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
if (ret)
goto out_buf;
+ /* Check for cycles in the tree */
+ for (i = 0; i <= depth; i++)
+ if (blk == blks[i]) {
+ quota_error(dquot->dq_sb,
+ "Cycle in quota tree detected: block %u index %u",
+ blks[depth],
+ get_index(info, dquot->dq_id, depth));
+ ret = -EIO;
+ goto out_buf;
+ }
+ blks[depth + 1] = blk;
if (depth < info->dqi_qtree_depth - 1)
- ret = find_tree_dqentry(info, dquot, blk, depth+1);
+ ret = find_tree_dqentry(info, dquot, blks, depth + 1);
else
ret = find_block_dqentry(info, dquot, blk);
out_buf:
@@ -649,7 +707,13 @@ out_buf:
static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
struct dquot *dquot)
{
- return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+ uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
+
+ if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+ quota_error(dquot->dq_sb, "Quota tree depth too big!");
+ return -EIO;
+ }
+ return find_tree_dqentry(info, dquot, blks, 0);
}
int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
@@ -684,7 +748,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
}
dquot->dq_off = offset;
}
- ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS);
+ ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL);
if (!ddquot)
return -ENOMEM;
ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
@@ -728,7 +792,7 @@ EXPORT_SYMBOL(qtree_release_dquot);
static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
unsigned int blk, int depth)
{
- char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
__le32 *ref = (__le32 *)buf;
ssize_t ret;
unsigned int epb = info->dqi_usable_bs >> 2;
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index a0db3f195e95..3f3e8acc05db 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -160,9 +160,11 @@ static int v1_read_file_info(struct super_block *sb, int type)
{
struct quota_info *dqopt = sb_dqopt(sb);
struct v1_disk_dqblk dqblk;
+ unsigned int memalloc;
int ret;
down_read(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct v1_disk_dqblk), v1_dqoff(0));
if (ret != sizeof(struct v1_disk_dqblk)) {
@@ -179,6 +181,7 @@ static int v1_read_file_info(struct super_block *sb, int type)
dqopt->info[type].dqi_bgrace =
dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
out:
+ memalloc_nofs_restore(memalloc);
up_read(&dqopt->dqio_sem);
return ret;
}
@@ -187,9 +190,11 @@ static int v1_write_file_info(struct super_block *sb, int type)
{
struct quota_info *dqopt = sb_dqopt(sb);
struct v1_disk_dqblk dqblk;
+ unsigned int memalloc;
int ret;
down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct v1_disk_dqblk), v1_dqoff(0));
if (ret != sizeof(struct v1_disk_dqblk)) {
@@ -209,6 +214,7 @@ static int v1_write_file_info(struct super_block *sb, int type)
else if (ret >= 0)
ret = -EIO;
out:
+ memalloc_nofs_restore(memalloc);
up_write(&dqopt->dqio_sem);
return ret;
}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index ae99e7b88205..c48c233f3bef 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -96,9 +96,11 @@ static int v2_read_file_info(struct super_block *sb, int type)
struct qtree_mem_dqinfo *qinfo;
ssize_t size;
unsigned int version;
+ unsigned int memalloc;
int ret;
down_read(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = v2_read_header(sb, type, &dqhead);
if (ret < 0)
goto out;
@@ -119,7 +121,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
ret = -EIO;
goto out;
}
- info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+ info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_KERNEL);
if (!info->dqi_priv) {
ret = -ENOMEM;
goto out;
@@ -166,14 +168,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
i_size_read(sb_dqopt(sb)->files[type]));
goto out_free;
}
- if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) {
- quota_error(sb, "Free block number too big (%u >= %u).",
- qinfo->dqi_free_blk, qinfo->dqi_blocks);
+ if (qinfo->dqi_free_blk && (qinfo->dqi_free_blk <= QT_TREEOFF ||
+ qinfo->dqi_free_blk >= qinfo->dqi_blocks)) {
+ quota_error(sb, "Free block number %u out of range (%u, %u).",
+ qinfo->dqi_free_blk, QT_TREEOFF, qinfo->dqi_blocks);
goto out_free;
}
- if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) {
- quota_error(sb, "Block with free entry too big (%u >= %u).",
- qinfo->dqi_free_entry, qinfo->dqi_blocks);
+ if (qinfo->dqi_free_entry && (qinfo->dqi_free_entry <= QT_TREEOFF ||
+ qinfo->dqi_free_entry >= qinfo->dqi_blocks)) {
+ quota_error(sb, "Block with free entry %u out of range (%u, %u).",
+ qinfo->dqi_free_entry, QT_TREEOFF,
+ qinfo->dqi_blocks);
goto out_free;
}
ret = 0;
@@ -183,6 +188,7 @@ out_free:
info->dqi_priv = NULL;
}
out:
+ memalloc_nofs_restore(memalloc);
up_read(&dqopt->dqio_sem);
return ret;
}
@@ -195,8 +201,10 @@ static int v2_write_file_info(struct super_block *sb, int type)
struct mem_dqinfo *info = &dqopt->info[type];
struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
ssize_t size;
+ unsigned int memalloc;
down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
spin_lock(&dq_data_lock);
info->dqi_flags &= ~DQF_INFO_DIRTY;
dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
@@ -209,6 +217,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+ memalloc_nofs_restore(memalloc);
up_write(&dqopt->dqio_sem);
if (size != sizeof(struct v2_disk_dqinfo)) {
quota_error(sb, "Can't write info structure");
@@ -328,11 +337,14 @@ static int v2_read_dquot(struct dquot *dquot)
{
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
int ret;
+ unsigned int memalloc;
down_read(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = qtree_read_dquot(
sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
dquot);
+ memalloc_nofs_restore(memalloc);
up_read(&dqopt->dqio_sem);
return ret;
}
@@ -342,6 +354,7 @@ static int v2_write_dquot(struct dquot *dquot)
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
int ret;
bool alloc = false;
+ unsigned int memalloc;
/*
* If space for dquot is already allocated, we don't need any
@@ -355,9 +368,11 @@ static int v2_write_dquot(struct dquot *dquot)
} else {
down_read(&dqopt->dqio_sem);
}
+ memalloc = memalloc_nofs_save();
ret = qtree_write_dquot(
sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
dquot);
+ memalloc_nofs_restore(memalloc);
if (alloc)
up_write(&dqopt->dqio_sem);
else
@@ -368,10 +383,13 @@ static int v2_write_dquot(struct dquot *dquot)
static int v2_release_dquot(struct dquot *dquot)
{
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ unsigned int memalloc;
int ret;
down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+ memalloc_nofs_restore(memalloc);
up_write(&dqopt->dqio_sem);
return ret;
@@ -386,10 +404,13 @@ static int v2_free_file_info(struct super_block *sb, int type)
static int v2_get_next_id(struct super_block *sb, struct kqid *qid)
{
struct quota_info *dqopt = sb_dqopt(sb);
+ unsigned int memalloc;
int ret;
down_read(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
ret = qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
+ memalloc_nofs_restore(memalloc);
up_read(&dqopt->dqio_sem);
return ret;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 4ac05a9e25bc..8006faaaf0ec 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -102,11 +102,20 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
int error = -ENOSPC;
if (inode) {
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name, NULL,
+ NULL);
+ if (error) {
+ iput(inode);
+ goto out;
+ }
+
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
+out:
return error;
}
@@ -134,6 +143,15 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
if (inode) {
int l = strlen(symname)+1;
+
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name, NULL,
+ NULL);
+ if (error) {
+ iput(inode);
+ goto out;
+ }
+
error = page_symlink(inode, symname, l);
if (!error) {
d_instantiate(dentry, inode);
@@ -143,6 +161,7 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
} else
iput(inode);
}
+out:
return error;
}
@@ -150,12 +169,23 @@ static int ramfs_tmpfile(struct mnt_idmap *idmap,
struct inode *dir, struct file *file, umode_t mode)
{
struct inode *inode;
+ int error;
inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
if (!inode)
return -ENOSPC;
+
+ error = security_inode_init_security(inode, dir,
+ &file_dentry(file)->d_name, NULL,
+ NULL);
+ if (error) {
+ iput(inode);
+ goto out;
+ }
+
d_tmpfile(file, inode);
- return finish_open_simple(file, 0);
+out:
+ return finish_open_simple(file, error);
}
static const struct inode_operations ramfs_dir_inode_operations = {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 171c912af50f..e539ccd39e1e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2386,7 +2386,7 @@ static int journal_read(struct super_block *sb)
cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
reiserfs_info(sb, "checking transaction log (%pg)\n",
- journal->j_bdev_handle->bdev);
+ file_bdev(journal->j_bdev_file));
start = ktime_get_seconds();
/*
@@ -2447,7 +2447,7 @@ static int journal_read(struct super_block *sb)
* device and journal device to be the same
*/
d_bh =
- reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock,
+ reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
sb->s_blocksize,
SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
SB_ONDISK_JOURNAL_SIZE(sb));
@@ -2588,9 +2588,9 @@ static void journal_list_init(struct super_block *sb)
static void release_journal_dev(struct reiserfs_journal *journal)
{
- if (journal->j_bdev_handle) {
- bdev_release(journal->j_bdev_handle);
- journal->j_bdev_handle = NULL;
+ if (journal->j_bdev_file) {
+ bdev_fput(journal->j_bdev_file);
+ journal->j_bdev_file = NULL;
}
}
@@ -2605,7 +2605,7 @@ static int journal_init_dev(struct super_block *super,
result = 0;
- journal->j_bdev_handle = NULL;
+ journal->j_bdev_file = NULL;
jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
@@ -2616,37 +2616,37 @@ static int journal_init_dev(struct super_block *super,
if ((!jdev_name || !jdev_name[0])) {
if (jdev == super->s_dev)
holder = NULL;
- journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode,
+ journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
holder, NULL);
- if (IS_ERR(journal->j_bdev_handle)) {
- result = PTR_ERR(journal->j_bdev_handle);
- journal->j_bdev_handle = NULL;
+ if (IS_ERR(journal->j_bdev_file)) {
+ result = PTR_ERR(journal->j_bdev_file);
+ journal->j_bdev_file = NULL;
reiserfs_warning(super, "sh-458",
"cannot init journal device unknown-block(%u,%u): %i",
MAJOR(jdev), MINOR(jdev), result);
return result;
} else if (jdev != super->s_dev)
- set_blocksize(journal->j_bdev_handle->bdev,
+ set_blocksize(file_bdev(journal->j_bdev_file),
super->s_blocksize);
return 0;
}
- journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode,
+ journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
holder, NULL);
- if (IS_ERR(journal->j_bdev_handle)) {
- result = PTR_ERR(journal->j_bdev_handle);
- journal->j_bdev_handle = NULL;
+ if (IS_ERR(journal->j_bdev_file)) {
+ result = PTR_ERR(journal->j_bdev_file);
+ journal->j_bdev_file = NULL;
reiserfs_warning(super, "sh-457",
"journal_init_dev: Cannot open '%s': %i",
jdev_name, result);
return result;
}
- set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize);
+ set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize);
reiserfs_info(super,
"journal_init_dev: journal device: %pg\n",
- journal->j_bdev_handle->bdev);
+ file_bdev(journal->j_bdev_file));
return 0;
}
@@ -2804,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
"journal header magic %x (device %pg) does "
"not match to magic found in super block %x",
jh->jh_journal.jp_journal_magic,
- journal->j_bdev_handle->bdev,
+ file_bdev(journal->j_bdev_file),
sb_jp_journal_magic(rs));
brelse(bhjh);
goto free_and_return;
@@ -2828,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
reiserfs_info(sb, "journal params: device %pg, size %u, "
"journal first block %u, max trans len %u, max batch %u, "
"max commit age %u, max trans age %u\n",
- journal->j_bdev_handle->bdev,
+ file_bdev(journal->j_bdev_file),
SB_ONDISK_JOURNAL_SIZE(sb),
SB_ONDISK_JOURNAL_1st_BLOCK(sb),
journal->j_trans_max,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 83cb9402e0f9..5c68a4a52d78 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused)
"prepare: \t%12lu\n"
"prepare_retry: \t%12lu\n",
DJP(jp_journal_1st_block),
- SB_JOURNAL(sb)->j_bdev_handle->bdev,
+ file_bdev(SB_JOURNAL(sb)->j_bdev_file),
DJP(jp_journal_dev),
DJP(jp_journal_size),
DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 725667880e62..f0e1f29f20ee 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -97,7 +97,7 @@ struct reiserfs_inode_info {
struct rw_semaphore i_xattr_sem;
#endif
#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
struct inode vfs_inode;
@@ -299,7 +299,7 @@ struct reiserfs_journal {
/* oldest journal block. start here for traverse */
struct reiserfs_journal_cnode *j_first;
- struct bdev_handle *j_bdev_handle;
+ struct file *j_bdev_file;
/* first block on s_dev of reserved area journal */
int j_1st_reserved_block;
@@ -2810,10 +2810,10 @@ struct reiserfs_journal_header {
/* We need these to make journal.c code more readable */
#define journal_find_get_block(s, block) __find_get_block(\
- SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+ file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
block, s->s_blocksize)
-#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
block, s->s_blocksize)
enum reiserfs_bh_state_bits {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 67b5510beded..ab76468da02d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -670,7 +670,6 @@ static int __init init_inodecache(void)
sizeof(struct
reiserfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|
SLAB_ACCOUNT),
init_once);
if (reiserfs_inode_cachep == NULL)
@@ -802,7 +801,7 @@ static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
loff_t);
-static struct dquot **reiserfs_get_dquots(struct inode *inode)
+static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode)
{
return REISERFS_I(inode)->i_dquot;
}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 545ad44f96b8..2cbb92462074 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb)
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev) {
sync_blockdev(sb->s_bdev);
- bdev_release(sb->s_bdev_handle);
+ bdev_fput(sb->s_bdev_file);
}
#endif
}
@@ -630,8 +630,8 @@ static int __init init_romfs_fs(void)
romfs_inode_cachep =
kmem_cache_create("romfs_i",
sizeof(struct romfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
- SLAB_ACCOUNT, romfs_i_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+ romfs_i_init_once);
if (!romfs_inode_cachep) {
pr_err("Failed to initialise inode cache\n");
diff --git a/fs/select.c b/fs/select.c
index 0ee55af1a55c..9515c3fa1a03 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -476,7 +476,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
@@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
struct poll_list {
struct poll_list *next;
- int len;
+ unsigned int len;
struct pollfd entries[];
};
@@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
struct poll_wqueues table;
- int err = -EFAULT, fdcount, len;
+ int err = -EFAULT, fdcount;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
- unsigned long todo = nfds;
+ unsigned int todo = nfds;
+ unsigned int len;
if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;
@@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
sizeof(struct pollfd) * walk->len))
goto out_fds;
- todo -= walk->len;
- if (!todo)
+ if (walk->len >= todo)
break;
+ todo -= walk->len;
len = min(todo, POLLFD_PER_PAGE);
walk = walk->next = kmalloc(struct_size(walk, entries, len),
@@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
- int j;
+ unsigned int j;
for (j = walk->len; j; fds++, ufds++, j--)
unsafe_put_user(fds->revents, &ufds->revents, Efault);
diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile
index 0b07eb94c93b..e11985f2460b 100644
--- a/fs/smb/client/Makefile
+++ b/fs/smb/client/Makefile
@@ -12,7 +12,7 @@ cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \
smb2ops.o smb2maperror.o smb2transport.o \
smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o \
- namespace.o
+ namespace.o reparse.o
$(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 3de5047a7ff9..0ff2491c311d 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -239,7 +239,8 @@ replay_again:
.tcon = tcon,
.path = path,
.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE),
- .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES,
+ .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES |
+ FILE_READ_EA,
.disposition = FILE_OPEN,
.fid = pfid,
.replay = !!(retries),
@@ -416,6 +417,7 @@ smb2_close_cached_fid(struct kref *ref)
{
struct cached_fid *cfid = container_of(ref, struct cached_fid,
refcount);
+ int rc;
spin_lock(&cfid->cfids->cfid_list_lock);
if (cfid->on_list) {
@@ -429,9 +431,10 @@ smb2_close_cached_fid(struct kref *ref)
cfid->dentry = NULL;
if (cfid->is_open) {
- SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
+ rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
cfid->fid.volatile_fid);
- atomic_dec(&cfid->tcon->num_remote_opens);
+ if (rc) /* should we retry on -EBUSY or -EAGAIN? */
+ cifs_dbg(VFS, "close cached dir rc %d\n", rc);
}
free_cached_dir(cfid);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 3e4209f41c18..c71ae5c04306 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -250,6 +250,8 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
@@ -278,6 +280,24 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
return 0;
}
+static __always_inline const char *compression_alg_str(__le16 alg)
+{
+ switch (alg) {
+ case SMB3_COMPRESS_NONE:
+ return "NONE";
+ case SMB3_COMPRESS_LZNT1:
+ return "LZNT1";
+ case SMB3_COMPRESS_LZ77:
+ return "LZ77";
+ case SMB3_COMPRESS_LZ77_HUFF:
+ return "LZ77-Huffman";
+ case SMB3_COMPRESS_PATTERN:
+ return "Pattern_V1";
+ default:
+ return "invalid";
+ }
+}
+
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
struct mid_q_entry *mid_entry;
@@ -423,12 +443,6 @@ skip_rdma:
server->echo_credits,
server->oplock_credits,
server->dialect);
- if (server->compress_algorithm == SMB3_COMPRESS_LZNT1)
- seq_printf(m, " COMPRESS_LZNT1");
- else if (server->compress_algorithm == SMB3_COMPRESS_LZ77)
- seq_printf(m, " COMPRESS_LZ77");
- else if (server->compress_algorithm == SMB3_COMPRESS_LZ77_HUFF)
- seq_printf(m, " COMPRESS_LZ77_HUFF");
if (server->sign)
seq_printf(m, " signed");
if (server->posix_ext_supported)
@@ -460,6 +474,14 @@ skip_rdma:
server->leaf_fullpath);
}
+ seq_puts(m, "\nCompression: ");
+ if (!server->compression.requested)
+ seq_puts(m, "disabled on mount");
+ else if (server->compression.enabled)
+ seq_printf(m, "enabled (%s)", compression_alg_str(server->compression.alg));
+ else
+ seq_puts(m, "disabled (not supported by this server)");
+
seq_printf(m, "\n\n\tSessions: ");
i = 0;
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
@@ -488,6 +510,8 @@ skip_rdma:
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->ses_status);
}
+ if (ses->expired_pwd)
+ seq_puts(m, "password no longer valid ");
spin_unlock(&ses->ses_lock);
seq_printf(m, "\n\tSecurity type: %s ",
@@ -654,6 +678,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
}
#endif /* CONFIG_CIFS_STATS2 */
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
atomic_set(&tcon->num_smbs_sent, 0);
spin_lock(&tcon->stat_lock);
@@ -733,6 +759,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
}
#endif /* STATS2 */
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
i++;
seq_printf(m, "\n%d) %s", i, tcon->tree_name);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 0c269396ae15..d41eedbff674 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -151,15 +151,12 @@ MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be "
"vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker"
" and less secure. Default: n/N/0");
-extern mempool_t *cifs_sm_req_poolp;
-extern mempool_t *cifs_req_poolp;
-extern mempool_t *cifs_mid_poolp;
-
struct workqueue_struct *cifsiod_wq;
struct workqueue_struct *decrypt_wq;
struct workqueue_struct *fileinfo_put_wq;
struct workqueue_struct *cifsoplockd_wq;
struct workqueue_struct *deferredclose_wq;
+struct workqueue_struct *serverclose_wq;
__u32 cifs_lock_secret;
/*
@@ -673,6 +670,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",backupgid=%u",
from_kgid_munged(&init_user_ns,
cifs_sb->ctx->backupgid));
+ seq_show_option(s, "reparse",
+ cifs_reparse_type_str(cifs_sb->ctx->reparse_type));
seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize);
@@ -1085,7 +1084,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
}
static int
-cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv)
+cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv)
{
/*
* Note that this is called by vfs setlease with i_lock held to
@@ -1094,9 +1093,6 @@ cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv)
struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
- if (!(S_ISREG(inode->i_mode)))
- return -EINVAL;
-
/* Check if file is oplocked if this is request for new lease */
if (arg == F_UNLCK ||
((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
@@ -1667,7 +1663,7 @@ cifs_init_inodecache(void)
cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
sizeof(struct cifsInodeInfo),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
cifs_init_once);
if (cifs_inode_cachep == NULL)
return -ENOMEM;
@@ -1893,6 +1889,13 @@ init_cifs(void)
goto out_destroy_cifsoplockd_wq;
}
+ serverclose_wq = alloc_workqueue("serverclose",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!serverclose_wq) {
+ rc = -ENOMEM;
+ goto out_destroy_serverclose_wq;
+ }
+
rc = cifs_init_inodecache();
if (rc)
goto out_destroy_deferredclose_wq;
@@ -1967,6 +1970,8 @@ out_destroy_decrypt_wq:
destroy_workqueue(decrypt_wq);
out_destroy_cifsiod_wq:
destroy_workqueue(cifsiod_wq);
+out_destroy_serverclose_wq:
+ destroy_workqueue(serverclose_wq);
out_clean_proc:
cifs_proc_clean();
return rc;
@@ -1996,6 +2001,7 @@ exit_cifs(void)
destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(decrypt_wq);
destroy_workqueue(fileinfo_put_wq);
+ destroy_workqueue(serverclose_wq);
destroy_workqueue(cifsiod_wq);
cifs_proc_clean();
}
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 685f7d1139c6..ca55d01117c8 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 47
-#define CIFS_VERSION "2.47"
+#define SMB3_PRODUCT_BUILD 48
+#define CIFS_VERSION "2.48"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 53c75cfb33ab..d6669ce4ae87 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -153,6 +153,24 @@ enum securityEnum {
Kerberos, /* Kerberos via SPNEGO */
};
+enum cifs_reparse_type {
+ CIFS_REPARSE_TYPE_NFS,
+ CIFS_REPARSE_TYPE_WSL,
+ CIFS_REPARSE_TYPE_DEFAULT = CIFS_REPARSE_TYPE_NFS,
+};
+
+static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type)
+{
+ switch (type) {
+ case CIFS_REPARSE_TYPE_NFS:
+ return "nfs";
+ case CIFS_REPARSE_TYPE_WSL:
+ return "wsl";
+ default:
+ return "unknown";
+ }
+}
+
struct session_key {
unsigned int len;
char *response;
@@ -208,6 +226,10 @@ struct cifs_open_info_data {
struct reparse_posix_data *posix;
};
} reparse;
+ struct {
+ __u8 eas[SMB2_WSL_MAX_QUERY_EA_RESP_SIZE];
+ unsigned int eas_len;
+ } wsl;
char *symlink_target;
struct cifs_sid posix_owner;
struct cifs_sid posix_group;
@@ -217,19 +239,6 @@ struct cifs_open_info_data {
};
};
-static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
-{
- struct smb2_file_all_info *fi = &data->fi;
- u32 attrs = le32_to_cpu(fi->Attributes);
- bool ret;
-
- ret = data->reparse_point || (attrs & ATTR_REPARSE);
- if (ret)
- attrs |= ATTR_REPARSE;
- fi->Attributes = cpu_to_le32(attrs);
- return ret;
-}
-
/*
*****************************************************************
* Except the CIFS PDUs themselves all the
@@ -346,6 +355,9 @@ struct smb_version_operations {
/* informational QFS call */
void (*qfs_tcon)(const unsigned int, struct cifs_tcon *,
struct cifs_sb_info *);
+ /* query for server interfaces */
+ int (*query_server_interfaces)(const unsigned int, struct cifs_tcon *,
+ bool);
/* check if a path is accessible or not */
int (*is_path_accessible)(const unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const char *);
@@ -371,7 +383,8 @@ struct smb_version_operations {
struct cifs_open_info_data *data);
/* set size by path */
int (*set_path_size)(const unsigned int, struct cifs_tcon *,
- const char *, __u64, struct cifs_sb_info *, bool);
+ const char *, __u64, struct cifs_sb_info *, bool,
+ struct dentry *);
/* set size by file handle */
int (*set_file_size)(const unsigned int, struct cifs_tcon *,
struct cifsFileInfo *, __u64, bool);
@@ -401,7 +414,7 @@ struct smb_version_operations {
struct cifs_sb_info *);
/* unlink file */
int (*unlink)(const unsigned int, struct cifs_tcon *, const char *,
- struct cifs_sb_info *);
+ struct cifs_sb_info *, struct dentry *);
/* open, rename and delete file */
int (*rename_pending_delete)(const char *, struct dentry *,
const unsigned int);
@@ -429,10 +442,10 @@ struct smb_version_operations {
/* set fid protocol-specific info */
void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
/* close a file */
- void (*close)(const unsigned int, struct cifs_tcon *,
+ int (*close)(const unsigned int, struct cifs_tcon *,
struct cifs_fid *);
/* close a file, returning file attributes and timestamps */
- void (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon,
+ int (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *pfile_info);
/* send a flush request to the server */
int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *);
@@ -759,7 +772,11 @@ struct TCP_Server_Info {
unsigned int max_write;
unsigned int min_offload;
unsigned int retrans;
- __le16 compress_algorithm;
+ struct {
+ bool requested; /* "compress" mount option set*/
+ bool enabled; /* actually negotiated with server */
+ __le16 alg; /* preferred alg negotiated with server */
+ } compression;
__u16 signing_algorithm;
__le16 cipher_type;
/* save initital negprot hash */
@@ -1060,12 +1077,14 @@ struct cifs_ses {
and after mount option parsing we fill it */
char *domainName;
char *password;
+ char *password2; /* When key rotation used, new password may be set before it expires */
char workstation_name[CIFS_MAX_WORKSTATION_LEN];
struct session_key auth_key;
struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
enum securityEnum sectype; /* what security flavor was specified? */
bool sign; /* is signing required? */
bool domainAuto:1;
+ bool expired_pwd; /* track if access denied or expired pwd so can know if need to update */
unsigned int flags;
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
@@ -1263,7 +1282,6 @@ struct cifs_tcon {
struct cached_fids *cfids;
/* BB add field for back pointer to sb struct(s)? */
#ifdef CONFIG_CIFS_DFS_UPCALL
- struct list_head dfs_ses_list;
struct delayed_work dfs_cache_work;
#endif
struct delayed_work query_interfaces; /* query interfaces workqueue job */
@@ -1379,6 +1397,7 @@ struct cifs_open_parms {
umode_t mode;
bool reconnect:1;
bool replay:1; /* indicates that this open is for a replay */
+ struct kvec *ea_cctx;
};
struct cifs_fid {
@@ -1420,6 +1439,8 @@ struct cifsFileInfo {
bool invalidHandle:1; /* file closed via session abend */
bool swapfile:1;
bool oplock_break_cancelled:1;
+ bool status_file_deleted:1; /* file has been deleted */
+ bool offload:1; /* offload final part of _put to a wq */
unsigned int oplock_epoch; /* epoch from the lease break */
__u32 oplock_level; /* oplock/lease level from the lease break */
int count;
@@ -1428,6 +1449,7 @@ struct cifsFileInfo {
struct cifs_search_info srch_inf;
struct work_struct oplock_break; /* work for oplock breaks */
struct work_struct put; /* work for the final part of _put */
+ struct work_struct serverclose; /* work for serverclose */
struct delayed_work deferred;
bool deferred_close_scheduled; /* Flag to indicate close is scheduled */
char *symlink_target;
@@ -1784,7 +1806,6 @@ struct cifs_mount_ctx {
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- struct list_head dfs_ses_list;
};
static inline void __free_dfs_info_param(struct dfs_info3_param *param)
@@ -2085,8 +2106,11 @@ extern struct workqueue_struct *decrypt_wq;
extern struct workqueue_struct *fileinfo_put_wq;
extern struct workqueue_struct *cifsoplockd_wq;
extern struct workqueue_struct *deferredclose_wq;
+extern struct workqueue_struct *serverclose_wq;
extern __u32 cifs_lock_secret;
+extern mempool_t *cifs_sm_req_poolp;
+extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
/* Operations for different SMB versions */
@@ -2277,6 +2301,17 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable,
}
}
+#define CIFS_OPARMS(_cifs_sb, _tcon, _path, _da, _cd, _co, _mode) \
+ ((struct cifs_open_parms) { \
+ .tcon = _tcon, \
+ .path = _path, \
+ .desired_access = (_da), \
+ .disposition = (_cd), \
+ .create_options = cifs_create_options(_cifs_sb, (_co)), \
+ .mode = (_mode), \
+ .cifs_sb = _cifs_sb, \
+ })
+
struct smb2_compound_vars {
struct cifs_open_parms oparms;
struct kvec rsp_iov[MAX_COMPOUND];
@@ -2288,6 +2323,17 @@ struct smb2_compound_vars {
struct kvec close_iov;
struct smb2_file_rename_info rename_info;
struct smb2_file_link_info link_info;
+ struct kvec ea_iov;
};
+static inline bool cifs_ses_exiting(struct cifs_ses *ses)
+{
+ bool ret;
+
+ spin_lock(&ses->ses_lock);
+ ret = ses->ses_status == SES_EXITING;
+ spin_unlock(&ses->ses_lock);
+ return ret;
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index a841bf4967fa..8e0a348f1f66 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -144,7 +144,8 @@ extern int cifs_reconnect(struct TCP_Server_Info *server,
extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr);
extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
extern bool backup_cred(struct cifs_sb_info *);
-extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof,
+ bool from_readdir);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
@@ -201,17 +202,14 @@ extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
struct cifs_sb_info *cifs_sb);
extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *,
struct cifs_sb_info *);
-extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
+extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
+ bool from_readdir);
extern struct inode *cifs_iget(struct super_block *sb,
struct cifs_fattr *fattr);
int cifs_get_inode_info(struct inode **inode, const char *full_path,
struct cifs_open_info_data *data, struct super_block *sb, int xid,
const struct cifs_fid *fid);
-bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
- struct cifs_fattr *fattr,
- struct cifs_open_info_data *data);
-
extern int smb311_posix_get_inode_info(struct inode **inode,
const char *full_path,
struct cifs_open_info_data *data,
@@ -296,6 +294,10 @@ extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
const char *path);
+
+extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode,
+ const char *path);
+
extern struct TCP_Server_Info *
cifs_get_tcp_session(struct smb3_fs_context *ctx,
struct TCP_Server_Info *primary_server);
@@ -402,7 +404,8 @@ extern int CIFSSMBSetFileDisposition(const unsigned int xid,
__u32 pid_of_opener);
extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
const char *file_name, __u64 size,
- struct cifs_sb_info *cifs_sb, bool set_allocation);
+ struct cifs_sb_info *cifs_sb, bool set_allocation,
+ struct dentry *dentry);
extern int CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile, __u64 size,
bool set_allocation);
@@ -438,7 +441,8 @@ extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
const struct nls_table *nls_codepage,
int remap_special_chars);
extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon,
- const char *name, struct cifs_sb_info *cifs_sb);
+ const char *name, struct cifs_sb_info *cifs_sb,
+ struct dentry *dentry);
int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
struct dentry *source_dentry,
const char *from_name, const char *to_name,
@@ -721,31 +725,31 @@ struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon);
void cifs_put_tcon_super(struct super_block *sb);
int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry);
-/* Put references of @ses and @ses->dfs_root_ses */
+/* Put references of @ses and its children */
static inline void cifs_put_smb_ses(struct cifs_ses *ses)
{
- struct cifs_ses *rses = ses->dfs_root_ses;
+ struct cifs_ses *next;
- __cifs_put_smb_ses(ses);
- if (rses)
- __cifs_put_smb_ses(rses);
+ do {
+ next = ses->dfs_root_ses;
+ __cifs_put_smb_ses(ses);
+ } while ((ses = next));
}
-/* Get an active reference of @ses and @ses->dfs_root_ses.
+/* Get an active reference of @ses and its children.
*
* NOTE: make sure to call this function when incrementing reference count of
* @ses to ensure that any DFS root session attached to it (@ses->dfs_root_ses)
* will also get its reference count incremented.
*
- * cifs_put_smb_ses() will put both references, so call it when you're done.
+ * cifs_put_smb_ses() will put all references, so call it when you're done.
*/
static inline void cifs_smb_ses_inc_refcount(struct cifs_ses *ses)
{
lockdep_assert_held(&cifs_tcp_ses_lock);
- ses->ses_count++;
- if (ses->dfs_root_ses)
- ses->dfs_root_ses->ses_count++;
+ for (; ses; ses = ses->dfs_root_ses)
+ ses->ses_count++;
}
static inline bool dfs_src_pathname_equal(const char *s1, const char *s2)
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 01e89070df5a..23b5709ddc31 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -738,7 +738,7 @@ PsxDelete:
int
CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
- struct cifs_sb_info *cifs_sb)
+ struct cifs_sb_info *cifs_sb, struct dentry *dentry)
{
DELETE_FILE_REQ *pSMB = NULL;
DELETE_FILE_RSP *pSMBr = NULL;
@@ -2066,20 +2066,20 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
parm_data = (struct cifs_posix_lock *)
((char *)&pSMBr->hdr.Protocol + data_offset);
if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
- pLockData->fl_type = F_UNLCK;
+ pLockData->c.flc_type = F_UNLCK;
else {
if (parm_data->lock_type ==
cpu_to_le16(CIFS_RDLCK))
- pLockData->fl_type = F_RDLCK;
+ pLockData->c.flc_type = F_RDLCK;
else if (parm_data->lock_type ==
cpu_to_le16(CIFS_WRLCK))
- pLockData->fl_type = F_WRLCK;
+ pLockData->c.flc_type = F_WRLCK;
pLockData->fl_start = le64_to_cpu(parm_data->start);
pLockData->fl_end = pLockData->fl_start +
(le64_to_cpu(parm_data->length) ?
le64_to_cpu(parm_data->length) - 1 : 0);
- pLockData->fl_pid = -le32_to_cpu(parm_data->pid);
+ pLockData->c.flc_pid = -le32_to_cpu(parm_data->pid);
}
}
@@ -4993,7 +4993,7 @@ QFSPosixRetry:
int
CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb,
- bool set_allocation)
+ bool set_allocation, struct dentry *dentry)
{
struct smb_com_transaction2_spi_req *pSMB = NULL;
struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
@@ -5854,10 +5854,8 @@ SetEARetry:
parm_data->list.EA_flags = 0;
/* we checked above that name len is less than 255 */
parm_data->list.name_len = (__u8)name_len;
- /* EA names are always ASCII */
- if (ea_name)
- strncpy(parm_data->list.name, ea_name, name_len);
- parm_data->list.name[name_len] = '\0';
+ /* EA names are always ASCII and NUL-terminated */
+ strscpy(parm_data->list.name, ea_name ?: "", name_len + 1);
parm_data->list.value_len = cpu_to_le16(ea_value_len);
/* caller ensures that ea_value_len is less than 64K but
we need to ensure that it fits within the smb */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index ac9595504f4b..4e35970681bf 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -52,9 +52,6 @@
#include "fs_context.h"
#include "cifs_swn.h"
-extern mempool_t *cifs_req_poolp;
-extern bool disable_legacy_dialects;
-
/* FIXME: should these be tunable? */
#define TLINK_ERROR_EXPIRE (1 * HZ)
#define TLINK_IDLE_EXPIRE (600 * HZ)
@@ -123,12 +120,16 @@ static void smb2_query_server_interfaces(struct work_struct *work)
struct cifs_tcon *tcon = container_of(work,
struct cifs_tcon,
query_interfaces.work);
+ struct TCP_Server_Info *server = tcon->ses->server;
/*
* query server network interfaces, in case they change
*/
+ if (!server->ops->query_server_interfaces)
+ return;
+
xid = get_xid();
- rc = SMB3_request_interfaces(xid, tcon, false);
+ rc = server->ops->query_server_interfaces(xid, tcon, false);
free_xid(xid);
if (rc) {
@@ -174,6 +175,8 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
if (!ses->chans[i].server)
@@ -231,7 +234,13 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
- /* check if iface is still active */
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
+ continue;
+ }
+ spin_unlock(&ses->ses_lock);
+
spin_lock(&ses->chan_lock);
if (cifs_ses_get_chan_index(ses, server) ==
CIFS_INVAL_CHAN_INDEX) {
@@ -1736,7 +1745,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */
tcp_ses->reconnect_instance = 1;
tcp_ses->lstrp = jiffies;
- tcp_ses->compress_algorithm = cpu_to_le16(ctx->compression);
+ tcp_ses->compression.requested = ctx->compress;
spin_lock_init(&tcp_ses->req_lock);
spin_lock_init(&tcp_ses->srv_lock);
spin_lock_init(&tcp_ses->mid_lock);
@@ -1859,6 +1868,9 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
ctx->sectype != ses->sectype)
return 0;
+ if (ctx->dfs_root_ses != ses->dfs_root_ses)
+ return 0;
+
/*
* If an existing session is limited to less channels than
* requested, it should not be reused
@@ -1962,31 +1974,6 @@ out:
return rc;
}
-/**
- * cifs_free_ipc - helper to release the session IPC tcon
- * @ses: smb session to unmount the IPC from
- *
- * Needs to be called everytime a session is destroyed.
- *
- * On session close, the IPC is closed and the server must release all tcons of the session.
- * No need to send a tree disconnect here.
- *
- * Besides, it will make the server to not close durable and resilient files on session close, as
- * specified in MS-SMB2 3.3.5.6 Receiving an SMB2 LOGOFF Request.
- */
-static int
-cifs_free_ipc(struct cifs_ses *ses)
-{
- struct cifs_tcon *tcon = ses->tcon_ipc;
-
- if (tcon == NULL)
- return 0;
-
- tconInfoFree(tcon);
- ses->tcon_ipc = NULL;
- return 0;
-}
-
static struct cifs_ses *
cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
{
@@ -2018,48 +2005,52 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
void __cifs_put_smb_ses(struct cifs_ses *ses)
{
struct TCP_Server_Info *server = ses->server;
+ struct cifs_tcon *tcon;
unsigned int xid;
size_t i;
+ bool do_logoff;
int rc;
+ spin_lock(&cifs_tcp_ses_lock);
spin_lock(&ses->ses_lock);
- if (ses->ses_status == SES_EXITING) {
+ cifs_dbg(FYI, "%s: id=0x%llx ses_count=%d ses_status=%u ipc=%s\n",
+ __func__, ses->Suid, ses->ses_count, ses->ses_status,
+ ses->tcon_ipc ? ses->tcon_ipc->tree_name : "none");
+ if (ses->ses_status == SES_EXITING || --ses->ses_count > 0) {
spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
return;
}
- spin_unlock(&ses->ses_lock);
+ /* ses_count can never go negative */
+ WARN_ON(ses->ses_count < 0);
- cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
- cifs_dbg(FYI,
- "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE");
+ spin_lock(&ses->chan_lock);
+ cifs_chan_clear_need_reconnect(ses, server);
+ spin_unlock(&ses->chan_lock);
- spin_lock(&cifs_tcp_ses_lock);
- if (--ses->ses_count > 0) {
- spin_unlock(&cifs_tcp_ses_lock);
- return;
- }
- spin_lock(&ses->ses_lock);
- if (ses->ses_status == SES_GOOD)
- ses->ses_status = SES_EXITING;
+ do_logoff = ses->ses_status == SES_GOOD && server->ops->logoff;
+ ses->ses_status = SES_EXITING;
+ tcon = ses->tcon_ipc;
+ ses->tcon_ipc = NULL;
spin_unlock(&ses->ses_lock);
spin_unlock(&cifs_tcp_ses_lock);
- /* ses_count can never go negative */
- WARN_ON(ses->ses_count < 0);
-
- spin_lock(&ses->ses_lock);
- if (ses->ses_status == SES_EXITING && server->ops->logoff) {
- spin_unlock(&ses->ses_lock);
- cifs_free_ipc(ses);
+ /*
+ * On session close, the IPC is closed and the server must release all
+ * tcons of the session. No need to send a tree disconnect here.
+ *
+ * Besides, it will make the server to not close durable and resilient
+ * files on session close, as specified in MS-SMB2 3.3.5.6 Receiving an
+ * SMB2 LOGOFF Request.
+ */
+ tconInfoFree(tcon);
+ if (do_logoff) {
xid = get_xid();
rc = server->ops->logoff(xid, ses);
if (rc)
cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
__func__, rc);
_free_xid(xid);
- } else {
- spin_unlock(&ses->ses_lock);
- cifs_free_ipc(ses);
}
spin_lock(&cifs_tcp_ses_lock);
@@ -2192,6 +2183,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
}
++delim;
+ /* BB consider adding support for password2 (Key Rotation) for multiuser in future */
ctx->password = kstrndup(delim, len, GFP_KERNEL);
if (!ctx->password) {
cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n",
@@ -2215,6 +2207,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
kfree(ctx->username);
ctx->username = NULL;
kfree_sensitive(ctx->password);
+ /* no need to free ctx->password2 since not allocated in this path */
ctx->password = NULL;
goto out_key_put;
}
@@ -2326,6 +2319,12 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
if (!ses->password)
goto get_ses_fail;
}
+ /* ctx->password freed at unmount */
+ if (ctx->password2) {
+ ses->password2 = kstrdup(ctx->password2, GFP_KERNEL);
+ if (!ses->password2)
+ goto get_ses_fail;
+ }
if (ctx->domainname) {
ses->domainName = kstrdup(ctx->domainname, GFP_KERNEL);
if (!ses->domainName)
@@ -2372,9 +2371,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
* need to lock before changing something in the session.
*/
spin_lock(&cifs_tcp_ses_lock);
+ if (ctx->dfs_root_ses)
+ cifs_smb_ses_inc_refcount(ctx->dfs_root_ses);
ses->dfs_root_ses = ctx->dfs_root_ses;
- if (ses->dfs_root_ses)
- ses->dfs_root_ses->ses_count++;
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -2803,6 +2802,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
return 0;
if (old->ctx->closetimeo != new->ctx->closetimeo)
return 0;
+ if (old->ctx->reparse_type != new->ctx->reparse_type)
+ return 0;
return 1;
}
@@ -3323,6 +3324,9 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx)
cifs_put_smb_ses(mnt_ctx->ses);
else if (mnt_ctx->server)
cifs_put_tcp_session(mnt_ctx->server, 0);
+ mnt_ctx->ses = NULL;
+ mnt_ctx->tcon = NULL;
+ mnt_ctx->server = NULL;
mnt_ctx->cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
free_xid(mnt_ctx->xid);
}
@@ -3601,8 +3605,6 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
bool isdfs;
int rc;
- INIT_LIST_HEAD(&mnt_ctx.dfs_ses_list);
-
rc = dfs_mount_share(&mnt_ctx, &isdfs);
if (rc)
goto error;
@@ -3633,7 +3635,6 @@ out:
return rc;
error:
- dfs_put_root_smb_sessions(&mnt_ctx.dfs_ses_list);
cifs_mount_put_conns(&mnt_ctx);
return rc;
}
@@ -3648,6 +3649,18 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
goto error;
rc = cifs_mount_get_tcon(&mnt_ctx);
+ if (!rc) {
+ /*
+ * Prevent superblock from being created with any missing
+ * connections.
+ */
+ if (WARN_ON(!mnt_ctx.server))
+ rc = -EHOSTDOWN;
+ else if (WARN_ON(!mnt_ctx.ses))
+ rc = -EACCES;
+ else if (WARN_ON(!mnt_ctx.tcon))
+ rc = -ENOENT;
+ }
if (rc)
goto error;
@@ -3985,13 +3998,14 @@ cifs_set_vol_auth(struct smb3_fs_context *ctx, struct cifs_ses *ses)
}
static struct cifs_tcon *
-cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
+__cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
{
int rc;
struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
struct cifs_ses *ses;
struct cifs_tcon *tcon = NULL;
struct smb3_fs_context *ctx;
+ char *origin_fullpath = NULL;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (ctx == NULL)
@@ -4015,6 +4029,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
ctx->sign = master_tcon->ses->sign;
ctx->seal = master_tcon->seal;
ctx->witness = master_tcon->use_witness;
+ ctx->dfs_root_ses = master_tcon->ses->dfs_root_ses;
rc = cifs_set_vol_auth(ctx, master_tcon->ses);
if (rc) {
@@ -4034,12 +4049,39 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
goto out;
}
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ spin_lock(&master_tcon->tc_lock);
+ if (master_tcon->origin_fullpath) {
+ spin_unlock(&master_tcon->tc_lock);
+ origin_fullpath = dfs_get_path(cifs_sb, cifs_sb->ctx->source);
+ if (IS_ERR(origin_fullpath)) {
+ tcon = ERR_CAST(origin_fullpath);
+ origin_fullpath = NULL;
+ cifs_put_smb_ses(ses);
+ goto out;
+ }
+ } else {
+ spin_unlock(&master_tcon->tc_lock);
+ }
+#endif
+
tcon = cifs_get_tcon(ses, ctx);
if (IS_ERR(tcon)) {
cifs_put_smb_ses(ses);
goto out;
}
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (origin_fullpath) {
+ spin_lock(&tcon->tc_lock);
+ tcon->origin_fullpath = origin_fullpath;
+ spin_unlock(&tcon->tc_lock);
+ origin_fullpath = NULL;
+ queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
+ dfs_cache_get_ttl() * HZ);
+ }
+#endif
+
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, ctx);
@@ -4048,11 +4090,23 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
out:
kfree(ctx->username);
kfree_sensitive(ctx->password);
+ kfree(origin_fullpath);
kfree(ctx);
return tcon;
}
+static struct cifs_tcon *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
+{
+ struct cifs_tcon *ret;
+
+ cifs_mount_lock();
+ ret = __cifs_construct_tcon(cifs_sb, fsuid);
+ cifs_mount_unlock();
+ return ret;
+}
+
struct cifs_tcon *
cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
{
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 449c59830039..3ec965547e3d 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -66,33 +66,20 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path)
}
/*
- * Track individual DFS referral servers used by new DFS mount.
- *
- * On success, their lifetime will be shared by final tcon (dfs_ses_list).
- * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount().
+ * Get an active reference of @ses so that next call to cifs_put_tcon() won't
+ * release it as any new DFS referrals must go through its IPC tcon.
*/
-static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
+static void add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct dfs_root_ses *root_ses;
struct cifs_ses *ses = mnt_ctx->ses;
if (ses) {
- root_ses = kmalloc(sizeof(*root_ses), GFP_KERNEL);
- if (!root_ses)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&root_ses->list);
-
spin_lock(&cifs_tcp_ses_lock);
cifs_smb_ses_inc_refcount(ses);
spin_unlock(&cifs_tcp_ses_lock);
- root_ses->ses = ses;
- list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list);
}
- /* Select new DFS referral server so that new referrals go through it */
ctx->dfs_root_ses = ses;
- return 0;
}
static inline int parse_dfs_target(struct smb3_fs_context *ctx,
@@ -185,11 +172,8 @@ again:
continue;
}
- if (is_refsrv) {
- rc = add_root_smb_session(mnt_ctx);
- if (rc)
- goto out;
- }
+ if (is_refsrv)
+ add_root_smb_session(mnt_ctx);
rc = ref_walk_advance(rw);
if (!rc) {
@@ -232,6 +216,7 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
struct cifs_tcon *tcon;
char *origin_fullpath;
+ bool new_tcon = true;
int rc;
origin_fullpath = dfs_get_path(cifs_sb, ctx->source);
@@ -239,6 +224,18 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
return PTR_ERR(origin_fullpath);
rc = dfs_referral_walk(mnt_ctx);
+ if (!rc) {
+ /*
+ * Prevent superblock from being created with any missing
+ * connections.
+ */
+ if (WARN_ON(!mnt_ctx->server))
+ rc = -EHOSTDOWN;
+ else if (WARN_ON(!mnt_ctx->ses))
+ rc = -EACCES;
+ else if (WARN_ON(!mnt_ctx->tcon))
+ rc = -ENOENT;
+ }
if (rc)
goto out;
@@ -247,15 +244,14 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
if (!tcon->origin_fullpath) {
tcon->origin_fullpath = origin_fullpath;
origin_fullpath = NULL;
+ } else {
+ new_tcon = false;
}
spin_unlock(&tcon->tc_lock);
- if (list_empty(&tcon->dfs_ses_list)) {
- list_replace_init(&mnt_ctx->dfs_ses_list, &tcon->dfs_ses_list);
+ if (new_tcon) {
queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
dfs_cache_get_ttl() * HZ);
- } else {
- dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list);
}
out:
@@ -298,7 +294,6 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
if (rc)
return rc;
- ctx->dfs_root_ses = mnt_ctx->ses;
/*
* If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally
* try to get an DFS referral (even cached) to determine whether it is an DFS mount.
@@ -324,7 +319,9 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
*isdfs = true;
add_root_smb_session(mnt_ctx);
- return __dfs_mount_share(mnt_ctx);
+ rc = __dfs_mount_share(mnt_ctx);
+ dfs_put_root_smb_sessions(mnt_ctx);
+ return rc;
}
/* Update dfs referral path of superblock */
diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h
index 875ab7ae57fc..e5c4dcf83750 100644
--- a/fs/smb/client/dfs.h
+++ b/fs/smb/client/dfs.h
@@ -7,7 +7,9 @@
#define _CIFS_DFS_H
#include "cifsglob.h"
+#include "cifsproto.h"
#include "fs_context.h"
+#include "dfs_cache.h"
#include "cifs_unicode.h"
#include <linux/namei.h>
@@ -114,11 +116,6 @@ static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw)
ref_walk_tit(rw));
}
-struct dfs_root_ses {
- struct list_head list;
- struct cifs_ses *ses;
-};
-
int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref,
struct smb3_fs_context *ctx);
int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs);
@@ -133,20 +130,32 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct cifs_ses *rses = ctx->dfs_root_ses ?: mnt_ctx->ses;
- return dfs_cache_find(mnt_ctx->xid, ctx->dfs_root_ses, cifs_sb->local_nls,
+ return dfs_cache_find(mnt_ctx->xid, rses, cifs_sb->local_nls,
cifs_remap(cifs_sb), path, ref, tl);
}
-static inline void dfs_put_root_smb_sessions(struct list_head *head)
+/*
+ * cifs_get_smb_ses() already guarantees an active reference of
+ * @ses->dfs_root_ses when a new session is created, so we need to put extra
+ * references of all DFS root sessions that were used across the mount process
+ * in dfs_mount_share().
+ */
+static inline void dfs_put_root_smb_sessions(struct cifs_mount_ctx *mnt_ctx)
{
- struct dfs_root_ses *root, *tmp;
+ const struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ struct cifs_ses *ses = ctx->dfs_root_ses;
+ struct cifs_ses *cur;
+
+ if (!ses)
+ return;
- list_for_each_entry_safe(root, tmp, head, list) {
- list_del_init(&root->list);
- cifs_put_smb_ses(root->ses);
- kfree(root);
+ for (cur = ses; cur; cur = cur->dfs_root_ses) {
+ if (cur->dfs_root_ses)
+ cifs_put_smb_ses(cur->dfs_root_ses);
}
+ cifs_put_smb_ses(ses);
}
#endif /* _CIFS_DFS_H */
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 508d831fabe3..11c8efecf7aa 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1172,8 +1172,8 @@ static bool is_ses_good(struct cifs_ses *ses)
return ret;
}
-/* Refresh dfs referral of tcon and mark it for reconnect if needed */
-static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_refresh)
+/* Refresh dfs referral of @ses and mark it for reconnect if needed */
+static void __refresh_ses_referral(struct cifs_ses *ses, bool force_refresh)
{
struct TCP_Server_Info *server = ses->server;
DFS_CACHE_TGT_LIST(old_tl);
@@ -1181,10 +1181,21 @@ static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_ref
bool needs_refresh = false;
struct cache_entry *ce;
unsigned int xid;
+ char *path = NULL;
int rc = 0;
xid = get_xid();
+ mutex_lock(&server->refpath_lock);
+ if (server->leaf_fullpath) {
+ path = kstrdup(server->leaf_fullpath + 1, GFP_ATOMIC);
+ if (!path)
+ rc = -ENOMEM;
+ }
+ mutex_unlock(&server->refpath_lock);
+ if (!path)
+ goto out;
+
down_read(&htable_rw_lock);
ce = lookup_cache_entry(path);
needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce);
@@ -1218,19 +1229,17 @@ out:
free_xid(xid);
dfs_cache_free_tgts(&old_tl);
dfs_cache_free_tgts(&new_tl);
- return rc;
+ kfree(path);
}
-static int refresh_tcon(struct cifs_tcon *tcon, bool force_refresh)
+static inline void refresh_ses_referral(struct cifs_ses *ses)
{
- struct TCP_Server_Info *server = tcon->ses->server;
- struct cifs_ses *ses = tcon->ses;
+ __refresh_ses_referral(ses, false);
+}
- mutex_lock(&server->refpath_lock);
- if (server->leaf_fullpath)
- __refresh_tcon(server->leaf_fullpath + 1, ses, force_refresh);
- mutex_unlock(&server->refpath_lock);
- return 0;
+static inline void force_refresh_ses_referral(struct cifs_ses *ses)
+{
+ __refresh_ses_referral(ses, true);
}
/**
@@ -1271,34 +1280,20 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
*/
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
- return refresh_tcon(tcon, true);
+ force_refresh_ses_referral(tcon->ses);
+ return 0;
}
/* Refresh all DFS referrals related to DFS tcon */
void dfs_cache_refresh(struct work_struct *work)
{
- struct TCP_Server_Info *server;
- struct dfs_root_ses *rses;
struct cifs_tcon *tcon;
struct cifs_ses *ses;
tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work);
- ses = tcon->ses;
- server = ses->server;
- mutex_lock(&server->refpath_lock);
- if (server->leaf_fullpath)
- __refresh_tcon(server->leaf_fullpath + 1, ses, false);
- mutex_unlock(&server->refpath_lock);
-
- list_for_each_entry(rses, &tcon->dfs_ses_list, list) {
- ses = rses->ses;
- server = ses->server;
- mutex_lock(&server->refpath_lock);
- if (server->leaf_fullpath)
- __refresh_tcon(server->leaf_fullpath + 1, ses, false);
- mutex_unlock(&server->refpath_lock);
- }
+ for (ses = tcon->ses; ses; ses = ses->dfs_root_ses)
+ refresh_ses_referral(ses);
queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
atomic_read(&dfs_cache_ttl) * HZ);
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 89333d9bce36..864b194dbaa0 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -189,6 +189,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
int disposition;
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
+ int rdwr_for_fscache = 0;
*oplock = 0;
if (tcon->ses->server->oplocks)
@@ -200,6 +201,10 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
return PTR_ERR(full_path);
}
+ /* If we're caching, we need to be able to fill in around partial writes. */
+ if (cifs_fscache_enabled(inode) && (oflags & O_ACCMODE) == O_WRONLY)
+ rdwr_for_fscache = 1;
+
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -276,6 +281,8 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
desired_access |= GENERIC_READ; /* is this too little? */
if (OPEN_FMODE(oflags) & FMODE_WRITE)
desired_access |= GENERIC_WRITE;
+ if (rdwr_for_fscache == 1)
+ desired_access |= GENERIC_READ;
disposition = FILE_OVERWRITE_IF;
if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -304,6 +311,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
create_options |= CREATE_OPTION_READONLY;
+retry_open:
oparms = (struct cifs_open_parms) {
.tcon = tcon,
.cifs_sb = cifs_sb,
@@ -317,8 +325,15 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc) {
cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
+ if (rc == -EACCES && rdwr_for_fscache == 1) {
+ desired_access &= ~GENERIC_READ;
+ rdwr_for_fscache = 2;
+ goto retry_open;
+ }
goto out;
}
+ if (rdwr_for_fscache == 2)
+ cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
@@ -612,11 +627,18 @@ int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode,
goto mknod_out;
}
+ trace_smb3_mknod_enter(xid, tcon->ses->Suid, tcon->tid, full_path);
+
rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
full_path, mode,
device_number);
mknod_out:
+ if (rc)
+ trace_smb3_mknod_err(xid, tcon->ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_mknod_done(xid, tcon->ses->Suid, tcon->tid);
+
free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index f391c9b803d8..9be37d0fe724 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -206,12 +206,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
*/
}
-static inline int cifs_convert_flags(unsigned int flags)
+static inline int cifs_convert_flags(unsigned int flags, int rdwr_for_fscache)
{
if ((flags & O_ACCMODE) == O_RDONLY)
return GENERIC_READ;
else if ((flags & O_ACCMODE) == O_WRONLY)
- return GENERIC_WRITE;
+ return rdwr_for_fscache == 1 ? (GENERIC_READ | GENERIC_WRITE) : GENERIC_WRITE;
else if ((flags & O_ACCMODE) == O_RDWR) {
/* GENERIC_ALL is too much permission to request
can cause unnecessary access denied on create */
@@ -329,7 +329,7 @@ int cifs_posix_open(const char *full_path, struct inode **pinode,
}
} else {
cifs_revalidate_mapping(*pinode);
- rc = cifs_fattr_to_inode(*pinode, &fattr);
+ rc = cifs_fattr_to_inode(*pinode, &fattr, false);
}
posix_open_ret:
@@ -348,11 +348,16 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_
int create_options = CREATE_NOT_DIR;
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
+ int rdwr_for_fscache = 0;
if (!server->ops->open)
return -ENOSYS;
- desired_access = cifs_convert_flags(f_flags);
+ /* If we're caching, we need to be able to fill in around partial writes. */
+ if (cifs_fscache_enabled(inode) && (f_flags & O_ACCMODE) == O_WRONLY)
+ rdwr_for_fscache = 1;
+
+ desired_access = cifs_convert_flags(f_flags, rdwr_for_fscache);
/*********************************************************************
* open flag mapping table:
@@ -389,6 +394,7 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_
if (f_flags & O_DIRECT)
create_options |= CREATE_NO_BUFFER;
+retry_open:
oparms = (struct cifs_open_parms) {
.tcon = tcon,
.cifs_sb = cifs_sb,
@@ -400,8 +406,16 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_
};
rc = server->ops->open(xid, &oparms, oplock, buf);
- if (rc)
+ if (rc) {
+ if (rc == -EACCES && rdwr_for_fscache == 1) {
+ desired_access = cifs_convert_flags(f_flags, 0);
+ rdwr_for_fscache = 2;
+ goto retry_open;
+ }
return rc;
+ }
+ if (rdwr_for_fscache == 2)
+ cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE);
/* TODO: Add support for calling posix query info but with passing in fid */
if (tcon->unix_ext)
@@ -445,6 +459,7 @@ cifs_down_write(struct rw_semaphore *sem)
}
static void cifsFileInfo_put_work(struct work_struct *work);
+void serverclose_work(struct work_struct *work);
struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock,
@@ -491,6 +506,7 @@ struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
cfile->tlink = cifs_get_tlink(tlink);
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
INIT_WORK(&cfile->put, cifsFileInfo_put_work);
+ INIT_WORK(&cfile->serverclose, serverclose_work);
INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close);
mutex_init(&cfile->fh_mutex);
spin_lock_init(&cfile->file_info_lock);
@@ -582,6 +598,40 @@ static void cifsFileInfo_put_work(struct work_struct *work)
cifsFileInfo_put_final(cifs_file);
}
+void serverclose_work(struct work_struct *work)
+{
+ struct cifsFileInfo *cifs_file = container_of(work,
+ struct cifsFileInfo, serverclose);
+
+ struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
+
+ struct TCP_Server_Info *server = tcon->ses->server;
+ int rc = 0;
+ int retries = 0;
+ int MAX_RETRIES = 4;
+
+ do {
+ if (server->ops->close_getattr)
+ rc = server->ops->close_getattr(0, tcon, cifs_file);
+ else if (server->ops->close)
+ rc = server->ops->close(0, tcon, &cifs_file->fid);
+
+ if (rc == -EBUSY || rc == -EAGAIN) {
+ retries++;
+ msleep(250);
+ }
+ } while ((rc == -EBUSY || rc == -EAGAIN) && (retries < MAX_RETRIES)
+ );
+
+ if (retries == MAX_RETRIES)
+ pr_warn("Serverclose failed %d times, giving up\n", MAX_RETRIES);
+
+ if (cifs_file->offload)
+ queue_work(fileinfo_put_wq, &cifs_file->put);
+ else
+ cifsFileInfo_put_final(cifs_file);
+}
+
/**
* cifsFileInfo_put - release a reference of file priv data
*
@@ -622,10 +672,13 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
struct cifs_fid fid = {};
struct cifs_pending_open open;
bool oplock_break_cancelled;
+ bool serverclose_offloaded = false;
spin_lock(&tcon->open_file_lock);
spin_lock(&cifsi->open_file_lock);
spin_lock(&cifs_file->file_info_lock);
+
+ cifs_file->offload = offload;
if (--cifs_file->count > 0) {
spin_unlock(&cifs_file->file_info_lock);
spin_unlock(&cifsi->open_file_lock);
@@ -667,13 +720,20 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
struct TCP_Server_Info *server = tcon->ses->server;
unsigned int xid;
+ int rc = 0;
xid = get_xid();
if (server->ops->close_getattr)
- server->ops->close_getattr(xid, tcon, cifs_file);
+ rc = server->ops->close_getattr(xid, tcon, cifs_file);
else if (server->ops->close)
- server->ops->close(xid, tcon, &cifs_file->fid);
+ rc = server->ops->close(xid, tcon, &cifs_file->fid);
_free_xid(xid);
+
+ if (rc == -EBUSY || rc == -EAGAIN) {
+ // Server close failed, hence offloading it as an async op
+ queue_work(serverclose_wq, &cifs_file->serverclose);
+ serverclose_offloaded = true;
+ }
}
if (oplock_break_cancelled)
@@ -681,10 +741,15 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
cifs_del_pending_open(&open);
- if (offload)
- queue_work(fileinfo_put_wq, &cifs_file->put);
- else
- cifsFileInfo_put_final(cifs_file);
+ // if serverclose has been offloaded to wq (on failure), it will
+ // handle offloading put as well. If serverclose not offloaded,
+ // we need to handle offloading put here.
+ if (!serverclose_offloaded) {
+ if (offload)
+ queue_work(fileinfo_put_wq, &cifs_file->put);
+ else
+ cifsFileInfo_put_final(cifs_file);
+ }
}
int cifs_open(struct inode *inode, struct file *file)
@@ -834,11 +899,11 @@ int cifs_open(struct inode *inode, struct file *file)
use_cache:
fscache_use_cookie(cifs_inode_cookie(file_inode(file)),
file->f_mode & FMODE_WRITE);
- if (file->f_flags & O_DIRECT &&
- (!((file->f_flags & O_ACCMODE) != O_RDONLY) ||
- file->f_flags & O_APPEND))
- cifs_invalidate_cache(file_inode(file),
- FSCACHE_INVAL_DIO_WRITE);
+ if (!(file->f_flags & O_DIRECT))
+ goto out;
+ if ((file->f_flags & (O_ACCMODE | O_APPEND)) == O_RDONLY)
+ goto out;
+ cifs_invalidate_cache(file_inode(file), FSCACHE_INVAL_DIO_WRITE);
out:
free_dentry_path(page);
@@ -903,6 +968,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
int disposition = FILE_OPEN;
int create_options = CREATE_NOT_DIR;
struct cifs_open_parms oparms;
+ int rdwr_for_fscache = 0;
xid = get_xid();
mutex_lock(&cfile->fh_mutex);
@@ -966,7 +1032,11 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
- desired_access = cifs_convert_flags(cfile->f_flags);
+ /* If we're caching, we need to be able to fill in around partial writes. */
+ if (cifs_fscache_enabled(inode) && (cfile->f_flags & O_ACCMODE) == O_WRONLY)
+ rdwr_for_fscache = 1;
+
+ desired_access = cifs_convert_flags(cfile->f_flags, rdwr_for_fscache);
/* O_SYNC also has bit for O_DSYNC so following check picks up either */
if (cfile->f_flags & O_SYNC)
@@ -978,6 +1048,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &cfile->fid);
+retry_open:
oparms = (struct cifs_open_parms) {
.tcon = tcon,
.cifs_sb = cifs_sb,
@@ -1003,6 +1074,11 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
/* indicate that we need to relock the file */
oparms.reconnect = true;
}
+ if (rc == -EACCES && rdwr_for_fscache == 1) {
+ desired_access = cifs_convert_flags(cfile->f_flags, 0);
+ rdwr_for_fscache = 2;
+ goto retry_open;
+ }
if (rc) {
mutex_unlock(&cfile->fh_mutex);
@@ -1011,6 +1087,9 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
goto reopen_error_exit;
}
+ if (rdwr_for_fscache == 2)
+ cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE);
+
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
reopen_success:
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
@@ -1072,6 +1151,19 @@ void smb2_deferred_work_close(struct work_struct *work)
_cifsFileInfo_put(cfile, true, false);
}
+static bool
+smb2_can_defer_close(struct inode *inode, struct cifs_deferred_close *dclose)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+
+ return (cifs_sb->ctx->closetimeo && cinode->lease_granted && dclose &&
+ (cinode->oplock == CIFS_CACHE_RHW_FLG ||
+ cinode->oplock == CIFS_CACHE_RH_FLG) &&
+ !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags));
+
+}
+
int cifs_close(struct inode *inode, struct file *file)
{
struct cifsFileInfo *cfile;
@@ -1085,10 +1177,8 @@ int cifs_close(struct inode *inode, struct file *file)
cfile = file->private_data;
file->private_data = NULL;
dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
- if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG)
- && cinode->lease_granted &&
- !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
- dclose) {
+ if ((cfile->status_file_deleted == false) &&
+ (smb2_can_defer_close(inode, dclose))) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
@@ -1315,20 +1405,20 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
down_read(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, offset, length, type,
- flock->fl_flags, &conf_lock,
+ flock->c.flc_flags, &conf_lock,
CIFS_LOCK_OP);
if (exist) {
flock->fl_start = conf_lock->offset;
flock->fl_end = conf_lock->offset + conf_lock->length - 1;
- flock->fl_pid = conf_lock->pid;
+ flock->c.flc_pid = conf_lock->pid;
if (conf_lock->type & server->vals->shared_lock_type)
- flock->fl_type = F_RDLCK;
+ flock->c.flc_type = F_RDLCK;
else
- flock->fl_type = F_WRLCK;
+ flock->c.flc_type = F_WRLCK;
} else if (!cinode->can_cache_brlcks)
rc = 1;
else
- flock->fl_type = F_UNLCK;
+ flock->c.flc_type = F_UNLCK;
up_read(&cinode->lock_sem);
return rc;
@@ -1404,16 +1494,16 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
{
int rc = 0;
struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
- unsigned char saved_type = flock->fl_type;
+ unsigned char saved_type = flock->c.flc_type;
- if ((flock->fl_flags & FL_POSIX) == 0)
+ if ((flock->c.flc_flags & FL_POSIX) == 0)
return 1;
down_read(&cinode->lock_sem);
posix_test_lock(file, flock);
- if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
- flock->fl_type = saved_type;
+ if (lock_is_unlock(flock) && !cinode->can_cache_brlcks) {
+ flock->c.flc_type = saved_type;
rc = 1;
}
@@ -1434,7 +1524,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
int rc = FILE_LOCK_DEFERRED + 1;
- if ((flock->fl_flags & FL_POSIX) == 0)
+ if ((flock->c.flc_flags & FL_POSIX) == 0)
return rc;
cifs_down_write(&cinode->lock_sem);
@@ -1584,7 +1674,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
el = locks_to_send.next;
spin_lock(&flctx->flc_lock);
- list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
+ for_each_file_lock(flock, &flctx->flc_posix) {
+ unsigned char ftype = flock->c.flc_type;
+
if (el == &locks_to_send) {
/*
* The list ended. We don't have enough allocated
@@ -1594,12 +1686,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
break;
}
length = cifs_flock_len(flock);
- if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+ if (ftype == F_RDLCK || ftype == F_SHLCK)
type = CIFS_RDLCK;
else
type = CIFS_WRLCK;
lck = list_entry(el, struct lock_to_push, llist);
- lck->pid = hash_lockowner(flock->fl_owner);
+ lck->pid = hash_lockowner(flock->c.flc_owner);
lck->netfid = cfile->fid.netfid;
lck->length = length;
lck->type = type;
@@ -1666,42 +1758,43 @@ static void
cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
bool *wait_flag, struct TCP_Server_Info *server)
{
- if (flock->fl_flags & FL_POSIX)
+ if (flock->c.flc_flags & FL_POSIX)
cifs_dbg(FYI, "Posix\n");
- if (flock->fl_flags & FL_FLOCK)
+ if (flock->c.flc_flags & FL_FLOCK)
cifs_dbg(FYI, "Flock\n");
- if (flock->fl_flags & FL_SLEEP) {
+ if (flock->c.flc_flags & FL_SLEEP) {
cifs_dbg(FYI, "Blocking lock\n");
*wait_flag = true;
}
- if (flock->fl_flags & FL_ACCESS)
+ if (flock->c.flc_flags & FL_ACCESS)
cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n");
- if (flock->fl_flags & FL_LEASE)
+ if (flock->c.flc_flags & FL_LEASE)
cifs_dbg(FYI, "Lease on file - not implemented yet\n");
- if (flock->fl_flags &
+ if (flock->c.flc_flags &
(~(FL_POSIX | FL_FLOCK | FL_SLEEP |
FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK)))
- cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
+ cifs_dbg(FYI, "Unknown lock flags 0x%x\n",
+ flock->c.flc_flags);
*type = server->vals->large_lock_type;
- if (flock->fl_type == F_WRLCK) {
+ if (lock_is_write(flock)) {
cifs_dbg(FYI, "F_WRLCK\n");
*type |= server->vals->exclusive_lock_type;
*lock = 1;
- } else if (flock->fl_type == F_UNLCK) {
+ } else if (lock_is_unlock(flock)) {
cifs_dbg(FYI, "F_UNLCK\n");
*type |= server->vals->unlock_lock_type;
*unlock = 1;
/* Check if unlock includes more than one lock range */
- } else if (flock->fl_type == F_RDLCK) {
+ } else if (lock_is_read(flock)) {
cifs_dbg(FYI, "F_RDLCK\n");
*type |= server->vals->shared_lock_type;
*lock = 1;
- } else if (flock->fl_type == F_EXLCK) {
+ } else if (flock->c.flc_type == F_EXLCK) {
cifs_dbg(FYI, "F_EXLCK\n");
*type |= server->vals->exclusive_lock_type;
*lock = 1;
- } else if (flock->fl_type == F_SHLCK) {
+ } else if (flock->c.flc_type == F_SHLCK) {
cifs_dbg(FYI, "F_SHLCK\n");
*type |= server->vals->shared_lock_type;
*lock = 1;
@@ -1733,7 +1826,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
else
posix_lock_type = CIFS_WRLCK;
rc = CIFSSMBPosixLock(xid, tcon, netfid,
- hash_lockowner(flock->fl_owner),
+ hash_lockowner(flock->c.flc_owner),
flock->fl_start, length, flock,
posix_lock_type, wait_flag);
return rc;
@@ -1750,7 +1843,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
if (rc == 0) {
rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
type, 0, 1, false);
- flock->fl_type = F_UNLCK;
+ flock->c.flc_type = F_UNLCK;
if (rc != 0)
cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
rc);
@@ -1758,7 +1851,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
}
if (type & server->vals->shared_lock_type) {
- flock->fl_type = F_WRLCK;
+ flock->c.flc_type = F_WRLCK;
return 0;
}
@@ -1770,12 +1863,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
if (rc == 0) {
rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
type | server->vals->shared_lock_type, 0, 1, false);
- flock->fl_type = F_RDLCK;
+ flock->c.flc_type = F_RDLCK;
if (rc != 0)
cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
rc);
} else
- flock->fl_type = F_WRLCK;
+ flock->c.flc_type = F_WRLCK;
return 0;
}
@@ -1943,7 +2036,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
posix_lock_type = CIFS_UNLCK;
rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid,
- hash_lockowner(flock->fl_owner),
+ hash_lockowner(flock->c.flc_owner),
flock->fl_start, length,
NULL, posix_lock_type, wait_flag);
goto out;
@@ -1953,7 +2046,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
struct cifsLockInfo *lock;
lock = cifs_lock_init(flock->fl_start, length, type,
- flock->fl_flags);
+ flock->c.flc_flags);
if (!lock)
return -ENOMEM;
@@ -1992,7 +2085,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) {
+ if ((flock->c.flc_flags & FL_POSIX) || (flock->c.flc_flags & FL_FLOCK)) {
/*
* If this is a request to remove all locks because we
* are closing the file, it doesn't matter if the
@@ -2001,7 +2094,7 @@ out:
*/
if (rc) {
cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
- if (!(flock->fl_flags & FL_CLOSE))
+ if (!(flock->c.flc_flags & FL_CLOSE))
return rc;
}
rc = locks_lock_file_wait(file, flock);
@@ -2022,7 +2115,7 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
xid = get_xid();
- if (!(fl->fl_flags & FL_FLOCK)) {
+ if (!(fl->c.flc_flags & FL_FLOCK)) {
rc = -ENOLCK;
free_xid(xid);
return rc;
@@ -2073,7 +2166,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
xid = get_xid();
cifs_dbg(FYI, "%s: %pD2 cmd=0x%x type=0x%x flags=0x%x r=%lld:%lld\n", __func__, file, cmd,
- flock->fl_flags, flock->fl_type, (long long)flock->fl_start,
+ flock->c.flc_flags, flock->c.flc_type,
+ (long long)flock->fl_start,
(long long)flock->fl_end);
cfile = (struct cifsFileInfo *)file->private_data;
@@ -2624,20 +2718,20 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
* dirty pages if possible, but don't sleep while doing so.
*/
static void cifs_extend_writeback(struct address_space *mapping,
+ struct xa_state *xas,
long *_count,
loff_t start,
int max_pages,
- size_t max_len,
- unsigned int *_len)
+ loff_t max_len,
+ size_t *_len)
{
struct folio_batch batch;
struct folio *folio;
- unsigned int psize, nr_pages;
- size_t len = *_len;
- pgoff_t index = (start + len) / PAGE_SIZE;
+ unsigned int nr_pages;
+ pgoff_t index = (start + *_len) / PAGE_SIZE;
+ size_t len;
bool stop = true;
unsigned int i;
- XA_STATE(xas, &mapping->i_pages, index);
folio_batch_init(&batch);
@@ -2648,54 +2742,64 @@ static void cifs_extend_writeback(struct address_space *mapping,
*/
rcu_read_lock();
- xas_for_each(&xas, folio, ULONG_MAX) {
+ xas_for_each(xas, folio, ULONG_MAX) {
stop = true;
- if (xas_retry(&xas, folio))
+ if (xas_retry(xas, folio))
continue;
if (xa_is_value(folio))
break;
- if (folio->index != index)
+ if (folio->index != index) {
+ xas_reset(xas);
break;
+ }
+
if (!folio_try_get_rcu(folio)) {
- xas_reset(&xas);
+ xas_reset(xas);
continue;
}
nr_pages = folio_nr_pages(folio);
- if (nr_pages > max_pages)
+ if (nr_pages > max_pages) {
+ xas_reset(xas);
break;
+ }
/* Has the page moved or been split? */
- if (unlikely(folio != xas_reload(&xas))) {
+ if (unlikely(folio != xas_reload(xas))) {
folio_put(folio);
+ xas_reset(xas);
break;
}
if (!folio_trylock(folio)) {
folio_put(folio);
+ xas_reset(xas);
break;
}
- if (!folio_test_dirty(folio) || folio_test_writeback(folio)) {
+ if (!folio_test_dirty(folio) ||
+ folio_test_writeback(folio)) {
folio_unlock(folio);
folio_put(folio);
+ xas_reset(xas);
break;
}
max_pages -= nr_pages;
- psize = folio_size(folio);
- len += psize;
+ len = folio_size(folio);
stop = false;
- if (max_pages <= 0 || len >= max_len || *_count <= 0)
- stop = true;
index += nr_pages;
+ *_count -= nr_pages;
+ *_len += len;
+ if (max_pages <= 0 || *_len >= max_len || *_count <= 0)
+ stop = true;
+
if (!folio_batch_add(&batch, folio))
break;
if (stop)
break;
}
- if (!stop)
- xas_pause(&xas);
+ xas_pause(xas);
rcu_read_unlock();
/* Now, if we obtained any pages, we can shift them to being
@@ -2712,16 +2816,12 @@ static void cifs_extend_writeback(struct address_space *mapping,
if (!folio_clear_dirty_for_io(folio))
WARN_ON(1);
folio_start_writeback(folio);
-
- *_count -= folio_nr_pages(folio);
folio_unlock(folio);
}
folio_batch_release(&batch);
cond_resched();
} while (!stop);
-
- *_len = len;
}
/*
@@ -2729,8 +2829,10 @@ static void cifs_extend_writeback(struct address_space *mapping,
*/
static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
struct writeback_control *wbc,
+ struct xa_state *xas,
struct folio *folio,
- loff_t start, loff_t end)
+ unsigned long long start,
+ unsigned long long end)
{
struct inode *inode = mapping->host;
struct TCP_Server_Info *server;
@@ -2739,17 +2841,18 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
struct cifs_credits credits_on_stack;
struct cifs_credits *credits = &credits_on_stack;
struct cifsFileInfo *cfile = NULL;
- unsigned int xid, wsize, len;
- loff_t i_size = i_size_read(inode);
- size_t max_len;
+ unsigned long long i_size = i_size_read(inode), max_len;
+ unsigned int xid, wsize;
+ size_t len = folio_size(folio);
long count = wbc->nr_to_write;
int rc;
/* The folio should be locked, dirty and not undergoing writeback. */
+ if (!folio_clear_dirty_for_io(folio))
+ WARN_ON_ONCE(1);
folio_start_writeback(folio);
count -= folio_nr_pages(folio);
- len = folio_size(folio);
xid = get_xid();
server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses);
@@ -2779,9 +2882,10 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
wdata->server = server;
cfile = NULL;
- /* Find all consecutive lockable dirty pages, stopping when we find a
- * page that is not immediately lockable, is not dirty or is missing,
- * or we reach the end of the range.
+ /* Find all consecutive lockable dirty pages that have contiguous
+ * written regions, stopping when we find a page that is not
+ * immediately lockable, is not dirty or is missing, or we reach the
+ * end of the range.
*/
if (start < i_size) {
/* Trim the write to the EOF; the extra data is ignored. Also
@@ -2801,19 +2905,18 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
max_pages -= folio_nr_pages(folio);
if (max_pages > 0)
- cifs_extend_writeback(mapping, &count, start,
+ cifs_extend_writeback(mapping, xas, &count, start,
max_pages, max_len, &len);
}
- len = min_t(loff_t, len, max_len);
}
-
- wdata->bytes = len;
+ len = min_t(unsigned long long, len, i_size - start);
/* We now have a contiguous set of dirty pages, each with writeback
* set; the first page is still locked at this point, but all the rest
* have been unlocked.
*/
folio_unlock(folio);
+ wdata->bytes = len;
if (start < i_size) {
iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages,
@@ -2864,102 +2967,118 @@ err_xid:
/*
* write a region of pages back to the server
*/
-static int cifs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next)
+static ssize_t cifs_writepages_begin(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct xa_state *xas,
+ unsigned long long *_start,
+ unsigned long long end)
{
- struct folio_batch fbatch;
+ struct folio *folio;
+ unsigned long long start = *_start;
+ ssize_t ret;
int skips = 0;
- folio_batch_init(&fbatch);
- do {
- int nr;
- pgoff_t index = start / PAGE_SIZE;
+search_again:
+ /* Find the first dirty page. */
+ rcu_read_lock();
- nr = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE,
- PAGECACHE_TAG_DIRTY, &fbatch);
- if (!nr)
+ for (;;) {
+ folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
+ if (xas_retry(xas, folio) || xa_is_value(folio))
+ continue;
+ if (!folio)
break;
- for (int i = 0; i < nr; i++) {
- ssize_t ret;
- struct folio *folio = fbatch.folios[i];
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
-redo_folio:
- start = folio_pos(folio); /* May regress with THPs */
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ continue;
+ }
- /* At this point we hold neither the i_pages lock nor the
- * page lock: the page may be truncated or invalidated
- * (changing page->mapping to NULL), or even swizzled
- * back from swapper_space to tmpfs file mapping
- */
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0)
- goto write_error;
- } else {
- if (!folio_trylock(folio))
- goto skip_write;
- }
+ xas_pause(xas);
+ break;
+ }
+ rcu_read_unlock();
+ if (!folio)
+ return 0;
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- continue;
- }
+ start = folio_pos(folio); /* May regress with THPs */
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode == WB_SYNC_NONE)
- goto skip_write;
+ /* At this point we hold neither the i_pages lock nor the page lock:
+ * the page may be truncated or invalidated (changing page->mapping to
+ * NULL), or even swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+lock_again:
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = folio_lock_killable(folio);
+ if (ret < 0)
+ return ret;
+ } else {
+ if (!folio_trylock(folio))
+ goto search_again;
+ }
+
+ if (folio->mapping != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ goto search_again;
+ }
- folio_wait_writeback(folio);
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ folio_wait_writeback(folio);
#ifdef CONFIG_CIFS_FSCACHE
- folio_wait_fscache(folio);
+ folio_wait_fscache(folio);
#endif
- goto redo_folio;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- /* We hold the page lock - it should've been dirty. */
- WARN_ON(1);
-
- ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end);
- if (ret < 0)
- goto write_error;
-
- start += ret;
- continue;
-
-write_error:
- folio_batch_release(&fbatch);
- *_next = start;
- return ret;
+ goto lock_again;
+ }
-skip_write:
- /*
- * Too many skipped writes, or need to reschedule?
- * Treat it as a write error without an error code.
- */
+ start += folio_size(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
if (skips >= 5 || need_resched()) {
ret = 0;
- goto write_error;
+ goto out;
}
-
- /* Otherwise, just skip that folio and go on to the next */
skips++;
- start += folio_size(folio);
- continue;
}
+ goto search_again;
+ }
- folio_batch_release(&fbatch);
- cond_resched();
- } while (wbc->nr_to_write > 0);
+ ret = cifs_write_back_from_locked_folio(mapping, wbc, xas, folio, start, end);
+out:
+ if (ret > 0)
+ *_start = start + ret;
+ return ret;
+}
- *_next = start;
- return 0;
+/*
+ * Write a region of pages back to the server
+ */
+static int cifs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ ssize_t ret;
+
+ XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
+
+ do {
+ ret = cifs_writepages_begin(mapping, wbc, &xas, _start, end);
+ if (ret > 0 && wbc->nr_to_write > 0)
+ cond_resched();
+ } while (ret > 0 && wbc->nr_to_write > 0);
+
+ return ret > 0 ? 0 : ret;
}
/*
@@ -2968,7 +3087,7 @@ skip_write:
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- loff_t start, next;
+ loff_t start, end;
int ret;
/* We have to be careful as we can end up racing with setattr()
@@ -2976,28 +3095,34 @@ static int cifs_writepages(struct address_space *mapping,
* to prevent it.
*/
- if (wbc->range_cyclic) {
+ if (wbc->range_cyclic && mapping->writeback_index) {
start = mapping->writeback_index * PAGE_SIZE;
- ret = cifs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
- if (ret == 0) {
- mapping->writeback_index = next / PAGE_SIZE;
- if (start > 0 && wbc->nr_to_write > 0) {
- ret = cifs_writepages_region(mapping, wbc, 0,
- start, &next);
- if (ret == 0)
- mapping->writeback_index =
- next / PAGE_SIZE;
- }
+ ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX);
+ if (ret < 0)
+ goto out;
+
+ if (wbc->nr_to_write <= 0) {
+ mapping->writeback_index = start / PAGE_SIZE;
+ goto out;
}
+
+ start = 0;
+ end = mapping->writeback_index * PAGE_SIZE;
+ mapping->writeback_index = 0;
+ ret = cifs_writepages_region(mapping, wbc, &start, end);
+ if (ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- ret = cifs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
+ start = 0;
+ ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX);
if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = next / PAGE_SIZE;
+ mapping->writeback_index = start / PAGE_SIZE;
} else {
- ret = cifs_writepages_region(mapping, wbc,
- wbc->range_start, wbc->range_end, &next);
+ start = wbc->range_start;
+ ret = cifs_writepages_region(mapping, wbc, &start, wbc->range_end);
}
+out:
return ret;
}
@@ -3094,8 +3219,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
if (rc > 0) {
spin_lock(&inode->i_lock);
if (pos > inode->i_size) {
+ loff_t additional_blocks = (512 - 1 + copied) >> 9;
+
i_size_write(inode, pos);
- inode->i_blocks = (512 - 1 + pos) >> 9;
+ /*
+ * Estimate new allocation size based on the amount written.
+ * This will be updated from server on close (and on queryinfo)
+ */
+ inode->i_blocks = min_t(blkcnt_t, (512 - 1 + pos) >> 9,
+ inode->i_blocks + additional_blocks);
}
spin_unlock(&inode->i_lock);
}
@@ -4738,12 +4870,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
refreshing the inode only on increases in the file size
but this is tricky to do without racing with writebehind
page caching in the current Linux kernel design */
-bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
+bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file,
+ bool from_readdir)
{
if (!cifsInode)
return true;
- if (is_inode_writable(cifsInode)) {
+ if (is_inode_writable(cifsInode) ||
+ ((cifsInode->oplock & CIFS_CACHE_RW_FLG) != 0 && from_readdir)) {
/* This inode is open for write at least once */
struct cifs_sb_info *cifs_sb;
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 4b2f5aa2ea0e..6c727d8c31e8 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -37,7 +37,7 @@
#include "rfc1002pdu.h"
#include "fs_context.h"
-static DEFINE_MUTEX(cifs_mount_mutex);
+DEFINE_MUTEX(cifs_mount_mutex);
static const match_table_t cifs_smb_version_tokens = {
{ Smb_1, SMB1_VERSION_STRING },
@@ -162,6 +162,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_string("username", Opt_user),
fsparam_string("pass", Opt_pass),
fsparam_string("password", Opt_pass),
+ fsparam_string("password2", Opt_pass2),
fsparam_string("ip", Opt_ip),
fsparam_string("addr", Opt_ip),
fsparam_string("domain", Opt_domain),
@@ -174,6 +175,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_string("vers", Opt_vers),
fsparam_string("sec", Opt_sec),
fsparam_string("cache", Opt_cache),
+ fsparam_string("reparse", Opt_reparse),
/* Arguments that should be ignored */
fsparam_flag("guest", Opt_ignore),
@@ -296,6 +298,35 @@ cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_conte
return 0;
}
+static const match_table_t reparse_flavor_tokens = {
+ { Opt_reparse_default, "default" },
+ { Opt_reparse_nfs, "nfs" },
+ { Opt_reparse_wsl, "wsl" },
+ { Opt_reparse_err, NULL },
+};
+
+static int parse_reparse_flavor(struct fs_context *fc, char *value,
+ struct smb3_fs_context *ctx)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ switch (match_token(value, reparse_flavor_tokens, args)) {
+ case Opt_reparse_default:
+ ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
+ break;
+ case Opt_reparse_nfs:
+ ctx->reparse_type = CIFS_REPARSE_TYPE_NFS;
+ break;
+ case Opt_reparse_wsl:
+ ctx->reparse_type = CIFS_REPARSE_TYPE_WSL;
+ break;
+ default:
+ cifs_errorf(fc, "bad reparse= option: %s\n", value);
+ return 1;
+ }
+ return 0;
+}
+
#define DUP_CTX_STR(field) \
do { \
if (ctx->field) { \
@@ -315,6 +346,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->nodename = NULL;
new_ctx->username = NULL;
new_ctx->password = NULL;
+ new_ctx->password2 = NULL;
new_ctx->server_hostname = NULL;
new_ctx->domainname = NULL;
new_ctx->UNC = NULL;
@@ -327,6 +359,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(prepath);
DUP_CTX_STR(username);
DUP_CTX_STR(password);
+ DUP_CTX_STR(password2);
DUP_CTX_STR(server_hostname);
DUP_CTX_STR(UNC);
DUP_CTX_STR(source);
@@ -753,9 +786,9 @@ static int smb3_get_tree(struct fs_context *fc)
if (err)
return err;
- mutex_lock(&cifs_mount_mutex);
+ cifs_mount_lock();
ret = smb3_get_tree_common(fc);
- mutex_unlock(&cifs_mount_mutex);
+ cifs_mount_unlock();
return ret;
}
@@ -772,7 +805,7 @@ static void smb3_fs_context_free(struct fs_context *fc)
*/
static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
struct smb3_fs_context *new_ctx,
- struct smb3_fs_context *old_ctx)
+ struct smb3_fs_context *old_ctx, bool need_recon)
{
if (new_ctx->posix_paths != old_ctx->posix_paths) {
cifs_errorf(fc, "can not change posixpaths during remount\n");
@@ -798,8 +831,15 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
}
if (new_ctx->password &&
(!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) {
- cifs_errorf(fc, "can not change password during remount\n");
- return -EINVAL;
+ if (need_recon == false) {
+ cifs_errorf(fc,
+ "can not change password of active session during remount\n");
+ return -EINVAL;
+ } else if (old_ctx->sectype == Kerberos) {
+ cifs_errorf(fc,
+ "can not change password for Kerberos via remount\n");
+ return -EINVAL;
+ }
}
if (new_ctx->domainname &&
(!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) {
@@ -843,9 +883,14 @@ static int smb3_reconfigure(struct fs_context *fc)
struct smb3_fs_context *ctx = smb3_fc2context(fc);
struct dentry *root = fc->root;
struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
+ struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses;
+ bool need_recon = false;
int rc;
- rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx);
+ if (ses->expired_pwd)
+ need_recon = true;
+
+ rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx, need_recon);
if (rc)
return rc;
@@ -858,7 +903,14 @@ static int smb3_reconfigure(struct fs_context *fc)
STEAL_STRING(cifs_sb, ctx, UNC);
STEAL_STRING(cifs_sb, ctx, source);
STEAL_STRING(cifs_sb, ctx, username);
- STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
+ if (need_recon == false)
+ STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
+ else {
+ kfree_sensitive(ses->password);
+ ses->password = kstrdup(ctx->password, GFP_KERNEL);
+ kfree_sensitive(ses->password2);
+ ses->password2 = kstrdup(ctx->password2, GFP_KERNEL);
+ }
STEAL_STRING(cifs_sb, ctx, domainname);
STEAL_STRING(cifs_sb, ctx, nodename);
STEAL_STRING(cifs_sb, ctx, iocharset);
@@ -916,7 +968,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
switch (opt) {
case Opt_compress:
- ctx->compression = UNKNOWN_TYPE;
+ ctx->compress = true;
cifs_dbg(VFS,
"SMB3 compression support is experimental\n");
break;
@@ -1258,6 +1310,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
goto cifs_parse_mount_err;
}
break;
+ case Opt_pass2:
+ kfree_sensitive(ctx->password2);
+ ctx->password2 = NULL;
+ if (strlen(param->string) == 0)
+ break;
+
+ ctx->password2 = kstrdup(param->string, GFP_KERNEL);
+ if (ctx->password2 == NULL) {
+ cifs_errorf(fc, "OOM when copying password2 string\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
case Opt_ip:
if (strlen(param->string) == 0) {
ctx->got_ip = false;
@@ -1549,6 +1613,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_rdma:
ctx->rdma = true;
break;
+ case Opt_reparse:
+ if (parse_reparse_flavor(fc, param->string, ctx))
+ goto cifs_parse_mount_err;
+ break;
}
/* case Opt_ignore: - is ignored as expected ... */
@@ -1557,6 +1625,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_parse_mount_err:
kfree_sensitive(ctx->password);
ctx->password = NULL;
+ kfree_sensitive(ctx->password2);
+ ctx->password2 = NULL;
return -EINVAL;
}
@@ -1635,6 +1705,7 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->backupgid_specified = false; /* no backup intent for a group */
ctx->retrans = 1;
+ ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
/*
* short int override_uid = -1;
@@ -1661,6 +1732,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->username = NULL;
kfree_sensitive(ctx->password);
ctx->password = NULL;
+ kfree_sensitive(ctx->password2);
+ ctx->password2 = NULL;
kfree(ctx->server_hostname);
ctx->server_hostname = NULL;
kfree(ctx->UNC);
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 182ce11cbe93..a947bddeba27 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -41,6 +41,13 @@ enum {
Opt_cache_err
};
+enum cifs_reparse_parm {
+ Opt_reparse_default,
+ Opt_reparse_nfs,
+ Opt_reparse_wsl,
+ Opt_reparse_err
+};
+
enum cifs_sec_param {
Opt_sec_krb5,
Opt_sec_krb5i,
@@ -138,6 +145,7 @@ enum cifs_param {
Opt_source,
Opt_user,
Opt_pass,
+ Opt_pass2,
Opt_ip,
Opt_domain,
Opt_srcaddr,
@@ -148,6 +156,7 @@ enum cifs_param {
Opt_vers,
Opt_sec,
Opt_cache,
+ Opt_reparse,
/* Mount options to be ignored */
Opt_ignore,
@@ -169,6 +178,7 @@ struct smb3_fs_context {
char *username;
char *password;
+ char *password2;
char *domainname;
char *source;
char *server_hostname;
@@ -265,12 +275,13 @@ struct smb3_fs_context {
unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
unsigned int max_channels;
unsigned int max_cached_dirs;
- __u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
+ bool compress; /* enable SMB2 messages (READ/WRITE) de/compression */
bool rootfs:1; /* if it's a SMB root file system */
bool witness:1; /* use witness protocol */
char *leaf_fullpath;
struct cifs_ses *dfs_root_ses;
bool dfs_automount:1; /* set for dfs automount only */
+ enum cifs_reparse_type reparse_type;
};
extern const struct fs_parameter_spec smb3_fs_parameters[];
@@ -295,4 +306,16 @@ extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb);
#define MAX_CACHED_FIDS 16
extern char *cifs_sanitize_prepath(char *prepath, gfp_t gfp);
+extern struct mutex cifs_mount_mutex;
+
+static inline void cifs_mount_lock(void)
+{
+ mutex_lock(&cifs_mount_mutex);
+}
+
+static inline void cifs_mount_unlock(void)
+{
+ mutex_unlock(&cifs_mount_mutex);
+}
+
#endif
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index c4a3cb736881..340efce8f052 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -12,6 +12,16 @@
#include "cifs_fs_sb.h"
#include "cifsproto.h"
+/*
+ * Key for fscache inode. [!] Contents must match comparisons in cifs_find_inode().
+ */
+struct cifs_fscache_inode_key {
+
+ __le64 uniqueid; /* server inode number */
+ __le64 createtime; /* creation time on server */
+ u8 type; /* S_IFMT file type */
+} __packed;
+
static void cifs_fscache_fill_volume_coherency(
struct cifs_tcon *tcon,
struct cifs_fscache_volume_coherency_data *cd)
@@ -97,15 +107,19 @@ void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
void cifs_fscache_get_inode_cookie(struct inode *inode)
{
struct cifs_fscache_inode_coherency_data cd;
+ struct cifs_fscache_inode_key key;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ key.uniqueid = cpu_to_le64(cifsi->uniqueid);
+ key.createtime = cpu_to_le64(cifsi->createtime);
+ key.type = (inode->i_mode & S_IFMT) >> 12;
cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd);
cifsi->netfs.cache =
fscache_acquire_cookie(tcon->fscache, 0,
- &cifsi->uniqueid, sizeof(cifsi->uniqueid),
+ &key, sizeof(key),
&cd, sizeof(cd),
i_size_read(&cifsi->netfs.inode));
if (cifsi->netfs.cache)
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index a3d73720914f..1f2ea9f5cc9a 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -109,6 +109,11 @@ static inline void cifs_readahead_to_fscache(struct inode *inode,
__cifs_readahead_to_fscache(inode, pos, len);
}
+static inline bool cifs_fscache_enabled(struct inode *inode)
+{
+ return fscache_cookie_enabled(cifs_inode_cookie(inode));
+}
+
#else /* CONFIG_CIFS_FSCACHE */
static inline
void cifs_fscache_fill_coherency(struct inode *inode,
@@ -124,6 +129,7 @@ static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) {}
static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; }
static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {}
+static inline bool cifs_fscache_enabled(struct inode *inode) { return false; }
static inline int cifs_fscache_query_occupancy(struct inode *inode,
pgoff_t first, unsigned int nr_pages,
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index d02f8ba29cb5..60afab5c83d4 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -26,6 +26,7 @@
#include "fs_context.h"
#include "cifs_ioctl.h"
#include "cached_dir.h"
+#include "reparse.h"
static void cifs_set_ops(struct inode *inode)
{
@@ -147,7 +148,8 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
/* populate an inode with info from a cifs_fattr struct */
int
-cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
+cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
+ bool from_readdir)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -199,7 +201,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
* Can't safely change the file size here if the client is writing to
* it due to potential races.
*/
- if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
+ if (is_size_safe_to_change(cifs_i, fattr->cf_eof, from_readdir)) {
i_size_write(inode, fattr->cf_eof);
/*
@@ -368,7 +370,7 @@ static int update_inode_info(struct super_block *sb,
CIFS_I(*inode)->time = 0; /* force reval */
return -ESTALE;
}
- return cifs_fattr_to_inode(*inode, fattr);
+ return cifs_fattr_to_inode(*inode, fattr, false);
}
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
@@ -399,11 +401,10 @@ cifs_get_file_info_unix(struct file *filp)
cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
} else if (rc == -EREMOTE) {
cifs_create_junction_fattr(&fattr, inode->i_sb);
- rc = 0;
} else
goto cifs_gfiunix_out;
- rc = cifs_fattr_to_inode(inode, &fattr);
+ rc = cifs_fattr_to_inode(inode, &fattr, false);
cifs_gfiunix_out:
free_xid(xid);
@@ -727,84 +728,6 @@ out_reparse:
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
}
-static inline dev_t nfs_mkdev(struct reparse_posix_data *buf)
-{
- u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
-
- return MKDEV(v >> 32, v & 0xffffffff);
-}
-
-bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
- struct cifs_fattr *fattr,
- struct cifs_open_info_data *data)
-{
- struct reparse_posix_data *buf = data->reparse.posix;
- u32 tag = data->reparse.tag;
-
- if (tag == IO_REPARSE_TAG_NFS && buf) {
- switch (le64_to_cpu(buf->InodeType)) {
- case NFS_SPECFILE_CHR:
- fattr->cf_mode |= S_IFCHR;
- fattr->cf_dtype = DT_CHR;
- fattr->cf_rdev = nfs_mkdev(buf);
- break;
- case NFS_SPECFILE_BLK:
- fattr->cf_mode |= S_IFBLK;
- fattr->cf_dtype = DT_BLK;
- fattr->cf_rdev = nfs_mkdev(buf);
- break;
- case NFS_SPECFILE_FIFO:
- fattr->cf_mode |= S_IFIFO;
- fattr->cf_dtype = DT_FIFO;
- break;
- case NFS_SPECFILE_SOCK:
- fattr->cf_mode |= S_IFSOCK;
- fattr->cf_dtype = DT_SOCK;
- break;
- case NFS_SPECFILE_LNK:
- fattr->cf_mode |= S_IFLNK;
- fattr->cf_dtype = DT_LNK;
- break;
- default:
- WARN_ON_ONCE(1);
- return false;
- }
- return true;
- }
-
- switch (tag) {
- case IO_REPARSE_TAG_LX_SYMLINK:
- fattr->cf_mode |= S_IFLNK;
- fattr->cf_dtype = DT_LNK;
- break;
- case IO_REPARSE_TAG_LX_FIFO:
- fattr->cf_mode |= S_IFIFO;
- fattr->cf_dtype = DT_FIFO;
- break;
- case IO_REPARSE_TAG_AF_UNIX:
- fattr->cf_mode |= S_IFSOCK;
- fattr->cf_dtype = DT_SOCK;
- break;
- case IO_REPARSE_TAG_LX_CHR:
- fattr->cf_mode |= S_IFCHR;
- fattr->cf_dtype = DT_CHR;
- break;
- case IO_REPARSE_TAG_LX_BLK:
- fattr->cf_mode |= S_IFBLK;
- fattr->cf_dtype = DT_BLK;
- break;
- case 0: /* SMB1 symlink */
- case IO_REPARSE_TAG_SYMLINK:
- case IO_REPARSE_TAG_NFS:
- fattr->cf_mode |= S_IFLNK;
- fattr->cf_dtype = DT_LNK;
- break;
- default:
- return false;
- }
- return true;
-}
-
static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
struct cifs_open_info_data *data,
struct super_block *sb)
@@ -835,6 +758,8 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
fattr->cf_createtime = le64_to_cpu(info->CreationTime);
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ fattr->cf_uid = cifs_sb->ctx->linux_uid;
+ fattr->cf_gid = cifs_sb->ctx->linux_gid;
fattr->cf_mode = cifs_sb->ctx->file_mode;
if (cifs_open_data_reparse(data) &&
@@ -877,9 +802,6 @@ out_reparse:
fattr->cf_symlink_target = data->symlink_target;
data->symlink_target = NULL;
}
-
- fattr->cf_uid = cifs_sb->ctx->linux_uid;
- fattr->cf_gid = cifs_sb->ctx->linux_gid;
}
static int
@@ -893,9 +815,14 @@ cifs_get_file_info(struct file *filp)
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
+ struct dentry *dentry = filp->f_path.dentry;
+ void *page = alloc_dentry_path();
+ const unsigned char *path;
- if (!server->ops->query_file_info)
+ if (!server->ops->query_file_info) {
+ free_dentry_path(page);
return -ENOSYS;
+ }
xid = get_xid();
rc = server->ops->query_file_info(xid, tcon, cfile, &data);
@@ -907,11 +834,17 @@ cifs_get_file_info(struct file *filp)
data.symlink = true;
data.reparse.tag = IO_REPARSE_TAG_SYMLINK;
}
+ path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(path)) {
+ rc = PTR_ERR(path);
+ goto cgfi_exit;
+ }
cifs_open_info_to_fattr(&fattr, &data, inode->i_sb);
+ if (fattr.cf_flags & CIFS_FATTR_DELETE_PENDING)
+ cifs_mark_open_handles_for_deleted_file(inode, path);
break;
case -EREMOTE:
cifs_create_junction_fattr(&fattr, inode->i_sb);
- rc = 0;
break;
case -EOPNOTSUPP:
case -EINVAL:
@@ -934,9 +867,10 @@ cifs_get_file_info(struct file *filp)
fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
/* if filetype is different, return error */
- rc = cifs_fattr_to_inode(inode, &fattr);
+ rc = cifs_fattr_to_inode(inode, &fattr, false);
cgfi_exit:
cifs_free_open_info(&data);
+ free_dentry_path(page);
free_xid(xid);
return rc;
}
@@ -1171,6 +1105,9 @@ static int cifs_get_fattr(struct cifs_open_info_data *data,
} else {
cifs_open_info_to_fattr(fattr, data, sb);
}
+ if (!rc && *inode &&
+ (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING))
+ cifs_mark_open_handles_for_deleted_file(*inode, full_path);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
@@ -1399,6 +1336,8 @@ int smb311_posix_get_inode_info(struct inode **inode,
goto out;
rc = update_inode_info(sb, &fattr, inode);
+ if (!rc && fattr.cf_flags & CIFS_FATTR_DELETE_PENDING)
+ cifs_mark_open_handles_for_deleted_file(*inode, full_path);
out:
kfree(fattr.cf_symlink_target);
return rc;
@@ -1413,6 +1352,8 @@ cifs_find_inode(struct inode *inode, void *opaque)
{
struct cifs_fattr *fattr = opaque;
+ /* [!] The compared values must be the same in struct cifs_fscache_inode_key. */
+
/* don't match inode with different uniqueid */
if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
return 0;
@@ -1491,7 +1432,7 @@ retry_iget5_locked:
}
/* can't fail - see cifs_find_inode() */
- cifs_fattr_to_inode(inode, fattr);
+ cifs_fattr_to_inode(inode, fattr, false);
if (sb->s_flags & SB_NOATIME)
inode->i_flags |= S_NOATIME | S_NOCMTIME;
if (inode->i_state & I_NEW) {
@@ -1560,6 +1501,9 @@ iget_root:
goto out;
}
+ if (!rc && fattr.cf_flags & CIFS_FATTR_DELETE_PENDING)
+ cifs_mark_open_handles_for_deleted_file(inode, path);
+
if (rc && tcon->pipe) {
cifs_dbg(FYI, "ipc connection - fake read inode\n");
spin_lock(&inode->i_lock);
@@ -1846,20 +1790,24 @@ retry_std_delete:
goto psx_del_no_retry;
}
- rc = server->ops->unlink(xid, tcon, full_path, cifs_sb);
+ rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
psx_del_no_retry:
if (!rc) {
- if (inode)
+ if (inode) {
+ cifs_mark_open_handles_for_deleted_file(inode, full_path);
cifs_drop_nlink(inode);
+ }
} else if (rc == -ENOENT) {
d_drop(dentry);
} else if (rc == -EBUSY) {
if (server->ops->rename_pending_delete) {
rc = server->ops->rename_pending_delete(full_path,
dentry, xid);
- if (rc == 0)
+ if (rc == 0) {
+ cifs_mark_open_handles_for_deleted_file(inode, full_path);
cifs_drop_nlink(inode);
+ }
}
} else if ((rc == -EACCES) && (dosattr == 0) && inode) {
attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
@@ -2797,7 +2745,7 @@ void cifs_setsize(struct inode *inode, loff_t offset)
static int
cifs_set_file_size(struct inode *inode, struct iattr *attrs,
- unsigned int xid, const char *full_path)
+ unsigned int xid, const char *full_path, struct dentry *dentry)
{
int rc;
struct cifsFileInfo *open_file;
@@ -2848,7 +2796,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
*/
if (server->ops->set_path_size)
rc = server->ops->set_path_size(xid, tcon, full_path,
- attrs->ia_size, cifs_sb, false);
+ attrs->ia_size, cifs_sb, false, dentry);
else
rc = -ENOSYS;
cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
@@ -2938,7 +2886,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
rc = 0;
if (attrs->ia_valid & ATTR_SIZE) {
- rc = cifs_set_file_size(inode, attrs, xid, full_path);
+ rc = cifs_set_file_size(inode, attrs, xid, full_path, direntry);
if (rc != 0)
goto out;
}
@@ -3105,7 +3053,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
}
if (attrs->ia_valid & ATTR_SIZE) {
- rc = cifs_set_file_size(inode, attrs, xid, full_path);
+ rc = cifs_set_file_size(inode, attrs, xid, full_path, direntry);
if (rc != 0)
goto cifs_setattr_exit;
}
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index e2f92c21fff5..855ac5a62edf 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -247,7 +247,9 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) {
list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) {
- if (ses_it->Suid == out.session_id) {
+ spin_lock(&ses_it->ses_lock);
+ if (ses_it->ses_status != SES_EXITING &&
+ ses_it->Suid == out.session_id) {
ses = ses_it;
/*
* since we are using the session outside the crit
@@ -255,9 +257,11 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug
* so increment its refcount
*/
cifs_smb_ses_inc_refcount(ses);
+ spin_unlock(&ses_it->ses_lock);
found = true;
goto search_end;
}
+ spin_unlock(&ses_it->ses_lock);
}
}
search_end:
@@ -345,6 +349,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
cifs_dbg(FYI, "cifs ioctl 0x%x\n", command);
+ if (pSMBFile == NULL)
+ trace_smb3_ioctl(xid, 0, command);
+ else
+ trace_smb3_ioctl(xid, pSMBFile->fid.persistent_fid, command);
+
switch (command) {
case FS_IOC_GETFLAGS:
if (pSMBFile == NULL)
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 0748d7b757b9..7d15a1969b81 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -27,9 +27,6 @@
#include "fs_context.h"
#include "cached_dir.h"
-extern mempool_t *cifs_sm_req_poolp;
-extern mempool_t *cifs_req_poolp;
-
/* The xid serves as a useful identifier for each incoming vfs request,
in a similar way to the mid which is useful to track each sent smb,
and CurrentXid can also provide a running counter (although it
@@ -101,6 +98,7 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree(buf_to_free->serverDomain);
kfree(buf_to_free->serverNOS);
kfree_sensitive(buf_to_free->password);
+ kfree_sensitive(buf_to_free->password2);
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
kfree_sensitive(buf_to_free->auth_key.response);
@@ -141,9 +139,6 @@ tcon_info_alloc(bool dir_leases_enabled)
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
ret_buf->stats_from_time = ktime_get_real_seconds();
-#ifdef CONFIG_CIFS_DFS_UPCALL
- INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
-#endif
return ret_buf;
}
@@ -159,9 +154,6 @@ tconInfoFree(struct cifs_tcon *tcon)
atomic_dec(&tconInfoAllocCount);
kfree(tcon->nativeFileSystem);
kfree_sensitive(tcon->password);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- dfs_put_root_smb_sessions(&tcon->dfs_ses_list);
-#endif
kfree(tcon->origin_fullpath);
kfree(tcon);
}
@@ -490,6 +482,8 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid != buf->Tid)
continue;
@@ -853,6 +847,40 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
free_dentry_path(page);
}
+/*
+ * If a dentry has been deleted, all corresponding open handles should know that
+ * so that we do not defer close them.
+ */
+void cifs_mark_open_handles_for_deleted_file(struct inode *inode,
+ const char *path)
+{
+ struct cifsFileInfo *cfile;
+ void *page;
+ const char *full_path;
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+
+ page = alloc_dentry_path();
+ spin_lock(&cinode->open_file_lock);
+
+ /*
+ * note: we need to construct path from dentry and compare only if the
+ * inode has any hardlinks. When number of hardlinks is 1, we can just
+ * mark all open handles since they are going to be from the same file.
+ */
+ if (inode->i_nlink > 1) {
+ list_for_each_entry(cfile, &cinode->openFileList, flist) {
+ full_path = build_path_from_dentry(cfile->dentry, page);
+ if (!IS_ERR(full_path) && strcmp(full_path, path) == 0)
+ cfile->status_file_deleted = true;
+ }
+ } else {
+ list_for_each_entry(cfile, &cinode->openFileList, flist)
+ cfile->status_file_deleted = true;
+ }
+ spin_unlock(&cinode->open_file_lock);
+ free_dentry_path(page);
+}
+
/* parses DFS referral V3 structure
* caller is responsible for freeing target_nodes
* returns:
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index b520eea7bfce..ebe1cb30e18e 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -22,6 +22,7 @@
#include "smb2proto.h"
#include "fs_context.h"
#include "cached_dir.h"
+#include "reparse.h"
/*
* To be safe - for UCS to UTF-8 with strings loaded with the rare long
@@ -56,23 +57,6 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
#endif /* DEBUG2 */
/*
- * Match a reparse point inode if reparse tag and ctime haven't changed.
- *
- * Windows Server updates ctime of reparse points when their data have changed.
- * The server doesn't allow changing reparse tags from existing reparse points,
- * though it's worth checking.
- */
-static inline bool reparse_inode_match(struct inode *inode,
- struct cifs_fattr *fattr)
-{
- struct timespec64 ctime = inode_get_ctime(inode);
-
- return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) &&
- CIFS_I(inode)->reparse_tag == fattr->cf_cifstag &&
- timespec64_equal(&ctime, &fattr->cf_ctime);
-}
-
-/*
* Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
*
* Find the dentry that matches "name". If there isn't one, create one. If it's
@@ -141,6 +125,8 @@ retry:
if (likely(reparse_inode_match(inode, fattr))) {
fattr->cf_mode = inode->i_mode;
fattr->cf_rdev = inode->i_rdev;
+ fattr->cf_uid = inode->i_uid;
+ fattr->cf_gid = inode->i_gid;
fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size;
fattr->cf_symlink_target = NULL;
} else {
@@ -148,7 +134,7 @@ retry:
rc = -ESTALE;
}
}
- if (!rc && !cifs_fattr_to_inode(inode, fattr)) {
+ if (!rc && !cifs_fattr_to_inode(inode, fattr, true)) {
dput(dentry);
return;
}
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
new file mode 100644
index 000000000000..a0ffbda90733
--- /dev/null
+++ b/fs/smb/client/reparse.c
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Paulo Alcantara <pc@manguebit.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include "cifsglob.h"
+#include "smb2proto.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "fs_context.h"
+#include "reparse.h"
+
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, const char *symname)
+{
+ struct reparse_symlink_data_buffer *buf = NULL;
+ struct cifs_open_info_data data;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct inode *new;
+ struct kvec iov;
+ __le16 *path;
+ char *sym, sep = CIFS_DIR_SEP(cifs_sb);
+ u16 len, plen;
+ int rc = 0;
+
+ sym = kstrdup(symname, GFP_KERNEL);
+ if (!sym)
+ return -ENOMEM;
+
+ data = (struct cifs_open_info_data) {
+ .reparse_point = true,
+ .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
+ .symlink_target = sym,
+ };
+
+ convert_delimiter(sym, sep);
+ path = cifs_convert_path_to_utf16(sym, cifs_sb);
+ if (!path) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX);
+ len = sizeof(*buf) + plen * 2;
+ buf = kzalloc(len, GFP_KERNEL);
+ if (!buf) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
+ buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
+ buf->SubstituteNameOffset = cpu_to_le16(plen);
+ buf->SubstituteNameLength = cpu_to_le16(plen);
+ memcpy(&buf->PathBuffer[plen], path, plen);
+ buf->PrintNameOffset = 0;
+ buf->PrintNameLength = cpu_to_le16(plen);
+ memcpy(buf->PathBuffer, path, plen);
+ buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
+ if (*sym != sep)
+ buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
+
+ convert_delimiter(sym, '/');
+ iov.iov_base = buf;
+ iov.iov_len = len;
+ new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ tcon, full_path, &iov, NULL);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+out:
+ kfree(path);
+ cifs_free_open_info(&data);
+ kfree(buf);
+ return rc;
+}
+
+static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
+ mode_t mode, dev_t dev,
+ struct kvec *iov)
+{
+ u64 type;
+ u16 len, dlen;
+
+ len = sizeof(*buf);
+
+ switch ((type = reparse_mode_nfs_type(mode))) {
+ case NFS_SPECFILE_BLK:
+ case NFS_SPECFILE_CHR:
+ dlen = sizeof(__le64);
+ break;
+ case NFS_SPECFILE_FIFO:
+ case NFS_SPECFILE_SOCK:
+ dlen = 0;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS);
+ buf->Reserved = 0;
+ buf->InodeType = cpu_to_le64(type);
+ buf->ReparseDataLength = cpu_to_le16(len + dlen -
+ sizeof(struct reparse_data_buffer));
+ *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
+ MINOR(dev));
+ iov->iov_base = buf;
+ iov->iov_len = len + dlen;
+ return 0;
+}
+
+static int mknod_nfs(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_open_info_data data;
+ struct reparse_posix_data *p;
+ struct inode *new;
+ struct kvec iov;
+ __u8 buf[sizeof(*p) + sizeof(__le64)];
+ int rc;
+
+ p = (struct reparse_posix_data *)buf;
+ rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+ if (rc)
+ return rc;
+
+ data = (struct cifs_open_info_data) {
+ .reparse_point = true,
+ .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
+ };
+
+ new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ tcon, full_path, &iov, NULL);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+ cifs_free_open_info(&data);
+ return rc;
+}
+
+static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
+ mode_t mode, struct kvec *iov)
+{
+ u32 tag;
+
+ switch ((tag = reparse_mode_wsl_tag(mode))) {
+ case IO_REPARSE_TAG_LX_BLK:
+ case IO_REPARSE_TAG_LX_CHR:
+ case IO_REPARSE_TAG_LX_FIFO:
+ case IO_REPARSE_TAG_AF_UNIX:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ buf->ReparseTag = cpu_to_le32(tag);
+ buf->Reserved = 0;
+ buf->ReparseDataLength = 0;
+ iov->iov_base = buf;
+ iov->iov_len = sizeof(*buf);
+ return 0;
+}
+
+static struct smb2_create_ea_ctx *ea_create_context(u32 dlen, size_t *cc_len)
+{
+ struct smb2_create_ea_ctx *cc;
+
+ *cc_len = round_up(sizeof(*cc) + dlen, 8);
+ cc = kzalloc(*cc_len, GFP_KERNEL);
+ if (!cc)
+ return ERR_PTR(-ENOMEM);
+
+ cc->ctx.NameOffset = cpu_to_le16(offsetof(struct smb2_create_ea_ctx,
+ name));
+ cc->ctx.NameLength = cpu_to_le16(4);
+ memcpy(cc->name, SMB2_CREATE_EA_BUFFER, strlen(SMB2_CREATE_EA_BUFFER));
+ cc->ctx.DataOffset = cpu_to_le16(offsetof(struct smb2_create_ea_ctx, ea));
+ cc->ctx.DataLength = cpu_to_le32(dlen);
+ return cc;
+}
+
+struct wsl_xattr {
+ const char *name;
+ __le64 value;
+ u16 size;
+ u32 next;
+};
+
+static int wsl_set_xattrs(struct inode *inode, umode_t _mode,
+ dev_t _dev, struct kvec *iov)
+{
+ struct smb2_file_full_ea_info *ea;
+ struct smb2_create_ea_ctx *cc;
+ struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
+ __le64 uid = cpu_to_le64(from_kuid(current_user_ns(), ctx->linux_uid));
+ __le64 gid = cpu_to_le64(from_kgid(current_user_ns(), ctx->linux_gid));
+ __le64 dev = cpu_to_le64(((u64)MINOR(_dev) << 32) | MAJOR(_dev));
+ __le64 mode = cpu_to_le64(_mode);
+ struct wsl_xattr xattrs[] = {
+ { .name = SMB2_WSL_XATTR_UID, .value = uid, .size = SMB2_WSL_XATTR_UID_SIZE, },
+ { .name = SMB2_WSL_XATTR_GID, .value = gid, .size = SMB2_WSL_XATTR_GID_SIZE, },
+ { .name = SMB2_WSL_XATTR_MODE, .value = mode, .size = SMB2_WSL_XATTR_MODE_SIZE, },
+ { .name = SMB2_WSL_XATTR_DEV, .value = dev, .size = SMB2_WSL_XATTR_DEV_SIZE, },
+ };
+ size_t cc_len;
+ u32 dlen = 0, next = 0;
+ int i, num_xattrs;
+ u8 name_size = SMB2_WSL_XATTR_NAME_LEN + 1;
+
+ memset(iov, 0, sizeof(*iov));
+
+ /* Exclude $LXDEV xattr for sockets and fifos */
+ if (S_ISSOCK(_mode) || S_ISFIFO(_mode))
+ num_xattrs = ARRAY_SIZE(xattrs) - 1;
+ else
+ num_xattrs = ARRAY_SIZE(xattrs);
+
+ for (i = 0; i < num_xattrs; i++) {
+ xattrs[i].next = ALIGN(sizeof(*ea) + name_size +
+ xattrs[i].size, 4);
+ dlen += xattrs[i].next;
+ }
+
+ cc = ea_create_context(dlen, &cc_len);
+ if (IS_ERR(cc))
+ return PTR_ERR(cc);
+
+ ea = &cc->ea;
+ for (i = 0; i < num_xattrs; i++) {
+ ea = (void *)((u8 *)ea + next);
+ next = xattrs[i].next;
+ ea->next_entry_offset = cpu_to_le32(next);
+
+ ea->ea_name_length = name_size - 1;
+ ea->ea_value_length = cpu_to_le16(xattrs[i].size);
+ memcpy(ea->ea_data, xattrs[i].name, name_size);
+ memcpy(&ea->ea_data[name_size],
+ &xattrs[i].value, xattrs[i].size);
+ }
+ ea->next_entry_offset = 0;
+
+ iov->iov_base = cc;
+ iov->iov_len = cc_len;
+ return 0;
+}
+
+static int mknod_wsl(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_open_info_data data;
+ struct reparse_data_buffer buf;
+ struct smb2_create_ea_ctx *cc;
+ struct inode *new;
+ unsigned int len;
+ struct kvec reparse_iov, xattr_iov;
+ int rc;
+
+ rc = wsl_set_reparse_buf(&buf, mode, &reparse_iov);
+ if (rc)
+ return rc;
+
+ rc = wsl_set_xattrs(inode, mode, dev, &xattr_iov);
+ if (rc)
+ return rc;
+
+ data = (struct cifs_open_info_data) {
+ .reparse_point = true,
+ .reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, },
+ };
+
+ cc = xattr_iov.iov_base;
+ len = le32_to_cpu(cc->ctx.DataLength);
+ memcpy(data.wsl.eas, &cc->ea, len);
+ data.wsl.eas_len = len;
+
+ new = smb2_get_reparse_inode(&data, inode->i_sb,
+ xid, tcon, full_path,
+ &reparse_iov, &xattr_iov);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+ cifs_free_open_info(&data);
+ kfree(xattr_iov.iov_base);
+ return rc;
+}
+
+int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
+{
+ struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
+ int rc = -EOPNOTSUPP;
+
+ switch (ctx->reparse_type) {
+ case CIFS_REPARSE_TYPE_NFS:
+ rc = mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev);
+ break;
+ case CIFS_REPARSE_TYPE_WSL:
+ rc = mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev);
+ break;
+ }
+ return rc;
+}
+
+/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
+static int parse_reparse_posix(struct reparse_posix_data *buf,
+ struct cifs_sb_info *cifs_sb,
+ struct cifs_open_info_data *data)
+{
+ unsigned int len;
+ u64 type;
+
+ switch ((type = le64_to_cpu(buf->InodeType))) {
+ case NFS_SPECFILE_LNK:
+ len = le16_to_cpu(buf->ReparseDataLength);
+ data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer,
+ len, true,
+ cifs_sb->local_nls);
+ if (!data->symlink_target)
+ return -ENOMEM;
+ convert_delimiter(data->symlink_target, '/');
+ cifs_dbg(FYI, "%s: target path: %s\n",
+ __func__, data->symlink_target);
+ break;
+ case NFS_SPECFILE_CHR:
+ case NFS_SPECFILE_BLK:
+ case NFS_SPECFILE_FIFO:
+ case NFS_SPECFILE_SOCK:
+ break;
+ default:
+ cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n",
+ __func__, type);
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
+ u32 plen, bool unicode,
+ struct cifs_sb_info *cifs_sb,
+ struct cifs_open_info_data *data)
+{
+ unsigned int len;
+ unsigned int offs;
+
+ /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
+
+ offs = le16_to_cpu(sym->SubstituteNameOffset);
+ len = le16_to_cpu(sym->SubstituteNameLength);
+ if (offs + 20 > plen || offs + len + 20 > plen) {
+ cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
+ return -EIO;
+ }
+
+ data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs,
+ len, unicode,
+ cifs_sb->local_nls);
+ if (!data->symlink_target)
+ return -ENOMEM;
+
+ convert_delimiter(data->symlink_target, '/');
+ cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target);
+
+ return 0;
+}
+
+int parse_reparse_point(struct reparse_data_buffer *buf,
+ u32 plen, struct cifs_sb_info *cifs_sb,
+ bool unicode, struct cifs_open_info_data *data)
+{
+ data->reparse.buf = buf;
+
+ /* See MS-FSCC 2.1.2 */
+ switch (le32_to_cpu(buf->ReparseTag)) {
+ case IO_REPARSE_TAG_NFS:
+ return parse_reparse_posix((struct reparse_posix_data *)buf,
+ cifs_sb, data);
+ case IO_REPARSE_TAG_SYMLINK:
+ return parse_reparse_symlink(
+ (struct reparse_symlink_data_buffer *)buf,
+ plen, unicode, cifs_sb, data);
+ case IO_REPARSE_TAG_LX_SYMLINK:
+ case IO_REPARSE_TAG_AF_UNIX:
+ case IO_REPARSE_TAG_LX_FIFO:
+ case IO_REPARSE_TAG_LX_CHR:
+ case IO_REPARSE_TAG_LX_BLK:
+ return 0;
+ default:
+ cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
+ __func__, le32_to_cpu(buf->ReparseTag));
+ return -EOPNOTSUPP;
+ }
+}
+
+int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
+ struct kvec *rsp_iov,
+ struct cifs_open_info_data *data)
+{
+ struct reparse_data_buffer *buf;
+ struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
+ u32 plen = le32_to_cpu(io->OutputCount);
+
+ buf = (struct reparse_data_buffer *)((u8 *)io +
+ le32_to_cpu(io->OutputOffset));
+ return parse_reparse_point(buf, plen, cifs_sb, true, data);
+}
+
+static void wsl_to_fattr(struct cifs_open_info_data *data,
+ struct cifs_sb_info *cifs_sb,
+ u32 tag, struct cifs_fattr *fattr)
+{
+ struct smb2_file_full_ea_info *ea;
+ u32 next = 0;
+
+ switch (tag) {
+ case IO_REPARSE_TAG_LX_SYMLINK:
+ fattr->cf_mode |= S_IFLNK;
+ break;
+ case IO_REPARSE_TAG_LX_FIFO:
+ fattr->cf_mode |= S_IFIFO;
+ break;
+ case IO_REPARSE_TAG_AF_UNIX:
+ fattr->cf_mode |= S_IFSOCK;
+ break;
+ case IO_REPARSE_TAG_LX_CHR:
+ fattr->cf_mode |= S_IFCHR;
+ break;
+ case IO_REPARSE_TAG_LX_BLK:
+ fattr->cf_mode |= S_IFBLK;
+ break;
+ }
+
+ if (!data->wsl.eas_len)
+ goto out;
+
+ ea = (struct smb2_file_full_ea_info *)data->wsl.eas;
+ do {
+ const char *name;
+ void *v;
+ u8 nlen;
+
+ ea = (void *)((u8 *)ea + next);
+ next = le32_to_cpu(ea->next_entry_offset);
+ if (!le16_to_cpu(ea->ea_value_length))
+ continue;
+
+ name = ea->ea_data;
+ nlen = ea->ea_name_length;
+ v = (void *)((u8 *)ea->ea_data + ea->ea_name_length + 1);
+
+ if (!strncmp(name, SMB2_WSL_XATTR_UID, nlen))
+ fattr->cf_uid = wsl_make_kuid(cifs_sb, v);
+ else if (!strncmp(name, SMB2_WSL_XATTR_GID, nlen))
+ fattr->cf_gid = wsl_make_kgid(cifs_sb, v);
+ else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen))
+ fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v);
+ else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen))
+ fattr->cf_rdev = wsl_mkdev(v);
+ } while (next);
+out:
+ fattr->cf_dtype = S_DT(fattr->cf_mode);
+}
+
+bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
+ struct cifs_fattr *fattr,
+ struct cifs_open_info_data *data)
+{
+ struct reparse_posix_data *buf = data->reparse.posix;
+ u32 tag = data->reparse.tag;
+
+ if (tag == IO_REPARSE_TAG_NFS && buf) {
+ switch (le64_to_cpu(buf->InodeType)) {
+ case NFS_SPECFILE_CHR:
+ fattr->cf_mode |= S_IFCHR;
+ fattr->cf_rdev = reparse_nfs_mkdev(buf);
+ break;
+ case NFS_SPECFILE_BLK:
+ fattr->cf_mode |= S_IFBLK;
+ fattr->cf_rdev = reparse_nfs_mkdev(buf);
+ break;
+ case NFS_SPECFILE_FIFO:
+ fattr->cf_mode |= S_IFIFO;
+ break;
+ case NFS_SPECFILE_SOCK:
+ fattr->cf_mode |= S_IFSOCK;
+ break;
+ case NFS_SPECFILE_LNK:
+ fattr->cf_mode |= S_IFLNK;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ goto out;
+ }
+
+ switch (tag) {
+ case IO_REPARSE_TAG_DFS:
+ case IO_REPARSE_TAG_DFSR:
+ case IO_REPARSE_TAG_MOUNT_POINT:
+ /* See cifs_create_junction_fattr() */
+ fattr->cf_mode = S_IFDIR | 0711;
+ break;
+ case IO_REPARSE_TAG_LX_SYMLINK:
+ case IO_REPARSE_TAG_LX_FIFO:
+ case IO_REPARSE_TAG_AF_UNIX:
+ case IO_REPARSE_TAG_LX_CHR:
+ case IO_REPARSE_TAG_LX_BLK:
+ wsl_to_fattr(data, cifs_sb, tag, fattr);
+ break;
+ case 0: /* SMB1 symlink */
+ case IO_REPARSE_TAG_SYMLINK:
+ case IO_REPARSE_TAG_NFS:
+ fattr->cf_mode |= S_IFLNK;
+ break;
+ default:
+ return false;
+ }
+out:
+ fattr->cf_dtype = S_DT(fattr->cf_mode);
+ return true;
+}
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
new file mode 100644
index 000000000000..6b55d1df9e2f
--- /dev/null
+++ b/fs/smb/client/reparse.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024 Paulo Alcantara <pc@manguebit.com>
+ */
+
+#ifndef _CIFS_REPARSE_H
+#define _CIFS_REPARSE_H
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/uidgid.h>
+#include "fs_context.h"
+#include "cifsglob.h"
+
+static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf)
+{
+ u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
+
+ return MKDEV(v >> 32, v & 0xffffffff);
+}
+
+static inline dev_t wsl_mkdev(void *ptr)
+{
+ u64 v = le64_to_cpu(*(__le64 *)ptr);
+
+ return MKDEV(v & 0xffffffff, v >> 32);
+}
+
+static inline kuid_t wsl_make_kuid(struct cifs_sb_info *cifs_sb,
+ void *ptr)
+{
+ u32 uid = le32_to_cpu(*(__le32 *)ptr);
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+ return cifs_sb->ctx->linux_uid;
+ return make_kuid(current_user_ns(), uid);
+}
+
+static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb,
+ void *ptr)
+{
+ u32 gid = le32_to_cpu(*(__le32 *)ptr);
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+ return cifs_sb->ctx->linux_gid;
+ return make_kgid(current_user_ns(), gid);
+}
+
+static inline u64 reparse_mode_nfs_type(mode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFBLK: return NFS_SPECFILE_BLK;
+ case S_IFCHR: return NFS_SPECFILE_CHR;
+ case S_IFIFO: return NFS_SPECFILE_FIFO;
+ case S_IFSOCK: return NFS_SPECFILE_SOCK;
+ }
+ return 0;
+}
+
+static inline u32 reparse_mode_wsl_tag(mode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFBLK: return IO_REPARSE_TAG_LX_BLK;
+ case S_IFCHR: return IO_REPARSE_TAG_LX_CHR;
+ case S_IFIFO: return IO_REPARSE_TAG_LX_FIFO;
+ case S_IFSOCK: return IO_REPARSE_TAG_AF_UNIX;
+ }
+ return 0;
+}
+
+/*
+ * Match a reparse point inode if reparse tag and ctime haven't changed.
+ *
+ * Windows Server updates ctime of reparse points when their data have changed.
+ * The server doesn't allow changing reparse tags from existing reparse points,
+ * though it's worth checking.
+ */
+static inline bool reparse_inode_match(struct inode *inode,
+ struct cifs_fattr *fattr)
+{
+ struct timespec64 ctime = inode_get_ctime(inode);
+
+ return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) &&
+ CIFS_I(inode)->reparse_tag == fattr->cf_cifstag &&
+ timespec64_equal(&ctime, &fattr->cf_ctime);
+}
+
+static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
+{
+ struct smb2_file_all_info *fi = &data->fi;
+ u32 attrs = le32_to_cpu(fi->Attributes);
+ bool ret;
+
+ ret = data->reparse_point || (attrs & ATTR_REPARSE);
+ if (ret)
+ attrs |= ATTR_REPARSE;
+ fi->Attributes = cpu_to_le32(attrs);
+ return ret;
+}
+
+bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
+ struct cifs_fattr *fattr,
+ struct cifs_open_info_data *data);
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, const char *symname);
+int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev);
+int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov,
+ struct cifs_open_info_data *data);
+
+#endif /* _CIFS_REPARSE_H */
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 8f37373fd333..3216f786908f 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -230,7 +230,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses)
spin_lock(&ses->iface_lock);
if (!ses->iface_count) {
spin_unlock(&ses->iface_lock);
- cifs_dbg(VFS, "server %s does not advertise interfaces\n",
+ cifs_dbg(ONCE, "server %s does not advertise interfaces\n",
ses->server->hostname);
break;
}
@@ -396,7 +396,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
spin_lock(&ses->iface_lock);
if (!ses->iface_count) {
spin_unlock(&ses->iface_lock);
- cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname);
+ cifs_dbg(ONCE, "server %s does not advertise interfaces\n", ses->server->hostname);
return;
}
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index a9eaba8083b0..212ec6f66ec6 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -753,11 +753,11 @@ cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
}
-static void
+static int
cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid *fid)
{
- CIFSSMBClose(xid, tcon, fid->netfid);
+ return CIFSSMBClose(xid, tcon, fid->netfid);
}
static int
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index e0ee96d69d49..c23478ab1cf8 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -228,7 +228,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
* flock and OFD lock are associated with an open
* file description, not the process.
*/
- if (!(flock->fl_flags & (FL_FLOCK | FL_OFDLCK)))
+ if (!(flock->c.flc_flags & (FL_FLOCK | FL_OFDLCK)))
continue;
if (cinode->can_cache_brlcks) {
/*
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index a0c156996fc5..2466e6155136 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -36,7 +36,8 @@ enum smb2_compound_ops {
SMB2_OP_RMDIR,
SMB2_OP_POSIX_QUERY_INFO,
SMB2_OP_SET_REPARSE,
- SMB2_OP_GET_REPARSE
+ SMB2_OP_GET_REPARSE,
+ SMB2_OP_QUERY_WSL_EA,
};
/* Used when constructing chained read requests. */
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 05818cd6d932..5c02a12251c8 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -85,6 +85,82 @@ static int parse_posix_sids(struct cifs_open_info_data *data,
return 0;
}
+struct wsl_query_ea {
+ __le32 next;
+ __u8 name_len;
+ __u8 name[SMB2_WSL_XATTR_NAME_LEN + 1];
+} __packed;
+
+#define NEXT_OFF cpu_to_le32(sizeof(struct wsl_query_ea))
+
+static const struct wsl_query_ea wsl_query_eas[] = {
+ { .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_UID, },
+ { .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_GID, },
+ { .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_MODE, },
+ { .next = 0, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_DEV, },
+};
+
+static int check_wsl_eas(struct kvec *rsp_iov)
+{
+ struct smb2_file_full_ea_info *ea;
+ struct smb2_query_info_rsp *rsp = rsp_iov->iov_base;
+ unsigned long addr;
+ u32 outlen, next;
+ u16 vlen;
+ u8 nlen;
+ u8 *end;
+
+ outlen = le32_to_cpu(rsp->OutputBufferLength);
+ if (outlen < SMB2_WSL_MIN_QUERY_EA_RESP_SIZE ||
+ outlen > SMB2_WSL_MAX_QUERY_EA_RESP_SIZE)
+ return -EINVAL;
+
+ ea = (void *)((u8 *)rsp_iov->iov_base +
+ le16_to_cpu(rsp->OutputBufferOffset));
+ end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len;
+ for (;;) {
+ if ((u8 *)ea > end - sizeof(*ea))
+ return -EINVAL;
+
+ nlen = ea->ea_name_length;
+ vlen = le16_to_cpu(ea->ea_value_length);
+ if (nlen != SMB2_WSL_XATTR_NAME_LEN ||
+ (u8 *)ea + nlen + 1 + vlen > end)
+ return -EINVAL;
+
+ switch (vlen) {
+ case 4:
+ if (strncmp(ea->ea_data, SMB2_WSL_XATTR_UID, nlen) &&
+ strncmp(ea->ea_data, SMB2_WSL_XATTR_GID, nlen) &&
+ strncmp(ea->ea_data, SMB2_WSL_XATTR_MODE, nlen))
+ return -EINVAL;
+ break;
+ case 8:
+ if (strncmp(ea->ea_data, SMB2_WSL_XATTR_DEV, nlen))
+ return -EINVAL;
+ break;
+ case 0:
+ if (!strncmp(ea->ea_data, SMB2_WSL_XATTR_UID, nlen) ||
+ !strncmp(ea->ea_data, SMB2_WSL_XATTR_GID, nlen) ||
+ !strncmp(ea->ea_data, SMB2_WSL_XATTR_MODE, nlen) ||
+ !strncmp(ea->ea_data, SMB2_WSL_XATTR_DEV, nlen))
+ break;
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ next = le32_to_cpu(ea->next_entry_offset);
+ if (!next)
+ break;
+ if (!IS_ALIGNED(next, 4) ||
+ check_add_overflow((unsigned long)ea, next, &addr))
+ return -EINVAL;
+ ea = (void *)addr;
+ }
+ return 0;
+}
+
/*
* note: If cfile is passed, the reference to it is dropped here.
* So make sure that you do not reuse cfile after return from this func.
@@ -95,10 +171,9 @@ static int parse_posix_sids(struct cifs_open_info_data *data,
*/
static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- __u32 desired_access, __u32 create_disposition,
- __u32 create_options, umode_t mode, struct kvec *in_iov,
+ struct cifs_open_parms *oparms, struct kvec *in_iov,
int *cmds, int num_cmds, struct cifsFileInfo *cfile,
- struct kvec *out_iov, int *out_buftype)
+ struct kvec *out_iov, int *out_buftype, struct dentry *dentry)
{
struct reparse_data_buffer *rbuf;
@@ -115,11 +190,12 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buftype[MAX_COMPOUND];
struct smb2_query_info_rsp *qi_rsp = NULL;
struct cifs_open_info_data *idata;
+ struct inode *inode = NULL;
int flags = 0;
__u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
unsigned int size[2];
void *data[2];
- int len;
+ unsigned int len;
int retries = 0, cur_sleep = 1;
replay_again:
@@ -152,16 +228,28 @@ replay_again:
goto finished;
}
- vars->oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .path = full_path,
- .desired_access = desired_access,
- .disposition = create_disposition,
- .create_options = cifs_create_options(cifs_sb, create_options),
- .fid = &fid,
- .mode = mode,
- .cifs_sb = cifs_sb,
- };
+ /* if there is an existing lease, reuse it */
+
+ /*
+ * note: files with hardlinks cause unexpected behaviour. As per MS-SMB2,
+ * lease keys are associated with the filepath. We are maintaining lease keys
+ * with the inode on the client. If the file has hardlinks, it is possible
+ * that the lease for a file be reused for an operation on its hardlink or
+ * vice versa.
+ * As a workaround, send request using an existing lease key and if the server
+ * returns STATUS_INVALID_PARAMETER, which maps to EINVAL, send the request
+ * again without the lease.
+ */
+ if (dentry) {
+ inode = d_inode(dentry);
+ if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) {
+ oplock = SMB2_OPLOCK_LEVEL_LEASE;
+ server->ops->get_lease_key(inode, &fid);
+ }
+ }
+
+ vars->oparms = *oparms;
+ vars->oparms.fid = &fid;
rqst[num_rqst].rq_iov = &vars->open_iov[0];
rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
@@ -202,14 +290,13 @@ replay_again:
SMB2_O_INFO_FILE, 0,
sizeof(struct smb2_file_all_info) +
PATH_MAX * 2, 0, NULL);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
- }
}
-
- if (rc)
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
+ }
num_rqst++;
trace_smb3_query_info_compound_enter(xid, ses->Suid,
tcon->tid, full_path);
@@ -239,14 +326,13 @@ replay_again:
sizeof(struct smb311_posix_qinfo *) +
(PATH_MAX * 2) +
(sizeof(struct cifs_sid) * 2), 0, NULL);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
- }
}
-
- if (rc)
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
+ }
num_rqst++;
trace_smb3_posix_query_info_compound_enter(xid, ses->Suid,
tcon->tid, full_path);
@@ -304,13 +390,13 @@ replay_again:
FILE_END_OF_FILE_INFORMATION,
SMB2_O_INFO_FILE, 0,
data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
- }
}
- if (rc)
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
+ }
num_rqst++;
trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
break;
@@ -335,14 +421,13 @@ replay_again:
COMPOUND_FID, current->tgid,
FILE_BASIC_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
- }
}
-
- if (rc)
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
+ }
num_rqst++;
trace_smb3_set_info_compound_enter(xid, ses->Suid,
tcon->tid, full_path);
@@ -376,13 +461,13 @@ replay_again:
COMPOUND_FID, COMPOUND_FID,
current->tgid, FILE_RENAME_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
- }
}
- if (rc)
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
+ }
num_rqst++;
trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
break;
@@ -417,15 +502,27 @@ replay_again:
rqst[num_rqst].rq_iov = vars->io_iov;
rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
- rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
- COMPOUND_FID, COMPOUND_FID,
- FSCTL_SET_REPARSE_POINT,
- in_iov[i].iov_base,
- in_iov[i].iov_len, 0);
- if (rc)
+ if (cfile) {
+ rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ FSCTL_SET_REPARSE_POINT,
+ in_iov[i].iov_base,
+ in_iov[i].iov_len, 0);
+ } else {
+ rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+ COMPOUND_FID, COMPOUND_FID,
+ FSCTL_SET_REPARSE_POINT,
+ in_iov[i].iov_base,
+ in_iov[i].iov_len, 0);
+ }
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
+ }
+ num_rqst++;
trace_smb3_set_reparse_compound_enter(xid, ses->Suid,
tcon->tid, full_path);
break;
@@ -433,17 +530,61 @@ replay_again:
rqst[num_rqst].rq_iov = vars->io_iov;
rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
- rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
- COMPOUND_FID, COMPOUND_FID,
- FSCTL_GET_REPARSE_POINT,
- NULL, 0, CIFSMaxBufSize);
- if (rc)
+ if (cfile) {
+ rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ FSCTL_GET_REPARSE_POINT,
+ NULL, 0, CIFSMaxBufSize);
+ } else {
+ rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+ COMPOUND_FID, COMPOUND_FID,
+ FSCTL_GET_REPARSE_POINT,
+ NULL, 0, CIFSMaxBufSize);
+ }
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
+ }
+ num_rqst++;
trace_smb3_get_reparse_compound_enter(xid, ses->Suid,
tcon->tid, full_path);
break;
+ case SMB2_OP_QUERY_WSL_EA:
+ rqst[num_rqst].rq_iov = &vars->ea_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ if (cfile) {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ FILE_FULL_EA_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ SMB2_WSL_MAX_QUERY_EA_RESP_SIZE,
+ sizeof(wsl_query_eas),
+ (void *)wsl_query_eas);
+ } else {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ FILE_FULL_EA_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ SMB2_WSL_MAX_QUERY_EA_RESP_SIZE,
+ sizeof(wsl_query_eas),
+ (void *)wsl_query_eas);
+ }
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
+ goto finished;
+ }
+ num_rqst++;
+ break;
default:
cifs_dbg(VFS, "Invalid command\n");
rc = -EINVAL;
@@ -551,8 +692,15 @@ finished:
case SMB2_OP_DELETE:
if (rc)
trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
- else
+ else {
+ /*
+ * If dentry (hence, inode) is NULL, lease break is going to
+ * take care of degrading leases on handles for deleted files.
+ */
+ if (inode)
+ cifs_mark_open_handles_for_deleted_file(inode, full_path);
trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+ }
break;
case SMB2_OP_MKDIR:
if (rc)
@@ -626,11 +774,32 @@ finished:
memset(iov, 0, sizeof(*iov));
resp_buftype[i + 1] = CIFS_NO_BUFFER;
} else {
- trace_smb3_set_reparse_compound_err(xid, ses->Suid,
+ trace_smb3_set_reparse_compound_err(xid, ses->Suid,
tcon->tid, rc);
}
SMB2_ioctl_free(&rqst[num_rqst++]);
break;
+ case SMB2_OP_QUERY_WSL_EA:
+ if (!rc) {
+ idata = in_iov[i].iov_base;
+ qi_rsp = rsp_iov[i + 1].iov_base;
+ data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset);
+ size[0] = le32_to_cpu(qi_rsp->OutputBufferLength);
+ rc = check_wsl_eas(&rsp_iov[i + 1]);
+ if (!rc) {
+ memcpy(idata->wsl.eas, data[0], size[0]);
+ idata->wsl.eas_len = size[0];
+ }
+ }
+ if (!rc) {
+ trace_smb3_query_wsl_ea_compound_done(xid, ses->Suid,
+ tcon->tid);
+ } else {
+ trace_smb3_query_wsl_ea_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ }
+ SMB2_query_info_free(&rqst[num_rqst++]);
+ break;
}
}
SMB2_close_free(&rqst[num_rqst]);
@@ -693,15 +862,16 @@ int smb2_query_path_info(const unsigned int xid,
const char *full_path,
struct cifs_open_info_data *data)
{
+ struct cifs_open_parms oparms;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
struct cached_fid *cfid = NULL;
struct smb2_hdr *hdr;
- struct kvec in_iov[2], out_iov[3] = {};
+ struct kvec in_iov[3], out_iov[3] = {};
int out_buftype[3] = {};
- int cmds[2];
+ int cmds[3];
bool islink;
- int i, num_cmds;
+ int i, num_cmds = 0;
int rc, rc2;
data->adjust_tz = false;
@@ -734,20 +904,22 @@ int smb2_query_path_info(const unsigned int xid,
close_cached_dir(cfid);
return rc;
}
- cmds[0] = SMB2_OP_QUERY_INFO;
+ cmds[num_cmds++] = SMB2_OP_QUERY_INFO;
} else {
- cmds[0] = SMB2_OP_POSIX_QUERY_INFO;
+ cmds[num_cmds++] = SMB2_OP_POSIX_QUERY_INFO;
}
in_iov[0].iov_base = data;
in_iov[0].iov_len = sizeof(*data);
in_iov[1] = in_iov[0];
+ in_iov[2] = in_iov[0];
cifs_get_readable_path(tcon, full_path, &cfile);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES,
+ FILE_OPEN, create_options, ACL_NO_MODE);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, in_iov,
- cmds, 1, cfile, out_iov, out_buftype);
+ &oparms, in_iov, cmds, num_cmds,
+ cfile, out_iov, out_buftype, NULL);
hdr = out_iov[0].iov_base;
/*
* If first iov is unset, then SMB session was dropped or we've got a
@@ -767,19 +939,22 @@ int smb2_query_path_info(const unsigned int xid,
if (rc || !data->reparse_point)
goto out;
- if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) {
- /* symlink already parsed in create response */
- num_cmds = 1;
- } else {
- cmds[1] = SMB2_OP_GET_REPARSE;
- num_cmds = 2;
- }
- create_options |= OPEN_REPARSE_POINT;
+ cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
+ /*
+ * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create
+ * response.
+ */
+ if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK)
+ cmds[num_cmds++] = SMB2_OP_GET_REPARSE;
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+ FILE_READ_ATTRIBUTES | FILE_READ_EA,
+ FILE_OPEN, create_options |
+ OPEN_REPARSE_POINT, ACL_NO_MODE);
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, in_iov,
- cmds, num_cmds, cfile, NULL, NULL);
+ &oparms, in_iov, cmds, num_cmds,
+ cfile, NULL, NULL, NULL);
break;
case -EREMOTE:
break;
@@ -807,11 +982,14 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
- return smb2_compound_op(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, mode,
- NULL, &(int){SMB2_OP_MKDIR}, 1,
- NULL, NULL, NULL);
+ struct cifs_open_parms oparms;
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, name, FILE_WRITE_ATTRIBUTES,
+ FILE_CREATE, CREATE_NOT_FILE, mode);
+ return smb2_compound_op(xid, tcon, cifs_sb,
+ name, &oparms, NULL,
+ &(int){SMB2_OP_MKDIR}, 1,
+ NULL, NULL, NULL, NULL);
}
void
@@ -819,6 +997,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
const unsigned int xid)
{
+ struct cifs_open_parms oparms;
FILE_BASIC_INFO data = {};
struct cifsInodeInfo *cifs_i;
struct cifsFileInfo *cfile;
@@ -832,11 +1011,12 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, name, FILE_WRITE_ATTRIBUTES,
+ FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE);
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, ACL_NO_MODE, &in_iov,
+ &oparms, &in_iov,
&(int){SMB2_OP_SET_INFO}, 1,
- cfile, NULL, NULL);
+ cfile, NULL, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -845,31 +1025,47 @@ int
smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
+ struct cifs_open_parms oparms;
+
drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
- return smb2_compound_op(xid, tcon, cifs_sb, name,
- DELETE, FILE_OPEN, CREATE_NOT_FILE,
- ACL_NO_MODE, NULL,
+ oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE,
+ FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE);
+ return smb2_compound_op(xid, tcon, cifs_sb,
+ name, &oparms, NULL,
&(int){SMB2_OP_RMDIR}, 1,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
}
int
smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
- struct cifs_sb_info *cifs_sb)
+ struct cifs_sb_info *cifs_sb, struct dentry *dentry)
{
- return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL,
- &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL);
+ struct cifs_open_parms oparms;
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, name,
+ DELETE, FILE_OPEN,
+ CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+ ACL_NO_MODE);
+ int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
+ NULL, &(int){SMB2_OP_DELETE}, 1,
+ NULL, NULL, NULL, dentry);
+ if (rc == -EINVAL) {
+ cifs_dbg(FYI, "invalid lease key, resending request without lease");
+ rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
+ NULL, &(int){SMB2_OP_DELETE}, 1,
+ NULL, NULL, NULL, NULL);
+ }
+ return rc;
}
static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
const char *from_name, const char *to_name,
struct cifs_sb_info *cifs_sb,
__u32 create_options, __u32 access,
- int command, struct cifsFileInfo *cfile)
+ int command, struct cifsFileInfo *cfile,
+ struct dentry *dentry)
{
+ struct cifs_open_parms oparms;
struct kvec in_iov;
__le16 *smb2_to_name = NULL;
int rc;
@@ -881,9 +1077,11 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
}
in_iov.iov_base = smb2_to_name;
in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX);
- rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, create_options, ACL_NO_MODE,
- &in_iov, &command, 1, cfile, NULL, NULL);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, from_name, access, FILE_OPEN,
+ create_options, ACL_NO_MODE);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, from_name,
+ &oparms, &in_iov, &command, 1,
+ cfile, NULL, NULL, dentry);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -901,8 +1099,14 @@ int smb2_rename_path(const unsigned int xid,
drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb);
cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
- return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
- co, DELETE, SMB2_OP_RENAME, cfile);
+ int rc = smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+ co, DELETE, SMB2_OP_RENAME, cfile, source_dentry);
+ if (rc == -EINVAL) {
+ cifs_dbg(FYI, "invalid lease key, resending request without lease");
+ rc = smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+ co, DELETE, SMB2_OP_RENAME, cfile, NULL);
+ }
+ return rc;
}
int smb2_create_hardlink(const unsigned int xid,
@@ -915,32 +1119,46 @@ int smb2_create_hardlink(const unsigned int xid,
return smb2_set_path_attr(xid, tcon, from_name, to_name,
cifs_sb, co, FILE_READ_ATTRIBUTES,
- SMB2_OP_HARDLINK, NULL);
+ SMB2_OP_HARDLINK, NULL, NULL);
}
int
smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, __u64 size,
- struct cifs_sb_info *cifs_sb, bool set_alloc)
+ struct cifs_sb_info *cifs_sb, bool set_alloc,
+ struct dentry *dentry)
{
+ struct cifs_open_parms oparms;
struct cifsFileInfo *cfile;
struct kvec in_iov;
__le64 eof = cpu_to_le64(size);
+ int rc;
in_iov.iov_base = &eof;
in_iov.iov_len = sizeof(eof);
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
- return smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_DATA, FILE_OPEN,
- 0, ACL_NO_MODE, &in_iov,
- &(int){SMB2_OP_SET_EOF}, 1,
- cfile, NULL, NULL);
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_DATA,
+ FILE_OPEN, 0, ACL_NO_MODE);
+ rc = smb2_compound_op(xid, tcon, cifs_sb,
+ full_path, &oparms, &in_iov,
+ &(int){SMB2_OP_SET_EOF}, 1,
+ cfile, NULL, NULL, dentry);
+ if (rc == -EINVAL) {
+ cifs_dbg(FYI, "invalid lease key, resending request without lease");
+ rc = smb2_compound_op(xid, tcon, cifs_sb,
+ full_path, &oparms, &in_iov,
+ &(int){SMB2_OP_SET_EOF}, 1,
+ cfile, NULL, NULL, NULL);
+ }
+ return rc;
}
int
smb2_set_file_info(struct inode *inode, const char *full_path,
FILE_BASIC_INFO *buf, const unsigned int xid)
{
+ struct cifs_open_parms oparms;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink;
struct cifs_tcon *tcon;
@@ -959,11 +1177,12 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
tcon = tlink_tcon(tlink);
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_ATTRIBUTES, FILE_OPEN,
- 0, ACL_NO_MODE, &in_iov,
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_ATTRIBUTES,
+ FILE_OPEN, 0, ACL_NO_MODE);
+ rc = smb2_compound_op(xid, tcon, cifs_sb,
+ full_path, &oparms, &in_iov,
&(int){SMB2_OP_SET_INFO}, 1,
- cfile, NULL, NULL);
+ cfile, NULL, NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
@@ -973,32 +1192,37 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
const unsigned int xid,
struct cifs_tcon *tcon,
const char *full_path,
- struct kvec *iov)
+ struct kvec *reparse_iov,
+ struct kvec *xattr_iov)
{
+ struct cifs_open_parms oparms;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifsFileInfo *cfile;
struct inode *new = NULL;
struct kvec in_iov[2];
int cmds[2];
- int da, co, cd;
int rc;
- da = SYNCHRONIZE | DELETE |
- FILE_READ_ATTRIBUTES |
- FILE_WRITE_ATTRIBUTES;
- co = CREATE_NOT_DIR | OPEN_REPARSE_POINT;
- cd = FILE_CREATE;
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+ SYNCHRONIZE | DELETE |
+ FILE_READ_ATTRIBUTES |
+ FILE_WRITE_ATTRIBUTES,
+ FILE_CREATE,
+ CREATE_NOT_DIR | OPEN_REPARSE_POINT,
+ ACL_NO_MODE);
+ if (xattr_iov)
+ oparms.ea_cctx = xattr_iov;
+
cmds[0] = SMB2_OP_SET_REPARSE;
- in_iov[0] = *iov;
+ in_iov[0] = *reparse_iov;
in_iov[1].iov_base = data;
in_iov[1].iov_len = sizeof(*data);
if (tcon->posix_extensions) {
cmds[1] = SMB2_OP_POSIX_QUERY_INFO;
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- da, cd, co, ACL_NO_MODE, in_iov,
- cmds, 2, cfile, NULL, NULL);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms,
+ in_iov, cmds, 2, cfile, NULL, NULL, NULL);
if (!rc) {
rc = smb311_posix_get_inode_info(&new, full_path,
data, sb, xid);
@@ -1006,9 +1230,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
} else {
cmds[1] = SMB2_OP_QUERY_INFO;
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- da, cd, co, ACL_NO_MODE, in_iov,
- cmds, 2, cfile, NULL, NULL);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms,
+ in_iov, cmds, 2, cfile, NULL, NULL, NULL);
if (!rc) {
rc = cifs_get_inode_info(&new, full_path,
data, sb, xid, NULL);
@@ -1024,6 +1247,7 @@ int smb2_query_reparse_point(const unsigned int xid,
u32 *tag, struct kvec *rsp,
int *rsp_buftype)
{
+ struct cifs_open_parms oparms;
struct cifs_open_info_data data = {};
struct cifsFileInfo *cfile;
struct kvec in_iov = { .iov_base = &data, .iov_len = sizeof(data), };
@@ -1032,11 +1256,12 @@ int smb2_query_reparse_point(const unsigned int xid,
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov,
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES,
+ FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE);
+ rc = smb2_compound_op(xid, tcon, cifs_sb,
+ full_path, &oparms, &in_iov,
&(int){SMB2_OP_GET_REPARSE}, 1,
- cfile, NULL, NULL);
+ cfile, NULL, NULL, NULL);
if (rc)
goto out;
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 82b84a4941dd..cc72be5a93a9 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -622,6 +622,8 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
cifs_stats_inc(
@@ -697,6 +699,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 4695433fcf39..78c94d0350fe 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -28,6 +28,7 @@
#include "fscache.h"
#include "fs_context.h"
#include "cached_dir.h"
+#include "reparse.h"
/* Change credits for different ops and return the total number of credits */
static int
@@ -1411,14 +1412,14 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
memcpy(cfile->fid.create_guid, fid->create_guid, 16);
}
-static void
+static int
smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid *fid)
{
- SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+ return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
}
-static void
+static int
smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile)
{
@@ -1429,7 +1430,7 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
rc = __SMB2_close(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, &file_inf);
if (rc)
- return;
+ return rc;
inode = d_inode(cfile->dentry);
@@ -1458,6 +1459,7 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
/* End of file and Attributes should not have to be updated on close */
spin_unlock(&inode->i_lock);
+ return rc;
}
static int
@@ -2479,6 +2481,8 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
spin_lock(&tcon->tc_lock);
@@ -2986,109 +2990,6 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
return rc;
}
-/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
-static int parse_reparse_posix(struct reparse_posix_data *buf,
- struct cifs_sb_info *cifs_sb,
- struct cifs_open_info_data *data)
-{
- unsigned int len;
- u64 type;
-
- switch ((type = le64_to_cpu(buf->InodeType))) {
- case NFS_SPECFILE_LNK:
- len = le16_to_cpu(buf->ReparseDataLength);
- data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer,
- len, true,
- cifs_sb->local_nls);
- if (!data->symlink_target)
- return -ENOMEM;
- convert_delimiter(data->symlink_target, '/');
- cifs_dbg(FYI, "%s: target path: %s\n",
- __func__, data->symlink_target);
- break;
- case NFS_SPECFILE_CHR:
- case NFS_SPECFILE_BLK:
- case NFS_SPECFILE_FIFO:
- case NFS_SPECFILE_SOCK:
- break;
- default:
- cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n",
- __func__, type);
- return -EOPNOTSUPP;
- }
- return 0;
-}
-
-static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
- u32 plen, bool unicode,
- struct cifs_sb_info *cifs_sb,
- struct cifs_open_info_data *data)
-{
- unsigned int len;
- unsigned int offs;
-
- /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
-
- offs = le16_to_cpu(sym->SubstituteNameOffset);
- len = le16_to_cpu(sym->SubstituteNameLength);
- if (offs + 20 > plen || offs + len + 20 > plen) {
- cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
- return -EIO;
- }
-
- data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs,
- len, unicode,
- cifs_sb->local_nls);
- if (!data->symlink_target)
- return -ENOMEM;
-
- convert_delimiter(data->symlink_target, '/');
- cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target);
-
- return 0;
-}
-
-int parse_reparse_point(struct reparse_data_buffer *buf,
- u32 plen, struct cifs_sb_info *cifs_sb,
- bool unicode, struct cifs_open_info_data *data)
-{
- data->reparse.buf = buf;
-
- /* See MS-FSCC 2.1.2 */
- switch (le32_to_cpu(buf->ReparseTag)) {
- case IO_REPARSE_TAG_NFS:
- return parse_reparse_posix((struct reparse_posix_data *)buf,
- cifs_sb, data);
- case IO_REPARSE_TAG_SYMLINK:
- return parse_reparse_symlink(
- (struct reparse_symlink_data_buffer *)buf,
- plen, unicode, cifs_sb, data);
- case IO_REPARSE_TAG_LX_SYMLINK:
- case IO_REPARSE_TAG_AF_UNIX:
- case IO_REPARSE_TAG_LX_FIFO:
- case IO_REPARSE_TAG_LX_CHR:
- case IO_REPARSE_TAG_LX_BLK:
- return 0;
- default:
- cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
- __func__, le32_to_cpu(buf->ReparseTag));
- return -EOPNOTSUPP;
- }
-}
-
-static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data)
-{
- struct reparse_data_buffer *buf;
- struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
- u32 plen = le32_to_cpu(io->OutputCount);
-
- buf = (struct reparse_data_buffer *)((u8 *)io +
- le32_to_cpu(io->OutputOffset));
- return parse_reparse_point(buf, plen, cifs_sb, true, data);
-}
-
static struct cifs_ntsd *
get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen, u32 info)
@@ -4015,7 +3916,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
strcat(message, "W");
}
if (!new_oplock)
- strncpy(message, "None", sizeof(message));
+ strscpy(message, "None");
cinode->oplock = new_oplock;
cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
@@ -5063,214 +4964,84 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf,
return 0;
}
-int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
- struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
{
- struct cifs_open_info_data buf = {};
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {};
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_fid fid;
unsigned int bytes_written;
- struct win_dev *pdev;
+ struct win_dev pdev = {};
struct kvec iov[2];
__u32 oplock = server->oplocks ? REQ_OPLOCK : 0;
int rc;
- if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode))
+ switch (mode & S_IFMT) {
+ case S_IFCHR:
+ strscpy(pdev.type, "IntxCHR");
+ pdev.major = cpu_to_le64(MAJOR(dev));
+ pdev.minor = cpu_to_le64(MINOR(dev));
+ break;
+ case S_IFBLK:
+ strscpy(pdev.type, "IntxBLK");
+ pdev.major = cpu_to_le64(MAJOR(dev));
+ pdev.minor = cpu_to_le64(MINOR(dev));
+ break;
+ case S_IFIFO:
+ strscpy(pdev.type, "LnxFIFO");
+ break;
+ default:
return -EPERM;
+ }
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .cifs_sb = cifs_sb,
- .desired_access = GENERIC_WRITE,
- .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
- CREATE_OPTION_SPECIAL),
- .disposition = FILE_CREATE,
- .path = full_path,
- .fid = &fid,
- };
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE,
+ FILE_CREATE, CREATE_NOT_DIR |
+ CREATE_OPTION_SPECIAL, ACL_NO_MODE);
+ oparms.fid = &fid;
- rc = server->ops->open(xid, &oparms, &oplock, &buf);
+ rc = server->ops->open(xid, &oparms, &oplock, NULL);
if (rc)
return rc;
- /*
- * BB Do not bother to decode buf since no local inode yet to put
- * timestamps in, but we can reuse it safely.
- */
- pdev = (struct win_dev *)&buf.fi;
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
- io_parms.length = sizeof(*pdev);
- iov[1].iov_base = pdev;
- iov[1].iov_len = sizeof(*pdev);
- if (S_ISCHR(mode)) {
- memcpy(pdev->type, "IntxCHR", 8);
- pdev->major = cpu_to_le64(MAJOR(dev));
- pdev->minor = cpu_to_le64(MINOR(dev));
- } else if (S_ISBLK(mode)) {
- memcpy(pdev->type, "IntxBLK", 8);
- pdev->major = cpu_to_le64(MAJOR(dev));
- pdev->minor = cpu_to_le64(MINOR(dev));
- } else if (S_ISFIFO(mode)) {
- memcpy(pdev->type, "LnxFIFO", 8);
- }
+ io_parms.length = sizeof(pdev);
+ iov[1].iov_base = &pdev;
+ iov[1].iov_len = sizeof(pdev);
rc = server->ops->sync_write(xid, &fid, &io_parms,
&bytes_written, iov, 1);
server->ops->close(xid, tcon, &fid);
- d_drop(dentry);
- /* FIXME: add code here to set EAs */
- cifs_free_open_info(&buf);
return rc;
}
-static inline u64 mode_nfs_type(mode_t mode)
-{
- switch (mode & S_IFMT) {
- case S_IFBLK: return NFS_SPECFILE_BLK;
- case S_IFCHR: return NFS_SPECFILE_CHR;
- case S_IFIFO: return NFS_SPECFILE_FIFO;
- case S_IFSOCK: return NFS_SPECFILE_SOCK;
- }
- return 0;
-}
-
-static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
- mode_t mode, dev_t dev,
- struct kvec *iov)
-{
- u64 type;
- u16 len, dlen;
-
- len = sizeof(*buf);
-
- switch ((type = mode_nfs_type(mode))) {
- case NFS_SPECFILE_BLK:
- case NFS_SPECFILE_CHR:
- dlen = sizeof(__le64);
- break;
- case NFS_SPECFILE_FIFO:
- case NFS_SPECFILE_SOCK:
- dlen = 0;
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS);
- buf->Reserved = 0;
- buf->InodeType = cpu_to_le64(type);
- buf->ReparseDataLength = cpu_to_le16(len + dlen -
- sizeof(struct reparse_data_buffer));
- *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
- MINOR(dev));
- iov->iov_base = buf;
- iov->iov_len = len + dlen;
- return 0;
-}
-
-static int nfs_make_node(unsigned int xid, struct inode *inode,
- struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
{
- struct cifs_open_info_data data;
- struct reparse_posix_data *p;
- struct inode *new;
- struct kvec iov;
- __u8 buf[sizeof(*p) + sizeof(__le64)];
+ struct inode *new = NULL;
int rc;
- p = (struct reparse_posix_data *)buf;
- rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+ rc = __cifs_sfu_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
if (rc)
return rc;
- data = (struct cifs_open_info_data) {
- .reparse_point = true,
- .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
- };
-
- new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
- tcon, full_path, &iov);
- if (!IS_ERR(new))
- d_instantiate(dentry, new);
- else
- rc = PTR_ERR(new);
- cifs_free_open_info(&data);
- return rc;
-}
-
-static int smb2_create_reparse_symlink(const unsigned int xid,
- struct inode *inode,
- struct dentry *dentry,
- struct cifs_tcon *tcon,
- const char *full_path,
- const char *symname)
-{
- struct reparse_symlink_data_buffer *buf = NULL;
- struct cifs_open_info_data data;
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- struct inode *new;
- struct kvec iov;
- __le16 *path;
- char *sym, sep = CIFS_DIR_SEP(cifs_sb);
- u16 len, plen;
- int rc = 0;
-
- sym = kstrdup(symname, GFP_KERNEL);
- if (!sym)
- return -ENOMEM;
-
- data = (struct cifs_open_info_data) {
- .reparse_point = true,
- .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
- .symlink_target = sym,
- };
-
- convert_delimiter(sym, sep);
- path = cifs_convert_path_to_utf16(sym, cifs_sb);
- if (!path) {
- rc = -ENOMEM;
- goto out;
- }
-
- plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX);
- len = sizeof(*buf) + plen * 2;
- buf = kzalloc(len, GFP_KERNEL);
- if (!buf) {
- rc = -ENOMEM;
- goto out;
+ if (tcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&new, full_path, NULL,
+ inode->i_sb, xid);
+ } else if (tcon->unix_ext) {
+ rc = cifs_get_inode_info_unix(&new, full_path,
+ inode->i_sb, xid);
+ } else {
+ rc = cifs_get_inode_info(&new, full_path, NULL,
+ inode->i_sb, xid, NULL);
}
-
- buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
- buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
- buf->SubstituteNameOffset = cpu_to_le16(plen);
- buf->SubstituteNameLength = cpu_to_le16(plen);
- memcpy(&buf->PathBuffer[plen], path, plen);
- buf->PrintNameOffset = 0;
- buf->PrintNameLength = cpu_to_le16(plen);
- memcpy(buf->PathBuffer, path, plen);
- buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
- if (*sym != sep)
- buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
-
- convert_delimiter(sym, '/');
- iov.iov_base = buf;
- iov.iov_len = len;
- new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
- tcon, full_path, &iov);
- if (!IS_ERR(new))
+ if (!rc)
d_instantiate(dentry, new);
- else
- rc = PTR_ERR(new);
-out:
- kfree(path);
- cifs_free_open_info(&data);
- kfree(buf);
return rc;
}
@@ -5291,8 +5062,8 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
rc = cifs_sfu_make_node(xid, inode, dentry, tcon,
full_path, mode, dev);
} else {
- rc = nfs_make_node(xid, inode, dentry, tcon,
- full_path, mode, dev);
+ rc = smb2_mknod_reparse(xid, inode, dentry, tcon,
+ full_path, mode, dev);
}
return rc;
}
@@ -5538,6 +5309,7 @@ struct smb_version_operations smb30_operations = {
.tree_connect = SMB2_tcon,
.tree_disconnect = SMB2_tdis,
.qfs_tcon = smb3_qfs_tcon,
+ .query_server_interfaces = SMB3_request_interfaces,
.is_path_accessible = smb2_is_path_accessible,
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
@@ -5653,6 +5425,7 @@ struct smb_version_operations smb311_operations = {
.tree_connect = SMB2_tcon,
.tree_disconnect = SMB2_tdis,
.qfs_tcon = smb3_qfs_tcon,
+ .query_server_interfaces = SMB3_request_interfaces,
.is_path_accessible = smb2_is_path_accessible,
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 608ee05491e2..86c647a947cc 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -367,6 +367,17 @@ again:
}
rc = cifs_setup_session(0, ses, server, nls_codepage);
+ if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect (key rotation
+ * could be enabled on the server e.g.) if an alternate
+ * password is available and the current password is expired,
+ * but do not swap on non pwd related errors like host down
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+
if ((rc == -EACCES) && !tcon->retry) {
mutex_unlock(&ses->session_mutex);
rc = -EHOSTDOWN;
@@ -409,14 +420,15 @@ skip_sess_setup:
spin_unlock(&ses->ses_lock);
if (!rc &&
- (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL) &&
+ server->ops->query_server_interfaces) {
mutex_unlock(&ses->session_mutex);
/*
* query server network interfaces, in case they change
*/
xid = get_xid();
- rc = SMB3_request_interfaces(xid, tcon, false);
+ rc = server->ops->query_server_interfaces(xid, tcon, false);
free_xid(xid);
if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
@@ -731,7 +743,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
pneg_ctxt += sizeof(struct smb2_posix_neg_context);
neg_context_count++;
- if (server->compress_algorithm) {
+ if (server->compression.requested) {
build_compression_ctxt((struct smb2_compression_capabilities_context *)
pneg_ctxt);
ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8);
@@ -779,6 +791,9 @@ static void decode_compress_ctx(struct TCP_Server_Info *server,
struct smb2_compression_capabilities_context *ctxt)
{
unsigned int len = le16_to_cpu(ctxt->DataLength);
+ __le16 alg;
+
+ server->compression.enabled = false;
/*
* Caller checked that DataLength remains within SMB boundary. We still
@@ -789,15 +804,22 @@ static void decode_compress_ctx(struct TCP_Server_Info *server,
pr_warn_once("server sent bad compression cntxt\n");
return;
}
+
if (le16_to_cpu(ctxt->CompressionAlgorithmCount) != 1) {
- pr_warn_once("Invalid SMB3 compress algorithm count\n");
+ pr_warn_once("invalid SMB3 compress algorithm count\n");
return;
}
- if (le16_to_cpu(ctxt->CompressionAlgorithms[0]) > 3) {
- pr_warn_once("unknown compression algorithm\n");
+
+ alg = ctxt->CompressionAlgorithms[0];
+
+ /* 'NONE' (0) compressor type is never negotiated */
+ if (alg == 0 || le16_to_cpu(alg) > 3) {
+ pr_warn_once("invalid compression algorithm '%u'\n", alg);
return;
}
- server->compress_algorithm = ctxt->CompressionAlgorithms[0];
+
+ server->compression.alg = alg;
+ server->compression.enabled = true;
}
static int decode_encrypt_ctx(struct TCP_Server_Info *server,
@@ -1536,6 +1558,11 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
&sess_data->buf0_type,
CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov);
cifs_small_buf_release(sess_data->iov[0].iov_base);
+ if (rc == 0)
+ sess_data->ses->expired_pwd = false;
+ else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED))
+ sess_data->ses->expired_pwd = true;
+
memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec));
return rc;
@@ -2715,6 +2742,17 @@ add_query_id_context(struct kvec *iov, unsigned int *num_iovec)
return 0;
}
+static void add_ea_context(struct cifs_open_parms *oparms,
+ struct kvec *rq_iov, unsigned int *num_iovs)
+{
+ struct kvec *iov = oparms->ea_cctx;
+
+ if (iov && iov->iov_base && iov->iov_len) {
+ rq_iov[(*num_iovs)++] = *iov;
+ memset(iov, 0, sizeof(*iov));
+ }
+}
+
static int
alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
const char *treename, const __le16 *path)
@@ -3081,6 +3119,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
}
add_query_id_context(iov, &n_iov);
+ add_ea_context(oparms, iov, &n_iov);
if (n_iov > 2) {
/*
@@ -3600,9 +3639,9 @@ replay_again:
memcpy(&pbuf->network_open_info,
&rsp->network_open_info,
sizeof(pbuf->network_open_info));
+ atomic_dec(&tcon->num_remote_opens);
}
- atomic_dec(&tcon->num_remote_opens);
close_exit:
SMB2_close_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index db08194484e0..c72a3b2886b7 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -117,9 +117,10 @@ struct share_redirect_error_context_rsp {
* [4] : posix context
* [5] : time warp context
* [6] : query id context
- * [7] : compound padding
+ * [7] : create ea context
+ * [8] : compound padding
*/
-#define SMB2_CREATE_IOV_SIZE 8
+#define SMB2_CREATE_IOV_SIZE 9
/*
* Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
@@ -413,4 +414,35 @@ struct smb2_posix_info_parsed {
const u8 *name;
};
+struct smb2_create_ea_ctx {
+ struct create_context ctx;
+ __u8 name[8];
+ struct smb2_file_full_ea_info ea;
+} __packed;
+
+#define SMB2_WSL_XATTR_UID "$LXUID"
+#define SMB2_WSL_XATTR_GID "$LXGID"
+#define SMB2_WSL_XATTR_MODE "$LXMOD"
+#define SMB2_WSL_XATTR_DEV "$LXDEV"
+#define SMB2_WSL_XATTR_NAME_LEN 6
+#define SMB2_WSL_NUM_XATTRS 4
+
+#define SMB2_WSL_XATTR_UID_SIZE 4
+#define SMB2_WSL_XATTR_GID_SIZE 4
+#define SMB2_WSL_XATTR_MODE_SIZE 4
+#define SMB2_WSL_XATTR_DEV_SIZE 8
+
+#define SMB2_WSL_MIN_QUERY_EA_RESP_SIZE \
+ (ALIGN((SMB2_WSL_NUM_XATTRS - 1) * \
+ (SMB2_WSL_XATTR_NAME_LEN + 1 + \
+ sizeof(struct smb2_file_full_ea_info)), 4) + \
+ SMB2_WSL_XATTR_NAME_LEN + 1 + sizeof(struct smb2_file_full_ea_info))
+
+#define SMB2_WSL_MAX_QUERY_EA_RESP_SIZE \
+ (ALIGN(SMB2_WSL_MIN_QUERY_EA_RESP_SIZE + \
+ SMB2_WSL_XATTR_UID_SIZE + \
+ SMB2_WSL_XATTR_GID_SIZE + \
+ SMB2_WSL_XATTR_MODE_SIZE + \
+ SMB2_WSL_XATTR_DEV_SIZE, 4))
+
#endif /* _SMB2PDU_H */
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index b3069911e9dd..732169d8a67a 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -61,7 +61,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
const unsigned int xid,
struct cifs_tcon *tcon,
const char *full_path,
- struct kvec *iov);
+ struct kvec *reparse_iov,
+ struct kvec *xattr_iov);
int smb2_query_reparse_point(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
@@ -75,7 +76,8 @@ int smb2_query_path_info(const unsigned int xid,
struct cifs_open_info_data *data);
extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, __u64 size,
- struct cifs_sb_info *cifs_sb, bool set_alloc);
+ struct cifs_sb_info *cifs_sb, bool set_alloc,
+ struct dentry *dentry);
extern int smb2_set_file_info(struct inode *inode, const char *full_path,
FILE_BASIC_INFO *buf, const unsigned int xid);
extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
@@ -91,7 +93,8 @@ extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *name, struct cifs_sb_info *cifs_sb);
+ const char *name, struct cifs_sb_info *cifs_sb,
+ struct dentry *dentry);
int smb2_rename_path(const unsigned int xid,
struct cifs_tcon *tcon,
struct dentry *source_dentry,
@@ -308,5 +311,11 @@ int smb311_posix_query_path_info(const unsigned int xid,
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, const char *symname);
+int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev);
#endif /* _SMB2PROTO_H */
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 5a3ca62d2f07..1d6e54f7879e 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -659,7 +659,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
}
spin_unlock(&server->srv_lock);
if (!is_binding && !server->session_estab) {
- strncpy(shdr->Signature, "BSRSPYL", 8);
+ strscpy(shdr->Signature, "BSRSPYL");
return 0;
}
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 522fa387fcfd..5e83cb9da902 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -375,6 +375,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter);
DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
TP_PROTO(unsigned int xid,
@@ -411,10 +412,11 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
-
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done);
DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
TP_PROTO(unsigned int xid,
@@ -456,9 +458,11 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err);
/*
* For logging SMB3 Status code and Command for responses which return errors
@@ -1030,6 +1034,38 @@ DEFINE_EVENT(smb3_ses_class, smb3_##name, \
DEFINE_SMB3_SES_EVENT(ses_not_found);
+DECLARE_EVENT_CLASS(smb3_ioctl_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ unsigned int command),
+ TP_ARGS(xid, fid, command),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(unsigned int, command)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->command = command;
+ ),
+ TP_printk("xid=%u fid=0x%llx ioctl cmd=0x%x",
+ __entry->xid, __entry->fid, __entry->command)
+)
+
+#define DEFINE_SMB3_IOCTL_EVENT(name) \
+DEFINE_EVENT(smb3_ioctl_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ unsigned int command), \
+ TP_ARGS(xid, fid, command))
+
+DEFINE_SMB3_IOCTL_EVENT(ioctl);
+
+
+
+
+
DECLARE_EVENT_CLASS(smb3_credit_class,
TP_PROTO(__u64 currmid,
__u64 conn_id,
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 57f2343164a3..1b594307c9d5 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -208,38 +208,45 @@ struct smb2_transform_hdr {
__le64 SessionId;
} __packed;
+/*
+ * These are simplified versions from the spec, as we don't need a fully fledged
+ * form of both unchained and chained structs.
+ *
+ * Moreover, even in chained compressed payloads, the initial compression header
+ * has the form of the unchained one -- i.e. it never has the
+ * OriginalPayloadSize field and ::Offset field always represent an offset
+ * (instead of a length, as it is in the chained header).
+ *
+ * See MS-SMB2 2.2.42 for more details.
+ */
+#define SMB2_COMPRESSION_FLAG_NONE 0x0000
+#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
-/* See MS-SMB2 2.2.42 */
-struct smb2_compression_transform_hdr_unchained {
- __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
+struct smb2_compression_hdr {
+ __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
__le32 OriginalCompressedSegmentSize;
__le16 CompressionAlgorithm;
__le16 Flags;
- __le16 Length; /* if chained it is length, else offset */
+ __le32 Offset; /* this is the size of the uncompressed SMB2 header below */
+ /* uncompressed SMB2 header (READ or WRITE) goes here */
+ /* compressed data goes here */
} __packed;
-/* See MS-SMB2 2.2.42.1 */
-#define SMB2_COMPRESSION_FLAG_NONE 0x0000
-#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
-
-struct compression_payload_header {
+/*
+ * ... OTOH, set compression payload header to always have OriginalPayloadSize
+ * as it's easier to pass the struct size minus sizeof(OriginalPayloadSize)
+ * than to juggle around the header/data memory.
+ */
+struct smb2_compression_payload_hdr {
__le16 CompressionAlgorithm;
__le16 Flags;
__le32 Length; /* length of compressed playload including field below if present */
- /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
+ __le32 OriginalPayloadSize; /* accounted when LZNT1, LZ77, LZ77+Huffman */
} __packed;
-/* See MS-SMB2 2.2.42.2 */
-struct smb2_compression_transform_hdr_chained {
- __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
- __le32 OriginalCompressedSegmentSize;
- /* struct compression_payload_header[] */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2.2 */
-struct compression_pattern_payload_v1 {
- __le16 Pattern;
- __le16 Reserved1;
+struct smb2_compression_pattern_v1 {
+ __u8 Pattern;
+ __u8 Reserved1;
__le16 Reserved2;
__le32 Repetitions;
} __packed;
@@ -273,15 +280,16 @@ struct smb3_blob_data {
#define SE_GROUP_RESOURCE 0x20000000
#define SE_GROUP_LOGON_ID 0xC0000000
-/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
-
struct sid_array_data {
__le16 SidAttrCount;
/* SidAttrList - array of sid_attr_data structs */
} __packed;
-struct luid_attr_data {
-
+/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
+struct sid_attr_data {
+ __le16 BlobSize;
+ __u8 BlobData[];
+ /* __le32 Attr */
} __packed;
/*
@@ -495,6 +503,7 @@ struct smb2_encryption_neg_context {
#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */
+#define SMB3_COMPRESS_LZ4 cpu_to_le16(0x0005)
/* Compression Flags */
#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h
index edd7fc2a7921..a94d658b88e8 100644
--- a/fs/smb/common/smbfsctl.h
+++ b/fs/smb/common/smbfsctl.h
@@ -158,12 +158,6 @@
#define IO_REPARSE_TAG_LX_CHR 0x80000025
#define IO_REPARSE_TAG_LX_BLK 0x80000026
-#define IO_REPARSE_TAG_LX_SYMLINK_LE cpu_to_le32(0xA000001D)
-#define IO_REPARSE_TAG_AF_UNIX_LE cpu_to_le32(0x80000023)
-#define IO_REPARSE_TAG_LX_FIFO_LE cpu_to_le32(0x80000024)
-#define IO_REPARSE_TAG_LX_CHR_LE cpu_to_le32(0x80000025)
-#define IO_REPARSE_TAG_LX_BLK_LE cpu_to_le32(0x80000026)
-
/* fsctl flags */
/* If Flags is set to this value, the request is an FSCTL not ioctl request */
#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
diff --git a/fs/smb/server/glob.h b/fs/smb/server/glob.h
index 5b8f3e0ebdb3..d528b20b37a8 100644
--- a/fs/smb/server/glob.h
+++ b/fs/smb/server/glob.h
@@ -12,8 +12,6 @@
#include "unicode.h"
#include "vfs_cache.h"
-#define KSMBD_VERSION "3.4.2"
-
extern int ksmbd_debug_types;
#define KSMBD_DEBUG_SMB BIT(0)
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index 0ebf91ffa236..686b321c5a8b 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -75,6 +75,7 @@ struct ksmbd_heartbeat {
#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1)
#define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2)
#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF BIT(3)
+#define KSMBD_GLOBAL_FLAG_DURABLE_HANDLE BIT(4)
/*
* IPC request for ksmbd server startup
@@ -166,7 +167,8 @@ struct ksmbd_share_config_response {
__u16 force_uid;
__u16 force_gid;
__s8 share_name[KSMBD_REQ_MAX_SHARE_NAME];
- __u32 reserved[112]; /* Reserved room */
+ __u32 reserved[111]; /* Reserved room */
+ __u32 payload_sz;
__u32 veto_list_sz;
__s8 ____payload[];
};
diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c
index 328a412259dc..a2f0a2edceb8 100644
--- a/fs/smb/server/mgmt/share_config.c
+++ b/fs/smb/server/mgmt/share_config.c
@@ -158,7 +158,12 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
share->name = kstrdup(name, GFP_KERNEL);
if (!test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) {
- share->path = kstrdup(ksmbd_share_config_path(resp),
+ int path_len = PATH_MAX;
+
+ if (resp->payload_sz)
+ path_len = resp->payload_sz - resp->veto_list_sz;
+
+ share->path = kstrndup(ksmbd_share_config_path(resp), path_len,
GFP_KERNEL);
if (share->path)
share->path_sz = strlen(share->path);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 15f68ee05089..aec0a7a12405 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -156,7 +156,7 @@ void ksmbd_session_destroy(struct ksmbd_session *sess)
kfree(sess);
}
-static struct ksmbd_session *__session_lookup(unsigned long long id)
+struct ksmbd_session *__session_lookup(unsigned long long id)
{
struct ksmbd_session *sess;
@@ -305,6 +305,32 @@ struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
return sess;
}
+void destroy_previous_session(struct ksmbd_conn *conn,
+ struct ksmbd_user *user, u64 id)
+{
+ struct ksmbd_session *prev_sess;
+ struct ksmbd_user *prev_user;
+
+ down_write(&sessions_table_lock);
+ down_write(&conn->session_lock);
+ prev_sess = __session_lookup(id);
+ if (!prev_sess || prev_sess->state == SMB2_SESSION_EXPIRED)
+ goto out;
+
+ prev_user = prev_sess->user;
+ if (!prev_user ||
+ strcmp(user->name, prev_user->name) ||
+ user->passkey_sz != prev_user->passkey_sz ||
+ memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
+ goto out;
+
+ ksmbd_destroy_file_table(&prev_sess->file_table);
+ prev_sess->state = SMB2_SESSION_EXPIRED;
+out:
+ up_write(&conn->session_lock);
+ up_write(&sessions_table_lock);
+}
+
static bool ksmbd_preauth_session_id_match(struct preauth_session *sess,
unsigned long long id)
{
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index 63cb08fffde8..dc9fded2cd43 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -88,8 +88,11 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
int ksmbd_session_register(struct ksmbd_conn *conn,
struct ksmbd_session *sess);
void ksmbd_sessions_deregister(struct ksmbd_conn *conn);
+struct ksmbd_session *__session_lookup(unsigned long long id);
struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
unsigned long long id);
+void destroy_previous_session(struct ksmbd_conn *conn,
+ struct ksmbd_user *user, u64 id);
struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
u64 sess_id);
struct preauth_session *ksmbd_preauth_session_lookup(struct ksmbd_conn *conn,
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 53dfaac425c6..4978edfb15f9 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -159,7 +159,8 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info,
op_entry);
if (opinfo) {
- if (!atomic_inc_not_zero(&opinfo->refcount))
+ if (opinfo->conn == NULL ||
+ !atomic_inc_not_zero(&opinfo->refcount))
opinfo = NULL;
else {
atomic_inc(&opinfo->conn->r_count);
@@ -527,7 +528,7 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
*/
read_lock(&ci->m_lock);
list_for_each_entry(opinfo, &ci->m_op_list, op_entry) {
- if (!opinfo->is_lease)
+ if (!opinfo->is_lease || !opinfo->conn)
continue;
read_unlock(&ci->m_lock);
lease = opinfo->o_lease;
@@ -641,7 +642,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
struct smb2_hdr *rsp_hdr;
struct ksmbd_file *fp;
- fp = ksmbd_lookup_durable_fd(br_info->fid);
+ fp = ksmbd_lookup_global_fd(br_info->fid);
if (!fp)
goto out;
@@ -1106,7 +1107,7 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
read_lock(&p_ci->m_lock);
list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
- if (!opinfo->is_lease)
+ if (opinfo->conn == NULL || !opinfo->is_lease)
continue;
if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE &&
@@ -1142,7 +1143,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
opinfo = rcu_dereference(fp->f_opinfo);
rcu_read_unlock();
- if (!opinfo->is_lease || opinfo->o_lease->version != 2)
+ if (!opinfo || !opinfo->is_lease || opinfo->o_lease->version != 2)
return;
p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent);
@@ -1151,7 +1152,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
read_lock(&p_ci->m_lock);
list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
- if (!opinfo->is_lease)
+ if (opinfo->conn == NULL || !opinfo->is_lease)
continue;
if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE) {
@@ -1361,6 +1362,9 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
rcu_read_lock();
list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) {
+ if (brk_op->conn == NULL)
+ continue;
+
if (!atomic_inc_not_zero(&brk_op->refcount))
continue;
@@ -1496,11 +1500,10 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
/**
* parse_lease_state() - parse lease context containted in file open request
* @open_req: buffer containing smb2 file open(create) request
- * @is_dir: whether leasing file is directory
*
* Return: oplock state, -ENOENT if create lease context not found
*/
-struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
+struct lease_ctx_info *parse_lease_state(void *open_req)
{
struct create_context *cc;
struct smb2_create_req *req = (struct smb2_create_req *)open_req;
@@ -1518,12 +1521,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
- if (is_dir) {
- lreq->req_state = lc->lcontext.LeaseState &
- ~SMB2_LEASE_WRITE_CACHING_LE;
- lreq->is_dir = true;
- } else
- lreq->req_state = lc->lcontext.LeaseState;
+ lreq->req_state = lc->lcontext.LeaseState;
lreq->flags = lc->lcontext.LeaseFlags;
lreq->epoch = lc->lcontext.Epoch;
lreq->duration = lc->lcontext.LeaseDuration;
@@ -1646,6 +1644,8 @@ void create_durable_v2_rsp_buf(char *cc, struct ksmbd_file *fp)
buf->Name[3] = 'Q';
buf->Timeout = cpu_to_le32(fp->durable_timeout);
+ if (fp->is_persistent)
+ buf->Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
}
/**
@@ -1813,3 +1813,71 @@ out:
read_unlock(&lease_list_lock);
return ret_op;
}
+
+int smb2_check_durable_oplock(struct ksmbd_conn *conn,
+ struct ksmbd_share_config *share,
+ struct ksmbd_file *fp,
+ struct lease_ctx_info *lctx,
+ char *name)
+{
+ struct oplock_info *opinfo = opinfo_get(fp);
+ int ret = 0;
+
+ if (!opinfo)
+ return 0;
+
+ if (opinfo->is_lease == false) {
+ if (lctx) {
+ pr_err("create context include lease\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (opinfo->level != SMB2_OPLOCK_LEVEL_BATCH) {
+ pr_err("oplock level is not equal to SMB2_OPLOCK_LEVEL_BATCH\n");
+ ret = -EBADF;
+ }
+
+ goto out;
+ }
+
+ if (memcmp(conn->ClientGUID, fp->client_guid,
+ SMB2_CLIENT_GUID_SIZE)) {
+ ksmbd_debug(SMB, "Client guid of fp is not equal to the one of connection\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (!lctx) {
+ ksmbd_debug(SMB, "create context does not include lease\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key,
+ SMB2_LEASE_KEY_SIZE)) {
+ ksmbd_debug(SMB,
+ "lease key of fp does not match lease key in create context\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (!(opinfo->o_lease->state & SMB2_LEASE_HANDLE_CACHING_LE)) {
+ ksmbd_debug(SMB, "lease state does not contain SMB2_LEASE_HANDLE_CACHING\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (opinfo->o_lease->version != lctx->version) {
+ ksmbd_debug(SMB,
+ "lease version of fp does not match the one in create context\n");
+ ret = -EBADF;
+ goto out;
+ }
+
+ if (!ksmbd_inode_pending_delete(fp))
+ ret = ksmbd_validate_name_reconnect(share, fp, name);
+out:
+ opinfo_put(opinfo);
+ return ret;
+}
diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h
index 5b93ea9196c0..e9da63f25b20 100644
--- a/fs/smb/server/oplock.h
+++ b/fs/smb/server/oplock.h
@@ -111,7 +111,7 @@ void opinfo_put(struct oplock_info *opinfo);
/* Lease related functions */
void create_lease_buf(u8 *rbuf, struct lease *lease);
-struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir);
+struct lease_ctx_info *parse_lease_state(void *open_req);
__u8 smb2_map_lease_to_oplock(__le32 lease_state);
int lease_read_to_write(struct oplock_info *opinfo);
@@ -130,4 +130,9 @@ void destroy_lease_table(struct ksmbd_conn *conn);
void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
struct lease_ctx_info *lctx);
void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp);
+int smb2_check_durable_oplock(struct ksmbd_conn *conn,
+ struct ksmbd_share_config *share,
+ struct ksmbd_file *fp,
+ struct lease_ctx_info *lctx,
+ char *name);
#endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 3079e607c5fe..c0788188aa82 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -625,7 +625,6 @@ static void __exit ksmbd_server_exit(void)
}
MODULE_AUTHOR("Namjae Jeon <linkinjeon@kernel.org>");
-MODULE_VERSION(KSMBD_VERSION);
MODULE_DESCRIPTION("Linux kernel CIFS/SMB SERVER");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: ecb");
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 03dded29a980..727cb49926ee 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -101,13 +101,17 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
*len = le16_to_cpu(((struct smb2_sess_setup_req *)hdr)->SecurityBufferLength);
break;
case SMB2_TREE_CONNECT:
- *off = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset);
+ *off = max_t(unsigned short int,
+ le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset),
+ offsetof(struct smb2_tree_connect_req, Buffer));
*len = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathLength);
break;
case SMB2_CREATE:
{
unsigned short int name_off =
- le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset);
+ max_t(unsigned short int,
+ le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset),
+ offsetof(struct smb2_create_req, Buffer));
unsigned short int name_len =
le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength);
@@ -128,11 +132,15 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
break;
}
case SMB2_QUERY_INFO:
- *off = le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset);
+ *off = max_t(unsigned int,
+ le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset),
+ offsetof(struct smb2_query_info_req, Buffer));
*len = le32_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferLength);
break;
case SMB2_SET_INFO:
- *off = le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset);
+ *off = max_t(unsigned int,
+ le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset),
+ offsetof(struct smb2_set_info_req, Buffer));
*len = le32_to_cpu(((struct smb2_set_info_req *)hdr)->BufferLength);
break;
case SMB2_READ:
@@ -142,7 +150,7 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
case SMB2_WRITE:
if (((struct smb2_write_req *)hdr)->DataOffset ||
((struct smb2_write_req *)hdr)->Length) {
- *off = max_t(unsigned int,
+ *off = max_t(unsigned short int,
le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset),
offsetof(struct smb2_write_req, Buffer));
*len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length);
@@ -153,7 +161,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
*len = le16_to_cpu(((struct smb2_write_req *)hdr)->WriteChannelInfoLength);
break;
case SMB2_QUERY_DIRECTORY:
- *off = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset);
+ *off = max_t(unsigned short int,
+ le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset),
+ offsetof(struct smb2_query_directory_req, Buffer));
*len = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameLength);
break;
case SMB2_LOCK:
@@ -168,7 +178,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
break;
}
case SMB2_IOCTL:
- *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset);
+ *off = max_t(unsigned int,
+ le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset),
+ offsetof(struct smb2_ioctl_req, Buffer));
*len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount);
break;
default:
diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c
index 27a9dce3e03a..606aa3c5189a 100644
--- a/fs/smb/server/smb2ops.c
+++ b/fs/smb/server/smb2ops.c
@@ -228,6 +228,11 @@ void init_smb3_0_server(struct ksmbd_conn *conn)
conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+ (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
}
@@ -256,6 +261,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_PERSISTENT_HANDLES;
}
/**
@@ -275,14 +283,12 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING |
SMB2_GLOBAL_CAP_DIRECTORY_LEASING;
- if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
- (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
- conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
- conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
-
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_PERSISTENT_HANDLES;
+
INIT_LIST_HEAD(&conn->preauth_sess_table);
return 0;
}
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0c97d3c86072..5723bbf372d7 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -607,30 +607,6 @@ int smb2_check_user_session(struct ksmbd_work *work)
return -ENOENT;
}
-static void destroy_previous_session(struct ksmbd_conn *conn,
- struct ksmbd_user *user, u64 id)
-{
- struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id);
- struct ksmbd_user *prev_user;
- struct channel *chann;
- long index;
-
- if (!prev_sess)
- return;
-
- prev_user = prev_sess->user;
-
- if (!prev_user ||
- strcmp(user->name, prev_user->name) ||
- user->passkey_sz != prev_user->passkey_sz ||
- memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
- return;
-
- prev_sess->state = SMB2_SESSION_EXPIRED;
- xa_for_each(&prev_sess->ksmbd_chann_list, index, chann)
- ksmbd_conn_set_exiting(chann->conn);
-}
-
/**
* smb2_get_name() - get filename string from on the wire smb format
* @src: source buffer
@@ -1951,7 +1927,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
WORK_BUFFERS(work, req, rsp);
- treename = smb_strndup_from_utf16(req->Buffer,
+ treename = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->PathOffset),
le16_to_cpu(req->PathLength), true,
conn->local_nls);
if (IS_ERR(treename)) {
@@ -2642,6 +2618,165 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
}
}
+enum {
+ DURABLE_RECONN_V2 = 1,
+ DURABLE_RECONN,
+ DURABLE_REQ_V2,
+ DURABLE_REQ,
+};
+
+struct durable_info {
+ struct ksmbd_file *fp;
+ unsigned short int type;
+ bool persistent;
+ bool reconnected;
+ unsigned int timeout;
+ char *CreateGuid;
+};
+
+static int parse_durable_handle_context(struct ksmbd_work *work,
+ struct smb2_create_req *req,
+ struct lease_ctx_info *lc,
+ struct durable_info *dh_info)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct create_context *context;
+ int dh_idx, err = 0;
+ u64 persistent_id = 0;
+ int req_op_level;
+ static const char * const durable_arr[] = {"DH2C", "DHnC", "DH2Q", "DHnQ"};
+
+ req_op_level = req->RequestedOplockLevel;
+ for (dh_idx = DURABLE_RECONN_V2; dh_idx <= ARRAY_SIZE(durable_arr);
+ dh_idx++) {
+ context = smb2_find_context_vals(req, durable_arr[dh_idx - 1], 4);
+ if (IS_ERR(context)) {
+ err = PTR_ERR(context);
+ goto out;
+ }
+ if (!context)
+ continue;
+
+ switch (dh_idx) {
+ case DURABLE_RECONN_V2:
+ {
+ struct create_durable_reconn_v2_req *recon_v2;
+
+ if (dh_info->type == DURABLE_RECONN ||
+ dh_info->type == DURABLE_REQ_V2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ recon_v2 = (struct create_durable_reconn_v2_req *)context;
+ persistent_id = recon_v2->Fid.PersistentFileId;
+ dh_info->fp = ksmbd_lookup_durable_fd(persistent_id);
+ if (!dh_info->fp) {
+ ksmbd_debug(SMB, "Failed to get durable handle state\n");
+ err = -EBADF;
+ goto out;
+ }
+
+ if (memcmp(dh_info->fp->create_guid, recon_v2->CreateGuid,
+ SMB2_CREATE_GUID_SIZE)) {
+ err = -EBADF;
+ ksmbd_put_durable_fd(dh_info->fp);
+ goto out;
+ }
+
+ dh_info->type = dh_idx;
+ dh_info->reconnected = true;
+ ksmbd_debug(SMB,
+ "reconnect v2 Persistent-id from reconnect = %llu\n",
+ persistent_id);
+ break;
+ }
+ case DURABLE_RECONN:
+ {
+ struct create_durable_reconn_req *recon;
+
+ if (dh_info->type == DURABLE_RECONN_V2 ||
+ dh_info->type == DURABLE_REQ_V2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ recon = (struct create_durable_reconn_req *)context;
+ persistent_id = recon->Data.Fid.PersistentFileId;
+ dh_info->fp = ksmbd_lookup_durable_fd(persistent_id);
+ if (!dh_info->fp) {
+ ksmbd_debug(SMB, "Failed to get durable handle state\n");
+ err = -EBADF;
+ goto out;
+ }
+
+ dh_info->type = dh_idx;
+ dh_info->reconnected = true;
+ ksmbd_debug(SMB, "reconnect Persistent-id from reconnect = %llu\n",
+ persistent_id);
+ break;
+ }
+ case DURABLE_REQ_V2:
+ {
+ struct create_durable_req_v2 *durable_v2_blob;
+
+ if (dh_info->type == DURABLE_RECONN ||
+ dh_info->type == DURABLE_RECONN_V2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ durable_v2_blob =
+ (struct create_durable_req_v2 *)context;
+ ksmbd_debug(SMB, "Request for durable v2 open\n");
+ dh_info->fp = ksmbd_lookup_fd_cguid(durable_v2_blob->CreateGuid);
+ if (dh_info->fp) {
+ if (!memcmp(conn->ClientGUID, dh_info->fp->client_guid,
+ SMB2_CLIENT_GUID_SIZE)) {
+ if (!(req->hdr.Flags & SMB2_FLAGS_REPLAY_OPERATION)) {
+ err = -ENOEXEC;
+ goto out;
+ }
+
+ dh_info->fp->conn = conn;
+ dh_info->reconnected = true;
+ goto out;
+ }
+ }
+
+ if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+ req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+ dh_info->CreateGuid =
+ durable_v2_blob->CreateGuid;
+ dh_info->persistent =
+ le32_to_cpu(durable_v2_blob->Flags);
+ dh_info->timeout =
+ le32_to_cpu(durable_v2_blob->Timeout);
+ dh_info->type = dh_idx;
+ }
+ break;
+ }
+ case DURABLE_REQ:
+ if (dh_info->type == DURABLE_RECONN)
+ goto out;
+ if (dh_info->type == DURABLE_RECONN_V2 ||
+ dh_info->type == DURABLE_REQ_V2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+ req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+ ksmbd_debug(SMB, "Request for durable open\n");
+ dh_info->type = dh_idx;
+ }
+ }
+ }
+
+out:
+ return err;
+}
+
/**
* smb2_open() - handler for smb file open request
* @work: smb work containing request buffer
@@ -2665,6 +2800,7 @@ int smb2_open(struct ksmbd_work *work)
struct lease_ctx_info *lc = NULL;
struct create_ea_buf_req *ea_buf = NULL;
struct oplock_info *opinfo;
+ struct durable_info dh_info = {0};
__le32 *next_ptr = NULL;
int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0;
int rc = 0;
@@ -2704,7 +2840,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out2;
}
- name = smb2_get_name(req->Buffer,
+ name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset),
le16_to_cpu(req->NameLength),
work->conn->local_nls);
if (IS_ERR(name)) {
@@ -2745,6 +2881,49 @@ int smb2_open(struct ksmbd_work *work)
}
}
+ req_op_level = req->RequestedOplockLevel;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE &&
+ req->CreateContextsOffset) {
+ lc = parse_lease_state(req);
+ rc = parse_durable_handle_context(work, req, lc, &dh_info);
+ if (rc) {
+ ksmbd_debug(SMB, "error parsing durable handle context\n");
+ goto err_out2;
+ }
+
+ if (dh_info.reconnected == true) {
+ rc = smb2_check_durable_oplock(conn, share, dh_info.fp, lc, name);
+ if (rc) {
+ ksmbd_put_durable_fd(dh_info.fp);
+ goto err_out2;
+ }
+
+ rc = ksmbd_reopen_durable_fd(work, dh_info.fp);
+ if (rc) {
+ ksmbd_put_durable_fd(dh_info.fp);
+ goto err_out2;
+ }
+
+ if (ksmbd_override_fsids(work)) {
+ rc = -ENOMEM;
+ ksmbd_put_durable_fd(dh_info.fp);
+ goto err_out2;
+ }
+
+ fp = dh_info.fp;
+ file_info = FILE_OPENED;
+
+ rc = ksmbd_vfs_getattr(&fp->filp->f_path, &stat);
+ if (rc)
+ goto err_out2;
+
+ ksmbd_put_durable_fd(fp);
+ goto reconnected_fp;
+ }
+ } else if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
+ lc = parse_lease_state(req);
+
if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE)) {
pr_err("Invalid impersonationlevel : 0x%x\n",
le32_to_cpu(req->ImpersonationLevel));
@@ -3207,10 +3386,6 @@ int smb2_open(struct ksmbd_work *work)
need_truncate = 1;
}
- req_op_level = req->RequestedOplockLevel;
- if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
- lc = parse_lease_state(req, S_ISDIR(file_inode(filp)->i_mode));
-
share_ret = ksmbd_smb_check_shared_mode(fp->filp, fp);
if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_OPLOCKS) ||
(req_op_level == SMB2_OPLOCK_LEVEL_LEASE &&
@@ -3221,6 +3396,11 @@ int smb2_open(struct ksmbd_work *work)
}
} else {
if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+ if (S_ISDIR(file_inode(filp)->i_mode)) {
+ lc->req_state &= ~SMB2_LEASE_WRITE_CACHING_LE;
+ lc->is_dir = true;
+ }
+
/*
* Compare parent lease using parent key. If there is no
* a lease that has same parent key, Send lease break
@@ -3317,6 +3497,24 @@ int smb2_open(struct ksmbd_work *work)
memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
+ if (dh_info.type == DURABLE_REQ_V2 || dh_info.type == DURABLE_REQ) {
+ if (dh_info.type == DURABLE_REQ_V2 && dh_info.persistent)
+ fp->is_persistent = true;
+ else
+ fp->is_durable = true;
+
+ if (dh_info.type == DURABLE_REQ_V2) {
+ memcpy(fp->create_guid, dh_info.CreateGuid,
+ SMB2_CREATE_GUID_SIZE);
+ if (dh_info.timeout)
+ fp->durable_timeout = min(dh_info.timeout,
+ 300000);
+ else
+ fp->durable_timeout = 60;
+ }
+ }
+
+reconnected_fp:
rsp->StructureSize = cpu_to_le16(89);
rcu_read_lock();
opinfo = rcu_dereference(fp->f_opinfo);
@@ -3403,6 +3601,33 @@ int smb2_open(struct ksmbd_work *work)
next_off = conn->vals->create_disk_id_size;
}
+ if (dh_info.type == DURABLE_REQ || dh_info.type == DURABLE_REQ_V2) {
+ struct create_context *durable_ccontext;
+
+ durable_ccontext = (struct create_context *)(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength));
+ contxt_cnt++;
+ if (dh_info.type == DURABLE_REQ) {
+ create_durable_rsp_buf(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength));
+ le32_add_cpu(&rsp->CreateContextsLength,
+ conn->vals->create_durable_size);
+ iov_len += conn->vals->create_durable_size;
+ } else {
+ create_durable_v2_rsp_buf(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength),
+ fp);
+ le32_add_cpu(&rsp->CreateContextsLength,
+ conn->vals->create_durable_v2_size);
+ iov_len += conn->vals->create_durable_v2_size;
+ }
+
+ if (next_ptr)
+ *next_ptr = cpu_to_le32(next_off);
+ next_ptr = &durable_ccontext->Next;
+ next_off = conn->vals->create_durable_size;
+ }
+
if (posix_ctxt) {
contxt_cnt++;
create_posix_rsp_buf(rsp->Buffer +
@@ -3828,11 +4053,16 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv)
}
ksmbd_kstat.kstat = &kstat;
- if (priv->info_level != FILE_NAMES_INFORMATION)
- ksmbd_vfs_fill_dentry_attrs(priv->work,
- idmap,
- dent,
- &ksmbd_kstat);
+ if (priv->info_level != FILE_NAMES_INFORMATION) {
+ rc = ksmbd_vfs_fill_dentry_attrs(priv->work,
+ idmap,
+ dent,
+ &ksmbd_kstat);
+ if (rc) {
+ dput(dent);
+ continue;
+ }
+ }
rc = smb2_populate_readdir_entry(priv->work->conn,
priv->info_level,
@@ -4075,7 +4305,7 @@ int smb2_query_dir(struct ksmbd_work *work)
}
srch_flag = req->Flags;
- srch_ptr = smb_strndup_from_utf16(req->Buffer,
+ srch_ptr = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->FileNameOffset),
le16_to_cpu(req->FileNameLength), 1,
conn->local_nls);
if (IS_ERR(srch_ptr)) {
@@ -4335,7 +4565,8 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
sizeof(struct smb2_ea_info_req))
return -EINVAL;
- ea_req = (struct smb2_ea_info_req *)req->Buffer;
+ ea_req = (struct smb2_ea_info_req *)((char *)req +
+ le16_to_cpu(req->InputBufferOffset));
} else {
/* need to send all EAs, if no specific EA is requested*/
if (le32_to_cpu(req->Flags) & SL_RETURN_SINGLE_ENTRY)
@@ -4480,6 +4711,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
struct smb2_file_basic_info *basic_info;
struct kstat stat;
u64 time;
+ int ret;
if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
pr_err("no right to read the attributes : 0x%x\n",
@@ -4487,9 +4719,12 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
return -EACCES;
}
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
+
basic_info = (struct smb2_file_basic_info *)rsp->Buffer;
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
- file_inode(fp->filp), &stat);
basic_info->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
basic_info->LastAccessTime = cpu_to_le64(time);
@@ -4504,27 +4739,31 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
return 0;
}
-static void get_file_standard_info(struct smb2_query_info_rsp *rsp,
- struct ksmbd_file *fp, void *rsp_org)
+static int get_file_standard_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
{
struct smb2_file_standard_info *sinfo;
unsigned int delete_pending;
- struct inode *inode;
struct kstat stat;
+ int ret;
- inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
delete_pending = ksmbd_inode_pending_delete(fp);
- sinfo->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
+ sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9);
sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending);
sinfo->DeletePending = delete_pending;
sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_standard_info));
+
+ return 0;
}
static void get_file_alignment_info(struct smb2_query_info_rsp *rsp,
@@ -4546,11 +4785,11 @@ static int get_file_all_info(struct ksmbd_work *work,
struct ksmbd_conn *conn = work->conn;
struct smb2_file_all_info *file_info;
unsigned int delete_pending;
- struct inode *inode;
struct kstat stat;
int conv_len;
char *filename;
u64 time;
+ int ret;
if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
ksmbd_debug(SMB, "no right to read the attributes : 0x%x\n",
@@ -4562,8 +4801,10 @@ static int get_file_all_info(struct ksmbd_work *work,
if (IS_ERR(filename))
return PTR_ERR(filename);
- inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
ksmbd_debug(SMB, "filename = %s\n", filename);
delete_pending = ksmbd_inode_pending_delete(fp);
@@ -4579,7 +4820,7 @@ static int get_file_all_info(struct ksmbd_work *work,
file_info->Attributes = fp->f_ci->m_fattr;
file_info->Pad1 = 0;
file_info->AllocationSize =
- cpu_to_le64(inode->i_blocks << 9);
+ cpu_to_le64(stat.blocks << 9);
file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
file_info->NumberOfLinks =
cpu_to_le32(get_nlink(&stat) - delete_pending);
@@ -4623,10 +4864,10 @@ static void get_file_alternate_info(struct ksmbd_work *work,
cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len);
}
-static void get_file_stream_info(struct ksmbd_work *work,
- struct smb2_query_info_rsp *rsp,
- struct ksmbd_file *fp,
- void *rsp_org)
+static int get_file_stream_info(struct ksmbd_work *work,
+ struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp,
+ void *rsp_org)
{
struct ksmbd_conn *conn = work->conn;
struct smb2_file_stream_info *file_info;
@@ -4637,9 +4878,13 @@ static void get_file_stream_info(struct ksmbd_work *work,
int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
int buf_free_len;
struct smb2_query_info_req *req = ksmbd_req_buf_next(work);
+ int ret;
+
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
- file_inode(fp->filp), &stat);
file_info = (struct smb2_file_stream_info *)rsp->Buffer;
buf_free_len =
@@ -4720,29 +4965,37 @@ out:
kvfree(xattr_list);
rsp->OutputBufferLength = cpu_to_le32(nbytes);
+
+ return 0;
}
-static void get_file_internal_info(struct smb2_query_info_rsp *rsp,
- struct ksmbd_file *fp, void *rsp_org)
+static int get_file_internal_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
{
struct smb2_file_internal_info *file_info;
struct kstat stat;
+ int ret;
+
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
- file_inode(fp->filp), &stat);
file_info = (struct smb2_file_internal_info *)rsp->Buffer;
file_info->IndexNumber = cpu_to_le64(stat.ino);
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_internal_info));
+
+ return 0;
}
static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
struct ksmbd_file *fp, void *rsp_org)
{
struct smb2_file_ntwrk_info *file_info;
- struct inode *inode;
struct kstat stat;
u64 time;
+ int ret;
if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
pr_err("no right to read the attributes : 0x%x\n",
@@ -4750,10 +5003,12 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
return -EACCES;
}
- file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
- inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+ file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
@@ -4763,8 +5018,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
time = ksmbd_UnixTimeToNT(stat.ctime);
file_info->ChangeTime = cpu_to_le64(time);
file_info->Attributes = fp->f_ci->m_fattr;
- file_info->AllocationSize =
- cpu_to_le64(inode->i_blocks << 9);
+ file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
file_info->Reserved = cpu_to_le32(0);
rsp->OutputBufferLength =
@@ -4804,14 +5058,17 @@ static void get_file_mode_info(struct smb2_query_info_rsp *rsp,
cpu_to_le32(sizeof(struct smb2_file_mode_info));
}
-static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
- struct ksmbd_file *fp, void *rsp_org)
+static int get_file_compression_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
{
struct smb2_file_comp_info *file_info;
struct kstat stat;
+ int ret;
- generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
- file_inode(fp->filp), &stat);
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
file_info = (struct smb2_file_comp_info *)rsp->Buffer;
file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9);
@@ -4823,6 +5080,8 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_comp_info));
+
+ return 0;
}
static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
@@ -4844,7 +5103,7 @@ static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
return 0;
}
-static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
+static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
struct ksmbd_file *fp, void *rsp_org)
{
struct smb311_posix_qinfo *file_info;
@@ -4852,24 +5111,31 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
struct mnt_idmap *idmap = file_mnt_idmap(fp->filp);
vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
+ struct kstat stat;
u64 time;
int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32;
+ int ret;
+
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret)
+ return ret;
file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
- time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
+ time = ksmbd_UnixTimeToNT(stat.atime);
file_info->LastAccessTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
+ time = ksmbd_UnixTimeToNT(stat.mtime);
file_info->LastWriteTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
+ time = ksmbd_UnixTimeToNT(stat.ctime);
file_info->ChangeTime = cpu_to_le64(time);
file_info->DosAttributes = fp->f_ci->m_fattr;
- file_info->Inode = cpu_to_le64(inode->i_ino);
- file_info->EndOfFile = cpu_to_le64(inode->i_size);
- file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
- file_info->HardLinks = cpu_to_le32(inode->i_nlink);
- file_info->Mode = cpu_to_le32(inode->i_mode & 0777);
- file_info->DeviceId = cpu_to_le32(inode->i_rdev);
+ file_info->Inode = cpu_to_le64(stat.ino);
+ file_info->EndOfFile = cpu_to_le64(stat.size);
+ file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ file_info->HardLinks = cpu_to_le32(stat.nlink);
+ file_info->Mode = cpu_to_le32(stat.mode & 0777);
+ file_info->DeviceId = cpu_to_le32(stat.rdev);
/*
* Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)).
@@ -4882,6 +5148,8 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]);
rsp->OutputBufferLength = cpu_to_le32(out_buf_len);
+
+ return 0;
}
static int smb2_get_info_file(struct ksmbd_work *work,
@@ -4930,7 +5198,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
break;
case FILE_STANDARD_INFORMATION:
- get_file_standard_info(rsp, fp, work->response_buf);
+ rc = get_file_standard_info(rsp, fp, work->response_buf);
break;
case FILE_ALIGNMENT_INFORMATION:
@@ -4946,11 +5214,11 @@ static int smb2_get_info_file(struct ksmbd_work *work,
break;
case FILE_STREAM_INFORMATION:
- get_file_stream_info(work, rsp, fp, work->response_buf);
+ rc = get_file_stream_info(work, rsp, fp, work->response_buf);
break;
case FILE_INTERNAL_INFORMATION:
- get_file_internal_info(rsp, fp, work->response_buf);
+ rc = get_file_internal_info(rsp, fp, work->response_buf);
break;
case FILE_NETWORK_OPEN_INFORMATION:
@@ -4974,7 +5242,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
break;
case FILE_COMPRESSION_INFORMATION:
- get_file_compression_info(rsp, fp, work->response_buf);
+ rc = get_file_compression_info(rsp, fp, work->response_buf);
break;
case FILE_ATTRIBUTE_TAG_INFORMATION:
@@ -4985,7 +5253,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
rc = -EOPNOTSUPP;
} else {
- find_file_posix_info(rsp, fp, work->response_buf);
+ rc = find_file_posix_info(rsp, fp, work->response_buf);
}
break;
default:
@@ -5398,7 +5666,6 @@ int smb2_close(struct ksmbd_work *work)
struct smb2_close_rsp *rsp;
struct ksmbd_conn *conn = work->conn;
struct ksmbd_file *fp;
- struct inode *inode;
u64 time;
int err = 0;
@@ -5453,24 +5720,33 @@ int smb2_close(struct ksmbd_work *work)
rsp->Reserved = 0;
if (req->Flags == SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB) {
+ struct kstat stat;
+ int ret;
+
fp = ksmbd_lookup_fd_fast(work, volatile_id);
if (!fp) {
err = -ENOENT;
goto out;
}
- inode = file_inode(fp->filp);
+ ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (ret) {
+ ksmbd_fd_put(work, fp);
+ goto out;
+ }
+
rsp->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB;
- rsp->AllocationSize = S_ISDIR(inode->i_mode) ? 0 :
- cpu_to_le64(inode->i_blocks << 9);
- rsp->EndOfFile = cpu_to_le64(inode->i_size);
+ rsp->AllocationSize = S_ISDIR(stat.mode) ? 0 :
+ cpu_to_le64(stat.blocks << 9);
+ rsp->EndOfFile = cpu_to_le64(stat.size);
rsp->Attributes = fp->f_ci->m_fattr;
rsp->CreationTime = cpu_to_le64(fp->create_time);
- time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
+ time = ksmbd_UnixTimeToNT(stat.atime);
rsp->LastAccessTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
+ time = ksmbd_UnixTimeToNT(stat.mtime);
rsp->LastWriteTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
+ time = ksmbd_UnixTimeToNT(stat.ctime);
rsp->ChangeTime = cpu_to_le64(time);
ksmbd_fd_put(work, fp);
} else {
@@ -5581,8 +5857,9 @@ static int smb2_rename(struct ksmbd_work *work,
if (!file_info->ReplaceIfExists)
flags = RENAME_NOREPLACE;
- smb_break_all_levII_oplock(work, fp, 0);
rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags);
+ if (!rc)
+ smb_break_all_levII_oplock(work, fp, 0);
out:
kfree(new_name);
return rc;
@@ -5759,15 +6036,21 @@ static int set_file_allocation_info(struct ksmbd_work *work,
loff_t alloc_blks;
struct inode *inode;
+ struct kstat stat;
int rc;
if (!(fp->daccess & FILE_WRITE_DATA_LE))
return -EACCES;
+ rc = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+ if (rc)
+ return rc;
+
alloc_blks = (le64_to_cpu(file_alloc_info->AllocationSize) + 511) >> 9;
inode = file_inode(fp->filp);
- if (alloc_blks > inode->i_blocks) {
+ if (alloc_blks > stat.blocks) {
smb_break_all_levII_oplock(work, fp, 1);
rc = vfs_fallocate(fp->filp, FALLOC_FL_KEEP_SIZE, 0,
alloc_blks * 512);
@@ -5775,7 +6058,7 @@ static int set_file_allocation_info(struct ksmbd_work *work,
pr_err("vfs_fallocate is failed : %d\n", rc);
return rc;
}
- } else if (alloc_blks < inode->i_blocks) {
+ } else if (alloc_blks < stat.blocks) {
loff_t size;
/*
@@ -5930,6 +6213,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
struct ksmbd_share_config *share)
{
unsigned int buf_len = le32_to_cpu(req->BufferLength);
+ char *buffer = (char *)req + le16_to_cpu(req->BufferOffset);
switch (req->FileInfoClass) {
case FILE_BASIC_INFORMATION:
@@ -5937,7 +6221,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
if (buf_len < sizeof(struct smb2_file_basic_info))
return -EINVAL;
- return set_file_basic_info(fp, (struct smb2_file_basic_info *)req->Buffer, share);
+ return set_file_basic_info(fp, (struct smb2_file_basic_info *)buffer, share);
}
case FILE_ALLOCATION_INFORMATION:
{
@@ -5945,7 +6229,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return set_file_allocation_info(work, fp,
- (struct smb2_file_alloc_info *)req->Buffer);
+ (struct smb2_file_alloc_info *)buffer);
}
case FILE_END_OF_FILE_INFORMATION:
{
@@ -5953,7 +6237,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return set_end_of_file_info(work, fp,
- (struct smb2_file_eof_info *)req->Buffer);
+ (struct smb2_file_eof_info *)buffer);
}
case FILE_RENAME_INFORMATION:
{
@@ -5961,7 +6245,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return set_rename_info(work, fp,
- (struct smb2_file_rename_info *)req->Buffer,
+ (struct smb2_file_rename_info *)buffer,
buf_len);
}
case FILE_LINK_INFORMATION:
@@ -5970,7 +6254,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return smb2_create_link(work, work->tcon->share_conf,
- (struct smb2_file_link_info *)req->Buffer,
+ (struct smb2_file_link_info *)buffer,
buf_len, fp->filp,
work->conn->local_nls);
}
@@ -5980,7 +6264,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return set_file_disposition_info(fp,
- (struct smb2_file_disposition_info *)req->Buffer);
+ (struct smb2_file_disposition_info *)buffer);
}
case FILE_FULL_EA_INFORMATION:
{
@@ -5993,7 +6277,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
if (buf_len < sizeof(struct smb2_ea_info))
return -EINVAL;
- return smb2_set_ea((struct smb2_ea_info *)req->Buffer,
+ return smb2_set_ea((struct smb2_ea_info *)buffer,
buf_len, &fp->filp->f_path, true);
}
case FILE_POSITION_INFORMATION:
@@ -6001,14 +6285,14 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
if (buf_len < sizeof(struct smb2_file_pos_info))
return -EINVAL;
- return set_file_position_info(fp, (struct smb2_file_pos_info *)req->Buffer);
+ return set_file_position_info(fp, (struct smb2_file_pos_info *)buffer);
}
case FILE_MODE_INFORMATION:
{
if (buf_len < sizeof(struct smb2_file_mode_info))
return -EINVAL;
- return set_file_mode_info(fp, (struct smb2_file_mode_info *)req->Buffer);
+ return set_file_mode_info(fp, (struct smb2_file_mode_info *)buffer);
}
}
@@ -6089,7 +6373,7 @@ int smb2_set_info(struct ksmbd_work *work)
}
rc = smb2_set_info_sec(fp,
le32_to_cpu(req->AdditionalInformation),
- req->Buffer,
+ (char *)req + le16_to_cpu(req->BufferOffset),
le32_to_cpu(req->BufferLength));
ksmbd_revert_fsids(work);
break;
@@ -6764,10 +7048,10 @@ struct file_lock *smb_flock_init(struct file *f)
locks_init_lock(fl);
- fl->fl_owner = f;
- fl->fl_pid = current->tgid;
- fl->fl_file = f;
- fl->fl_flags = FL_POSIX;
+ fl->c.flc_owner = f;
+ fl->c.flc_pid = current->tgid;
+ fl->c.flc_file = f;
+ fl->c.flc_flags = FL_POSIX;
fl->fl_ops = NULL;
fl->fl_lmops = NULL;
@@ -6784,30 +7068,30 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags)
case SMB2_LOCKFLAG_SHARED:
ksmbd_debug(SMB, "received shared request\n");
cmd = F_SETLKW;
- flock->fl_type = F_RDLCK;
- flock->fl_flags |= FL_SLEEP;
+ flock->c.flc_type = F_RDLCK;
+ flock->c.flc_flags |= FL_SLEEP;
break;
case SMB2_LOCKFLAG_EXCLUSIVE:
ksmbd_debug(SMB, "received exclusive request\n");
cmd = F_SETLKW;
- flock->fl_type = F_WRLCK;
- flock->fl_flags |= FL_SLEEP;
+ flock->c.flc_type = F_WRLCK;
+ flock->c.flc_flags |= FL_SLEEP;
break;
case SMB2_LOCKFLAG_SHARED | SMB2_LOCKFLAG_FAIL_IMMEDIATELY:
ksmbd_debug(SMB,
"received shared & fail immediately request\n");
cmd = F_SETLK;
- flock->fl_type = F_RDLCK;
+ flock->c.flc_type = F_RDLCK;
break;
case SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_FAIL_IMMEDIATELY:
ksmbd_debug(SMB,
"received exclusive & fail immediately request\n");
cmd = F_SETLK;
- flock->fl_type = F_WRLCK;
+ flock->c.flc_type = F_WRLCK;
break;
case SMB2_LOCKFLAG_UNLOCK:
ksmbd_debug(SMB, "received unlock request\n");
- flock->fl_type = F_UNLCK;
+ flock->c.flc_type = F_UNLCK;
cmd = F_SETLK;
break;
}
@@ -6845,13 +7129,13 @@ static void smb2_remove_blocked_lock(void **argv)
struct file_lock *flock = (struct file_lock *)argv[0];
ksmbd_vfs_posix_lock_unblock(flock);
- wake_up(&flock->fl_wait);
+ locks_wake_up(flock);
}
static inline bool lock_defer_pending(struct file_lock *fl)
{
/* check pending lock waiters */
- return waitqueue_active(&fl->fl_wait);
+ return waitqueue_active(&fl->c.flc_wait);
}
/**
@@ -6942,8 +7226,8 @@ int smb2_lock(struct ksmbd_work *work)
list_for_each_entry(cmp_lock, &lock_list, llist) {
if (cmp_lock->fl->fl_start <= flock->fl_start &&
cmp_lock->fl->fl_end >= flock->fl_end) {
- if (cmp_lock->fl->fl_type != F_UNLCK &&
- flock->fl_type != F_UNLCK) {
+ if (cmp_lock->fl->c.flc_type != F_UNLCK &&
+ flock->c.flc_type != F_UNLCK) {
pr_err("conflict two locks in one request\n");
err = -EINVAL;
locks_free_lock(flock);
@@ -6991,12 +7275,12 @@ int smb2_lock(struct ksmbd_work *work)
list_for_each_entry(conn, &conn_list, conns_list) {
spin_lock(&conn->llist_lock);
list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) {
- if (file_inode(cmp_lock->fl->fl_file) !=
- file_inode(smb_lock->fl->fl_file))
+ if (file_inode(cmp_lock->fl->c.flc_file) !=
+ file_inode(smb_lock->fl->c.flc_file))
continue;
- if (smb_lock->fl->fl_type == F_UNLCK) {
- if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file &&
+ if (lock_is_unlock(smb_lock->fl)) {
+ if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file &&
cmp_lock->start == smb_lock->start &&
cmp_lock->end == smb_lock->end &&
!lock_defer_pending(cmp_lock->fl)) {
@@ -7013,7 +7297,7 @@ int smb2_lock(struct ksmbd_work *work)
continue;
}
- if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file) {
+ if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file) {
if (smb_lock->flags & SMB2_LOCKFLAG_SHARED)
continue;
} else {
@@ -7055,7 +7339,7 @@ int smb2_lock(struct ksmbd_work *work)
}
up_read(&conn_list_lock);
out_check_cl:
- if (smb_lock->fl->fl_type == F_UNLCK && nolock) {
+ if (lock_is_unlock(smb_lock->fl) && nolock) {
pr_err("Try to unlock nolocked range\n");
rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED;
goto out;
@@ -7179,7 +7463,7 @@ out:
struct file_lock *rlock = NULL;
rlock = smb_flock_init(filp);
- rlock->fl_type = F_UNLCK;
+ rlock->c.flc_type = F_UNLCK;
rlock->fl_start = smb_lock->start;
rlock->fl_end = smb_lock->end;
@@ -7535,7 +7819,7 @@ static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id,
struct smb2_ioctl_rsp *rsp)
{
struct ksmbd_rpc_command *rpc_resp;
- char *data_buf = (char *)&req->Buffer[0];
+ char *data_buf = (char *)req + le32_to_cpu(req->InputOffset);
int nbytes = 0;
rpc_resp = ksmbd_rpc_ioctl(work->sess, id, data_buf,
@@ -7648,6 +7932,7 @@ int smb2_ioctl(struct ksmbd_work *work)
u64 id = KSMBD_NO_FID;
struct ksmbd_conn *conn = work->conn;
int ret = 0;
+ char *buffer;
if (work->next_smb2_rcv_hdr_off) {
req = ksmbd_req_buf_next(work);
@@ -7670,6 +7955,8 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ buffer = (char *)req + le32_to_cpu(req->InputOffset);
+
cnt_code = le32_to_cpu(req->CtlCode);
ret = smb2_calc_max_out_buf_len(work, 48,
le32_to_cpu(req->MaxOutputResponse));
@@ -7727,7 +8014,7 @@ int smb2_ioctl(struct ksmbd_work *work)
}
ret = fsctl_validate_negotiate_info(conn,
- (struct validate_negotiate_info_req *)&req->Buffer[0],
+ (struct validate_negotiate_info_req *)buffer,
(struct validate_negotiate_info_rsp *)&rsp->Buffer[0],
in_buf_len);
if (ret < 0)
@@ -7780,7 +8067,7 @@ int smb2_ioctl(struct ksmbd_work *work)
rsp->VolatileFileId = req->VolatileFileId;
rsp->PersistentFileId = req->PersistentFileId;
fsctl_copychunk(work,
- (struct copychunk_ioctl_req *)&req->Buffer[0],
+ (struct copychunk_ioctl_req *)buffer,
le32_to_cpu(req->CtlCode),
le32_to_cpu(req->InputCount),
req->VolatileFileId,
@@ -7793,8 +8080,7 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- ret = fsctl_set_sparse(work, id,
- (struct file_sparse *)&req->Buffer[0]);
+ ret = fsctl_set_sparse(work, id, (struct file_sparse *)buffer);
if (ret < 0)
goto out;
break;
@@ -7817,7 +8103,7 @@ int smb2_ioctl(struct ksmbd_work *work)
}
zero_data =
- (struct file_zero_data_information *)&req->Buffer[0];
+ (struct file_zero_data_information *)buffer;
off = le64_to_cpu(zero_data->FileOffset);
bfz = le64_to_cpu(zero_data->BeyondFinalZero);
@@ -7848,7 +8134,7 @@ int smb2_ioctl(struct ksmbd_work *work)
}
ret = fsctl_query_allocated_ranges(work, id,
- (struct file_allocated_range_buffer *)&req->Buffer[0],
+ (struct file_allocated_range_buffer *)buffer,
(struct file_allocated_range_buffer *)&rsp->Buffer[0],
out_buf_len /
sizeof(struct file_allocated_range_buffer), &nbytes);
@@ -7892,7 +8178,7 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0];
+ dup_ext = (struct duplicate_extents_to_file *)buffer;
fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle,
dup_ext->PersistentFileHandle);
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index d12cfd3b0927..bd1d2a0e9203 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -72,6 +72,18 @@ struct create_durable_req_v2 {
__u8 CreateGuid[16];
} __packed;
+struct create_durable_reconn_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ union {
+ __u8 Reserved[16];
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ } Data;
+} __packed;
+
struct create_durable_reconn_v2_req {
struct create_context ccontext;
__u8 Name[8];
@@ -98,6 +110,9 @@ struct create_durable_rsp {
} Data;
} __packed;
+/* See MS-SMB2 2.2.13.2.11 */
+/* Flags */
+#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
struct create_durable_v2_rsp {
struct create_context ccontext;
__u8 Name[8];
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 7c98bf699772..fcaf373cc008 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -457,10 +457,13 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
}
ksmbd_kstat.kstat = &kstat;
- ksmbd_vfs_fill_dentry_attrs(work,
- idmap,
- dentry,
- &ksmbd_kstat);
+ rc = ksmbd_vfs_fill_dentry_attrs(work,
+ idmap,
+ dentry,
+ &ksmbd_kstat);
+ if (rc)
+ break;
+
rc = fn(conn, info_level, d_info, &ksmbd_kstat);
if (rc)
break;
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index f29bb03f0dc4..8752ac82c557 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -65,6 +65,7 @@ struct ipc_msg_table_entry {
struct hlist_node ipc_table_hlist;
void *response;
+ unsigned int msg_sz;
};
static struct delayed_work ipc_timer_work;
@@ -275,6 +276,7 @@ static int handle_response(int type, void *payload, size_t sz)
}
memcpy(entry->response, payload, sz);
+ entry->msg_sz = sz;
wake_up_interruptible(&entry->wait);
ret = 0;
break;
@@ -453,6 +455,34 @@ out:
return ret;
}
+static int ipc_validate_msg(struct ipc_msg_table_entry *entry)
+{
+ unsigned int msg_sz = entry->msg_sz;
+
+ if (entry->type == KSMBD_EVENT_RPC_REQUEST) {
+ struct ksmbd_rpc_command *resp = entry->response;
+
+ msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz;
+ } else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) {
+ struct ksmbd_spnego_authen_response *resp = entry->response;
+
+ msg_sz = sizeof(struct ksmbd_spnego_authen_response) +
+ resp->session_key_len + resp->spnego_blob_len;
+ } else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) {
+ struct ksmbd_share_config_response *resp = entry->response;
+
+ if (resp->payload_sz) {
+ if (resp->payload_sz < resp->veto_list_sz)
+ return -EINVAL;
+
+ msg_sz = sizeof(struct ksmbd_share_config_response) +
+ resp->payload_sz;
+ }
+ }
+
+ return entry->msg_sz != msg_sz ? -EINVAL : 0;
+}
+
static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle)
{
struct ipc_msg_table_entry entry;
@@ -477,6 +507,13 @@ static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle
ret = wait_event_interruptible_timeout(entry.wait,
entry.response != NULL,
IPC_WAIT_TIMEOUT);
+ if (entry.response) {
+ ret = ipc_validate_msg(&entry);
+ if (ret) {
+ kvfree(entry.response);
+ entry.response = NULL;
+ }
+ }
out:
down_write(&ipc_msg_table_lock);
hash_del(&entry.ipc_table_hlist);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index a6961bfe3e13..22f0f3db3ac9 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -337,18 +337,18 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end,
return 0;
spin_lock(&ctx->flc_lock);
- list_for_each_entry(flock, &ctx->flc_posix, fl_list) {
+ for_each_file_lock(flock, &ctx->flc_posix) {
/* check conflict locks */
if (flock->fl_end >= start && end >= flock->fl_start) {
- if (flock->fl_type == F_RDLCK) {
+ if (lock_is_read(flock)) {
if (type == WRITE) {
pr_err("not allow write by shared lock\n");
error = 1;
goto out;
}
- } else if (flock->fl_type == F_WRLCK) {
+ } else if (lock_is_write(flock)) {
/* check owner in lock */
- if (flock->fl_file != filp) {
+ if (flock->c.flc_file != filp) {
error = 1;
pr_err("not allow rw access by exclusive lock from other opens\n");
goto out;
@@ -1682,11 +1682,19 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
struct dentry *dentry,
struct ksmbd_kstat *ksmbd_kstat)
{
+ struct ksmbd_share_config *share_conf = work->tcon->share_conf;
u64 time;
int rc;
+ struct path path = {
+ .mnt = share_conf->vfs_path.mnt,
+ .dentry = dentry,
+ };
- generic_fillattr(idmap, STATX_BASIC_STATS, d_inode(dentry),
- ksmbd_kstat->kstat);
+ rc = vfs_getattr(&path, ksmbd_kstat->kstat,
+ STATX_BASIC_STATS | STATX_BTIME,
+ AT_STATX_SYNC_AS_STAT);
+ if (rc)
+ return rc;
time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime);
ksmbd_kstat->create_time = time;
@@ -1837,13 +1845,13 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
void ksmbd_vfs_posix_lock_wait(struct file_lock *flock)
{
- wait_event(flock->fl_wait, !flock->fl_blocker);
+ wait_event(flock->c.flc_wait, !flock->c.flc_blocker);
}
int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout)
{
- return wait_event_interruptible_timeout(flock->fl_wait,
- !flock->fl_blocker,
+ return wait_event_interruptible_timeout(flock->c.flc_wait,
+ !flock->c.flc_blocker,
timeout);
}
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 4e82ff627d12..030f70700036 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -305,7 +305,8 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
fd_limit_close();
__ksmbd_remove_durable_fd(fp);
- __ksmbd_remove_fd(ft, fp);
+ if (ft)
+ __ksmbd_remove_fd(ft, fp);
close_id_del_oplock(fp);
filp = fp->filp;
@@ -465,11 +466,32 @@ struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
return fp;
}
-struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id)
+struct ksmbd_file *ksmbd_lookup_global_fd(unsigned long long id)
{
return __ksmbd_lookup_fd(&global_ft, id);
}
+struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id)
+{
+ struct ksmbd_file *fp;
+
+ fp = __ksmbd_lookup_fd(&global_ft, id);
+ if (fp && fp->conn) {
+ ksmbd_put_durable_fd(fp);
+ fp = NULL;
+ }
+
+ return fp;
+}
+
+void ksmbd_put_durable_fd(struct ksmbd_file *fp)
+{
+ if (!atomic_dec_and_test(&fp->refcount))
+ return;
+
+ __ksmbd_close_fd(NULL, fp);
+}
+
struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid)
{
struct ksmbd_file *fp = NULL;
@@ -639,6 +661,32 @@ __close_file_table_ids(struct ksmbd_file_table *ft,
return num;
}
+static inline bool is_reconnectable(struct ksmbd_file *fp)
+{
+ struct oplock_info *opinfo = opinfo_get(fp);
+ bool reconn = false;
+
+ if (!opinfo)
+ return false;
+
+ if (opinfo->op_state != OPLOCK_STATE_NONE) {
+ opinfo_put(opinfo);
+ return false;
+ }
+
+ if (fp->is_resilient || fp->is_persistent)
+ reconn = true;
+ else if (fp->is_durable && opinfo->is_lease &&
+ opinfo->o_lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+ reconn = true;
+
+ else if (fp->is_durable && opinfo->level == SMB2_OPLOCK_LEVEL_BATCH)
+ reconn = true;
+
+ opinfo_put(opinfo);
+ return reconn;
+}
+
static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
struct ksmbd_file *fp)
{
@@ -648,7 +696,28 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
static bool session_fd_check(struct ksmbd_tree_connect *tcon,
struct ksmbd_file *fp)
{
- return false;
+ struct ksmbd_inode *ci;
+ struct oplock_info *op;
+ struct ksmbd_conn *conn;
+
+ if (!is_reconnectable(fp))
+ return false;
+
+ conn = fp->conn;
+ ci = fp->f_ci;
+ write_lock(&ci->m_lock);
+ list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
+ if (op->conn != conn)
+ continue;
+ op->conn = NULL;
+ }
+ write_unlock(&ci->m_lock);
+
+ fp->conn = NULL;
+ fp->tcon = NULL;
+ fp->volatile_id = KSMBD_NO_FID;
+
+ return true;
}
void ksmbd_close_tree_conn_fds(struct ksmbd_work *work)
@@ -687,6 +756,68 @@ void ksmbd_free_global_file_table(void)
ksmbd_destroy_file_table(&global_ft);
}
+int ksmbd_validate_name_reconnect(struct ksmbd_share_config *share,
+ struct ksmbd_file *fp, char *name)
+{
+ char *pathname, *ab_pathname;
+ int ret = 0;
+
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathname)
+ return -EACCES;
+
+ ab_pathname = d_path(&fp->filp->f_path, pathname, PATH_MAX);
+ if (IS_ERR(ab_pathname)) {
+ kfree(pathname);
+ return -EACCES;
+ }
+
+ if (name && strcmp(&ab_pathname[share->path_sz + 1], name)) {
+ ksmbd_debug(SMB, "invalid name reconnect %s\n", name);
+ ret = -EINVAL;
+ }
+
+ kfree(pathname);
+
+ return ret;
+}
+
+int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp)
+{
+ struct ksmbd_inode *ci;
+ struct oplock_info *op;
+
+ if (!fp->is_durable || fp->conn || fp->tcon) {
+ pr_err("Invalid durable fd [%p:%p]\n", fp->conn, fp->tcon);
+ return -EBADF;
+ }
+
+ if (has_file_id(fp->volatile_id)) {
+ pr_err("Still in use durable fd: %llu\n", fp->volatile_id);
+ return -EBADF;
+ }
+
+ fp->conn = work->conn;
+ fp->tcon = work->tcon;
+
+ ci = fp->f_ci;
+ write_lock(&ci->m_lock);
+ list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
+ if (op->conn)
+ continue;
+ op->conn = fp->conn;
+ }
+ write_unlock(&ci->m_lock);
+
+ __open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID);
+ if (!has_file_id(fp->volatile_id)) {
+ fp->conn = NULL;
+ fp->tcon = NULL;
+ return -EBADF;
+ }
+ return 0;
+}
+
int ksmbd_init_file_table(struct ksmbd_file_table *ft)
{
ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL);
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index a528f0cc775a..ed44fb4e18e7 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -14,6 +14,7 @@
#include <linux/workqueue.h>
#include "vfs.h"
+#include "mgmt/share_config.h"
/* Windows style file permissions for extended response */
#define FILE_GENERIC_ALL 0x1F01FF
@@ -106,6 +107,9 @@ struct ksmbd_file {
int dot_dotdot[2];
unsigned int f_state;
bool reserve_lease_break;
+ bool is_durable;
+ bool is_persistent;
+ bool is_resilient;
};
static inline void set_ctx_actor(struct dir_context *ctx,
@@ -141,7 +145,9 @@ struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp);
struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d);
void ksmbd_inode_put(struct ksmbd_inode *ci);
+struct ksmbd_file *ksmbd_lookup_global_fd(unsigned long long id);
struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id);
+void ksmbd_put_durable_fd(struct ksmbd_file *fp);
struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid);
struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry);
unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp);
@@ -173,6 +179,9 @@ void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp);
void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp);
void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp,
int file_info);
+int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp);
+int ksmbd_validate_name_reconnect(struct ksmbd_share_config *share,
+ struct ksmbd_file *fp, char *name);
int ksmbd_init_file_cache(void);
void ksmbd_exit_file_cache(void);
#endif /* __VFS_CACHE_H__ */
diff --git a/fs/super.c b/fs/super.c
index d6efeba0d0ce..69ce6c600968 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1527,16 +1527,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
struct fs_context *fc)
{
blk_mode_t mode = sb_open_mode(sb_flags);
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
- bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
- if (IS_ERR(bdev_handle)) {
+ bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+ if (IS_ERR(bdev_file)) {
if (fc)
errorf(fc, "%s: Can't open blockdev", fc->source);
- return PTR_ERR(bdev_handle);
+ return PTR_ERR(bdev_file);
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
/*
* This really should be in blkdev_get_by_dev, but right now can't due
@@ -1544,7 +1544,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
* writable from userspace even for a read-only block device.
*/
if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- bdev_release(bdev_handle);
+ bdev_fput(bdev_file);
return -EACCES;
}
@@ -1555,11 +1555,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
if (fc)
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
- bdev_release(bdev_handle);
+ bdev_fput(bdev_file);
return -EBUSY;
}
spin_lock(&sb_lock);
- sb->s_bdev_handle = bdev_handle;
+ sb->s_bdev_file = bdev_file;
sb->s_bdev = bdev;
sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
if (bdev_stable_writes(bdev))
@@ -1675,7 +1675,7 @@ void kill_block_super(struct super_block *sb)
generic_shutdown_super(sb);
if (bdev) {
sync_blockdev(bdev);
- bdev_release(sb->s_bdev_handle);
+ bdev_fput(sb->s_bdev_file);
}
}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 138676463336..d22ad67a0f32 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -31,6 +31,17 @@ static void remove_files(struct kernfs_node *parent,
kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
}
+static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj)
+{
+ if (grp->attrs && grp->attrs[0] && grp->is_visible)
+ return grp->is_visible(kobj, grp->attrs[0], 0);
+
+ if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible)
+ return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0);
+
+ return 0;
+}
+
static int create_files(struct kernfs_node *parent, struct kobject *kobj,
kuid_t uid, kgid_t gid,
const struct attribute_group *grp, int update)
@@ -52,6 +63,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
kernfs_remove_by_name(parent, (*attr)->name);
if (grp->is_visible) {
mode = grp->is_visible(kobj, *attr, i);
+ mode &= ~SYSFS_GROUP_INVISIBLE;
if (!mode)
continue;
}
@@ -81,6 +93,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
(*bin_attr)->attr.name);
if (grp->is_bin_visible) {
mode = grp->is_bin_visible(kobj, *bin_attr, i);
+ mode &= ~SYSFS_GROUP_INVISIBLE;
if (!mode)
continue;
}
@@ -127,16 +140,31 @@ static int internal_create_group(struct kobject *kobj, int update,
kobject_get_ownership(kobj, &uid, &gid);
if (grp->name) {
+ umode_t mode = __first_visible(grp, kobj);
+
+ if (mode & SYSFS_GROUP_INVISIBLE)
+ mode = 0;
+ else
+ mode = S_IRWXU | S_IRUGO | S_IXUGO;
+
if (update) {
kn = kernfs_find_and_get(kobj->sd, grp->name);
if (!kn) {
- pr_warn("Can't update unknown attr grp name: %s/%s\n",
- kobj->name, grp->name);
- return -EINVAL;
+ pr_debug("attr grp %s/%s not created yet\n",
+ kobj->name, grp->name);
+ /* may have been invisible prior to this update */
+ update = 0;
+ } else if (!mode) {
+ sysfs_remove_group(kobj, grp);
+ kernfs_put(kn);
+ return 0;
}
- } else {
- kn = kernfs_create_dir_ns(kobj->sd, grp->name,
- S_IRWXU | S_IRUGO | S_IXUGO,
+ }
+
+ if (!update) {
+ if (!mode)
+ return 0;
+ kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode,
uid, gid, kobj, NULL);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
@@ -279,9 +307,8 @@ void sysfs_remove_group(struct kobject *kobj,
if (grp->name) {
kn = kernfs_find_and_get(parent, grp->name);
if (!kn) {
- WARN(!kn, KERN_WARNING
- "sysfs group '%s' not found for kobject '%s'\n",
- grp->name, kobject_name(kobj));
+ pr_debug("sysfs group '%s' not found for kobject '%s'\n",
+ grp->name, kobject_name(kobj));
return;
}
} else {
@@ -318,13 +345,13 @@ void sysfs_remove_groups(struct kobject *kobj,
EXPORT_SYMBOL_GPL(sysfs_remove_groups);
/**
- * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * sysfs_merge_group - merge files into a pre-existing named attribute group.
* @kobj: The kobject containing the group.
* @grp: The files to create and the attribute group they belong to.
*
- * This function returns an error if the group doesn't exist or any of the
- * files already exist in that group, in which case none of the new files
- * are created.
+ * This function returns an error if the group doesn't exist, the .name field is
+ * NULL or any of the files already exist in that group, in which case none of
+ * the new files are created.
*/
int sysfs_merge_group(struct kobject *kobj,
const struct attribute_group *grp)
@@ -356,7 +383,7 @@ int sysfs_merge_group(struct kobject *kobj,
EXPORT_SYMBOL_GPL(sysfs_merge_group);
/**
- * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * sysfs_unmerge_group - remove files from a pre-existing named attribute group.
* @kobj: The kobject containing the group.
* @grp: The files to remove and the attribute group they belong to.
*/
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 5a915b2e68f5..76bc2d5e75a9 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -336,7 +336,7 @@ int __init sysv_init_icache(void)
{
sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
init_once);
if (!sysv_inode_cachep)
return -ENOMEM;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 410ab2a44d2f..19bcb51a2203 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh)
return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
}
-/*
- * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
- */
static Indirect *get_branch(struct inode *inode,
int depth,
int offsets[],
@@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode,
bh = sb_bread(sb, block);
if (!bh)
goto failure;
+ read_lock(&pointers_lock);
if (!verify_chain(chain, p))
goto changed;
add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+ read_unlock(&pointers_lock);
if (!p->key)
goto no_block;
}
return NULL;
changed:
+ read_unlock(&pointers_lock);
brelse(bh);
*err = -EAGAIN;
goto no_block;
@@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
goto out;
reread:
- read_lock(&pointers_lock);
partial = get_branch(inode, depth, offsets, chain, &err);
- read_unlock(&pointers_lock);
/* Simplest case - block found, no allocation needed */
if (!partial) {
@@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode,
*top = 0;
for (k = depth; k > 1 && !offsets[k-1]; k--)
;
+ partial = get_branch(inode, k, offsets, chain, &err);
write_lock(&pointers_lock);
- partial = get_branch(inode, k, offsets, chain, &err);
if (!partial)
partial = chain + k-1;
/*
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 110e8a272189..894c6ca1e500 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -35,6 +35,17 @@ static DEFINE_MUTEX(eventfs_mutex);
/* Choose something "unique" ;-) */
#define EVENTFS_FILE_INODE_INO 0x12c4e37
+struct eventfs_root_inode {
+ struct eventfs_inode ei;
+ struct dentry *events_dir;
+};
+
+static struct eventfs_root_inode *get_root_inode(struct eventfs_inode *ei)
+{
+ WARN_ON_ONCE(!ei->is_events);
+ return container_of(ei, struct eventfs_root_inode, ei);
+}
+
/* Just try to make something consistent and unique */
static int eventfs_dir_ino(struct eventfs_inode *ei)
{
@@ -73,12 +84,18 @@ enum {
static void release_ei(struct kref *ref)
{
struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
+ struct eventfs_root_inode *rei;
WARN_ON_ONCE(!ei->is_freed);
kfree(ei->entry_attrs);
kfree_const(ei->name);
- kfree_rcu(ei, rcu);
+ if (ei->is_events) {
+ rei = get_root_inode(ei);
+ kfree_rcu(rei, ei.rcu);
+ } else {
+ kfree_rcu(ei, rcu);
+ }
}
static inline void put_ei(struct eventfs_inode *ei)
@@ -319,6 +336,7 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode,
/**
* lookup_file - look up a file in the tracefs filesystem
+ * @parent_ei: Pointer to the eventfs_inode that represents parent of the file
* @dentry: the dentry to look up
* @mode: the permission that the file should have.
* @attr: saved attributes changed by user
@@ -372,6 +390,7 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
/**
* lookup_dir_entry - look up a dir in the tracefs filesystem
* @dentry: the directory to look up
+ * @pei: Pointer to the parent eventfs_inode if available
* @ei: the eventfs_inode that represents the directory to create
*
* This function will look up a dentry for a directory represented by
@@ -408,19 +427,43 @@ static struct dentry *lookup_dir_entry(struct dentry *dentry,
return NULL;
}
+static inline struct eventfs_inode *init_ei(struct eventfs_inode *ei, const char *name)
+{
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name)
+ return NULL;
+ kref_init(&ei->kref);
+ return ei;
+}
+
static inline struct eventfs_inode *alloc_ei(const char *name)
{
struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ struct eventfs_inode *result;
if (!ei)
return NULL;
- ei->name = kstrdup_const(name, GFP_KERNEL);
- if (!ei->name) {
+ result = init_ei(ei, name);
+ if (!result)
kfree(ei);
+
+ return result;
+}
+
+static inline struct eventfs_inode *alloc_root_ei(const char *name)
+{
+ struct eventfs_root_inode *rei = kzalloc(sizeof(*rei), GFP_KERNEL);
+ struct eventfs_inode *ei;
+
+ if (!rei)
return NULL;
- }
- kref_init(&ei->kref);
+
+ rei->ei.is_events = 1;
+ ei = init_ei(&rei->ei, name);
+ if (!ei)
+ kfree(rei);
+
return ei;
}
@@ -437,16 +480,20 @@ void eventfs_d_release(struct dentry *dentry)
/**
* lookup_file_dentry - create a dentry for a file of an eventfs_inode
+ * @dentry: The parent dentry under which the new file's dentry will be created
* @ei: the eventfs_inode that the file will be created under
* @idx: the index into the entry_attrs[] of the @ei
- * @parent: The parent dentry of the created file.
- * @name: The name of the file to create
* @mode: The mode of the file.
* @data: The data to use to set the inode of the file with on open()
* @fops: The fops of the file to be created.
*
- * Create a dentry for a file of an eventfs_inode @ei and place it into the
- * address located at @e_dentry.
+ * This function creates a dentry for a file associated with an
+ * eventfs_inode @ei. It uses the entry attributes specified by @idx,
+ * if available. The file will have the specified @mode and its inode will be
+ * set up with @data upon open. The file operations will be set to @fops.
+ *
+ * Return: Returns a pointer to the newly created file's dentry or an error
+ * pointer.
*/
static struct dentry *
lookup_file_dentry(struct dentry *dentry,
@@ -483,7 +530,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *result = NULL;
ti = get_tracefs(dir);
- if (!(ti->flags & TRACEFS_EVENT_INODE))
+ if (WARN_ON_ONCE(!(ti->flags & TRACEFS_EVENT_INODE)))
return ERR_PTR(-EIO);
mutex_lock(&eventfs_mutex);
@@ -495,7 +542,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
list_for_each_entry(ei_child, &ei->children, list) {
if (strcmp(ei_child->name, name) != 0)
continue;
- if (ei_child->is_freed)
+ /* A child is freed and removed from the list at the same time */
+ if (WARN_ON_ONCE(ei_child->is_freed))
goto out;
result = lookup_dir_entry(dentry, ei, ei_child);
goto out;
@@ -709,6 +757,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
int size, void *data)
{
struct dentry *dentry = tracefs_start_creating(name, parent);
+ struct eventfs_root_inode *rei;
struct eventfs_inode *ei;
struct tracefs_inode *ti;
struct inode *inode;
@@ -721,7 +770,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
if (IS_ERR(dentry))
return ERR_CAST(dentry);
- ei = alloc_ei(name);
+ ei = alloc_root_ei(name);
if (!ei)
goto fail;
@@ -730,10 +779,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
goto fail;
// Note: we have a ref to the dentry from tracefs_start_creating()
- ei->events_dir = dentry;
+ rei = get_root_inode(ei);
+ rei->events_dir = dentry;
+
ei->entries = entries;
ei->nr_entries = size;
- ei->is_events = 1;
ei->data = data;
/* Save the ownership of this directory */
@@ -844,13 +894,15 @@ void eventfs_remove_dir(struct eventfs_inode *ei)
*/
void eventfs_remove_events_dir(struct eventfs_inode *ei)
{
+ struct eventfs_root_inode *rei;
struct dentry *dentry;
- dentry = ei->events_dir;
+ rei = get_root_inode(ei);
+ dentry = rei->events_dir;
if (!dentry)
return;
- ei->events_dir = NULL;
+ rei->events_dir = NULL;
eventfs_remove_dir(ei);
/*
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index d65ffad4c327..5545e6bf7d26 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -731,7 +731,6 @@ static int __init tracefs_init(void)
tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache",
sizeof(struct tracefs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|
SLAB_ACCOUNT),
init_once);
if (!tracefs_inode_cachep)
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index beb3dcd0e434..15c26f9aaad4 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -36,7 +36,6 @@ struct eventfs_attr {
* @children: link list into the child eventfs_inode
* @entries: the array of entries representing the files in the directory
* @name: the name of the directory to create
- * @events_dir: the dentry of the events directory
* @entry_attrs: Saved mode and ownership of the @d_children
* @data: The private data to pass to the callbacks
* @attr: Saved mode and ownership of eventfs_inode itself
@@ -54,7 +53,6 @@ struct eventfs_inode {
struct list_head children;
const struct eventfs_entry *entries;
const char *name;
- struct dentry *events_dir;
struct eventfs_attr *entry_attrs;
void *data;
struct eventfs_attr attr;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index d013c5b3f1ed..ac77ac1fd73e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -1742,17 +1742,22 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
err = dbg_walk_index(c, NULL, add_size, &calc);
if (err) {
ubifs_err(c, "error %d while walking the index", err);
- return err;
+ goto out_err;
}
if (calc != idx_size) {
ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld",
calc, idx_size);
dump_stack();
- return -EINVAL;
+ err = -EINVAL;
+ goto out_err;
}
return 0;
+
+out_err:
+ ubifs_destroy_tnc_tree(c);
+ return err;
}
/**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e413a9cf8ee3..eac0fef801f1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -205,7 +205,6 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
err = fscrypt_prepare_lookup(dir, dentry, &nm);
- generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return d_splice_alias(NULL, dentry);
if (err)
@@ -1134,6 +1133,8 @@ out_cancel:
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
+ /* Free inode->i_link before inode is marked as bad. */
+ fscrypt_free_inode(inode);
make_bad_inode(inode);
iput(inode);
out_fname:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5029eb3390a5..a1f46919934c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -96,36 +96,36 @@ dump:
return -EINVAL;
}
-static int do_readpage(struct page *page)
+static int do_readpage(struct folio *folio)
{
void *addr;
int err = 0, i;
unsigned int block, beyond;
- struct ubifs_data_node *dn;
- struct inode *inode = page->mapping->host;
+ struct ubifs_data_node *dn = NULL;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
loff_t i_size = i_size_read(inode);
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
- inode->i_ino, page->index, i_size, page->flags);
- ubifs_assert(c, !PageChecked(page));
- ubifs_assert(c, !PagePrivate(page));
+ inode->i_ino, folio->index, i_size, folio->flags);
+ ubifs_assert(c, !folio_test_checked(folio));
+ ubifs_assert(c, !folio->private);
- addr = kmap(page);
+ addr = kmap_local_folio(folio, 0);
- block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
if (block >= beyond) {
/* Reading beyond inode */
- SetPageChecked(page);
- memset(addr, 0, PAGE_SIZE);
+ folio_set_checked(folio);
+ addr = folio_zero_tail(folio, 0, addr);
goto out;
}
dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
if (!dn) {
err = -ENOMEM;
- goto error;
+ goto out;
}
i = 0;
@@ -150,39 +150,35 @@ static int do_readpage(struct page *page)
memset(addr + ilen, 0, dlen - ilen);
}
}
- if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio)))
break;
block += 1;
addr += UBIFS_BLOCK_SIZE;
+ if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) {
+ kunmap_local(addr - UBIFS_BLOCK_SIZE);
+ addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE);
+ }
}
+
if (err) {
struct ubifs_info *c = inode->i_sb->s_fs_info;
if (err == -ENOENT) {
/* Not found, so it must be a hole */
- SetPageChecked(page);
+ folio_set_checked(folio);
dbg_gen("hole");
- goto out_free;
+ err = 0;
+ } else {
+ ubifs_err(c, "cannot read page %lu of inode %lu, error %d",
+ folio->index, inode->i_ino, err);
}
- ubifs_err(c, "cannot read page %lu of inode %lu, error %d",
- page->index, inode->i_ino, err);
- goto error;
}
-out_free:
- kfree(dn);
out:
- SetPageUptodate(page);
- ClearPageError(page);
- flush_dcache_page(page);
- kunmap(page);
- return 0;
-
-error:
kfree(dn);
- ClearPageUptodate(page);
- SetPageError(page);
- flush_dcache_page(page);
- kunmap(page);
+ if (!err)
+ folio_mark_uptodate(folio);
+ flush_dcache_folio(folio);
+ kunmap_local(addr);
return err;
}
@@ -222,16 +218,16 @@ static int write_begin_slow(struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
struct ubifs_budget_req req = { .new_page = 1 };
int err, appending = !!(pos + len > inode->i_size);
- struct page *page;
+ struct folio *folio;
dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
inode->i_ino, pos, len, inode->i_size);
/*
- * At the slow path we have to budget before locking the page, because
- * budgeting may force write-back, which would wait on locked pages and
- * deadlock if we had the page locked. At this point we do not know
- * anything about the page, so assume that this is a new page which is
+ * At the slow path we have to budget before locking the folio, because
+ * budgeting may force write-back, which would wait on locked folios and
+ * deadlock if we had the folio locked. At this point we do not know
+ * anything about the folio, so assume that this is a new folio which is
* written to a hole. This corresponds to largest budget. Later the
* budget will be amended if this is not true.
*/
@@ -243,45 +239,43 @@ static int write_begin_slow(struct address_space *mapping,
if (unlikely(err))
return err;
- page = grab_cache_page_write_begin(mapping, index);
- if (unlikely(!page)) {
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio)) {
ubifs_release_budget(c, &req);
- return -ENOMEM;
+ return PTR_ERR(folio);
}
- if (!PageUptodate(page)) {
- if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE)
- SetPageChecked(page);
+ if (!folio_test_uptodate(folio)) {
+ if (pos == folio_pos(folio) && len >= folio_size(folio))
+ folio_set_checked(folio);
else {
- err = do_readpage(page);
+ err = do_readpage(folio);
if (err) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
ubifs_release_budget(c, &req);
return err;
}
}
-
- SetPageUptodate(page);
- ClearPageError(page);
}
- if (PagePrivate(page))
+ if (folio->private)
/*
- * The page is dirty, which means it was budgeted twice:
+ * The folio is dirty, which means it was budgeted twice:
* o first time the budget was allocated by the task which
- * made the page dirty and set the PG_private flag;
+ * made the folio dirty and set the private field;
* o and then we budgeted for it for the second time at the
* very beginning of this function.
*
- * So what we have to do is to release the page budget we
+ * So what we have to do is to release the folio budget we
* allocated.
*/
release_new_page_budget(c);
- else if (!PageChecked(page))
+ else if (!folio_test_checked(folio))
/*
- * We are changing a page which already exists on the media.
- * This means that changing the page does not make the amount
+ * We are changing a folio which already exists on the media.
+ * This means that changing the folio does not make the amount
* of indexing information larger, and this part of the budget
* which we have already acquired may be released.
*/
@@ -304,14 +298,14 @@ static int write_begin_slow(struct address_space *mapping,
ubifs_release_dirty_inode_budget(c, ui);
}
- *pagep = page;
+ *pagep = &folio->page;
return 0;
}
/**
* allocate_budget - allocate budget for 'ubifs_write_begin()'.
* @c: UBIFS file-system description object
- * @page: page to allocate budget for
+ * @folio: folio to allocate budget for
* @ui: UBIFS inode object the page belongs to
* @appending: non-zero if the page is appended
*
@@ -322,15 +316,15 @@ static int write_begin_slow(struct address_space *mapping,
*
* Returns: %0 in case of success and %-ENOSPC in case of failure.
*/
-static int allocate_budget(struct ubifs_info *c, struct page *page,
+static int allocate_budget(struct ubifs_info *c, struct folio *folio,
struct ubifs_inode *ui, int appending)
{
struct ubifs_budget_req req = { .fast = 1 };
- if (PagePrivate(page)) {
+ if (folio->private) {
if (!appending)
/*
- * The page is dirty and we are not appending, which
+ * The folio is dirty and we are not appending, which
* means no budget is needed at all.
*/
return 0;
@@ -354,11 +348,11 @@ static int allocate_budget(struct ubifs_info *c, struct page *page,
*/
req.dirtied_ino = 1;
} else {
- if (PageChecked(page))
+ if (folio_test_checked(folio))
/*
* The page corresponds to a hole and does not
* exist on the media. So changing it makes
- * make the amount of indexing information
+ * the amount of indexing information
* larger, and we have to budget for a new
* page.
*/
@@ -428,7 +422,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
int err, appending = !!(pos + len > inode->i_size);
int skipped_read = 0;
- struct page *page;
+ struct folio *folio;
ubifs_assert(c, ubifs_inode(inode)->ui_size == inode->i_size);
ubifs_assert(c, !c->ro_media && !c->ro_mount);
@@ -437,13 +431,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
return -EROFS;
/* Try out the fast-path part first */
- page = grab_cache_page_write_begin(mapping, index);
- if (unlikely(!page))
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
/* The page is not loaded from the flash */
- if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) {
+ if (pos == folio_pos(folio) && len >= folio_size(folio)) {
/*
* We change whole page so no need to load it. But we
* do not know whether this page exists on the media or
@@ -453,32 +448,27 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
* media. Thus, we are setting the @PG_checked flag
* here.
*/
- SetPageChecked(page);
+ folio_set_checked(folio);
skipped_read = 1;
} else {
- err = do_readpage(page);
+ err = do_readpage(folio);
if (err) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return err;
}
}
-
- SetPageUptodate(page);
- ClearPageError(page);
}
- err = allocate_budget(c, page, ui, appending);
+ err = allocate_budget(c, folio, ui, appending);
if (unlikely(err)) {
ubifs_assert(c, err == -ENOSPC);
/*
* If we skipped reading the page because we were going to
* write all of it, then it is not up to date.
*/
- if (skipped_read) {
- ClearPageChecked(page);
- ClearPageUptodate(page);
- }
+ if (skipped_read)
+ folio_clear_checked(folio);
/*
* Budgeting failed which means it would have to force
* write-back but didn't, because we set the @fast flag in the
@@ -490,8 +480,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
mutex_unlock(&ui->ui_mutex);
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return write_begin_slow(mapping, pos, len, pagep);
}
@@ -502,22 +492,21 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
* with @ui->ui_mutex locked if we are appending pages, and unlocked
* otherwise. This is an optimization (slightly hacky though).
*/
- *pagep = page;
+ *pagep = &folio->page;
return 0;
-
}
/**
* cancel_budget - cancel budget.
* @c: UBIFS file-system description object
- * @page: page to cancel budget for
+ * @folio: folio to cancel budget for
* @ui: UBIFS inode object the page belongs to
* @appending: non-zero if the page is appended
*
* This is a helper function for a page write operation. It unlocks the
* @ui->ui_mutex in case of appending.
*/
-static void cancel_budget(struct ubifs_info *c, struct page *page,
+static void cancel_budget(struct ubifs_info *c, struct folio *folio,
struct ubifs_inode *ui, int appending)
{
if (appending) {
@@ -525,8 +514,8 @@ static void cancel_budget(struct ubifs_info *c, struct page *page,
ubifs_release_dirty_inode_budget(c, ui);
mutex_unlock(&ui->ui_mutex);
}
- if (!PagePrivate(page)) {
- if (PageChecked(page))
+ if (!folio->private) {
+ if (folio_test_checked(folio))
release_new_page_budget(c);
else
release_existing_page_budget(c);
@@ -537,6 +526,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -544,44 +534,47 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
int appending = !!(end_pos > inode->i_size);
dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
- inode->i_ino, pos, page->index, len, copied, inode->i_size);
+ inode->i_ino, pos, folio->index, len, copied, inode->i_size);
- if (unlikely(copied < len && len == PAGE_SIZE)) {
+ if (unlikely(copied < len && !folio_test_uptodate(folio))) {
/*
- * VFS copied less data to the page that it intended and
+ * VFS copied less data to the folio than it intended and
* declared in its '->write_begin()' call via the @len
- * argument. If the page was not up-to-date, and @len was
- * @PAGE_SIZE, the 'ubifs_write_begin()' function did
+ * argument. If the folio was not up-to-date,
+ * the 'ubifs_write_begin()' function did
* not load it from the media (for optimization reasons). This
- * means that part of the page contains garbage. So read the
- * page now.
+ * means that part of the folio contains garbage. So read the
+ * folio now.
*/
dbg_gen("copied %d instead of %d, read page and repeat",
copied, len);
- cancel_budget(c, page, ui, appending);
- ClearPageChecked(page);
+ cancel_budget(c, folio, ui, appending);
+ folio_clear_checked(folio);
/*
* Return 0 to force VFS to repeat the whole operation, or the
* error code if 'do_readpage()' fails.
*/
- copied = do_readpage(page);
+ copied = do_readpage(folio);
goto out;
}
- if (!PagePrivate(page)) {
- attach_page_private(page, (void *)1);
+ if (len == folio_size(folio))
+ folio_mark_uptodate(folio);
+
+ if (!folio->private) {
+ folio_attach_private(folio, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
- __set_page_dirty_nobuffers(page);
+ filemap_dirty_folio(mapping, folio);
}
if (appending) {
i_size_write(inode, end_pos);
ui->ui_size = end_pos;
/*
- * Note, we do not set @I_DIRTY_PAGES (which means that the
- * inode has dirty pages), this has been done in
- * '__set_page_dirty_nobuffers()'.
+ * We do not set @I_DIRTY_PAGES (which means that
+ * the inode has dirty pages), this was done in
+ * filemap_dirty_folio().
*/
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
@@ -589,43 +582,43 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
}
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
/**
* populate_page - copy data nodes into a page for bulk-read.
* @c: UBIFS file-system description object
- * @page: page
+ * @folio: folio
* @bu: bulk-read information
* @n: next zbranch slot
*
* Returns: %0 on success and a negative error code on failure.
*/
-static int populate_page(struct ubifs_info *c, struct page *page,
+static int populate_page(struct ubifs_info *c, struct folio *folio,
struct bu_info *bu, int *n)
{
int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
loff_t i_size = i_size_read(inode);
unsigned int page_block;
void *addr, *zaddr;
pgoff_t end_index;
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
- inode->i_ino, page->index, i_size, page->flags);
+ inode->i_ino, folio->index, i_size, folio->flags);
- addr = zaddr = kmap(page);
+ addr = zaddr = kmap_local_folio(folio, 0);
end_index = (i_size - 1) >> PAGE_SHIFT;
- if (!i_size || page->index > end_index) {
+ if (!i_size || folio->index > end_index) {
hole = 1;
- memset(addr, 0, PAGE_SIZE);
+ addr = folio_zero_tail(folio, 0, addr);
goto out_hole;
}
- page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ page_block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
while (1) {
int err, len, out_len, dlen;
@@ -674,9 +667,13 @@ static int populate_page(struct ubifs_info *c, struct page *page,
break;
addr += UBIFS_BLOCK_SIZE;
page_block += 1;
+ if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) {
+ kunmap_local(addr - UBIFS_BLOCK_SIZE);
+ addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE);
+ }
}
- if (end_index == page->index) {
+ if (end_index == folio->index) {
int len = i_size & (PAGE_SIZE - 1);
if (len && len < read)
@@ -685,22 +682,19 @@ static int populate_page(struct ubifs_info *c, struct page *page,
out_hole:
if (hole) {
- SetPageChecked(page);
+ folio_set_checked(folio);
dbg_gen("hole");
}
- SetPageUptodate(page);
- ClearPageError(page);
- flush_dcache_page(page);
- kunmap(page);
+ folio_mark_uptodate(folio);
+ flush_dcache_folio(folio);
+ kunmap_local(addr);
*n = nn;
return 0;
out_err:
- ClearPageUptodate(page);
- SetPageError(page);
- flush_dcache_page(page);
- kunmap(page);
+ flush_dcache_folio(folio);
+ kunmap_local(addr);
ubifs_err(c, "bad data node (block %u, inode %lu)",
page_block, inode->i_ino);
return -EINVAL;
@@ -710,15 +704,15 @@ out_err:
* ubifs_do_bulk_read - do bulk-read.
* @c: UBIFS file-system description object
* @bu: bulk-read information
- * @page1: first page to read
+ * @folio1: first folio to read
*
* Returns: %1 if the bulk-read is done, otherwise %0 is returned.
*/
static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
- struct page *page1)
+ struct folio *folio1)
{
- pgoff_t offset = page1->index, end_index;
- struct address_space *mapping = page1->mapping;
+ pgoff_t offset = folio1->index, end_index;
+ struct address_space *mapping = folio1->mapping;
struct inode *inode = mapping->host;
struct ubifs_inode *ui = ubifs_inode(inode);
int err, page_idx, page_cnt, ret = 0, n = 0;
@@ -768,11 +762,11 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
goto out_warn;
}
- err = populate_page(c, page1, bu, &n);
+ err = populate_page(c, folio1, bu, &n);
if (err)
goto out_warn;
- unlock_page(page1);
+ folio_unlock(folio1);
ret = 1;
isize = i_size_read(inode);
@@ -782,19 +776,19 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
for (page_idx = 1; page_idx < page_cnt; page_idx++) {
pgoff_t page_offset = offset + page_idx;
- struct page *page;
+ struct folio *folio;
if (page_offset > end_index)
break;
- page = pagecache_get_page(mapping, page_offset,
+ folio = __filemap_get_folio(mapping, page_offset,
FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT,
ra_gfp_mask);
- if (!page)
+ if (IS_ERR(folio))
break;
- if (!PageUptodate(page))
- err = populate_page(c, page, bu, &n);
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio))
+ err = populate_page(c, folio, bu, &n);
+ folio_unlock(folio);
+ folio_put(folio);
if (err)
break;
}
@@ -817,7 +811,7 @@ out_bu_off:
/**
* ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
- * @page: page from which to start bulk-read.
+ * @folio: folio from which to start bulk-read.
*
* Some flash media are capable of reading sequentially at faster rates. UBIFS
* bulk-read facility is designed to take advantage of that, by reading in one
@@ -826,12 +820,12 @@ out_bu_off:
*
* Returns: %1 if a bulk-read is done and %0 otherwise.
*/
-static int ubifs_bulk_read(struct page *page)
+static int ubifs_bulk_read(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_inode *ui = ubifs_inode(inode);
- pgoff_t index = page->index, last_page_read = ui->last_page_read;
+ pgoff_t index = folio->index, last_page_read = ui->last_page_read;
struct bu_info *bu;
int err = 0, allocated = 0;
@@ -879,8 +873,8 @@ static int ubifs_bulk_read(struct page *page)
bu->buf_len = c->max_bu_buf_len;
data_key_init(c, &bu->key, inode->i_ino,
- page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
- err = ubifs_do_bulk_read(c, bu, page);
+ folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+ err = ubifs_do_bulk_read(c, bu, folio);
if (!allocated)
mutex_unlock(&c->bu_mutex);
@@ -894,69 +888,71 @@ out_unlock:
static int ubifs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
-
- if (ubifs_bulk_read(page))
+ if (ubifs_bulk_read(folio))
return 0;
- do_readpage(page);
+ do_readpage(folio);
folio_unlock(folio);
return 0;
}
-static int do_writepage(struct page *page, int len)
+static int do_writepage(struct folio *folio, size_t len)
{
- int err = 0, i, blen;
+ int err = 0, blen;
unsigned int block;
void *addr;
+ size_t offset = 0;
union ubifs_key key;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
#ifdef UBIFS_DEBUG
struct ubifs_inode *ui = ubifs_inode(inode);
spin_lock(&ui->ui_lock);
- ubifs_assert(c, page->index <= ui->synced_i_size >> PAGE_SHIFT);
+ ubifs_assert(c, folio->index <= ui->synced_i_size >> PAGE_SHIFT);
spin_unlock(&ui->ui_lock);
#endif
- /* Update radix tree tags */
- set_page_writeback(page);
+ folio_start_writeback(folio);
- addr = kmap(page);
- block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
- i = 0;
- while (len) {
- blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+ addr = kmap_local_folio(folio, offset);
+ block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ for (;;) {
+ blen = min_t(size_t, len, UBIFS_BLOCK_SIZE);
data_key_init(c, &key, inode->i_ino, block);
err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
if (err)
break;
- if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ len -= blen;
+ if (!len)
break;
block += 1;
addr += blen;
- len -= blen;
+ if (folio_test_highmem(folio) && !offset_in_page(addr)) {
+ kunmap_local(addr - blen);
+ offset += PAGE_SIZE;
+ addr = kmap_local_folio(folio, offset);
+ }
}
+ kunmap_local(addr);
if (err) {
- SetPageError(page);
- ubifs_err(c, "cannot write page %lu of inode %lu, error %d",
- page->index, inode->i_ino, err);
+ mapping_set_error(folio->mapping, err);
+ ubifs_err(c, "cannot write folio %lu of inode %lu, error %d",
+ folio->index, inode->i_ino, err);
ubifs_ro_mode(c, err);
}
- ubifs_assert(c, PagePrivate(page));
- if (PageChecked(page))
+ ubifs_assert(c, folio->private != NULL);
+ if (folio_test_checked(folio))
release_new_page_budget(c);
else
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- detach_page_private(page);
- ClearPageChecked(page);
+ folio_detach_private(folio);
+ folio_clear_checked(folio);
- kunmap(page);
- unlock_page(page);
- end_page_writeback(page);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
return err;
}
@@ -1006,22 +1002,21 @@ static int do_writepage(struct page *page, int len)
* on the page lock and it would not write the truncated inode node to the
* journal before we have finished.
*/
-static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc,
+ void *data)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_inode *ui = ubifs_inode(inode);
loff_t i_size = i_size_read(inode), synced_i_size;
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- int err, len = i_size & (PAGE_SIZE - 1);
- void *kaddr;
+ int err, len = folio_size(folio);
dbg_gen("ino %lu, pg %lu, pg flags %#lx",
- inode->i_ino, page->index, page->flags);
- ubifs_assert(c, PagePrivate(page));
+ inode->i_ino, folio->index, folio->flags);
+ ubifs_assert(c, folio->private != NULL);
- /* Is the page fully outside @i_size? (truncate in progress) */
- if (page->index > end_index || (page->index == end_index && !len)) {
+ /* Is the folio fully outside @i_size? (truncate in progress) */
+ if (folio_pos(folio) >= i_size) {
err = 0;
goto out_unlock;
}
@@ -1030,9 +1025,9 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
synced_i_size = ui->synced_i_size;
spin_unlock(&ui->ui_lock);
- /* Is the page fully inside @i_size? */
- if (page->index < end_index) {
- if (page->index >= synced_i_size >> PAGE_SHIFT) {
+ /* Is the folio fully inside i_size? */
+ if (folio_pos(folio) + len <= i_size) {
+ if (folio_pos(folio) >= synced_i_size) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
goto out_redirty;
@@ -1045,20 +1040,18 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
* with this.
*/
}
- return do_writepage(page, PAGE_SIZE);
+ return do_writepage(folio, len);
}
/*
- * The page straddles @i_size. It must be zeroed out on each and every
+ * The folio straddles @i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- kaddr = kmap_atomic(page);
- memset(kaddr + len, 0, PAGE_SIZE - len);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
+ len = i_size - folio_pos(folio);
+ folio_zero_segment(folio, len, folio_size(folio));
if (i_size > synced_i_size) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
@@ -1066,19 +1059,25 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
goto out_redirty;
}
- return do_writepage(page, len);
+ return do_writepage(folio, len);
out_redirty:
/*
- * redirty_page_for_writepage() won't call ubifs_dirty_inode() because
+ * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because
* it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so
* there is no need to do space budget for dirty inode.
*/
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
+static int ubifs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return write_cache_pages(mapping, wbc, ubifs_writepage, NULL);
+}
+
/**
* do_attr_changes - change inode attributes.
* @inode: inode to change attributes for
@@ -1155,11 +1154,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
if (offset) {
pgoff_t index = new_size >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
- page = find_lock_page(inode->i_mapping, index);
- if (page) {
- if (PageDirty(page)) {
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (!IS_ERR(folio)) {
+ if (folio_test_dirty(folio)) {
/*
* 'ubifs_jnl_truncate()' will try to truncate
* the last data node, but it contains
@@ -1168,14 +1167,14 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
* 'ubifs_jnl_truncate()' will see an already
* truncated (and up to date) data node.
*/
- ubifs_assert(c, PagePrivate(page));
+ ubifs_assert(c, folio->private != NULL);
- clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
- offset = new_size &
- (PAGE_SIZE - 1);
- err = do_writepage(page, offset);
- put_page(page);
+ offset = offset_in_folio(folio,
+ new_size);
+ err = do_writepage(folio, offset);
+ folio_put(folio);
if (err)
goto out_budg;
/*
@@ -1188,8 +1187,8 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
* to 'ubifs_jnl_truncate()' to save it from
* having to read it.
*/
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
}
}
@@ -1512,14 +1511,14 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
*/
static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct timespec64 now = current_time(inode);
struct ubifs_budget_req req = { .new_page = 1 };
int err, update_time;
- dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
+ dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, folio->index,
i_size_read(inode));
ubifs_assert(c, !c->ro_media && !c->ro_mount);
@@ -1527,17 +1526,17 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
return VM_FAULT_SIGBUS; /* -EROFS */
/*
- * We have not locked @page so far so we may budget for changing the
- * page. Note, we cannot do this after we locked the page, because
+ * We have not locked @folio so far so we may budget for changing the
+ * folio. Note, we cannot do this after we locked the folio, because
* budgeting may cause write-back which would cause deadlock.
*
- * At the moment we do not know whether the page is dirty or not, so we
- * assume that it is not and budget for a new page. We could look at
+ * At the moment we do not know whether the folio is dirty or not, so we
+ * assume that it is not and budget for a new folio. We could look at
* the @PG_private flag and figure this out, but we may race with write
- * back and the page state may change by the time we lock it, so this
+ * back and the folio state may change by the time we lock it, so this
* would need additional care. We do not bother with this at the
* moment, although it might be good idea to do. Instead, we allocate
- * budget for a new page and amend it later on if the page was in fact
+ * budget for a new folio and amend it later on if the folio was in fact
* dirty.
*
* The budgeting-related logic of this function is similar to what we
@@ -1560,21 +1559,21 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- lock_page(page);
- if (unlikely(page->mapping != inode->i_mapping ||
- page_offset(page) > i_size_read(inode))) {
- /* Page got truncated out from underneath us */
+ folio_lock(folio);
+ if (unlikely(folio->mapping != inode->i_mapping ||
+ folio_pos(folio) >= i_size_read(inode))) {
+ /* Folio got truncated out from underneath us */
goto sigbus;
}
- if (PagePrivate(page))
+ if (folio->private)
release_new_page_budget(c);
else {
- if (!PageChecked(page))
+ if (!folio_test_checked(folio))
ubifs_convert_page_budget(c);
- attach_page_private(page, (void *)1);
+ folio_attach_private(folio, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
- __set_page_dirty_nobuffers(page);
+ filemap_dirty_folio(folio->mapping, folio);
}
if (update_time) {
@@ -1590,11 +1589,11 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
ubifs_release_dirty_inode_budget(c, ui);
}
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
return VM_FAULT_LOCKED;
sigbus:
- unlock_page(page);
+ folio_unlock(folio);
ubifs_release_budget(c, &req);
return VM_FAULT_SIGBUS;
}
@@ -1648,7 +1647,7 @@ static int ubifs_symlink_getattr(struct mnt_idmap *idmap,
const struct address_space_operations ubifs_file_address_operations = {
.read_folio = ubifs_read_folio,
- .writepage = ubifs_writepage,
+ .writepages = ubifs_writepages,
.write_begin = ubifs_write_begin,
.write_end = ubifs_write_end,
.invalidate_folio = ubifs_invalidate_folio,
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 873e6e1c92b5..6ebf3c04ac5f 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -82,8 +82,9 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
*/
static int scan_for_dirty_cb(struct ubifs_info *c,
const struct ubifs_lprops *lprops, int in_tree,
- struct scan_data *data)
+ void *arg)
{
+ struct scan_data *data = arg;
int ret = LPT_SCAN_CONTINUE;
/* Exclude LEBs that are currently in use */
@@ -166,8 +167,7 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
data.pick_free = pick_free;
data.lnum = -1;
data.exclude_index = exclude_index;
- err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
- (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_dirty_cb,
&data);
if (err)
return ERR_PTR(err);
@@ -349,8 +349,9 @@ out:
*/
static int scan_for_free_cb(struct ubifs_info *c,
const struct ubifs_lprops *lprops, int in_tree,
- struct scan_data *data)
+ void *arg)
{
+ struct scan_data *data = arg;
int ret = LPT_SCAN_CONTINUE;
/* Exclude LEBs that are currently in use */
@@ -446,7 +447,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
data.pick_free = pick_free;
data.lnum = -1;
err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
- (ubifs_lpt_scan_callback)scan_for_free_cb,
+ scan_for_free_cb,
&data);
if (err)
return ERR_PTR(err);
@@ -589,8 +590,9 @@ out:
*/
static int scan_for_idx_cb(struct ubifs_info *c,
const struct ubifs_lprops *lprops, int in_tree,
- struct scan_data *data)
+ void *arg)
{
+ struct scan_data *data = arg;
int ret = LPT_SCAN_CONTINUE;
/* Exclude LEBs that are currently in use */
@@ -625,8 +627,7 @@ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
int err;
data.lnum = -1;
- err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
- (ubifs_lpt_scan_callback)scan_for_idx_cb,
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_idx_cb,
&data);
if (err)
return ERR_PTR(err);
@@ -726,11 +727,10 @@ out:
return err;
}
-static int cmp_dirty_idx(const struct ubifs_lprops **a,
- const struct ubifs_lprops **b)
+static int cmp_dirty_idx(const void *a, const void *b)
{
- const struct ubifs_lprops *lpa = *a;
- const struct ubifs_lprops *lpb = *b;
+ const struct ubifs_lprops *lpa = *(const struct ubifs_lprops **)a;
+ const struct ubifs_lprops *lpb = *(const struct ubifs_lprops **)b;
return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
}
@@ -754,7 +754,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
sizeof(void *) * c->dirty_idx.cnt);
/* Sort it so that the dirtiest is now at the end */
sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
- (int (*)(const void *, const void *))cmp_dirty_idx, NULL);
+ cmp_dirty_idx, NULL);
dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
if (c->dirty_idx.cnt)
dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
@@ -782,8 +782,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
*/
static int scan_dirty_idx_cb(struct ubifs_info *c,
const struct ubifs_lprops *lprops, int in_tree,
- struct scan_data *data)
+ void *arg)
{
+ struct scan_data *data = arg;
int ret = LPT_SCAN_CONTINUE;
/* Exclude LEBs that are currently in use */
@@ -842,8 +843,7 @@ static int find_dirty_idx_leb(struct ubifs_info *c)
if (c->pnodes_have >= c->pnode_cnt)
/* All pnodes are in memory, so skip scan */
return -ENOSPC;
- err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
- (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_dirty_idx_cb,
&data);
if (err)
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f0a5538c84b0..74aee92433d7 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -293,6 +293,96 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
}
/**
+ * __queue_and_wait - queue a task and wait until the task is waked up.
+ * @c: UBIFS file-system description object
+ *
+ * This function adds current task in queue and waits until the task is waked
+ * up. This function should be called with @c->reserve_space_wq locked.
+ */
+static void __queue_and_wait(struct ubifs_info *c)
+{
+ DEFINE_WAIT(wait);
+
+ __add_wait_queue_entry_tail_exclusive(&c->reserve_space_wq, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&c->reserve_space_wq.lock);
+
+ schedule();
+ finish_wait(&c->reserve_space_wq, &wait);
+}
+
+/**
+ * wait_for_reservation - try queuing current task to wait until waked up.
+ * @c: UBIFS file-system description object
+ *
+ * This function queues current task to wait until waked up, if queuing is
+ * started(@c->need_wait_space is not %0). Returns %true if current task is
+ * added in queue, otherwise %false is returned.
+ */
+static bool wait_for_reservation(struct ubifs_info *c)
+{
+ if (likely(atomic_read(&c->need_wait_space) == 0))
+ /* Quick path to check whether queuing is started. */
+ return false;
+
+ spin_lock(&c->reserve_space_wq.lock);
+ if (atomic_read(&c->need_wait_space) == 0) {
+ /* Queuing is not started, don't queue current task. */
+ spin_unlock(&c->reserve_space_wq.lock);
+ return false;
+ }
+
+ __queue_and_wait(c);
+ return true;
+}
+
+/**
+ * wake_up_reservation - wake up first task in queue or stop queuing.
+ * @c: UBIFS file-system description object
+ *
+ * This function wakes up the first task in queue if it exists, or stops
+ * queuing if no tasks in queue.
+ */
+static void wake_up_reservation(struct ubifs_info *c)
+{
+ spin_lock(&c->reserve_space_wq.lock);
+ if (waitqueue_active(&c->reserve_space_wq))
+ wake_up_locked(&c->reserve_space_wq);
+ else
+ /*
+ * Compared with wait_for_reservation(), set @c->need_wait_space
+ * under the protection of wait queue lock, which can avoid that
+ * @c->need_wait_space is set to 0 after new task queued.
+ */
+ atomic_set(&c->need_wait_space, 0);
+ spin_unlock(&c->reserve_space_wq.lock);
+}
+
+/**
+ * wake_up_reservation - add current task in queue or start queuing.
+ * @c: UBIFS file-system description object
+ *
+ * This function starts queuing if queuing is not started, otherwise adds
+ * current task in queue.
+ */
+static void add_or_start_queue(struct ubifs_info *c)
+{
+ spin_lock(&c->reserve_space_wq.lock);
+ if (atomic_cmpxchg(&c->need_wait_space, 0, 1) == 0) {
+ /* Starts queuing, task can go on directly. */
+ spin_unlock(&c->reserve_space_wq.lock);
+ return;
+ }
+
+ /*
+ * There are at least two tasks have retried more than 32 times
+ * at certain point, first task has started queuing, just queue
+ * the left tasks.
+ */
+ __queue_and_wait(c);
+}
+
+/**
* make_reservation - reserve journal space.
* @c: UBIFS file-system description object
* @jhead: journal head
@@ -311,33 +401,27 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
static int make_reservation(struct ubifs_info *c, int jhead, int len)
{
int err, cmt_retries = 0, nospc_retries = 0;
+ bool blocked = wait_for_reservation(c);
again:
down_read(&c->commit_sem);
err = reserve_space(c, jhead, len);
- if (!err)
+ if (!err) {
/* c->commit_sem will get released via finish_reservation(). */
- return 0;
+ goto out_wake_up;
+ }
up_read(&c->commit_sem);
if (err == -ENOSPC) {
/*
* GC could not make any progress. We should try to commit
- * once because it could make some dirty space and GC would
- * make progress, so make the error -EAGAIN so that the below
+ * because it could make some dirty space and GC would make
+ * progress, so make the error -EAGAIN so that the below
* will commit and re-try.
*/
- if (nospc_retries++ < 2) {
- dbg_jnl("no space, retry");
- err = -EAGAIN;
- }
-
- /*
- * This means that the budgeting is incorrect. We always have
- * to be able to write to the media, because all operations are
- * budgeted. Deletions are not budgeted, though, but we reserve
- * an extra LEB for them.
- */
+ nospc_retries++;
+ dbg_jnl("no space, retry");
+ err = -EAGAIN;
}
if (err != -EAGAIN)
@@ -349,15 +433,37 @@ again:
*/
if (cmt_retries > 128) {
/*
- * This should not happen unless the journal size limitations
- * are too tough.
+ * This should not happen unless:
+ * 1. The journal size limitations are too tough.
+ * 2. The budgeting is incorrect. We always have to be able to
+ * write to the media, because all operations are budgeted.
+ * Deletions are not budgeted, though, but we reserve an
+ * extra LEB for them.
*/
- ubifs_err(c, "stuck in space allocation");
+ ubifs_err(c, "stuck in space allocation, nospc_retries %d",
+ nospc_retries);
err = -ENOSPC;
goto out;
- } else if (cmt_retries > 32)
- ubifs_warn(c, "too many space allocation re-tries (%d)",
- cmt_retries);
+ } else if (cmt_retries > 32) {
+ /*
+ * It's almost impossible to happen, unless there are many tasks
+ * making reservation concurrently and someone task has retried
+ * gc + commit for many times, generated available space during
+ * this period are grabbed by other tasks.
+ * But if it happens, start queuing up all tasks that will make
+ * space reservation, then there is only one task making space
+ * reservation at any time, and it can always make success under
+ * the premise of correct budgeting.
+ */
+ ubifs_warn(c, "too many space allocation cmt_retries (%d) "
+ "nospc_retries (%d), start queuing tasks",
+ cmt_retries, nospc_retries);
+
+ if (!blocked) {
+ blocked = true;
+ add_or_start_queue(c);
+ }
+ }
dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
cmt_retries);
@@ -365,7 +471,7 @@ again:
err = ubifs_run_commit(c);
if (err)
- return err;
+ goto out_wake_up;
goto again;
out:
@@ -380,6 +486,27 @@ out:
cmt_retries = dbg_check_lprops(c);
up_write(&c->commit_sem);
}
+out_wake_up:
+ if (blocked) {
+ /*
+ * Only tasks that have ever started queuing or ever been queued
+ * can wake up other queued tasks, which can make sure that
+ * there is only one task waked up to make space reservation.
+ * For example:
+ * task A task B task C
+ * make_reservation make_reservation
+ * reserve_space // 0
+ * wake_up_reservation
+ * atomic_cmpxchg // 0, start queuing
+ * reserve_space
+ * wait_for_reservation
+ * __queue_and_wait
+ * add_wait_queue
+ * if (blocked) // false
+ * // So that task C won't be waked up to race with task B
+ */
+ wake_up_reservation(c);
+ }
return err;
}
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 6d6cd85c2b4c..a11c3dab7e16 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1014,8 +1014,9 @@ out:
*/
static int scan_check_cb(struct ubifs_info *c,
const struct ubifs_lprops *lp, int in_tree,
- struct ubifs_lp_stats *lst)
+ void *arg)
{
+ struct ubifs_lp_stats *lst = arg;
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
@@ -1269,8 +1270,7 @@ int dbg_check_lprops(struct ubifs_info *c)
memset(&lst, 0, sizeof(struct ubifs_lp_stats));
err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
- (ubifs_lpt_scan_callback)scan_check_cb,
- &lst);
+ scan_check_cb, &lst);
if (err && err != -ENOSPC)
goto out;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index c4d079328b92..07351fdce722 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1646,7 +1646,6 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
len -= node_len;
}
- err = 0;
out:
vfree(buf);
return err;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 09e270d6ed02..291583005dd1 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2151,6 +2151,8 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
mutex_init(&c->bu_mutex);
mutex_init(&c->write_reserve_mutex);
init_waitqueue_head(&c->cmt_wq);
+ init_waitqueue_head(&c->reserve_space_wq);
+ atomic_set(&c->need_wait_space, 0);
c->buds = RB_ROOT;
c->old_idx = RB_ROOT;
c->size_tree = RB_ROOT;
@@ -2239,13 +2241,14 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out_umount;
}
+ generic_set_sb_d_ops(sb);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
err = -ENOMEM;
goto out_umount;
}
- import_uuid(&sb->s_uuid, c->uuid);
+ super_set_uuid(sb, c->uuid, sizeof(c->uuid));
mutex_unlock(&c->umount_mutex);
return 0;
@@ -2433,8 +2436,8 @@ static int __init ubifs_init(void)
ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
sizeof(struct ubifs_inode), 0,
- SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
- SLAB_ACCOUNT, &inode_slab_ctor);
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+ &inode_slab_ctor);
if (!ubifs_inode_slab)
return -ENOMEM;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f4728e65d1bd..45cacdcd4746 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -3116,14 +3116,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
void ubifs_tnc_close(struct ubifs_info *c)
{
tnc_destroy_cnext(c);
- if (c->zroot.znode) {
- long n, freed;
-
- n = atomic_long_read(&c->clean_zn_cnt);
- freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode);
- ubifs_assert(c, freed == n);
- atomic_long_sub(n, &ubifs_clean_zn_cnt);
- }
+ ubifs_destroy_tnc_tree(c);
kfree(c->gap_lebs);
kfree(c->ilebs);
destroy_old_idx(c);
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index 4d686e34e64d..d3f8a6aa1f49 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -251,6 +251,28 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c,
}
/**
+ * ubifs_destroy_tnc_tree - destroy all znodes connected to the TNC tree.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the whole TNC tree and updates clean global znode
+ * count.
+ */
+void ubifs_destroy_tnc_tree(struct ubifs_info *c)
+{
+ long n, freed;
+
+ if (!c->zroot.znode)
+ return;
+
+ n = atomic_long_read(&c->clean_zn_cnt);
+ freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode);
+ ubifs_assert(c, freed == n);
+ atomic_long_sub(n, &ubifs_clean_zn_cnt);
+
+ c->zroot.znode = NULL;
+}
+
+/**
* read_znode - read an indexing node from flash and fill znode.
* @c: UBIFS file-system description object
* @zzbr: the zbranch describing the node to read
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 3916dc4f30ca..1f3ea879d93a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1047,6 +1047,8 @@ struct ubifs_debug_info;
* @bg_bud_bytes: number of bud bytes when background commit is initiated
* @old_buds: buds to be released after commit ends
* @max_bud_cnt: maximum number of buds
+ * @need_wait_space: Non %0 means space reservation tasks need to wait in queue
+ * @reserve_space_wq: wait queue to sleep on if @need_wait_space is not %0
*
* @commit_sem: synchronizes committer with other processes
* @cmt_state: commit state
@@ -1305,6 +1307,8 @@ struct ubifs_info {
long long bg_bud_bytes;
struct list_head old_buds;
int max_bud_cnt;
+ atomic_t need_wait_space;
+ wait_queue_head_t reserve_space_wq;
struct rw_semaphore commit_sem;
int cmt_state;
@@ -1903,6 +1907,7 @@ struct ubifs_znode *ubifs_tnc_postorder_next(const struct ubifs_info *c,
struct ubifs_znode *znode);
long ubifs_destroy_tnc_subtree(const struct ubifs_info *c,
struct ubifs_znode *zr);
+void ubifs_destroy_tnc_tree(struct ubifs_info *c);
struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
struct ubifs_zbranch *zbr,
struct ubifs_znode *parent, int iip);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f6533f93851b..f94f45fe2c91 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -67,7 +67,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
pos_valid = true;
}
- fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ fname = kmalloc(UDF_NAME_LEN, GFP_KERNEL);
if (!fname) {
ret = -ENOMEM;
goto out;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index d8493449d4c5..2f831a3a91af 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -357,7 +357,7 @@ int udf_expand_file_adinicb(struct inode *inode)
return 0;
}
- page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, 0, GFP_KERNEL);
if (!page)
return -ENOMEM;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 1bb6ed948927..1308109fd42d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -59,7 +59,7 @@ static int udf_fiiter_find_entry(struct inode *dir, const struct qstr *child,
child->name[0] == '.' && child->name[1] == '.';
int ret;
- fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ fname = kmalloc(UDF_NAME_LEN, GFP_KERNEL);
if (!fname)
return -ENOMEM;
@@ -566,7 +566,7 @@ out:
static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
- struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777);
+ struct inode *inode;
struct pathComponent *pc;
const char *compstart;
struct extent_position epos = {};
@@ -579,17 +579,20 @@ static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct udf_inode_info *iinfo;
struct super_block *sb = dir->i_sb;
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- iinfo = UDF_I(inode);
- down_write(&iinfo->i_data_sem);
- name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
+ name = kmalloc(UDF_NAME_LEN_CS0, GFP_KERNEL);
if (!name) {
err = -ENOMEM;
- goto out_no_entry;
+ goto out;
+ }
+
+ inode = udf_new_inode(dir, S_IFLNK | 0777);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out;
}
+ iinfo = UDF_I(inode);
+ down_write(&iinfo->i_data_sem);
inode->i_data.a_ops = &udf_symlink_aops;
inode->i_op = &udf_symlink_inode_operations;
inode_nohighmem(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 928a04d9d9e0..2217f7ed7a49 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -40,20 +40,20 @@
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/parser.h>
#include <linux/stat.h>
#include <linux/cdrom.h>
#include <linux/nls.h>
#include <linux/vfs.h>
#include <linux/vmalloc.h>
#include <linux/errno.h>
-#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>
#include <linux/crc-itu-t.h>
#include <linux/log2.h>
#include <asm/byteorder.h>
#include <linux/iversion.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "udf_sb.h"
#include "udf_i.h"
@@ -91,16 +91,20 @@ enum { UDF_MAX_LINKS = 0xffff };
#define UDF_MAX_FILESIZE (1ULL << 42)
/* These are the "meat" - everything else is stuffing */
-static int udf_fill_super(struct super_block *, void *, int);
+static int udf_fill_super(struct super_block *sb, struct fs_context *fc);
static void udf_put_super(struct super_block *);
static int udf_sync_fs(struct super_block *, int);
-static int udf_remount_fs(struct super_block *, int *, char *);
static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
static void udf_open_lvid(struct super_block *);
static void udf_close_lvid(struct super_block *);
static unsigned int udf_count_free(struct super_block *);
static int udf_statfs(struct dentry *, struct kstatfs *);
static int udf_show_options(struct seq_file *, struct dentry *);
+static int udf_init_fs_context(struct fs_context *fc);
+static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param);
+static int udf_reconfigure(struct fs_context *fc);
+static void udf_free_fc(struct fs_context *fc);
+static const struct fs_parameter_spec udf_param_spec[];
struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
{
@@ -119,18 +123,25 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
}
/* UDF filesystem type */
-static struct dentry *udf_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int udf_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+ return get_tree_bdev(fc, udf_fill_super);
}
+static const struct fs_context_operations udf_context_ops = {
+ .parse_param = udf_parse_param,
+ .get_tree = udf_get_tree,
+ .reconfigure = udf_reconfigure,
+ .free = udf_free_fc,
+};
+
static struct file_system_type udf_fstype = {
.owner = THIS_MODULE,
.name = "udf",
- .mount = udf_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = udf_init_fs_context,
+ .parameters = udf_param_spec,
};
MODULE_ALIAS_FS("udf");
@@ -177,7 +188,6 @@ static int __init init_inodecache(void)
udf_inode_cachep = kmem_cache_create("udf_inode_cache",
sizeof(struct udf_inode_info),
0, (SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD |
SLAB_ACCOUNT),
init_once);
if (!udf_inode_cachep)
@@ -204,12 +214,10 @@ static const struct super_operations udf_sb_ops = {
.put_super = udf_put_super,
.sync_fs = udf_sync_fs,
.statfs = udf_statfs,
- .remount_fs = udf_remount_fs,
.show_options = udf_show_options,
};
struct udf_options {
- unsigned char novrs;
unsigned int blocksize;
unsigned int session;
unsigned int lastblock;
@@ -223,6 +231,65 @@ struct udf_options {
struct nls_table *nls_map;
};
+/*
+ * UDF has historically preserved prior mount options across
+ * a remount, so copy those here if remounting, otherwise set
+ * initial mount defaults.
+ */
+static void udf_init_options(struct fs_context *fc, struct udf_options *uopt)
+{
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct udf_sb_info *sbi = UDF_SB(sb);
+
+ uopt->flags = sbi->s_flags;
+ uopt->uid = sbi->s_uid;
+ uopt->gid = sbi->s_gid;
+ uopt->umask = sbi->s_umask;
+ uopt->fmode = sbi->s_fmode;
+ uopt->dmode = sbi->s_dmode;
+ uopt->nls_map = NULL;
+ } else {
+ uopt->flags = (1 << UDF_FLAG_USE_AD_IN_ICB) |
+ (1 << UDF_FLAG_STRICT);
+ /*
+ * By default we'll use overflow[ug]id when UDF
+ * inode [ug]id == -1
+ */
+ uopt->uid = make_kuid(current_user_ns(), overflowuid);
+ uopt->gid = make_kgid(current_user_ns(), overflowgid);
+ uopt->umask = 0;
+ uopt->fmode = UDF_INVALID_MODE;
+ uopt->dmode = UDF_INVALID_MODE;
+ uopt->nls_map = NULL;
+ uopt->session = 0xFFFFFFFF;
+ }
+}
+
+static int udf_init_fs_context(struct fs_context *fc)
+{
+ struct udf_options *uopt;
+
+ uopt = kzalloc(sizeof(*uopt), GFP_KERNEL);
+ if (!uopt)
+ return -ENOMEM;
+
+ udf_init_options(fc, uopt);
+
+ fc->fs_private = uopt;
+ fc->ops = &udf_context_ops;
+
+ return 0;
+}
+
+static void udf_free_fc(struct fs_context *fc)
+{
+ struct udf_options *uopt = fc->fs_private;
+
+ unload_nls(uopt->nls_map);
+ kfree(fc->fs_private);
+}
+
static int __init init_udf_fs(void)
{
int err;
@@ -358,7 +425,7 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
}
/*
- * udf_parse_options
+ * udf_parse_param
*
* PURPOSE
* Parse mount options.
@@ -401,12 +468,12 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
* yield highly unpredictable results.
*
* PRE-CONDITIONS
- * options Pointer to mount options string.
- * uopts Pointer to mount options variable.
+ * fc fs_context with pointer to mount options variable.
+ * param Pointer to fs_parameter being parsed.
*
* POST-CONDITIONS
- * <return> 1 Mount options parsed okay.
- * <return> 0 Error parsing mount options.
+ * <return> 0 Mount options parsed okay.
+ * <return> errno Error parsing mount options.
*
* HISTORY
* July 1, 1997 - Andrew E. Mileski
@@ -418,229 +485,193 @@ enum {
Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad,
Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
- Opt_rootdir, Opt_utf8, Opt_iocharset,
- Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
- Opt_fmode, Opt_dmode
-};
-
-static const match_table_t tokens = {
- {Opt_novrs, "novrs"},
- {Opt_nostrict, "nostrict"},
- {Opt_bs, "bs=%u"},
- {Opt_unhide, "unhide"},
- {Opt_undelete, "undelete"},
- {Opt_noadinicb, "noadinicb"},
- {Opt_adinicb, "adinicb"},
- {Opt_shortad, "shortad"},
- {Opt_longad, "longad"},
- {Opt_uforget, "uid=forget"},
- {Opt_uignore, "uid=ignore"},
- {Opt_gforget, "gid=forget"},
- {Opt_gignore, "gid=ignore"},
- {Opt_gid, "gid=%u"},
- {Opt_uid, "uid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_session, "session=%u"},
- {Opt_lastblock, "lastblock=%u"},
- {Opt_anchor, "anchor=%u"},
- {Opt_volume, "volume=%u"},
- {Opt_partition, "partition=%u"},
- {Opt_fileset, "fileset=%u"},
- {Opt_rootdir, "rootdir=%u"},
- {Opt_utf8, "utf8"},
- {Opt_iocharset, "iocharset=%s"},
- {Opt_fmode, "mode=%o"},
- {Opt_dmode, "dmode=%o"},
- {Opt_err, NULL}
+ Opt_rootdir, Opt_utf8, Opt_iocharset, Opt_err, Opt_fmode, Opt_dmode
};
-static int udf_parse_options(char *options, struct udf_options *uopt,
- bool remount)
+static const struct fs_parameter_spec udf_param_spec[] = {
+ fsparam_flag ("novrs", Opt_novrs),
+ fsparam_flag ("nostrict", Opt_nostrict),
+ fsparam_u32 ("bs", Opt_bs),
+ fsparam_flag ("unhide", Opt_unhide),
+ fsparam_flag ("undelete", Opt_undelete),
+ fsparam_flag_no ("adinicb", Opt_adinicb),
+ fsparam_flag ("shortad", Opt_shortad),
+ fsparam_flag ("longad", Opt_longad),
+ fsparam_string ("gid", Opt_gid),
+ fsparam_string ("uid", Opt_uid),
+ fsparam_u32 ("umask", Opt_umask),
+ fsparam_u32 ("session", Opt_session),
+ fsparam_u32 ("lastblock", Opt_lastblock),
+ fsparam_u32 ("anchor", Opt_anchor),
+ fsparam_u32 ("volume", Opt_volume),
+ fsparam_u32 ("partition", Opt_partition),
+ fsparam_u32 ("fileset", Opt_fileset),
+ fsparam_u32 ("rootdir", Opt_rootdir),
+ fsparam_flag ("utf8", Opt_utf8),
+ fsparam_string ("iocharset", Opt_iocharset),
+ fsparam_u32 ("mode", Opt_fmode),
+ fsparam_u32 ("dmode", Opt_dmode),
+ {}
+ };
+
+static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- int option;
unsigned int uv;
-
- uopt->novrs = 0;
- uopt->session = 0xFFFFFFFF;
- uopt->lastblock = 0;
- uopt->anchor = 0;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- unsigned n;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_novrs:
- uopt->novrs = 1;
- break;
- case Opt_bs:
- if (match_int(&args[0], &option))
- return 0;
- n = option;
- if (n != 512 && n != 1024 && n != 2048 && n != 4096)
- return 0;
- uopt->blocksize = n;
- uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
- break;
- case Opt_unhide:
- uopt->flags |= (1 << UDF_FLAG_UNHIDE);
- break;
- case Opt_undelete:
- uopt->flags |= (1 << UDF_FLAG_UNDELETE);
- break;
- case Opt_noadinicb:
+ unsigned int n;
+ struct udf_options *uopt = fc->fs_private;
+ struct fs_parse_result result;
+ int token;
+ bool remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);
+
+ token = fs_parse(fc, udf_param_spec, param, &result);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_novrs:
+ uopt->flags |= (1 << UDF_FLAG_NOVRS);
+ break;
+ case Opt_bs:
+ n = result.uint_32;
+ if (n != 512 && n != 1024 && n != 2048 && n != 4096)
+ return -EINVAL;
+ uopt->blocksize = n;
+ uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
+ break;
+ case Opt_unhide:
+ uopt->flags |= (1 << UDF_FLAG_UNHIDE);
+ break;
+ case Opt_undelete:
+ uopt->flags |= (1 << UDF_FLAG_UNDELETE);
+ break;
+ case Opt_adinicb:
+ if (result.negated)
uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB);
- break;
- case Opt_adinicb:
+ else
uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB);
- break;
- case Opt_shortad:
- uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
- break;
- case Opt_longad:
- uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
- break;
- case Opt_gid:
- if (match_uint(args, &uv))
- return 0;
- uopt->gid = make_kgid(current_user_ns(), uv);
- if (!gid_valid(uopt->gid))
- return 0;
+ break;
+ case Opt_shortad:
+ uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
+ break;
+ case Opt_longad:
+ uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
+ break;
+ case Opt_gid:
+ if (kstrtoint(param->string, 10, &uv) == 0) {
+ kgid_t gid = make_kgid(current_user_ns(), uv);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ uopt->gid = gid;
uopt->flags |= (1 << UDF_FLAG_GID_SET);
- break;
- case Opt_uid:
- if (match_uint(args, &uv))
- return 0;
- uopt->uid = make_kuid(current_user_ns(), uv);
- if (!uid_valid(uopt->uid))
- return 0;
+ } else if (!strcmp(param->string, "forget")) {
+ uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
+ } else if (!strcmp(param->string, "ignore")) {
+ /* this option is superseded by gid=<number> */
+ ;
+ } else {
+ return -EINVAL;
+ }
+ break;
+ case Opt_uid:
+ if (kstrtoint(param->string, 10, &uv) == 0) {
+ kuid_t uid = make_kuid(current_user_ns(), uv);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ uopt->uid = uid;
uopt->flags |= (1 << UDF_FLAG_UID_SET);
- break;
- case Opt_umask:
- if (match_octal(args, &option))
- return 0;
- uopt->umask = option;
- break;
- case Opt_nostrict:
- uopt->flags &= ~(1 << UDF_FLAG_STRICT);
- break;
- case Opt_session:
- if (match_int(args, &option))
- return 0;
- uopt->session = option;
- if (!remount)
- uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
- break;
- case Opt_lastblock:
- if (match_int(args, &option))
- return 0;
- uopt->lastblock = option;
- if (!remount)
- uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
- break;
- case Opt_anchor:
- if (match_int(args, &option))
- return 0;
- uopt->anchor = option;
- break;
- case Opt_volume:
- case Opt_partition:
- case Opt_fileset:
- case Opt_rootdir:
- /* Ignored (never implemented properly) */
- break;
- case Opt_utf8:
- if (!remount) {
- unload_nls(uopt->nls_map);
- uopt->nls_map = NULL;
- }
- break;
- case Opt_iocharset:
- if (!remount) {
- unload_nls(uopt->nls_map);
- uopt->nls_map = NULL;
- }
- /* When nls_map is not loaded then UTF-8 is used */
- if (!remount && strcmp(args[0].from, "utf8") != 0) {
- uopt->nls_map = load_nls(args[0].from);
- if (!uopt->nls_map) {
- pr_err("iocharset %s not found\n",
- args[0].from);
- return 0;
- }
- }
- break;
- case Opt_uforget:
+ } else if (!strcmp(param->string, "forget")) {
uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
- break;
- case Opt_uignore:
- case Opt_gignore:
- /* These options are superseeded by uid=<number> */
- break;
- case Opt_gforget:
- uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
- break;
- case Opt_fmode:
- if (match_octal(args, &option))
- return 0;
- uopt->fmode = option & 0777;
- break;
- case Opt_dmode:
- if (match_octal(args, &option))
- return 0;
- uopt->dmode = option & 0777;
- break;
- default:
- pr_err("bad mount option \"%s\" or missing value\n", p);
- return 0;
+ } else if (!strcmp(param->string, "ignore")) {
+ /* this option is superseded by uid=<number> */
+ ;
+ } else {
+ return -EINVAL;
+ }
+ break;
+ case Opt_umask:
+ uopt->umask = result.uint_32;
+ break;
+ case Opt_nostrict:
+ uopt->flags &= ~(1 << UDF_FLAG_STRICT);
+ break;
+ case Opt_session:
+ uopt->session = result.uint_32;
+ if (!remount)
+ uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
+ break;
+ case Opt_lastblock:
+ uopt->lastblock = result.uint_32;
+ if (!remount)
+ uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
+ break;
+ case Opt_anchor:
+ uopt->anchor = result.uint_32;
+ break;
+ case Opt_volume:
+ case Opt_partition:
+ case Opt_fileset:
+ case Opt_rootdir:
+ /* Ignored (never implemented properly) */
+ break;
+ case Opt_utf8:
+ if (!remount) {
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
}
+ break;
+ case Opt_iocharset:
+ if (!remount) {
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
+ }
+ /* When nls_map is not loaded then UTF-8 is used */
+ if (!remount && strcmp(param->string, "utf8") != 0) {
+ uopt->nls_map = load_nls(param->string);
+ if (!uopt->nls_map) {
+ errorf(fc, "iocharset %s not found",
+ param->string);
+ return -EINVAL;;
+ }
+ }
+ break;
+ case Opt_fmode:
+ uopt->fmode = result.uint_32 & 0777;
+ break;
+ case Opt_dmode:
+ uopt->dmode = result.uint_32 & 0777;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
-static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
+static int udf_reconfigure(struct fs_context *fc)
{
- struct udf_options uopt;
+ struct udf_options *uopt = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
+ int readonly = fc->sb_flags & SB_RDONLY;
int error = 0;
- if (!(*flags & SB_RDONLY) && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
+ if (!readonly && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
return -EACCES;
sync_filesystem(sb);
- uopt.flags = sbi->s_flags;
- uopt.uid = sbi->s_uid;
- uopt.gid = sbi->s_gid;
- uopt.umask = sbi->s_umask;
- uopt.fmode = sbi->s_fmode;
- uopt.dmode = sbi->s_dmode;
- uopt.nls_map = NULL;
-
- if (!udf_parse_options(options, &uopt, true))
- return -EINVAL;
-
write_lock(&sbi->s_cred_lock);
- sbi->s_flags = uopt.flags;
- sbi->s_uid = uopt.uid;
- sbi->s_gid = uopt.gid;
- sbi->s_umask = uopt.umask;
- sbi->s_fmode = uopt.fmode;
- sbi->s_dmode = uopt.dmode;
+ sbi->s_flags = uopt->flags;
+ sbi->s_uid = uopt->uid;
+ sbi->s_gid = uopt->gid;
+ sbi->s_umask = uopt->umask;
+ sbi->s_fmode = uopt->fmode;
+ sbi->s_dmode = uopt->dmode;
write_unlock(&sbi->s_cred_lock);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if (readonly == sb_rdonly(sb))
goto out_unlock;
- if (*flags & SB_RDONLY)
+ if (readonly)
udf_close_lvid(sb);
else
udf_open_lvid(sb);
@@ -864,7 +895,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
int ret;
struct timestamp *ts;
- outstr = kmalloc(128, GFP_NOFS);
+ outstr = kmalloc(128, GFP_KERNEL);
if (!outstr)
return -ENOMEM;
@@ -1539,6 +1570,20 @@ out_bh:
return ret;
}
+static bool udf_lvid_valid(struct super_block *sb,
+ struct logicalVolIntegrityDesc *lvid)
+{
+ u32 parts, impuselen;
+
+ parts = le32_to_cpu(lvid->numOfPartitions);
+ impuselen = le32_to_cpu(lvid->lengthOfImpUse);
+ if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
+ sizeof(struct logicalVolIntegrityDesc) + impuselen +
+ 2 * parts * sizeof(u32) > sb->s_blocksize)
+ return false;
+ return true;
+}
+
/*
* Find the prevailing Logical Volume Integrity Descriptor.
*/
@@ -1549,7 +1594,6 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
struct udf_sb_info *sbi = UDF_SB(sb);
struct logicalVolIntegrityDesc *lvid;
int indirections = 0;
- u32 parts, impuselen;
while (++indirections <= UDF_MAX_LVID_NESTING) {
final_bh = NULL;
@@ -1571,32 +1615,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
if (!final_bh)
return;
- brelse(sbi->s_lvid_bh);
- sbi->s_lvid_bh = final_bh;
-
lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data;
+ if (udf_lvid_valid(sb, lvid)) {
+ brelse(sbi->s_lvid_bh);
+ sbi->s_lvid_bh = final_bh;
+ } else {
+ udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
+ "ignoring.\n",
+ le32_to_cpu(lvid->numOfPartitions),
+ le32_to_cpu(lvid->lengthOfImpUse));
+ }
+
if (lvid->nextIntegrityExt.extLength == 0)
- goto check;
+ return;
loc = leea_to_cpu(lvid->nextIntegrityExt);
}
udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n",
UDF_MAX_LVID_NESTING);
-out_err:
brelse(sbi->s_lvid_bh);
sbi->s_lvid_bh = NULL;
- return;
-check:
- parts = le32_to_cpu(lvid->numOfPartitions);
- impuselen = le32_to_cpu(lvid->lengthOfImpUse);
- if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
- sizeof(struct logicalVolIntegrityDesc) + impuselen +
- 2 * parts * sizeof(u32) > sb->s_blocksize) {
- udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
- "ignoring.\n", parts, impuselen);
- goto out_err;
- }
}
/*
@@ -1946,7 +1985,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
return -EINVAL;
}
sbi->s_last_block = uopt->lastblock;
- if (!uopt->novrs) {
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_NOVRS)) {
/* Check that it is NSR02 compliant */
nsr = udf_check_vsd(sb);
if (!nsr) {
@@ -2084,23 +2123,15 @@ u64 lvid_get_unique_id(struct super_block *sb)
return ret;
}
-static int udf_fill_super(struct super_block *sb, void *options, int silent)
+static int udf_fill_super(struct super_block *sb, struct fs_context *fc)
{
int ret = -EINVAL;
struct inode *inode = NULL;
- struct udf_options uopt;
+ struct udf_options *uopt = fc->fs_private;
struct kernel_lb_addr rootdir, fileset;
struct udf_sb_info *sbi;
bool lvid_open = false;
-
- uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
- /* By default we'll use overflow[ug]id when UDF inode [ug]id == -1 */
- uopt.uid = make_kuid(current_user_ns(), overflowuid);
- uopt.gid = make_kgid(current_user_ns(), overflowgid);
- uopt.umask = 0;
- uopt.fmode = UDF_INVALID_MODE;
- uopt.dmode = UDF_INVALID_MODE;
- uopt.nls_map = NULL;
+ int silent = fc->sb_flags & SB_SILENT;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -2110,25 +2141,23 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
mutex_init(&sbi->s_alloc_mutex);
- if (!udf_parse_options((char *)options, &uopt, false))
- goto parse_options_failure;
-
fileset.logicalBlockNum = 0xFFFFFFFF;
fileset.partitionReferenceNum = 0xFFFF;
- sbi->s_flags = uopt.flags;
- sbi->s_uid = uopt.uid;
- sbi->s_gid = uopt.gid;
- sbi->s_umask = uopt.umask;
- sbi->s_fmode = uopt.fmode;
- sbi->s_dmode = uopt.dmode;
- sbi->s_nls_map = uopt.nls_map;
+ sbi->s_flags = uopt->flags;
+ sbi->s_uid = uopt->uid;
+ sbi->s_gid = uopt->gid;
+ sbi->s_umask = uopt->umask;
+ sbi->s_fmode = uopt->fmode;
+ sbi->s_dmode = uopt->dmode;
+ sbi->s_nls_map = uopt->nls_map;
+ uopt->nls_map = NULL;
rwlock_init(&sbi->s_cred_lock);
- if (uopt.session == 0xFFFFFFFF)
+ if (uopt->session == 0xFFFFFFFF)
sbi->s_session = udf_get_last_session(sb);
else
- sbi->s_session = uopt.session;
+ sbi->s_session = uopt->session;
udf_debug("Multi-session=%d\n", sbi->s_session);
@@ -2139,16 +2168,16 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sb->s_magic = UDF_SUPER_MAGIC;
sb->s_time_gran = 1000;
- if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
- ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+ if (uopt->flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+ ret = udf_load_vrs(sb, uopt, silent, &fileset);
} else {
- uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
- while (uopt.blocksize <= 4096) {
- ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+ uopt->blocksize = bdev_logical_block_size(sb->s_bdev);
+ while (uopt->blocksize <= 4096) {
+ ret = udf_load_vrs(sb, uopt, silent, &fileset);
if (ret < 0) {
if (!silent && ret != -EACCES) {
pr_notice("Scanning with blocksize %u failed\n",
- uopt.blocksize);
+ uopt->blocksize);
}
brelse(sbi->s_lvid_bh);
sbi->s_lvid_bh = NULL;
@@ -2161,7 +2190,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
} else
break;
- uopt.blocksize <<= 1;
+ uopt->blocksize <<= 1;
}
}
if (ret < 0) {
@@ -2266,8 +2295,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
error_out:
iput(sbi->s_vat_inode);
-parse_options_failure:
- unload_nls(uopt.nls_map);
+ unload_nls(uopt->nls_map);
if (lvid_open)
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index f9a60bc1abcf..08ec8756b948 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -23,6 +23,7 @@
#define UDF_FLAG_STRICT 5
#define UDF_FLAG_UNDELETE 6
#define UDF_FLAG_UNHIDE 7
+#define UDF_FLAG_NOVRS 8
#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */
#define UDF_FLAG_GID_FORGET 12
#define UDF_FLAG_UID_SET 13
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index a480810cd4e3..44666afc6209 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1470,8 +1470,7 @@ static int __init init_inodecache(void)
{
ufs_inode_cachep = kmem_cache_create_usercopy("ufs_inode_cache",
sizeof(struct ufs_inode_info), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
- SLAB_ACCOUNT),
+ (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT),
offsetof(struct ufs_inode_info, i_u1.i_symlink),
sizeof_field(struct ufs_inode_info,
i_u1.i_symlink),
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 959551ff9a95..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
-/*
- * Start with fault_pending_wqh and fault_wqh so they're more likely
- * to be in the same cacheline.
- *
- * Locking order:
- * fd_wqh.lock
- * fault_pending_wqh.lock
- * fault_wqh.lock
- * event_wqh.lock
- *
- * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
- * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
- * also taken in IRQ context.
- */
-struct userfaultfd_ctx {
- /* waitqueue head for the pending (i.e. not read) userfaults */
- wait_queue_head_t fault_pending_wqh;
- /* waitqueue head for the userfaults */
- wait_queue_head_t fault_wqh;
- /* waitqueue head for the pseudo fd to wakeup poll/read */
- wait_queue_head_t fd_wqh;
- /* waitqueue head for events */
- wait_queue_head_t event_wqh;
- /* a refile sequence protected by fault_pending_wqh lock */
- seqcount_spinlock_t refile_seq;
- /* pseudo fd refcounting */
- refcount_t refcount;
- /* userfaultfd syscall flags */
- unsigned int flags;
- /* features requested from the userspace */
- unsigned int features;
- /* released */
- bool released;
- /* memory mappings are changing because of non-cooperative event */
- atomic_t mmap_changing;
- /* mm with one ore more vmas attached to this userfaultfd_ctx */
- struct mm_struct *mm;
-};
-
struct userfaultfd_fork_ctx {
struct userfaultfd_ctx *orig;
struct userfaultfd_ctx *new;
@@ -724,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->flags = octx->flags;
ctx->features = octx->features;
ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
+ down_write(&octx->map_changing_lock);
atomic_inc(&octx->mmap_changing);
+ up_write(&octx->map_changing_lock);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -776,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
vma_start_write(vma);
@@ -822,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
mmap_read_unlock(mm);
msg_init(&ewq.msg);
@@ -864,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -1748,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- flags);
+ ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1800,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1857,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return -EINVAL;
if (mmget_not_zero(ctx->mm)) {
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1909,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
- &ctx->mmap_changing, flags);
+ ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
+ uffdio_continue.range.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1964,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
- uffdio_poison.range.len,
- &ctx->mmap_changing, 0);
+ ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
+ uffdio_poison.range.len, 0);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -2040,16 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
return -EINVAL;
if (mmget_not_zero(mm)) {
- mmap_read_lock(mm);
-
- /* Re-check after taking mmap_lock */
- if (likely(!atomic_read(&ctx->mmap_changing)))
- ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
- uffdio_move.len, uffdio_move.mode);
- else
- ret = -EINVAL;
-
- mmap_read_unlock(mm);
+ ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
mmput(mm);
} else {
return -ESRCH;
@@ -2255,6 +2212,7 @@ static int new_userfaultfd(int flags)
ctx->flags = flags;
ctx->features = 0;
ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 2307f8037efc..118dedef8ebe 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -218,6 +218,7 @@ const struct file_operations vboxsf_reg_fops = {
.release = vboxsf_file_release,
.fsync = noop_fsync,
.splice_read = filemap_splice_read,
+ .setlease = simple_nosetlease,
};
const struct inode_operations vboxsf_reg_iops = {
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 1fb8f4df60cb..ffb1d565da39 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -151,11 +151,11 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
if (!sbi->nls) {
vbg_err("vboxsf: Count not load '%s' nls\n", nls_name);
err = -EINVAL;
- goto fail_free;
+ goto fail_destroy_idr;
}
}
- sbi->bdi_id = ida_simple_get(&vboxsf_bdi_ida, 0, 0, GFP_KERNEL);
+ sbi->bdi_id = ida_alloc(&vboxsf_bdi_ida, GFP_KERNEL);
if (sbi->bdi_id < 0) {
err = sbi->bdi_id;
goto fail_free;
@@ -221,9 +221,10 @@ fail_unmap:
vboxsf_unmap_folder(sbi->root);
fail_free:
if (sbi->bdi_id >= 0)
- ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id);
+ ida_free(&vboxsf_bdi_ida, sbi->bdi_id);
if (sbi->nls)
unload_nls(sbi->nls);
+fail_destroy_idr:
idr_destroy(&sbi->ino_idr);
kfree(sbi);
return err;
@@ -268,7 +269,7 @@ static void vboxsf_put_super(struct super_block *sb)
vboxsf_unmap_folder(sbi->root);
if (sbi->bdi_id >= 0)
- ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id);
+ ida_free(&vboxsf_bdi_ida, sbi->bdi_id);
if (sbi->nls)
unload_nls(sbi->nls);
@@ -339,8 +340,7 @@ static int vboxsf_setup(void)
vboxsf_inode_cachep =
kmem_cache_create("vboxsf_inode_cache",
sizeof(struct vboxsf_inode), 0,
- (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
- SLAB_ACCOUNT),
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
vboxsf_inode_init_once);
if (!vboxsf_inode_cachep) {
err = -ENOMEM;
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 72ac9320e6a3..9515bbf0b54c 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -440,7 +440,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
{
const char *in;
char *out;
- size_t out_len;
size_t out_bound_len;
size_t in_bound_len;
@@ -448,7 +447,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
in_bound_len = utf8_len;
out = name;
- out_len = 0;
/* Reserve space for terminating 0 */
out_bound_len = name_bound_len - 1;
@@ -469,7 +467,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
out += nb;
out_bound_len -= nb;
- out_len += nb;
}
*out = 0;
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index a6a6b2749241..b3506f56e180 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -69,7 +69,6 @@ struct fsverity_info {
u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
const struct inode *inode;
unsigned long *hash_block_verified;
- spinlock_t hash_page_init_lock;
};
#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index bf7a5f4cccaf..3969d54158d1 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -159,9 +159,9 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_ker
__bpf_kfunc_end_defs();
-BTF_SET8_START(fsverity_set_ids)
+BTF_KFUNCS_START(fsverity_set_ids)
BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS)
-BTF_SET8_END(fsverity_set_ids)
+BTF_KFUNCS_END(fsverity_set_ids)
static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 6c31a871b84b..fdeb95eca3af 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -239,7 +239,6 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
err = -ENOMEM;
goto fail;
}
- spin_lock_init(&vi->hash_page_init_lock);
}
return vi;
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 904ccd7e8e16..4fcad0825a12 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -19,7 +19,6 @@ static struct workqueue_struct *fsverity_read_workqueue;
static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
unsigned long hblock_idx)
{
- bool verified;
unsigned int blocks_per_page;
unsigned int i;
@@ -43,12 +42,20 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
* re-instantiated from the backing storage are re-verified. To do
* this, we use PG_checked again, but now it doesn't really mean
* "checked". Instead, now it just serves as an indicator for whether
- * the hash page is newly instantiated or not.
+ * the hash page is newly instantiated or not. If the page is new, as
+ * indicated by PG_checked=0, we clear the bitmap bits for the page's
+ * blocks since they are untrustworthy, then set PG_checked=1.
+ * Otherwise we return the bitmap bit for the requested block.
*
- * The first thread that sees PG_checked=0 must clear the corresponding
- * bitmap bits, then set PG_checked=1. This requires a spinlock. To
- * avoid having to take this spinlock in the common case of
- * PG_checked=1, we start with an opportunistic lockless read.
+ * Multiple threads may execute this code concurrently on the same page.
+ * This is safe because we use memory barriers to ensure that if a
+ * thread sees PG_checked=1, then it also sees the associated bitmap
+ * clearing to have occurred. Also, all writes and their corresponding
+ * reads are atomic, and all writes are safe to repeat in the event that
+ * multiple threads get into the PG_checked=0 section. (Clearing a
+ * bitmap bit again at worst causes a hash block to be verified
+ * redundantly. That event should be very rare, so it's not worth using
+ * a lock to avoid. Setting PG_checked again has no effect.)
*/
if (PageChecked(hpage)) {
/*
@@ -58,24 +65,17 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
smp_rmb();
return test_bit(hblock_idx, vi->hash_block_verified);
}
- spin_lock(&vi->hash_page_init_lock);
- if (PageChecked(hpage)) {
- verified = test_bit(hblock_idx, vi->hash_block_verified);
- } else {
- blocks_per_page = vi->tree_params.blocks_per_page;
- hblock_idx = round_down(hblock_idx, blocks_per_page);
- for (i = 0; i < blocks_per_page; i++)
- clear_bit(hblock_idx + i, vi->hash_block_verified);
- /*
- * A write memory barrier is needed here to give RELEASE
- * semantics to the below SetPageChecked() operation.
- */
- smp_wmb();
- SetPageChecked(hpage);
- verified = false;
- }
- spin_unlock(&vi->hash_page_init_lock);
- return verified;
+ blocks_per_page = vi->tree_params.blocks_per_page;
+ hblock_idx = round_down(hblock_idx, blocks_per_page);
+ for (i = 0; i < blocks_per_page; i++)
+ clear_bit(hblock_idx + i, vi->hash_block_verified);
+ /*
+ * A write memory barrier is needed here to give RELEASE semantics to
+ * the below SetPageChecked() operation.
+ */
+ smp_wmb();
+ SetPageChecked(hpage);
+ return false;
}
/*
diff --git a/fs/xattr.c b/fs/xattr.c
index 09d927603433..f8b643f91a98 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -16,7 +16,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
-#include <linux/evm.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fsnotify.h>
@@ -552,11 +551,11 @@ __vfs_removexattr_locked(struct mnt_idmap *idmap,
goto out;
error = __vfs_removexattr(idmap, dentry, name);
+ if (error)
+ return error;
- if (!error) {
- fsnotify_xattr(dentry);
- evm_inode_post_removexattr(dentry, name);
- }
+ fsnotify_xattr(dentry);
+ security_inode_post_removexattr(dentry, name);
out:
return error;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 567fb37274d3..d41edd30388b 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -124,12 +124,24 @@ config XFS_DRAIN_INTENTS
bool
select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+config XFS_LIVE_HOOKS
+ bool
+ select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+
+config XFS_MEMORY_BUFS
+ bool
+
+config XFS_BTREE_IN_MEM
+ bool
+
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
default n
depends on XFS_FS
depends on TMPFS && SHMEM
+ select XFS_LIVE_HOOKS
select XFS_DRAIN_INTENTS
+ select XFS_MEMORY_BUFS
help
If you say Y here you will be able to check metadata on a
mounted XFS filesystem. This feature is intended to reduce
@@ -164,6 +176,7 @@ config XFS_ONLINE_REPAIR
bool "XFS online metadata repair support"
default n
depends on XFS_FS && XFS_ONLINE_SCRUB
+ select XFS_BTREE_IN_MEM
help
If you say Y here you will be able to repair metadata on a
mounted XFS filesystem. This feature is intended to reduce
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index fbe3cdc79036..76674ad5833e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -92,8 +92,7 @@ xfs-y += xfs_aops.o \
xfs_symlink.o \
xfs_sysfs.o \
xfs_trans.o \
- xfs_xattr.o \
- kmem.o
+ xfs_xattr.o
# low-level transaction/log code
xfs-y += xfs_log.o \
@@ -137,6 +136,9 @@ xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
endif
xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o
+xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o
+xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o
+xfs-$(CONFIG_XFS_BTREE_IN_MEM) += libxfs/xfs_btree_mem.o
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
@@ -159,6 +161,8 @@ xfs-y += $(addprefix scrub/, \
health.o \
ialloc.o \
inode.o \
+ iscan.o \
+ nlinks.o \
parent.o \
readdir.o \
refcount.o \
@@ -179,6 +183,7 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
dqiterate.o \
quota.o \
+ quotacheck.o \
)
# online repair
@@ -188,12 +193,17 @@ xfs-y += $(addprefix scrub/, \
alloc_repair.o \
bmap_repair.o \
cow_repair.o \
+ fscounters_repair.o \
ialloc_repair.o \
inode_repair.o \
newbt.o \
+ nlinks_repair.o \
+ rcbag_btree.o \
+ rcbag.o \
reap.o \
refcount_repair.o \
repair.o \
+ rmap_repair.o \
)
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
@@ -202,6 +212,7 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
quota_repair.o \
+ quotacheck_repair.o \
)
endif
endif
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
deleted file mode 100644
index c557a030acfe..000000000000
--- a/fs/xfs/kmem.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_message.h"
-#include "xfs_trace.h"
-
-void *
-kmem_alloc(size_t size, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_alloc(size, flags, _RET_IP_);
-
- do {
- ptr = kmalloc(size, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
- current->comm, current->pid,
- (unsigned int)size, __func__, lflags);
- memalloc_retry_wait(lflags);
- } while (1);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
deleted file mode 100644
index b987dc2c6851..000000000000
--- a/fs/xfs/kmem.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-
-/*
- * General memory allocation interfaces
- */
-
-typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
-#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
-#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
-#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u)
-
-/*
- * We use a special process flag to avoid recursive callbacks into
- * the filesystem during transactions. We will also issue our own
- * warnings, so we explicitly skip any generic ones (silly of us).
- */
-static inline gfp_t
-kmem_flags_convert(xfs_km_flags_t flags)
-{
- gfp_t lflags;
-
- BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP));
-
- lflags = GFP_KERNEL | __GFP_NOWARN;
- if (flags & KM_NOFS)
- lflags &= ~__GFP_FS;
-
- /*
- * Default page/slab allocator behavior is to retry for ever
- * for small allocations. We can override this behavior by using
- * __GFP_RETRY_MAYFAIL which will tell the allocator to retry as long
- * as it is feasible but rather fail than retry forever for all
- * request sizes.
- */
- if (flags & KM_MAYFAIL)
- lflags |= __GFP_RETRY_MAYFAIL;
-
- if (flags & KM_ZERO)
- lflags |= __GFP_ZERO;
-
- if (flags & KM_NOLOCKDEP)
- lflags |= __GFP_NOLOCKDEP;
-
- return lflags;
-}
-
-extern void *kmem_alloc(size_t, xfs_km_flags_t);
-static inline void kmem_free(const void *ptr)
-{
- kvfree(ptr);
-}
-
-
-static inline void *
-kmem_zalloc(size_t size, xfs_km_flags_t flags)
-{
- return kmem_alloc(size, flags | KM_ZERO);
-}
-
-/*
- * Zone interfaces
- */
-static inline struct page *
-kmem_to_page(void *addr)
-{
- if (is_vmalloc_addr(addr))
- return vmalloc_to_page(addr);
- return virt_to_page(addr);
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 39d9525270b7..dc1873f76bff 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -217,6 +217,7 @@ xfs_initialize_perag_data(
*/
if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
error = -EFSCORRUPTED;
goto out;
}
@@ -241,7 +242,7 @@ __xfs_free_perag(
struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
- kmem_free(pag);
+ kfree(pag);
}
/*
@@ -263,7 +264,7 @@ xfs_free_perag(
xfs_defer_drain_free(&pag->pag_intents_drain);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_buf_hash_destroy(pag);
+ xfs_buf_cache_destroy(&pag->pag_bcache);
/* drop the mount's active reference */
xfs_perag_rele(pag);
@@ -351,9 +352,9 @@ xfs_free_unused_perag_range(
spin_unlock(&mp->m_perag_lock);
if (!pag)
break;
- xfs_buf_hash_destroy(pag);
+ xfs_buf_cache_destroy(&pag->pag_bcache);
xfs_defer_drain_free(&pag->pag_intents_drain);
- kmem_free(pag);
+ kfree(pag);
}
}
@@ -381,7 +382,7 @@ xfs_initialize_perag(
continue;
}
- pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+ pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!pag) {
error = -ENOMEM;
goto out_unwind_new_pags;
@@ -389,7 +390,7 @@ xfs_initialize_perag(
pag->pag_agno = index;
pag->pag_mount = mp;
- error = radix_tree_preload(GFP_NOFS);
+ error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (error)
goto out_free_pag;
@@ -416,9 +417,10 @@ xfs_initialize_perag(
init_waitqueue_head(&pag->pag_active_wq);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
+ xfs_hooks_init(&pag->pag_rmap_update_hooks);
#endif /* __KERNEL__ */
- error = xfs_buf_hash_init(pag);
+ error = xfs_buf_cache_init(&pag->pag_bcache);
if (error)
goto out_remove_pag;
@@ -453,7 +455,7 @@ out_remove_pag:
radix_tree_delete(&mp->m_perag_tree, index);
spin_unlock(&mp->m_perag_lock);
out_free_pag:
- kmem_free(pag);
+ kfree(pag);
out_unwind_new_pags:
/* unwind any prior newly initialized pags */
xfs_free_unused_perag_range(mp, first_initialised, agcount);
@@ -491,7 +493,7 @@ xfs_btroot_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno);
}
/* Finish initializing a free space btree. */
@@ -549,7 +551,7 @@ xfs_freesp_init_recs(
}
/*
- * Alloc btree root block init functions
+ * bnobt/cntbt btree root block init functions
*/
static void
xfs_bnoroot_init(
@@ -557,17 +559,7 @@ xfs_bnoroot_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno);
- xfs_freesp_init_recs(mp, bp, id);
-}
-
-static void
-xfs_cntroot_init(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- struct aghdr_init_data *id)
-{
- xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno);
xfs_freesp_init_recs(mp, bp, id);
}
@@ -583,7 +575,7 @@ xfs_rmaproot_init(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_rmap_rec *rrec;
- xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 4, id->agno);
/*
* mark the AG header regions as static metadata The BNO
@@ -678,14 +670,13 @@ xfs_agfblock_init(
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(id->agno);
agf->agf_length = cpu_to_be32(id->agsize);
- agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
- agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
- agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+ agf->agf_bno_root = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ agf->agf_cnt_root = cpu_to_be32(XFS_CNT_BLOCK(mp));
+ agf->agf_bno_level = cpu_to_be32(1);
+ agf->agf_cnt_level = cpu_to_be32(1);
if (xfs_has_rmapbt(mp)) {
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(XFS_RMAP_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_root = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ agf->agf_rmap_level = cpu_to_be32(1);
agf->agf_rmap_blocks = cpu_to_be32(1);
}
@@ -796,7 +787,7 @@ struct xfs_aghdr_grow_data {
size_t numblks;
const struct xfs_buf_ops *ops;
aghdr_init_work_f work;
- xfs_btnum_t type;
+ const struct xfs_btree_ops *bc_ops;
bool need_init;
};
@@ -850,13 +841,15 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
+ .bc_ops = &xfs_bnobt_ops,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_cntbt_buf_ops,
- .work = &xfs_cntroot_init,
+ .work = &xfs_bnoroot_init,
+ .bc_ops = &xfs_cntbt_ops,
.need_init = true
},
{ /* INO root block */
@@ -864,7 +857,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_inobt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_INO,
+ .bc_ops = &xfs_inobt_ops,
.need_init = true
},
{ /* FINO root block */
@@ -872,7 +865,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_FINO,
+ .bc_ops = &xfs_finobt_ops,
.need_init = xfs_has_finobt(mp)
},
{ /* RMAP root block */
@@ -880,6 +873,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_rmapbt_buf_ops,
.work = &xfs_rmaproot_init,
+ .bc_ops = &xfs_rmapbt_ops,
.need_init = xfs_has_rmapbt(mp)
},
{ /* REFC root block */
@@ -887,7 +881,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_refcountbt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_REFC,
+ .bc_ops = &xfs_refcountbt_ops,
.need_init = xfs_has_reflink(mp)
},
{ /* NULL terminating block */
@@ -905,7 +899,7 @@ xfs_ag_init_headers(
id->daddr = dp->daddr;
id->numblks = dp->numblks;
- id->type = dp->type;
+ id->bc_ops = dp->bc_ops;
error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
if (error)
break;
@@ -950,8 +944,10 @@ xfs_ag_shrink_space(
agf = agfbp->b_addr;
aglen = be32_to_cpu(agi->agi_length);
/* some extra paranoid checks before we shrink the ag */
- if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
+ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
return -EFSCORRUPTED;
+ }
if (delta >= aglen)
return -EINVAL;
@@ -979,14 +975,23 @@ xfs_ag_shrink_space(
if (error) {
/*
- * if extent allocation fails, need to roll the transaction to
+ * If extent allocation fails, need to roll the transaction to
* ensure that the AGFL fixup has been committed anyway.
+ *
+ * We need to hold the AGF across the roll to ensure nothing can
+ * access the AG for allocation until the shrink is fully
+ * cleaned up. And due to the resetting of the AG block
+ * reservation space needing to lock the AGI, we also have to
+ * hold that so we don't get AGI/AGF lock order inversions in
+ * the error handling path.
*/
xfs_trans_bhold(*tpp, agfbp);
+ xfs_trans_bhold(*tpp, agibp);
err2 = xfs_trans_roll(tpp);
if (err2)
return err2;
xfs_trans_bjoin(*tpp, agfbp);
+ xfs_trans_bjoin(*tpp, agibp);
goto resv_init_out;
}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4b343c4fac28..35de09a2516c 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -36,8 +36,9 @@ struct xfs_perag {
atomic_t pag_active_ref; /* active reference count */
wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */
unsigned long pag_opstate;
- uint8_t pagf_levels[XFS_BTNUM_AGF];
- /* # of levels in bno & cnt btree */
+ uint8_t pagf_bno_level; /* # of levels in bno btree */
+ uint8_t pagf_cnt_level; /* # of levels in cnt btree */
+ uint8_t pagf_rmap_level;/* # of levels in rmap btree */
uint32_t pagf_flcount; /* count of blocks in freelist */
xfs_extlen_t pagf_freeblks; /* total free blocks */
xfs_extlen_t pagf_longest; /* longest free space */
@@ -86,8 +87,10 @@ struct xfs_perag {
* Alternate btree heights so that online repair won't trip the write
* verifiers while rebuilding the AG btrees.
*/
- uint8_t pagf_repair_levels[XFS_BTNUM_AGF];
+ uint8_t pagf_repair_bno_level;
+ uint8_t pagf_repair_cnt_level;
uint8_t pagf_repair_refcount_level;
+ uint8_t pagf_repair_rmap_level;
#endif
spinlock_t pag_state_lock;
@@ -104,9 +107,7 @@ struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
- /* buffer cache index */
- spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
- struct rhashtable pag_buf_hash;
+ struct xfs_buf_cache pag_bcache;
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
@@ -119,6 +120,9 @@ struct xfs_perag {
* inconsistencies.
*/
struct xfs_defer_drain pag_intents_drain;
+
+ /* Hook to feed rmapbt updates to an active online repair. */
+ struct xfs_hooks pag_rmap_update_hooks;
#endif /* __KERNEL__ */
};
@@ -331,7 +335,7 @@ struct aghdr_init_data {
/* per header data */
xfs_daddr_t daddr; /* header location */
size_t numblks; /* size of header */
- xfs_btnum_t type; /* type of btree root block */
+ const struct xfs_btree_ops *bc_ops; /* btree ops */
};
int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3bd0a33fee0a..9da52e92172a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -26,6 +26,7 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_extfree_item_cache;
@@ -150,23 +151,38 @@ xfs_alloc_ag_max_usable(
return mp->m_sb.sb_agblocks - blocks;
}
+
+static int
+xfs_alloc_lookup(
+ struct xfs_btree_cur *cur,
+ xfs_lookup_t dir,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ int *stat)
+{
+ int error;
+
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ error = xfs_btree_lookup(cur, dir, stat);
+ if (*stat == 1)
+ cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
+ else
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+ return error;
+}
+
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
*/
-STATIC int /* error */
+static inline int /* error */
xfs_alloc_lookup_eq(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
-
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat);
}
/*
@@ -180,13 +196,7 @@ xfs_alloc_lookup_ge(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
-
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat);
}
/*
@@ -200,19 +210,14 @@ xfs_alloc_lookup_le(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat);
}
static inline bool
xfs_alloc_cur_active(
struct xfs_btree_cur *cur)
{
- return cur && cur->bc_ag.abt.active;
+ return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE);
}
/*
@@ -268,12 +273,12 @@ xfs_alloc_complain_bad_rec(
struct xfs_mount *mp = cur->bc_mp;
xfs_warn(mp,
- "%s Freespace BTree record corruption in AG %d detected at %pS!",
- cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
- cur->bc_ag.pag->pag_agno, fa);
+ "%sbt record corruption in AG %d detected at %pS!",
+ cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"start block 0x%x block count 0x%x", irec->ar_startblock,
irec->ar_blockcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -497,14 +502,18 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
i != 1 ||
nfbno1 != fbno ||
- nflen1 != flen))
+ nflen1 != flen)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
#endif
} else {
if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
/*
* Look up the record in the by-block tree if necessary.
@@ -516,14 +525,18 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
i != 1 ||
nfbno1 != fbno ||
- nflen1 != flen))
+ nflen1 != flen)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
#endif
} else {
if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
#ifdef DEBUG
@@ -536,8 +549,10 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
bnoblock->bb_numrecs !=
- cntblock->bb_numrecs))
+ cntblock->bb_numrecs)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
#endif
@@ -567,30 +582,40 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
/*
* Add new by-size btree entry(s).
*/
if (nfbno1 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
if (nfbno2 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
/*
* Fix up the by-block btree entry(s).
@@ -601,8 +626,10 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
} else {
/*
* Update the by-block entry to start later|be shorter.
@@ -616,12 +643,16 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
return 0;
}
@@ -755,6 +786,8 @@ xfs_alloc_read_agfl(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
if (error)
return error;
xfs_buf_set_ref(bp, XFS_AGFL_REF);
@@ -776,6 +809,7 @@ xfs_alloc_update_counters(
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
be32_to_cpu(agf->agf_length))) {
xfs_buf_mark_corrupt(agbp);
+ xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
return -EFSCORRUPTED;
}
@@ -828,8 +862,8 @@ xfs_alloc_cur_setup(
* attempt a small allocation.
*/
if (!acur->cnt)
- acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_CNT);
+ acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
if (error)
return error;
@@ -838,11 +872,11 @@ xfs_alloc_cur_setup(
* Allocate the bnobt left and right search cursors.
*/
if (!acur->bnolt)
- acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_BNO);
+ acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
if (!acur->bnogt)
- acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_BNO);
+ acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
return i == 1 ? 0 : -ENOSPC;
}
@@ -884,15 +918,17 @@ xfs_alloc_cur_check(
bool busy;
unsigned busy_gen = 0;
bool deactivate = false;
- bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+ bool isbnobt = xfs_btree_is_bno(cur->bc_ops);
*new = 0;
error = xfs_alloc_get_rec(cur, &bno, &len, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->mp, i != 1))
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/*
* Check minlen and deactivate a cntbt cursor if out of acceptable size
@@ -958,9 +994,8 @@ xfs_alloc_cur_check(
deactivate = true;
out:
if (deactivate)
- cur->bc_ag.abt.active = false;
- trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
- *new);
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+ trace_xfs_alloc_cur_check(cur, bno, len, diff, *new);
return 0;
}
@@ -1098,6 +1133,7 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(ccur);
error = -EFSCORRUPTED;
goto error;
}
@@ -1132,6 +1168,7 @@ xfs_alloc_ag_vextent_small(
*fbnop = args->agbno = fbno;
*flenp = args->len = 1;
if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
+ xfs_btree_mark_sick(ccur);
error = -EFSCORRUPTED;
goto error;
}
@@ -1197,8 +1234,8 @@ xfs_alloc_ag_vextent_exact(
/*
* Allocate/initialize a cursor for the by-number freespace btree.
*/
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
@@ -1218,6 +1255,7 @@ xfs_alloc_ag_vextent_exact(
if (error)
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1257,8 +1295,8 @@ xfs_alloc_ag_vextent_exact(
* We are allocating agbno for args->len
* Allocate/initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
@@ -1330,7 +1368,7 @@ xfs_alloc_walk_iter(
if (error)
return error;
if (i == 0)
- cur->bc_ag.abt.active = false;
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
if (count > 0)
count--;
@@ -1444,7 +1482,7 @@ xfs_alloc_ag_vextent_locality(
if (error)
return error;
if (i) {
- acur->cnt->bc_ag.abt.active = true;
+ acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
fbcur = acur->cnt;
fbinc = false;
}
@@ -1497,8 +1535,10 @@ xfs_alloc_ag_vextent_lastblock(
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->mp, i != 1))
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(acur->cnt);
return -EFSCORRUPTED;
+ }
if (*len >= args->minlen)
break;
error = xfs_btree_increment(acur->cnt, 0, &i);
@@ -1670,8 +1710,8 @@ restart:
/*
* Allocate and initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
bno_cur = NULL;
/*
@@ -1710,6 +1750,7 @@ restart:
if (error)
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1756,6 +1797,7 @@ restart:
rlen != 0 &&
(rlen > flen ||
rbno + rlen > fbno + flen))) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1778,6 +1820,7 @@ restart:
&i)))
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1790,6 +1833,7 @@ restart:
rlen != 0 &&
(rlen > flen ||
rbno + rlen > fbno + flen))) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1806,6 +1850,7 @@ restart:
&i)))
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1844,14 +1889,15 @@ restart:
rlen = args->len;
if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
/*
* Allocate and initialize a cursor for the by-block tree.
*/
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
@@ -1863,6 +1909,7 @@ restart:
if (XFS_IS_CORRUPT(args->mp,
args->agbno + args->len >
be32_to_cpu(agf->agf_length))) {
+ xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1924,7 +1971,7 @@ xfs_free_ag_extent(
/*
* Allocate and initialize a cursor for the by-block btree.
*/
- bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
@@ -1938,6 +1985,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1953,6 +2001,7 @@ xfs_free_ag_extent(
* Very bad.
*/
if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1971,6 +2020,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1986,6 +2036,7 @@ xfs_free_ag_extent(
* Very bad.
*/
if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1994,7 +2045,7 @@ xfs_free_ag_extent(
/*
* Now allocate and initialize a cursor for the by-size tree.
*/
- cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
/*
* Have both left and right contiguous neighbors.
* Merge all three into a single free block.
@@ -2006,12 +2057,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2021,12 +2074,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2036,6 +2091,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2045,6 +2101,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2064,6 +2121,7 @@ xfs_free_ag_extent(
i != 1 ||
xxbno != ltbno ||
xxlen != ltlen)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2088,12 +2146,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2104,6 +2164,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2123,12 +2184,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2151,6 +2214,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2163,12 +2227,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2267,8 +2333,9 @@ xfs_alloc_min_freelist(
struct xfs_perag *pag)
{
/* AG btrees have at least 1 level. */
- static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
- const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
+ const unsigned int bno_level = pag ? pag->pagf_bno_level : 1;
+ const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1;
+ const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1;
unsigned int min_free;
ASSERT(mp->m_alloc_maxlevels > 0);
@@ -2295,16 +2362,12 @@ xfs_alloc_min_freelist(
*/
/* space needed by-bno freespace btree */
- min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_alloc_maxlevels) * 2 - 2;
+ min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
/* space needed by-size freespace btree */
- min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_alloc_maxlevels) * 2 - 2;
+ min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
- min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels) * 2 - 2;
-
+ min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2;
return min_free;
}
@@ -2691,13 +2754,14 @@ xfs_exact_minlen_extent_available(
xfs_extlen_t flen;
int error = 0;
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp,
+ args->pag);
error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
if (error)
goto out;
if (*stat == 0) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -2987,8 +3051,8 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_versionnum),
offsetof(xfs_agf_t, agf_seqno),
offsetof(xfs_agf_t, agf_length),
- offsetof(xfs_agf_t, agf_roots[0]),
- offsetof(xfs_agf_t, agf_levels[0]),
+ offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */
+ offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */
offsetof(xfs_agf_t, agf_flfirst),
offsetof(xfs_agf_t, agf_fllast),
offsetof(xfs_agf_t, agf_flcount),
@@ -3167,12 +3231,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_freeblks) > agf_length)
return __this_address;
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
- mp->m_alloc_maxlevels ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
- mp->m_alloc_maxlevels)
+ if (be32_to_cpu(agf->agf_bno_level) < 1 ||
+ be32_to_cpu(agf->agf_cnt_level) < 1 ||
+ be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels)
return __this_address;
if (xfs_has_lazysbcount(mp) &&
@@ -3183,9 +3245,8 @@ xfs_agf_verify(
if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
return __this_address;
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
- mp->m_rmap_maxlevels)
+ if (be32_to_cpu(agf->agf_rmap_level) < 1 ||
+ be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels)
return __this_address;
}
@@ -3268,6 +3329,8 @@ xfs_read_agf(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
if (error)
return error;
@@ -3309,12 +3372,9 @@ xfs_alloc_read_agf(
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
- pag->pagf_levels[XFS_BTNUM_BNOi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
- pag->pagf_levels[XFS_BTNUM_CNTi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- pag->pagf_levels[XFS_BTNUM_RMAPi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+ pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+ pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
if (xfs_agfl_needs_reset(pag->pag_mount, agf))
set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
@@ -3343,10 +3403,8 @@ xfs_alloc_read_agf(
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
- ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
- ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+ ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
+ ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
}
#endif
if (agfbpp)
@@ -3895,17 +3953,23 @@ __xfs_free_extent(
return -EIO;
error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
- if (error)
+ if (error) {
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
return error;
+ }
+
agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto err_release;
}
/* validate the extent size is legal now we have the agf locked */
if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto err_release;
}
@@ -3962,7 +4026,7 @@ xfs_alloc_query_range(
union xfs_btree_irec high_brec = { .a = *high_rec };
struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn };
- ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ ASSERT(xfs_btree_is_bno(cur->bc_ops));
return xfs_btree_query_range(cur, &low_brec, &high_brec,
xfs_alloc_query_range_helper, &query);
}
@@ -3976,7 +4040,7 @@ xfs_alloc_query_all(
{
struct xfs_alloc_query_range_info query;
- ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ ASSERT(xfs_btree_is_bno(cur->bc_ops));
query.priv = priv;
query.fn = fn;
return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index a7032bf0cd37..6ef5ddd89600 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -16,6 +16,7 @@
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_ag.h"
@@ -23,13 +24,22 @@
static struct kmem_cache *xfs_allocbt_cur_cache;
STATIC struct xfs_btree_cur *
-xfs_allocbt_dup_cursor(
+xfs_bnobt_dup_cursor(
struct xfs_btree_cur *cur)
{
- return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+ return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
+ cur->bc_ag.pag);
}
+STATIC struct xfs_btree_cur *
+xfs_cntbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
+ cur->bc_ag.pag);
+}
+
+
STATIC void
xfs_allocbt_set_root(
struct xfs_btree_cur *cur,
@@ -38,13 +48,18 @@ xfs_allocbt_set_root(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- int btnum = cur->bc_btnum;
ASSERT(ptr->s != 0);
- agf->agf_roots[btnum] = ptr->s;
- be32_add_cpu(&agf->agf_levels[btnum], inc);
- cur->bc_ag.pag->pagf_levels[btnum] += inc;
+ if (xfs_btree_is_bno(cur->bc_ops)) {
+ agf->agf_bno_root = ptr->s;
+ be32_add_cpu(&agf->agf_bno_level, inc);
+ cur->bc_ag.pag->pagf_bno_level += inc;
+ } else {
+ agf->agf_cnt_root = ptr->s;
+ be32_add_cpu(&agf->agf_cnt_level, inc);
+ cur->bc_ag.pag->pagf_cnt_level += inc;
+ }
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -116,7 +131,7 @@ xfs_allocbt_update_lastrec(
__be32 len;
int numrecs;
- ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+ ASSERT(!xfs_btree_is_bno(cur->bc_ops));
switch (reason) {
case LASTREC_UPDATE:
@@ -226,7 +241,10 @@ xfs_allocbt_init_ptr_from_cur(
ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
- ptr->s = agf->agf_roots[cur->bc_btnum];
+ if (xfs_btree_is_bno(cur->bc_ops))
+ ptr->s = agf->agf_bno_root;
+ else
+ ptr->s = agf->agf_cnt_root;
}
STATIC int64_t
@@ -299,13 +317,12 @@ xfs_allocbt_verify(
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
- xfs_btnum_t btnum = XFS_BTNUM_BNOi;
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (xfs_has_crc(mp)) {
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
}
@@ -320,26 +337,32 @@ xfs_allocbt_verify(
* against.
*/
level = be16_to_cpu(block->bb_level);
- if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
- btnum = XFS_BTNUM_CNTi;
if (pag && xfs_perag_initialised_agf(pag)) {
- unsigned int maxlevel = pag->pagf_levels[btnum];
+ unsigned int maxlevel, repair_maxlevel = 0;
-#ifdef CONFIG_XFS_ONLINE_REPAIR
/*
* Online repair could be rewriting the free space btrees, so
* we'll validate against the larger of either tree while this
* is going on.
*/
- maxlevel = max_t(unsigned int, maxlevel,
- pag->pagf_repair_levels[btnum]);
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) {
+ maxlevel = pag->pagf_cnt_level;
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ repair_maxlevel = pag->pagf_repair_cnt_level;
+#endif
+ } else {
+ maxlevel = pag->pagf_bno_level;
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ repair_maxlevel = pag->pagf_repair_bno_level;
#endif
- if (level >= maxlevel)
+ }
+
+ if (level >= max(maxlevel, repair_maxlevel))
return __this_address;
} else if (level >= mp->m_alloc_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
static void
@@ -348,7 +371,7 @@ xfs_allocbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_allocbt_verify(bp);
@@ -372,7 +395,7 @@ xfs_allocbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -454,11 +477,19 @@ xfs_allocbt_keys_contiguous(
be32_to_cpu(key2->alloc.ar_startblock));
}
-static const struct xfs_btree_ops xfs_bnobt_ops = {
+const struct xfs_btree_ops xfs_bnobt_ops = {
+ .name = "bno",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
- .dup_cursor = xfs_allocbt_dup_cursor,
+ .lru_refs = XFS_ALLOC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2),
+ .sick_mask = XFS_SICK_AG_BNOBT,
+
+ .dup_cursor = xfs_bnobt_dup_cursor,
.set_root = xfs_allocbt_set_root,
.alloc_block = xfs_allocbt_alloc_block,
.free_block = xfs_allocbt_free_block,
@@ -477,11 +508,20 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.keys_contiguous = xfs_allocbt_keys_contiguous,
};
-static const struct xfs_btree_ops xfs_cntbt_ops = {
+const struct xfs_btree_ops xfs_cntbt_ops = {
+ .name = "cnt",
+ .type = XFS_BTREE_TYPE_AG,
+ .geom_flags = XFS_BTGEO_LASTREC_UPDATE,
+
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_ALLOC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2),
+ .sick_mask = XFS_SICK_AG_CNTBT,
- .dup_cursor = xfs_allocbt_dup_cursor,
+ .dup_cursor = xfs_cntbt_dup_cursor,
.set_root = xfs_allocbt_set_root,
.alloc_block = xfs_allocbt_alloc_block,
.free_block = xfs_allocbt_free_block,
@@ -500,76 +540,55 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.keys_contiguous = NULL, /* not needed right now */
};
-/* Allocate most of a new allocation btree cursor. */
-STATIC struct xfs_btree_cur *
-xfs_allocbt_init_common(
+/*
+ * Allocate a new bnobt cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
+struct xfs_btree_cur *
+xfs_bnobt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
- cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
- xfs_allocbt_cur_cache);
- cur->bc_ag.abt.active = false;
-
- if (btnum == XFS_BTNUM_CNT) {
- cur->bc_ops = &xfs_cntbt_ops;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
- cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
- } else {
- cur->bc_ops = &xfs_bnobt_ops;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
- }
-
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops,
+ mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
+ cur->bc_nlevels = be32_to_cpu(agf->agf_bno_level);
+ }
return cur;
}
/*
- * Allocate a new allocation btree cursor.
+ * Allocate a new cntbt cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-struct xfs_btree_cur * /* new alloc btree cursor */
-xfs_allocbt_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for agf structure */
- struct xfs_perag *pag,
- xfs_btnum_t btnum) /* btree identifier */
-{
- struct xfs_agf *agf = agbp->b_addr;
- struct xfs_btree_cur *cur;
-
- cur = xfs_allocbt_init_common(mp, tp, pag, btnum);
- if (btnum == XFS_BTNUM_CNT)
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- else
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
-
- cur->bc_ag.agbp = agbp;
-
- return cur;
-}
-
-/* Create a free space btree cursor with a fake root for staging. */
struct xfs_btree_cur *
-xfs_allocbt_stage_cursor(
+xfs_cntbt_init_cursor(
struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- cur = xfs_allocbt_init_common(mp, NULL, pag, btnum);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops,
+ mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
+ cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agf->agf_cnt_level);
+ }
return cur;
}
@@ -588,16 +607,16 @@ xfs_allocbt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
- agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
- xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+ if (xfs_btree_is_bno(cur->bc_ops)) {
+ agf->agf_bno_root = cpu_to_be32(afake->af_root);
+ agf->agf_bno_level = cpu_to_be32(afake->af_levels);
} else {
- cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+ agf->agf_cnt_root = cpu_to_be32(afake->af_root);
+ agf->agf_cnt_level = cpu_to_be32(afake->af_levels);
}
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in an alloc btree block. */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45df893ef6bb..155b47f231ab 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -47,12 +47,12 @@ struct xbtree_afakeroot;
(maxrecs) * sizeof(xfs_alloc_key_t) + \
((index) - 1) * sizeof(xfs_alloc_ptr_t)))
-extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
+struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_perag *pag, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag,
- xfs_btnum_t btnum);
+ struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_perag *pag);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e965a48e7db9..673a4b6d2e8d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -224,7 +224,7 @@ int
xfs_attr_get_ilocked(
struct xfs_da_args *args)
{
- ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
@@ -891,7 +891,8 @@ xfs_attr_defer_add(
struct xfs_attr_intent *new;
- new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ new = kmem_cache_zalloc(xfs_attr_intent_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
new->xattri_op_flags = op_flags;
new->xattri_da_args = args;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6374bf107242..ac904cc1a97b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -29,6 +29,7 @@
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
/*
@@ -879,8 +880,7 @@ xfs_attr_shortform_to_leaf(
trace_xfs_attr_sf_to_leaf(args);
- tmpbuffer = kmem_alloc(size, 0);
- ASSERT(tmpbuffer != NULL);
+ tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, ifp->if_data, size);
sf = (struct xfs_attr_sf_hdr *)tmpbuffer;
@@ -924,7 +924,7 @@ xfs_attr_shortform_to_leaf(
}
error = 0;
out:
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
return error;
}
@@ -1059,7 +1059,7 @@ xfs_attr3_leaf_to_shortform(
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+ tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
if (!tmpbuffer)
return -ENOMEM;
@@ -1125,7 +1125,7 @@ xfs_attr3_leaf_to_shortform(
error = 0;
out:
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
return error;
}
@@ -1533,7 +1533,7 @@ xfs_attr3_leaf_compact(
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+ tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1571,7 +1571,7 @@ xfs_attr3_leaf_compact(
*/
xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
}
/*
@@ -2250,7 +2250,8 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
+ tmp_leaf = kzalloc(state->args->geo->blksize,
+ GFP_KERNEL | __GFP_NOFAIL);
/*
* Copy the header into the temp leaf so that all the stuff
@@ -2290,7 +2291,7 @@ xfs_attr3_leaf_unbalance(
}
memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
savehdr = tmphdr; /* struct copy */
- kmem_free(tmp_leaf);
+ kfree(tmp_leaf);
}
xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
@@ -2343,6 +2344,7 @@ xfs_attr3_leaf_lookup_int(
entries = xfs_attr3_leaf_entryp(leaf);
if (ichdr.count >= args->geo->blksize / 8) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -2362,10 +2364,12 @@ xfs_attr3_leaf_lookup_int(
}
if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..ff0412828772 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -22,6 +22,7 @@
#include "xfs_attr_remote.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
@@ -276,17 +277,18 @@ xfs_attr3_rmt_hdr_set(
*/
STATIC int
xfs_attr_rmtval_copyout(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_ino_t ino,
- int *offset,
- int *valuelen,
- uint8_t **dst)
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct xfs_inode *dp,
+ int *offset,
+ int *valuelen,
+ uint8_t **dst)
{
- char *src = bp->b_addr;
- xfs_daddr_t bno = xfs_buf_daddr(bp);
- int len = BBTOB(bp->b_length);
- int blksize = mp->m_attr_geo->blksize;
+ char *src = bp->b_addr;
+ xfs_ino_t ino = dp->i_ino;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
ASSERT(len >= blksize);
@@ -302,6 +304,7 @@ xfs_attr_rmtval_copyout(
xfs_alert(mp,
"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
bno, *offset, byte_cnt, ino);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
@@ -418,10 +421,12 @@ xfs_attr_rmtval_get(
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
0, &bp, &xfs_attr3_rmt_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
if (error)
return error;
- error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+ error = xfs_attr_rmtval_copyout(mp, bp, args->dp,
&offset, &valuelen,
&dst);
xfs_buf_relse(bp);
@@ -545,11 +550,13 @@ xfs_attr_rmtval_stale(
struct xfs_buf *bp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) ||
- XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
+ XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) {
+ xfs_bmap_mark_sick(ip, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
error = xfs_buf_incore(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map->br_startblock),
@@ -659,8 +666,10 @@ xfs_attr_rmtval_invalidate(
blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1))
+ if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) {
+ xfs_bmap_mark_sick(args->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f362345467fa..656c95a22f2e 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -36,6 +36,9 @@
#include "xfs_refcount.h"
#include "xfs_icache.h"
#include "xfs_iomap.h"
+#include "xfs_health.h"
+#include "xfs_bmap_item.h"
+#include "xfs_symlink_remote.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -225,6 +228,28 @@ xfs_bmap_forkoff_reset(
}
}
+static int
+xfs_bmap_read_buf(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ struct xfs_buf **bpp) /* buffer for fsbno */
+{
+ struct xfs_buf *bp; /* return value */
+ int error;
+
+ if (!xfs_verify_fsbno(mp, fsbno))
+ return -EFSCORRUPTED;
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp,
+ &xfs_bmbt_buf_ops);
+ if (!error) {
+ xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+ *bpp = bp;
+ }
+ return error;
+}
+
#ifdef DEBUG
STATIC struct xfs_buf *
xfs_bmap_get_bp(
@@ -364,9 +389,9 @@ xfs_bmap_check_leaf_extents(
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, NULL, bno, &bp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
goto error_norelse;
}
@@ -383,6 +408,7 @@ xfs_bmap_check_leaf_extents(
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -450,9 +476,9 @@ xfs_bmap_check_leaf_extents(
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, NULL, bno, &bp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
goto error_norelse;
}
@@ -562,11 +588,14 @@ xfs_bmap_btree_to_extents(
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
#ifdef DEBUG
- if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1)))
+ if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
#endif
- error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, tp, cbno, &cbp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
return error;
cblock = XFS_BUF_TO_BLOCK(cbp);
@@ -634,14 +663,13 @@ xfs_bmap_extents_to_btree(
* Fill in the root.
*/
block = ifp->if_broot;
- xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
- XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, block, NULL, 1, 1);
/*
* Need a cursor. Can't allocate until bb_level is filled in.
*/
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+ if (wasdel)
+ cur->bc_flags |= XFS_BTREE_BMBT_WASDEL;
/*
* Convert to a btree with two levels, one record in root.
*/
@@ -667,7 +695,7 @@ xfs_bmap_extents_to_btree(
goto out_root_realloc;
}
- cur->bc_ino.allocated++;
+ cur->bc_bmap.allocated++;
ip->i_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
@@ -679,11 +707,8 @@ xfs_bmap_extents_to_btree(
/*
* Fill in the child block.
*/
- abp->b_ops = &xfs_bmbt_buf_ops;
ablock = XFS_BUF_TO_BLOCK(abp);
- xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp),
- XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, ablock, abp, 0, 0);
for_each_xfs_iext(ifp, &icur, &rec) {
if (isnullstartblock(rec.br_startblock))
@@ -878,6 +903,7 @@ xfs_bmap_add_attrfork_btree(
goto error0;
/* must be at least one entry */
if (XFS_IS_CORRUPT(mp, stat != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -887,7 +913,7 @@ xfs_bmap_add_attrfork_btree(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return -ENOSPC;
}
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
}
return 0;
@@ -915,7 +941,7 @@ xfs_bmap_add_attrfork_extents(
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
XFS_DATA_FORK);
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -960,6 +986,7 @@ xfs_bmap_add_attrfork_local(
/* should only be called for types that support local format data */
ASSERT(0);
+ xfs_bmap_mark_sick(ip, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -1143,6 +1170,7 @@ xfs_iread_bmbt_block(
(unsigned long long)ip->i_ino);
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
sizeof(*block), __this_address);
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -1158,6 +1186,7 @@ xfs_iread_bmbt_block(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iread_extents(2)", frp,
sizeof(*frp), fa);
+ xfs_bmap_mark_sick(ip, whichfork);
return xfs_bmap_complain_bad_rec(ip, whichfork, fa,
&new);
}
@@ -1189,7 +1218,7 @@ xfs_iread_extents(
if (!xfs_need_iread_extents(ifp))
return 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ir.loaded = 0;
xfs_iext_first(ifp, &ir.icur);
@@ -1201,6 +1230,7 @@ xfs_iread_extents(
goto out;
if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto out;
}
@@ -1213,6 +1243,8 @@ xfs_iread_extents(
smp_store_release(&ifp->if_needextents, 0);
return 0;
out:
+ if (xfs_metadata_is_sick(error))
+ xfs_bmap_mark_sick(ip, whichfork);
xfs_iext_destroy(ifp);
return error;
}
@@ -1292,6 +1324,7 @@ xfs_bmap_last_before(
break;
default:
ASSERT(0);
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -1388,8 +1421,10 @@ xfs_bmap_last_offset(
if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
return 0;
- if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp)))
+ if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
if (error || is_empty)
@@ -1429,8 +1464,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(whichfork != XFS_ATTR_FORK);
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!bma->cur ||
- (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+ ASSERT(!bma->cur || (bma->cur->bc_flags & XFS_BTREE_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -1528,6 +1562,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1535,6 +1570,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1542,6 +1578,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1571,6 +1608,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1604,6 +1642,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1632,6 +1671,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1639,6 +1679,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1673,6 +1714,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1698,6 +1740,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1705,6 +1748,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1721,7 +1765,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_ino.allocated : 0));
+ (bma->cur ? bma->cur->bc_bmap.allocated : 0));
PREV.br_startoff = new_endoff;
PREV.br_blockcount = temp;
@@ -1749,6 +1793,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1785,6 +1830,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1792,6 +1838,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1808,7 +1855,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_ino.allocated : 0));
+ (bma->cur ? bma->cur->bc_bmap.allocated : 0));
PREV.br_startblock = nullstartblock(da_new);
PREV.br_blockcount = temp;
@@ -1871,6 +1918,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1878,6 +1926,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1929,8 +1978,8 @@ xfs_bmap_add_extent_delay_real(
xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
if (bma->cur) {
- da_new += bma->cur->bc_ino.allocated;
- bma->cur->bc_ino.allocated = 0;
+ da_new += bma->cur->bc_bmap.allocated;
+ bma->cur->bc_bmap.allocated = 0;
}
/* adjust for changes in reserved delayed indirect blocks */
@@ -2074,30 +2123,35 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2126,18 +2180,21 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2169,18 +2226,21 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2207,6 +2267,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2240,6 +2301,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2277,6 +2339,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2287,6 +2350,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2317,6 +2381,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2353,6 +2418,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2363,12 +2429,14 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2405,6 +2473,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2417,6 +2486,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2429,6 +2499,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2436,6 +2507,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2472,7 +2544,7 @@ xfs_bmap_add_extent_unwritten_real(
/* clear out the allocated field, done with it now in any case. */
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
*curp = cur;
}
@@ -2651,7 +2723,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+ ASSERT(!cur || !(cur->bc_flags & XFS_BTREE_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2721,6 +2793,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2728,6 +2801,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2735,6 +2809,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2764,6 +2839,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2794,6 +2870,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2820,6 +2897,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2827,6 +2905,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2853,7 +2932,7 @@ xfs_bmap_add_extent_hole_real(
/* clear out the allocated field, done with it now in any case. */
if (cur)
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
@@ -3898,14 +3977,18 @@ xfs_bmapi_read(
ASSERT(*nmap >= 1);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE)));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (WARN_ON_ONCE(!ifp))
+ if (WARN_ON_ONCE(!ifp)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (xfs_is_shutdown(mp))
return -EIO;
@@ -4160,9 +4243,8 @@ xfs_bmapi_allocate(
*/
bma->nallocs++;
- if (bma->cur)
- bma->cur->bc_ino.flags =
- bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+ if (bma->cur && bma->wasdel)
+ bma->cur->bc_flags |= XFS_BTREE_BMBT_WASDEL;
bma->got.br_startoff = bma->offset;
bma->got.br_startblock = bma->blkno;
@@ -4369,7 +4451,7 @@ xfs_bmapi_write(
ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!(flags & XFS_BMAPI_REMAP));
/* zeroing is for currently only for data extents, not metadata */
@@ -4386,6 +4468,7 @@ xfs_bmapi_write(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4613,9 +4696,11 @@ xfs_bmapi_convert_delalloc(
error = -ENOSPC;
if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
goto out_finish;
- error = -EFSCORRUPTED;
- if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
+ if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
+ error = -EFSCORRUPTED;
goto out_finish;
+ }
XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
XFS_STATS_INC(mp, xs_xstrat_quick);
@@ -4666,7 +4751,7 @@ xfs_bmapi_remap(
ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(len > 0);
ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
@@ -4674,6 +4759,7 @@ xfs_bmapi_remap(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4693,10 +4779,8 @@ xfs_bmapi_remap(
ip->i_nblocks += len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
got.br_startoff = bno;
got.br_startblock = startblock;
@@ -4831,7 +4915,7 @@ xfs_bmap_del_extent_delay(
XFS_STATS_INC(mp, xs_del_exlist);
- isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ isrt = xfs_ifork_is_realtime(ip, whichfork);
del_endoff = del->br_startoff + del->br_blockcount;
got_endoff = got->br_startoff + got->br_blockcount;
da_old = startblockval(got->br_startblock);
@@ -5067,7 +5151,7 @@ xfs_bmap_del_extent_real(
return -ENOSPC;
*logflagsp = XFS_ILOG_CORE;
- if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+ if (xfs_ifork_is_realtime(ip, whichfork)) {
if (!(bflags & XFS_BMAPI_REMAP)) {
error = xfs_rtfree_blocks(tp, del->br_startblock,
del->br_blockcount);
@@ -5088,8 +5172,10 @@ xfs_bmap_del_extent_real(
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
}
if (got.br_startoff == del->br_startoff)
@@ -5113,8 +5199,10 @@ xfs_bmap_del_extent_real(
}
if ((error = xfs_btree_delete(cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
break;
case BMAP_LEFT_FILLING:
/*
@@ -5186,8 +5274,10 @@ xfs_bmap_del_extent_real(
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/*
* Update the btree record back
* to the original value.
@@ -5203,8 +5293,10 @@ xfs_bmap_del_extent_real(
*logflagsp = 0;
return -ENOSPC;
}
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
} else
*logflagsp |= xfs_ilog_fext(whichfork);
@@ -5286,12 +5378,14 @@ __xfs_bunmapi(
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
ifp = xfs_ifork_ptr(ip, whichfork);
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(len > 0);
ASSERT(nexts >= 0);
@@ -5304,7 +5398,7 @@ __xfs_bunmapi(
return 0;
}
XFS_STATS_INC(mp, xs_blk_unmap);
- isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ isrt = xfs_ifork_is_realtime(ip, whichfork);
end = start + len;
if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
@@ -5317,7 +5411,6 @@ __xfs_bunmapi(
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
} else
cur = NULL;
@@ -5367,7 +5460,7 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
- if (!isrt)
+ if (!isrt || (flags & XFS_BMAPI_REMAP))
goto delete;
mod = xfs_rtb_to_rtxoff(mp,
@@ -5385,7 +5478,7 @@ __xfs_bunmapi(
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
*/
- ASSERT(end >= mod);
+ ASSERT((flags & XFS_BMAPI_REMAP) || end >= mod);
end -= mod > del.br_blockcount ?
del.br_blockcount : mod;
if (end < got.br_startoff &&
@@ -5555,7 +5648,7 @@ error0:
xfs_trans_log_inode(tp, ip, logflags);
if (cur) {
if (!error)
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -5635,8 +5728,7 @@ xfs_bmse_merge(
blockcount = left->br_blockcount + got->br_blockcount;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
ASSERT(xfs_bmse_can_merge(left, got, shift));
new = *left;
@@ -5657,21 +5749,27 @@ xfs_bmse_merge(
error = xfs_bmbt_lookup_eq(cur, got, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_btree_delete(cur, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/* lookup and update size of the previous extent */
error = xfs_bmbt_lookup_eq(cur, left, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_bmbt_update(cur, &new);
if (error)
@@ -5719,8 +5817,10 @@ xfs_bmap_shift_update_extent(
error = xfs_bmbt_lookup_eq(cur, &prev, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_bmbt_update(cur, got);
if (error)
@@ -5758,28 +5858,28 @@ xfs_bmap_collapse_extents(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
*done = true;
goto del_cursor;
}
if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5837,7 +5937,7 @@ xfs_bmap_can_insert_extents(
int is_empty;
int error = 0;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
if (xfs_is_shutdown(ip->i_mount))
return -EIO;
@@ -5873,22 +5973,21 @@ xfs_bmap_insert_extents(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
if (*next_fsb == NULLFSBLOCK) {
xfs_iext_last(ifp, &icur);
@@ -5904,11 +6003,13 @@ xfs_bmap_insert_extents(
}
}
if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5976,6 +6077,7 @@ xfs_bmap_split_extent(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -6002,11 +6104,11 @@ xfs_bmap_split_extent(
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6034,6 +6136,7 @@ xfs_bmap_split_extent(
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6041,6 +6144,7 @@ xfs_bmap_split_extent(
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6060,7 +6164,7 @@ xfs_bmap_split_extent(
del_cursor:
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
@@ -6069,17 +6173,8 @@ del_cursor:
return error;
}
-/* Deferred mapping is only for real extents in the data fork. */
-static bool
-xfs_bmap_is_update_needed(
- struct xfs_bmbt_irec *bmap)
-{
- return bmap->br_startblock != HOLESTARTBLOCK &&
- bmap->br_startblock != DELAYSTARTBLOCK;
-}
-
/* Record a bmap intent. */
-static int
+static inline void
__xfs_bmap_add(
struct xfs_trans *tp,
enum xfs_bmap_intent_type type,
@@ -6089,25 +6184,19 @@ __xfs_bmap_add(
{
struct xfs_bmap_intent *bi;
- trace_xfs_bmap_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
- type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
- ip->i_ino, whichfork,
- bmap->br_startoff,
- bmap->br_blockcount,
- bmap->br_state);
+ if ((whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) ||
+ bmap->br_startblock == HOLESTARTBLOCK ||
+ bmap->br_startblock == DELAYSTARTBLOCK)
+ return;
- bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
bi->bi_whichfork = whichfork;
bi->bi_bmap = *bmap;
- xfs_bmap_update_get_group(tp->t_mountp, bi);
- xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
- return 0;
+ xfs_bmap_defer_add(tp, bi);
}
/* Map an extent into a file. */
@@ -6115,12 +6204,10 @@ void
xfs_bmap_map_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
+ int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_bmap_is_update_needed(PREV))
- return;
-
- __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -6128,12 +6215,10 @@ void
xfs_bmap_unmap_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
+ int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_bmap_is_update_needed(PREV))
- return;
-
- __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, whichfork, PREV);
}
/*
@@ -6147,36 +6232,35 @@ xfs_bmap_finish_one(
{
struct xfs_bmbt_irec *bmap = &bi->bi_bmap;
int error = 0;
+ int flags = 0;
- ASSERT(tp->t_highest_agno == NULLAGNUMBER);
+ if (bi->bi_whichfork == XFS_ATTR_FORK)
+ flags |= XFS_BMAPI_ATTRFORK;
- trace_xfs_bmap_deferred(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
- bi->bi_type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
- bi->bi_owner->i_ino, bi->bi_whichfork,
- bmap->br_startoff, bmap->br_blockcount,
- bmap->br_state);
+ ASSERT(tp->t_highest_agno == NULLAGNUMBER);
- if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK))
- return -EFSCORRUPTED;
+ trace_xfs_bmap_deferred(bi);
- if (XFS_TEST_ERROR(false, tp->t_mountp,
- XFS_ERRTAG_BMAP_FINISH_ONE))
+ if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
return -EIO;
switch (bi->bi_type) {
case XFS_BMAP_MAP:
+ if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN)
+ flags |= XFS_BMAPI_PREALLOC;
error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff,
- bmap->br_blockcount, bmap->br_startblock, 0);
+ bmap->br_blockcount, bmap->br_startblock,
+ flags);
bmap->br_blockcount = 0;
break;
case XFS_BMAP_UNMAP:
error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff,
- &bmap->br_blockcount, XFS_BMAPI_REMAP, 1);
+ &bmap->br_blockcount, flags | XFS_BMAPI_REMAP,
+ 1);
break;
default:
ASSERT(0);
+ xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork);
error = -EFSCORRUPTED;
}
@@ -6257,7 +6341,7 @@ xfs_bunmapi_range(
xfs_filblks_t unmap_len = endoff - startoff + 1;
int error = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
while (unmap_len > 0) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@@ -6274,3 +6358,46 @@ xfs_bunmapi_range(
out:
return error;
}
+
+struct xfs_bmap_query_range {
+ xfs_bmap_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_bmap_query_range_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_bmap_query_range *query = priv;
+ struct xfs_bmbt_irec irec;
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ fa = xfs_bmap_validate_extent(cur->bc_ino.ip, cur->bc_ino.whichfork,
+ &irec);
+ if (fa) {
+ xfs_btree_mark_sick(cur);
+ return xfs_bmap_complain_bad_rec(cur->bc_ino.ip,
+ cur->bc_ino.whichfork, fa, &irec);
+ }
+
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all bmaps. */
+int
+xfs_bmap_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_bmap_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_bmap_query_range query = {
+ .priv = priv,
+ .fn = fn,
+ };
+
+ return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f6b73f1bad5f..f7662595309d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -232,6 +232,10 @@ enum xfs_bmap_intent_type {
XFS_BMAP_UNMAP,
};
+#define XFS_BMAP_INTENT_STRINGS \
+ { XFS_BMAP_MAP, "map" }, \
+ { XFS_BMAP_UNMAP, "unmap" }
+
struct xfs_bmap_intent {
struct list_head bi_list;
enum xfs_bmap_intent_type bi_type;
@@ -241,14 +245,11 @@ struct xfs_bmap_intent {
struct xfs_bmbt_irec bi_bmap;
};
-void xfs_bmap_update_get_group(struct xfs_mount *mp,
- struct xfs_bmap_intent *bi);
-
int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
+ int whichfork, struct xfs_bmbt_irec *imap);
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
+ int whichfork, struct xfs_bmbt_irec *imap);
static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
{
@@ -280,4 +281,12 @@ extern struct kmem_cache *xfs_bmap_intent_cache;
int __init xfs_bmap_intent_init_cache(void);
void xfs_bmap_intent_destroy_cache(void);
+typedef int (*xfs_bmap_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv);
+
+int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn,
+ void *priv);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 71f2d50f7823..f5d84dcb58da 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -26,6 +26,22 @@
static struct kmem_cache *xfs_bmbt_cur_cache;
+void
+xfs_bmbt_init_block(
+ struct xfs_inode *ip,
+ struct xfs_btree_block *buf,
+ struct xfs_buf *bp,
+ __u16 level,
+ __u16 numrecs)
+{
+ if (bp)
+ xfs_btree_init_buf(ip->i_mount, bp, &xfs_bmbt_ops, level,
+ numrecs, ip->i_ino);
+ else
+ xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, level,
+ numrecs, ip->i_ino);
+}
+
/*
* Convert on-disk form of btree root to in-memory form.
*/
@@ -44,9 +60,7 @@ xfs_bmdr_to_bmbt(
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
- XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, rblock, NULL, 0, 0);
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
@@ -171,13 +185,8 @@ xfs_bmbt_dup_cursor(
new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
cur->bc_ino.ip, cur->bc_ino.whichfork);
-
- /*
- * Copy the firstblock, dfops, and flags values,
- * since init cursor doesn't get them.
- */
- new->bc_ino.flags = cur->bc_ino.flags;
-
+ new->bc_flags |= (cur->bc_flags &
+ (XFS_BTREE_BMBT_INVALID_OWNER | XFS_BTREE_BMBT_WASDEL));
return new;
}
@@ -189,10 +198,10 @@ xfs_bmbt_update_cursor(
ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) ||
(dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME));
- dst->bc_ino.allocated += src->bc_ino.allocated;
+ dst->bc_bmap.allocated += src->bc_bmap.allocated;
dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno;
- src->bc_ino.allocated = 0;
+ src->bc_bmap.allocated = 0;
}
STATIC int
@@ -211,7 +220,7 @@ xfs_bmbt_alloc_block(
xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
cur->bc_ino.whichfork);
args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
+ args.wasdel = cur->bc_flags & XFS_BTREE_BMBT_WASDEL;
if (!args.wasdel && args.tp->t_blk_res == 0)
return -ENOSPC;
@@ -247,7 +256,7 @@ xfs_bmbt_alloc_block(
}
ASSERT(args.len == 1);
- cur->bc_ino.allocated++;
+ cur->bc_bmap.allocated++;
cur->bc_ino.ip->i_nblocks++;
xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
@@ -360,14 +369,6 @@ xfs_bmbt_init_rec_from_cur(
xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
}
-STATIC void
-xfs_bmbt_init_ptr_from_cur(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- ptr->l = 0;
-}
-
STATIC int64_t
xfs_bmbt_key_diff(
struct xfs_btree_cur *cur,
@@ -419,7 +420,7 @@ xfs_bmbt_verify(
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
*/
- fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
}
@@ -435,7 +436,7 @@ xfs_bmbt_verify(
if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
return __this_address;
- return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
+ return xfs_btree_fsblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
}
static void
@@ -444,7 +445,7 @@ xfs_bmbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_lblock_verify_crc(bp))
+ if (!xfs_btree_fsblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_bmbt_verify(bp);
@@ -468,7 +469,7 @@ xfs_bmbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_lblock_calc_crc(bp);
+ xfs_btree_fsblock_calc_crc(bp);
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
@@ -515,9 +516,16 @@ xfs_bmbt_keys_contiguous(
be64_to_cpu(key2->bmbt.br_startoff));
}
-static const struct xfs_btree_ops xfs_bmbt_ops = {
+const struct xfs_btree_ops xfs_bmbt_ops = {
+ .name = "bmap",
+ .type = XFS_BTREE_TYPE_INODE,
+
.rec_len = sizeof(xfs_bmbt_rec_t),
.key_len = sizeof(xfs_bmbt_key_t),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_BMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2),
.dup_cursor = xfs_bmbt_dup_cursor,
.update_cursor = xfs_bmbt_update_cursor,
@@ -529,7 +537,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
.init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
- .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
.diff_two_keys = xfs_bmbt_diff_two_keys,
.buf_ops = &xfs_bmbt_buf_ops,
@@ -538,35 +545,10 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.keys_contiguous = xfs_bmbt_keys_contiguous,
};
-static struct xfs_btree_cur *
-xfs_bmbt_init_common(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- int whichfork)
-{
- struct xfs_btree_cur *cur;
-
- ASSERT(whichfork != XFS_COW_FORK);
-
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
- mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
-
- cur->bc_ops = &xfs_bmbt_ops;
- cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
- cur->bc_ino.ip = ip;
- cur->bc_ino.allocated = 0;
- cur->bc_ino.flags = 0;
-
- return cur;
-}
-
/*
- * Allocate a new bmap btree cursor.
+ * Create a new bmap btree cursor.
+ *
+ * For staging cursors -1 in passed in whichfork.
*/
struct xfs_btree_cur *
xfs_bmbt_init_cursor(
@@ -575,15 +557,34 @@ xfs_bmbt_init_cursor(
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
+ unsigned int maxlevels;
- cur = xfs_bmbt_init_common(mp, tp, ip, whichfork);
+ ASSERT(whichfork != XFS_COW_FORK);
- cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+ /*
+ * The Data fork always has larger maxlevel, so use that for staging
+ * cursors.
+ */
+ switch (whichfork) {
+ case XFS_STAGING_FORK:
+ maxlevels = mp->m_bm_maxlevels[XFS_DATA_FORK];
+ break;
+ default:
+ maxlevels = mp->m_bm_maxlevels[whichfork];
+ break;
+ }
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bmbt_ops, maxlevels,
+ xfs_bmbt_cur_cache);
+ cur->bc_ino.ip = ip;
cur->bc_ino.whichfork = whichfork;
+ cur->bc_bmap.allocated = 0;
+ if (whichfork != XFS_STAGING_FORK) {
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+ }
return cur;
}
@@ -599,33 +600,6 @@ xfs_bmbt_block_maxrecs(
}
/*
- * Allocate a new bmap btree cursor for reloading an inode block mapping data
- * structure. Note that callers can use the staged cursor to reload extents
- * format inode forks if they rebuild the iext tree and commit the staged
- * cursor immediately.
- */
-struct xfs_btree_cur *
-xfs_bmbt_stage_cursor(
- struct xfs_mount *mp,
- struct xfs_inode *ip,
- struct xbtree_ifakeroot *ifake)
-{
- struct xfs_btree_cur *cur;
- struct xfs_btree_ops *ops;
-
- /* data fork always has larger maxheight */
- cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK);
- cur->bc_nlevels = ifake->if_levels;
- cur->bc_ino.forksize = ifake->if_fork_size;
-
- /* Don't let anyone think we're attached to the real fork yet. */
- cur->bc_ino.whichfork = -1;
- xfs_btree_stage_ifakeroot(cur, ifake, &ops);
- ops->update_cursor = NULL;
- return cur;
-}
-
-/*
* Swap in the new inode fork root. Once we pass this point the newly rebuilt
* mappings are in place and we have to kill off any old btree blocks.
*/
@@ -665,7 +639,7 @@ xfs_bmbt_commit_staged_btree(
break;
}
xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
- xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops);
+ xfs_btree_commit_ifakeroot(cur, tp, whichfork);
}
/*
@@ -751,7 +725,7 @@ xfs_bmbt_change_owner(
ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
- cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
+ cur->bc_flags |= XFS_BTREE_BMBT_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 151b8491f60e..de1b73f1225c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -107,8 +107,6 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
-struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp,
- struct xfs_inode *ip, struct xbtree_ifakeroot *ifake);
void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, int whichfork);
@@ -120,4 +118,7 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void);
int __init xfs_bmbt_init_cur_cache(void);
void xfs_bmbt_destroy_cur_cache(void);
+void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf,
+ struct xfs_buf *bp, __u16 level, __u16 numrecs);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index ea8d3659df20..d29547572a68 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -27,28 +27,24 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
/*
* Btree magic numbers.
*/
-static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
- { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
- XFS_FIBT_MAGIC, 0 },
- { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
- XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
- XFS_REFC_CRC_MAGIC }
-};
-
uint32_t
xfs_btree_magic(
- int crc,
- xfs_btnum_t btnum)
+ struct xfs_mount *mp,
+ const struct xfs_btree_ops *ops)
{
- uint32_t magic = xfs_magics[crc][btnum];
+ int idx = xfs_has_crc(mp) ? 1 : 0;
+ __be32 magic = ops->buf_ops->magic[idx];
/* Ensure we asked for crc for crc-only magics. */
ASSERT(magic != 0);
- return magic;
+ return be32_to_cpu(magic);
}
/*
@@ -63,10 +59,8 @@ xfs_btree_magic(
* bytes.
*/
static inline xfs_failaddr_t
-xfs_btree_check_lblock_siblings(
+xfs_btree_check_fsblock_siblings(
struct xfs_mount *mp,
- struct xfs_btree_cur *cur,
- int level,
xfs_fsblock_t fsb,
__be64 dsibling)
{
@@ -78,22 +72,33 @@ xfs_btree_check_lblock_siblings(
sibling = be64_to_cpu(dsibling);
if (sibling == fsb)
return __this_address;
- if (level >= 0) {
- if (!xfs_btree_check_lptr(cur, sibling, level + 1))
- return __this_address;
- } else {
- if (!xfs_verify_fsbno(mp, sibling))
- return __this_address;
- }
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ return NULL;
+}
+static inline xfs_failaddr_t
+xfs_btree_check_memblock_siblings(
+ struct xfs_buftarg *btp,
+ xfbno_t bno,
+ __be64 dsibling)
+{
+ xfbno_t sibling;
+
+ if (dsibling == cpu_to_be64(NULLFSBLOCK))
+ return NULL;
+
+ sibling = be64_to_cpu(dsibling);
+ if (sibling == bno)
+ return __this_address;
+ if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
+ return __this_address;
return NULL;
}
static inline xfs_failaddr_t
-xfs_btree_check_sblock_siblings(
+xfs_btree_check_agblock_siblings(
struct xfs_perag *pag,
- struct xfs_btree_cur *cur,
- int level,
xfs_agblock_t agbno,
__be32 dsibling)
{
@@ -105,34 +110,21 @@ xfs_btree_check_sblock_siblings(
sibling = be32_to_cpu(dsibling);
if (sibling == agbno)
return __this_address;
- if (level >= 0) {
- if (!xfs_btree_check_sptr(cur, sibling, level + 1))
- return __this_address;
- } else {
- if (!xfs_verify_agbno(pag, sibling))
- return __this_address;
- }
+ if (!xfs_verify_agbno(pag, sibling))
+ return __this_address;
return NULL;
}
-/*
- * Check a long btree block header. Return the address of the failing check,
- * or NULL if everything is ok.
- */
-xfs_failaddr_t
-__xfs_btree_check_lblock(
+static xfs_failaddr_t
+__xfs_btree_check_lblock_hdr(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_has_crc(mp);
- xfs_failaddr_t fa;
- xfs_fsblock_t fsb = NULLFSBLOCK;
- if (crc) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (block->bb_u.l.bb_blkno !=
@@ -142,7 +134,7 @@ __xfs_btree_check_lblock(
return __this_address;
}
- if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
return __this_address;
if (be16_to_cpu(block->bb_level) != level)
return __this_address;
@@ -150,44 +142,83 @@ __xfs_btree_check_lblock(
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (bp)
- fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ return NULL;
+}
+
+/*
+ * Check a long btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_fsblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb;
+
+ fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+ if (fa)
+ return fa;
- fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ /*
+ * For inode-rooted btrees, the root block sits in the inode fork. In
+ * that case bp is NULL, and the block must not have any siblings.
+ */
+ if (!bp) {
+ if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK))
+ return __this_address;
+ if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK))
+ return __this_address;
+ return NULL;
+ }
+
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_rightsib);
return fa;
}
-/* Check a long btree block header. */
-static int
-xfs_btree_check_lblock(
+/*
+ * Check an in-memory btree block header. Return the address of the failing
+ * check, or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_memblock(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
xfs_failaddr_t fa;
+ xfbno_t bno;
- fa = __xfs_btree_check_lblock(cur, block, level, bp);
- if (XFS_IS_CORRUPT(mp, fa != NULL) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
- if (bp)
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- return -EFSCORRUPTED;
- }
- return 0;
+ fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+ if (fa)
+ return fa;
+
+ bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/*
* Check a short btree block header. Return the address of the failing check,
* or NULL if everything is ok.
*/
-xfs_failaddr_t
-__xfs_btree_check_sblock(
+static xfs_failaddr_t
+__xfs_btree_check_agblock(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
@@ -195,20 +226,17 @@ __xfs_btree_check_sblock(
{
struct xfs_mount *mp = cur->bc_mp;
struct xfs_perag *pag = cur->bc_ag.pag;
- xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_has_crc(mp);
xfs_failaddr_t fa;
- xfs_agblock_t agbno = NULLAGBLOCK;
+ xfs_agblock_t agbno;
- if (crc) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (block->bb_u.s.bb_blkno !=
- cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
}
- if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
return __this_address;
if (be16_to_cpu(block->bb_level) != level)
return __this_address;
@@ -216,36 +244,45 @@ __xfs_btree_check_sblock(
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (bp)
- agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
-
- fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_agblock_siblings(pag, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ fa = xfs_btree_check_agblock_siblings(pag, agbno,
block->bb_u.s.bb_rightsib);
return fa;
}
-/* Check a short btree block header. */
-STATIC int
-xfs_btree_check_sblock(
+/*
+ * Internal btree block check.
+ *
+ * Return NULL if the block is ok or the address of the failed check otherwise.
+ */
+xfs_failaddr_t
+__xfs_btree_check_block(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = cur->bc_mp;
- xfs_failaddr_t fa;
-
- fa = __xfs_btree_check_sblock(cur, block, level, bp);
- if (XFS_IS_CORRUPT(mp, fa != NULL) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
- if (bp)
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- return -EFSCORRUPTED;
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ return __xfs_btree_check_memblock(cur, block, level, bp);
+ case XFS_BTREE_TYPE_AG:
+ return __xfs_btree_check_agblock(cur, block, level, bp);
+ case XFS_BTREE_TYPE_INODE:
+ return __xfs_btree_check_fsblock(cur, block, level, bp);
+ default:
+ ASSERT(0);
+ return __this_address;
}
- return 0;
+}
+
+static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN)
+ return XFS_ERRTAG_BTREE_CHECK_SBLOCK;
+ return XFS_ERRTAG_BTREE_CHECK_LBLOCK;
}
/*
@@ -258,34 +295,49 @@ xfs_btree_check_block(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer containing block, if any */
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return xfs_btree_check_lblock(cur, block, level, bp);
- else
- return xfs_btree_check_sblock(cur, block, level, bp);
-}
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
-/* Check that this long pointer is valid and points within the fs. */
-bool
-xfs_btree_check_lptr(
- struct xfs_btree_cur *cur,
- xfs_fsblock_t fsbno,
- int level)
-{
- if (level <= 0)
- return false;
- return xfs_verify_fsbno(cur->bc_mp, fsbno);
+ fa = __xfs_btree_check_block(cur, block, level, bp);
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
-/* Check that this short pointer is valid and points within the AG. */
-bool
-xfs_btree_check_sptr(
- struct xfs_btree_cur *cur,
- xfs_agblock_t agbno,
- int level)
+int
+__xfs_btree_check_ptr(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int index,
+ int level)
{
if (level <= 0)
- return false;
- return xfs_verify_agbno(cur->bc_ag.pag, agbno);
+ return -EFSCORRUPTED;
+
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
+ be64_to_cpu((&ptr->l)[index])))
+ return -EFSCORRUPTED;
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ if (!xfs_verify_fsbno(cur->bc_mp,
+ be64_to_cpu((&ptr->l)[index])))
+ return -EFSCORRUPTED;
+ break;
+ case XFS_BTREE_TYPE_AG:
+ if (!xfs_verify_agbno(cur->bc_ag.pag,
+ be32_to_cpu((&ptr->s)[index])))
+ return -EFSCORRUPTED;
+ break;
+ }
+
+ return 0;
}
/*
@@ -299,26 +351,35 @@ xfs_btree_check_ptr(
int index,
int level)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
- level))
- return 0;
- xfs_err(cur->bc_mp,
-"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
+ int error;
+
+ error = __xfs_btree_check_ptr(cur, ptr, index, level);
+ if (error) {
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ xfs_err(cur->bc_mp,
+"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
+ cur->bc_ops->name, cur->bc_flags, level, index,
+ __this_address);
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ xfs_err(cur->bc_mp,
+"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
cur->bc_ino.ip->i_ino,
- cur->bc_ino.whichfork, cur->bc_btnum,
+ cur->bc_ino.whichfork, cur->bc_ops->name,
level, index);
- } else {
- if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
- level))
- return 0;
- xfs_err(cur->bc_mp,
-"AG %u: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_ag.pag->pag_agno, cur->bc_btnum,
+ break;
+ case XFS_BTREE_TYPE_AG:
+ xfs_err(cur->bc_mp,
+"AG %u: Corrupt %sbt pointer at level %d index %d.",
+ cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
level, index);
+ break;
+ }
+ xfs_btree_mark_sick(cur);
}
- return -EFSCORRUPTED;
+ return error;
}
#ifdef DEBUG
@@ -336,7 +397,7 @@ xfs_btree_check_ptr(
* it to disk.
*/
void
-xfs_btree_lblock_calc_crc(
+xfs_btree_fsblock_calc_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -350,7 +411,7 @@ xfs_btree_lblock_calc_crc(
}
bool
-xfs_btree_lblock_verify_crc(
+xfs_btree_fsblock_verify_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -374,7 +435,7 @@ xfs_btree_lblock_verify_crc(
* it to disk.
*/
void
-xfs_btree_sblock_calc_crc(
+xfs_btree_agblock_calc_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -388,7 +449,7 @@ xfs_btree_sblock_calc_crc(
}
bool
-xfs_btree_sblock_verify_crc(
+xfs_btree_agblock_verify_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -410,6 +471,17 @@ xfs_btree_free_block(
{
int error;
+ trace_xfs_btree_free_block(cur, bp);
+
+ /*
+ * Don't allow block freeing for a staging cursor, because staging
+ * cursors do not support regular btree modifications.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
error = cur->bc_ops->free_block(cur, bp);
if (!error) {
xfs_trans_binval(cur->bc_tp, bp);
@@ -448,33 +520,70 @@ xfs_btree_del_cursor(
* zero, then we should be shut down or on our way to shutdown due to
* cancelling a dirty transaction on error.
*/
- ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+ ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
xfs_is_shutdown(cur->bc_mp) || error != 0);
- if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
- kmem_free(cur->bc_ops);
- if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
- xfs_perag_put(cur->bc_ag.pag);
+
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
+ if (cur->bc_ag.pag)
+ xfs_perag_put(cur->bc_ag.pag);
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ /* nothing to do */
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ if (cur->bc_mem.pag)
+ xfs_perag_put(cur->bc_mem.pag);
+ break;
+ }
+
kmem_cache_free(cur->bc_cache, cur);
}
+/* Return the buffer target for this btree's buffer. */
+static inline struct xfs_buftarg *
+xfs_btree_buftarg(
+ struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+ return cur->bc_mem.xfbtree->target;
+ return cur->bc_mp->m_ddev_targp;
+}
+
+/* Return the block size (in units of 512b sectors) for this btree. */
+static inline unsigned int
+xfs_btree_bbsize(
+ struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+ return XFBNO_BBSIZE;
+ return cur->bc_mp->m_bsize;
+}
+
/*
* Duplicate the btree cursor.
* Allocate a new one, copy the record, re-get the buffers.
*/
-int /* error */
+int /* error */
xfs_btree_dup_cursor(
- struct xfs_btree_cur *cur, /* input cursor */
- struct xfs_btree_cur **ncur) /* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur) /* output cursor */
{
- struct xfs_buf *bp; /* btree block's buffer pointer */
- int error; /* error return value */
- int i; /* level number of btree block */
- xfs_mount_t *mp; /* mount structure for filesystem */
- struct xfs_btree_cur *new; /* new cursor value */
- xfs_trans_t *tp; /* transaction pointer, can be NULL */
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_trans *tp = cur->bc_tp;
+ struct xfs_buf *bp;
+ struct xfs_btree_cur *new;
+ int error;
+ int i;
- tp = cur->bc_tp;
- mp = cur->bc_mp;
+ /*
+ * Don't allow staging cursors to be duplicated because they're supposed
+ * to be kept private to a single thread.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
/*
* Allocate a new cursor like the old one.
@@ -494,10 +603,13 @@ xfs_btree_dup_cursor(
new->bc_levels[i].ra = cur->bc_levels[i].ra;
bp = cur->bc_levels[i].bp;
if (bp) {
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- xfs_buf_daddr(bp), mp->m_bsize,
- 0, &bp,
- cur->bc_ops->buf_ops);
+ error = xfs_trans_read_buf(mp, tp,
+ xfs_btree_buftarg(cur),
+ xfs_buf_daddr(bp),
+ xfs_btree_bbsize(cur), 0, &bp,
+ cur->bc_ops->buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(new);
if (error) {
xfs_btree_del_cursor(new, error);
*ncur = NULL;
@@ -539,7 +651,7 @@ xfs_btree_dup_cursor(
* record, key or pointer (xfs_btree_*_addr). Note that all addressing
* inside the btree block is done using indices starting at one, not zero!
*
- * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing
* overlapping intervals. In such a tree, records are still sorted lowest to
* highest and indexed by the smallest key value that refers to the record.
* However, nodes are different: each pointer has two associated keys -- one
@@ -589,26 +701,17 @@ xfs_btree_dup_cursor(
*/
static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ if (xfs_has_crc(cur->bc_mp))
return XFS_BTREE_LBLOCK_CRC_LEN;
return XFS_BTREE_LBLOCK_LEN;
}
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ if (xfs_has_crc(cur->bc_mp))
return XFS_BTREE_SBLOCK_CRC_LEN;
return XFS_BTREE_SBLOCK_LEN;
}
/*
- * Return size of btree block pointers for this btree instance.
- */
-static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
-{
- return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
- sizeof(__be64) : sizeof(__be32);
-}
-
-/*
* Calculate offset of the n-th record in a btree block.
*/
STATIC size_t
@@ -655,7 +758,7 @@ xfs_btree_ptr_offset(
{
return xfs_btree_block_len(cur) +
cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
- (n - 1) * xfs_btree_ptr_len(cur);
+ (n - 1) * cur->bc_ops->ptr_len;
}
/*
@@ -718,7 +821,7 @@ struct xfs_ifork *
xfs_btree_ifork_ptr(
struct xfs_btree_cur *cur)
{
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
if (cur->bc_flags & XFS_BTREE_STAGING)
return cur->bc_ino.ifake->if_fork;
@@ -750,8 +853,7 @@ xfs_btree_get_block(
int level, /* level in btree */
struct xfs_buf **bpp) /* buffer containing the block */
{
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1)) {
+ if (xfs_btree_at_iroot(cur, level)) {
*bpp = NULL;
return xfs_btree_get_iroot(cur);
}
@@ -856,95 +958,52 @@ xfs_btree_offsets(
}
}
-/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int
-xfs_btree_read_bufl(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- struct xfs_buf **bpp, /* buffer for fsbno */
- int refval, /* ref count value for buffer */
- const struct xfs_buf_ops *ops)
-{
- struct xfs_buf *bp; /* return value */
- xfs_daddr_t d; /* real disk block address */
- int error;
-
- if (!xfs_verify_fsbno(mp, fsbno))
- return -EFSCORRUPTED;
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0, &bp, ops);
- if (error)
- return error;
- if (bp)
- xfs_buf_set_ref(bp, refval);
- *bpp = bp;
- return 0;
-}
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufl(
- struct xfs_mount *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops)
+STATIC int
+xfs_btree_readahead_fsblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
{
- xfs_daddr_t d;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ int rval = 0;
- ASSERT(fsbno != NULLFSBLOCK);
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
-}
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+ xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left),
+ mp->m_bsize, cur->bc_ops->buf_ops);
+ rval++;
+ }
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufs(
- struct xfs_mount *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops)
-{
- xfs_daddr_t d;
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+ xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right),
+ mp->m_bsize, cur->bc_ops->buf_ops);
+ rval++;
+ }
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+ return rval;
}
STATIC int
-xfs_btree_readahead_lblock(
+xfs_btree_readahead_memblock(
struct xfs_btree_cur *cur,
int lr,
struct xfs_btree_block *block)
{
+ struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
+ xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
int rval = 0;
- xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
- xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
- xfs_btree_reada_bufl(cur->bc_mp, left, 1,
- cur->bc_ops->buf_ops);
+ xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
+ cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
- xfs_btree_reada_bufl(cur->bc_mp, right, 1,
- cur->bc_ops->buf_ops);
+ xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
+ cur->bc_ops->buf_ops);
rval++;
}
@@ -952,25 +1011,28 @@ xfs_btree_readahead_lblock(
}
STATIC int
-xfs_btree_readahead_sblock(
+xfs_btree_readahead_agblock(
struct xfs_btree_cur *cur,
int lr,
- struct xfs_btree_block *block)
+ struct xfs_btree_block *block)
{
- int rval = 0;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
-
+ int rval = 0;
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- left, 1, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, left),
+ mp->m_bsize, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- right, 1, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, right),
+ mp->m_bsize, cur->bc_ops->buf_ops);
rval++;
}
@@ -993,8 +1055,7 @@ xfs_btree_readahead(
* No readahead needed if we are at the root level and the
* btree root is stored in the inode.
*/
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (lev == cur->bc_nlevels - 1))
+ if (xfs_btree_at_iroot(cur, lev))
return 0;
if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
@@ -1003,9 +1064,17 @@ xfs_btree_readahead(
cur->bc_levels[lev].ra |= lr;
block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return xfs_btree_readahead_lblock(cur, lr, block);
- return xfs_btree_readahead_sblock(cur, lr, block);
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
+ return xfs_btree_readahead_agblock(cur, lr, block);
+ case XFS_BTREE_TYPE_INODE:
+ return xfs_btree_readahead_fsblock(cur, lr, block);
+ case XFS_BTREE_TYPE_MEM:
+ return xfs_btree_readahead_memblock(cur, lr, block);
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
STATIC int
@@ -1014,23 +1083,24 @@ xfs_btree_ptr_to_daddr(
const union xfs_btree_ptr *ptr,
xfs_daddr_t *daddr)
{
- xfs_fsblock_t fsbno;
- xfs_agblock_t agbno;
int error;
error = xfs_btree_check_ptr(cur, ptr, 0, 1);
if (error)
return error;
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- fsbno = be64_to_cpu(ptr->l);
- *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
- } else {
- agbno = be32_to_cpu(ptr->s);
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
*daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- agbno);
+ be32_to_cpu(ptr->s));
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
+ break;
}
-
return 0;
}
@@ -1050,8 +1120,9 @@ xfs_btree_readahead_ptr(
if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
return;
- xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
- cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
+ xfs_btree_bbsize(cur) * count,
+ cur->bc_ops->buf_ops);
}
/*
@@ -1072,7 +1143,7 @@ xfs_btree_setbuf(
cur->bc_levels[lev].ra = 0;
b = XFS_BUF_TO_BLOCK(bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
@@ -1090,7 +1161,7 @@ xfs_btree_ptr_is_null(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return ptr->l == cpu_to_be64(NULLFSBLOCK);
else
return ptr->s == cpu_to_be32(NULLAGBLOCK);
@@ -1101,12 +1172,23 @@ xfs_btree_set_ptr_null(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
ptr->l = cpu_to_be64(NULLFSBLOCK);
else
ptr->s = cpu_to_be32(NULLAGBLOCK);
}
+static inline bool
+xfs_btree_ptrs_equal(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr1,
+ union xfs_btree_ptr *ptr2)
+{
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
+ return ptr1->l == ptr2->l;
+ return ptr1->s == ptr2->s;
+}
+
/*
* Get/set/init sibling pointers
*/
@@ -1119,7 +1201,7 @@ xfs_btree_get_sibling(
{
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (lr == XFS_BB_RIGHTSIB)
ptr->l = block->bb_u.l.bb_rightsib;
else
@@ -1141,7 +1223,7 @@ xfs_btree_set_sibling(
{
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (lr == XFS_BB_RIGHTSIB)
block->bb_u.l.bb_rightsib = ptr->l;
else
@@ -1154,25 +1236,24 @@ xfs_btree_set_sibling(
}
}
-void
-xfs_btree_init_block_int(
+static void
+__xfs_btree_init_block(
struct xfs_mount *mp,
struct xfs_btree_block *buf,
+ const struct xfs_btree_ops *ops,
xfs_daddr_t blkno,
- xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
- __u64 owner,
- unsigned int flags)
+ __u64 owner)
{
- int crc = xfs_has_crc(mp);
- __u32 magic = xfs_btree_magic(crc, btnum);
+ bool crc = xfs_has_crc(mp);
+ __u32 magic = xfs_btree_magic(mp, ops);
buf->bb_magic = cpu_to_be32(magic);
buf->bb_level = cpu_to_be16(level);
buf->bb_numrecs = cpu_to_be16(numrecs);
- if (flags & XFS_BTREE_LONG_PTRS) {
+ if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
if (crc) {
@@ -1183,14 +1264,12 @@ xfs_btree_init_block_int(
buf->bb_u.l.bb_lsn = 0;
}
} else {
- /* owner is a 32 bit value on short blocks */
- __u32 __owner = (__u32)owner;
-
buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
if (crc) {
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
- buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+ /* owner is a 32 bit value on short blocks */
+ buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner);
uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.s.bb_lsn = 0;
}
@@ -1199,15 +1278,46 @@ xfs_btree_init_block_int(
void
xfs_btree_init_block(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner)
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ const struct xfs_btree_ops *ops,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner)
+{
+ __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level,
+ numrecs, owner);
+}
+
+void
+xfs_btree_init_buf(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ const struct xfs_btree_ops *ops,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner)
+{
+ __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops,
+ xfs_buf_daddr(bp), level, numrecs, owner);
+ bp->b_ops = ops->buf_ops;
+}
+
+static inline __u64
+xfs_btree_owner(
+ struct xfs_btree_cur *cur)
{
- xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
- btnum, level, numrecs, owner, 0);
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ return cur->bc_mem.xfbtree->owner;
+ case XFS_BTREE_TYPE_INODE:
+ return cur->bc_ino.ip->i_ino;
+ case XFS_BTREE_TYPE_AG:
+ return cur->bc_ag.pag->pag_agno;
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
void
@@ -1217,22 +1327,8 @@ xfs_btree_init_block_cur(
int level,
int numrecs)
{
- __u64 owner;
-
- /*
- * we can pull the owner from the cursor right now as the different
- * owners align directly with the pointer size of the btree. This may
- * change in future, but is safe for current users of the generic btree
- * code.
- */
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- owner = cur->bc_ino.ip->i_ino;
- else
- owner = cur->bc_ag.pag->pag_agno;
-
- xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
- xfs_buf_daddr(bp), cur->bc_btnum, level,
- numrecs, owner, cur->bc_flags);
+ xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs,
+ xfs_btree_owner(cur));
}
/*
@@ -1250,7 +1346,7 @@ xfs_btree_is_lastrec(
if (level > 0)
return 0;
- if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE))
return 0;
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
@@ -1265,41 +1361,27 @@ xfs_btree_buf_to_ptr(
struct xfs_buf *bp,
union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(bp)));
- else {
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
xfs_buf_daddr(bp)));
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)));
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
+ break;
}
}
-STATIC void
+static inline void
xfs_btree_set_refs(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- switch (cur->bc_btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
- break;
- case XFS_BTNUM_INO:
- case XFS_BTNUM_FINO:
- xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
- break;
- case XFS_BTNUM_BMAP:
- xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
- break;
- case XFS_BTNUM_RMAP:
- xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
- break;
- case XFS_BTNUM_REFC:
- xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
- break;
- default:
- ASSERT(0);
- }
+ xfs_buf_set_ref(bp, cur->bc_ops->lru_refs);
}
int
@@ -1309,15 +1391,14 @@ xfs_btree_get_buf_block(
struct xfs_btree_block **block,
struct xfs_buf **bpp)
{
- struct xfs_mount *mp = cur->bc_mp;
- xfs_daddr_t d;
- int error;
+ xfs_daddr_t d;
+ int error;
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
- error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
- 0, bpp);
+ error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
+ xfs_btree_bbsize(cur), 0, bpp);
if (error)
return error;
@@ -1348,9 +1429,11 @@ xfs_btree_read_buf_block(
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
- error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, flags, bpp,
- cur->bc_ops->buf_ops);
+ error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
+ xfs_btree_bbsize(cur), flags, bpp,
+ cur->bc_ops->buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
return error;
@@ -1398,7 +1481,7 @@ xfs_btree_copy_ptrs(
int numptrs)
{
ASSERT(numptrs >= 0);
- memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+ memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len);
}
/*
@@ -1454,8 +1537,8 @@ xfs_btree_shift_ptrs(
ASSERT(numptrs >= 0);
ASSERT(dir == 1 || dir == -1);
- dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
- memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+ dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len);
+ memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len);
}
/*
@@ -1566,7 +1649,7 @@ xfs_btree_log_block(
if (bp) {
int nbits;
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (xfs_has_crc(cur->bc_mp)) {
/*
* We don't log the CRC when updating a btree
* block but instead recreate it during log
@@ -1581,7 +1664,7 @@ xfs_btree_log_block(
nbits = XFS_BB_NUM_BITS;
}
xfs_btree_offsets(fields,
- (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ?
loffsets : soffsets,
nbits, &first, &last);
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
@@ -1658,9 +1741,10 @@ xfs_btree_increment(
* confused or have the tree root in an inode.
*/
if (lev == cur->bc_nlevels) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
goto out0;
ASSERT(0);
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1751,9 +1835,10 @@ xfs_btree_decrement(
* or the root of the tree is in an inode.
*/
if (lev == cur->bc_nlevels) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
goto out0;
ASSERT(0);
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1786,6 +1871,33 @@ error0:
return error;
}
+/*
+ * Check the btree block owner now that we have the context to know who the
+ * real owner is.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_block_owner(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block)
+{
+ __u64 owner;
+
+ if (!xfs_has_crc(cur->bc_mp) ||
+ (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER))
+ return NULL;
+
+ owner = xfs_btree_owner(cur);
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ if (be64_to_cpu(block->bb_u.l.bb_owner) != owner)
+ return __this_address;
+ } else {
+ if (be32_to_cpu(block->bb_u.s.bb_owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
int
xfs_btree_lookup_get_block(
struct xfs_btree_cur *cur, /* btree cursor */
@@ -1798,8 +1910,7 @@ xfs_btree_lookup_get_block(
int error = 0;
/* special case the root block if in an inode */
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1)) {
+ if (xfs_btree_at_iroot(cur, level)) {
*blkp = xfs_btree_get_iroot(cur);
return 0;
}
@@ -1824,11 +1935,7 @@ xfs_btree_lookup_get_block(
return error;
/* Check the inode owner since the verifiers don't. */
- if (xfs_has_crc(cur->bc_mp) &&
- !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
- (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
- be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
- cur->bc_ino.ip->i_ino)
+ if (xfs_btree_check_block_owner(cur, *blkp) != NULL)
goto out_bad;
/* Did we get the level we were looking for? */
@@ -1846,6 +1953,7 @@ out_bad:
*blkp = NULL;
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(cur->bc_tp, bp);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -1872,6 +1980,27 @@ xfs_lookup_get_search_key(
}
/*
+ * Initialize a pointer to the root block.
+ */
+void
+xfs_btree_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+ /*
+ * Inode-rooted btrees call xfs_btree_get_iroot to find the root
+ * in xfs_btree_lookup_get_block and don't need a pointer here.
+ */
+ ptr->l = 0;
+ } else if (cur->bc_flags & XFS_BTREE_STAGING) {
+ ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root);
+ } else {
+ cur->bc_ops->init_ptr_from_cur(cur, ptr);
+ }
+}
+
+/*
* Lookup the record. The cursor is made to point to it, based on dir.
* stat is set to 0 if can't find any such record, 1 for success.
*/
@@ -1892,14 +2021,16 @@ xfs_btree_lookup(
XFS_BTREE_STATS_INC(cur, lookup);
/* No such thing as a zero-level tree. */
- if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
+ if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
block = NULL;
keyno = 0;
/* initialise start pointer from cursor */
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
pp = &ptr;
/*
@@ -1936,6 +2067,7 @@ xfs_btree_lookup(
XFS_ERRLEVEL_LOW,
cur->bc_mp, block,
sizeof(*block));
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -2012,8 +2144,10 @@ xfs_btree_lookup(
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
*stat = 1;
return 0;
}
@@ -2040,7 +2174,7 @@ xfs_btree_high_key_from_key(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
- ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
return (union xfs_btree_key *)((char *)key +
(cur->bc_ops->key_len / 2));
}
@@ -2061,7 +2195,7 @@ xfs_btree_get_leaf_keys(
rec = xfs_btree_rec_addr(cur, 1, block);
cur->bc_ops->init_key_from_rec(key, rec);
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
@@ -2088,7 +2222,7 @@ xfs_btree_get_node_keys(
union xfs_btree_key *high;
int n;
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
memcpy(key, xfs_btree_key_addr(cur, 1, block),
cur->bc_ops->key_len / 2);
@@ -2132,7 +2266,7 @@ xfs_btree_needs_key_update(
struct xfs_btree_cur *cur,
int ptr)
{
- return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+ return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1;
}
/*
@@ -2156,7 +2290,7 @@ __xfs_btree_updkeys(
struct xfs_buf *bp;
int ptr;
- ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
/* Exit if there aren't any parent levels to update. */
if (level + 1 >= cur->bc_nlevels)
@@ -2225,7 +2359,7 @@ xfs_btree_update_keys(
ASSERT(level >= 0);
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)
return __xfs_btree_updkeys(cur, level, block, bp, false);
/*
@@ -2332,8 +2466,7 @@ xfs_btree_lshift(
int error; /* error return value */
int i;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1)
+ if (xfs_btree_at_iroot(cur, level))
goto out0;
/* Set up variables for this block as "right". */
@@ -2460,12 +2593,13 @@ xfs_btree_lshift(
* Using a temporary cursor, update the parent key values of the
* block on the left.
*/
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_dup_cursor(cur, &tcur);
if (error)
goto error0;
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2527,8 +2661,7 @@ xfs_btree_rshift(
int error; /* error return value */
int i; /* loop counter */
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1))
+ if (xfs_btree_at_iroot(cur, level))
goto out0;
/* Set up variables for this block as "left". */
@@ -2636,6 +2769,7 @@ xfs_btree_rshift(
goto error0;
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2645,7 +2779,7 @@ xfs_btree_rshift(
goto error1;
/* Update the parent high keys of the left block, if needed. */
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_update_keys(cur, level);
if (error)
goto error1;
@@ -2673,6 +2807,32 @@ error1:
return error;
}
+static inline int
+xfs_btree_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *hint_block,
+ union xfs_btree_ptr *new_block,
+ int *stat)
+{
+ int error;
+
+ /*
+ * Don't allow block allocation for a staging cursor, because staging
+ * cursors do not support regular btree modifications.
+ *
+ * Bulk loading uses a separate callback to obtain new blocks from a
+ * preallocated list, which prevents ENOSPC failures during loading.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat);
+ trace_xfs_btree_alloc_block(cur, new_block, *stat, error);
+ return error;
+}
+
/*
* Split cur/level block in half.
* Return new block number and the key to its first
@@ -2716,7 +2876,7 @@ __xfs_btree_split(
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+ error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2823,7 +2983,7 @@ __xfs_btree_split(
}
/* Update the parent high keys of the left block, if needed. */
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
@@ -2941,7 +3101,7 @@ xfs_btree_split(
struct xfs_btree_split_args args;
DECLARE_COMPLETION_ONSTACK(done);
- if (cur->bc_btnum != XFS_BTNUM_BMAP ||
+ if (!xfs_btree_is_bmap(cur->bc_ops) ||
cur->bc_tp->t_highest_agno == NULLAGNUMBER)
return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
@@ -2963,7 +3123,6 @@ xfs_btree_split(
#define xfs_btree_split __xfs_btree_split
#endif /* __KERNEL__ */
-
/*
* Copy the old inode root contents into a real block and make the
* broot point to it.
@@ -2988,7 +3147,7 @@ xfs_btree_new_iroot(
XFS_BTREE_STATS_INC(cur, newroot);
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
level = cur->bc_nlevels - 1;
@@ -2996,7 +3155,7 @@ xfs_btree_new_iroot(
pp = xfs_btree_ptr_addr(cur, 1, block);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+ error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -3014,9 +3173,9 @@ xfs_btree_new_iroot(
* In that case have to also ensure the blkno remains correct
*/
memcpy(cblock, block, xfs_btree_block_len(cur));
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (xfs_has_crc(cur->bc_mp)) {
__be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
cblock->bb_u.l.bb_blkno = bno;
else
cblock->bb_u.s.bb_blkno = bno;
@@ -3069,6 +3228,21 @@ error0:
return error;
}
+static void
+xfs_btree_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ if (cur->bc_flags & XFS_BTREE_STAGING) {
+ /* Update the btree root information for a per-AG fake root. */
+ cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s);
+ cur->bc_ag.afake->af_levels += inc;
+ } else {
+ cur->bc_ops->set_root(cur, ptr, inc);
+ }
+}
+
/*
* Allocate a new root block, fill it in.
*/
@@ -3093,10 +3267,10 @@ xfs_btree_new_root(
XFS_BTREE_STATS_INC(cur, newroot);
/* initialise our start point from the cursor */
- cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+ xfs_btree_init_ptr_from_cur(cur, &rptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+ error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -3109,7 +3283,7 @@ xfs_btree_new_root(
goto error0;
/* Set the root in the holding structure increasing the level by 1. */
- cur->bc_ops->set_root(cur, &lptr, 1);
+ xfs_btree_set_root(cur, &lptr, 1);
/*
* At the previous root level there are now two blocks: the old root,
@@ -3213,8 +3387,7 @@ xfs_btree_make_block_unfull(
{
int error = 0;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1) {
+ if (xfs_btree_at_iroot(cur, level)) {
struct xfs_inode *ip = cur->bc_ino.ip;
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
@@ -3299,8 +3472,8 @@ xfs_btree_insrec(
* If we have an external root pointer, and we've made it to the
* root level, allocate a new root block and we're done.
*/
- if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level >= cur->bc_nlevels)) {
+ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE &&
+ level >= cur->bc_nlevels) {
error = xfs_btree_new_root(cur, stat);
xfs_btree_set_ptr_null(cur, ptrp);
@@ -3524,6 +3697,7 @@ xfs_btree_insert(
}
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3537,7 +3711,8 @@ xfs_btree_insert(
if (pcur != cur &&
(ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
/* Save the state from the cursor before we trash it */
- if (cur->bc_ops->update_cursor)
+ if (cur->bc_ops->update_cursor &&
+ !(cur->bc_flags & XFS_BTREE_STAGING))
cur->bc_ops->update_cursor(pcur, cur);
cur->bc_nlevels = pcur->bc_nlevels;
xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
@@ -3586,7 +3761,7 @@ xfs_btree_kill_iroot(
#endif
int i;
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_nlevels > 1);
/*
@@ -3680,7 +3855,7 @@ xfs_btree_kill_root(
* Update the root pointer, decreasing the level by 1 and then
* free the old root.
*/
- cur->bc_ops->set_root(cur, newroot, -1);
+ xfs_btree_set_root(cur, newroot, -1);
error = xfs_btree_free_block(cur, bp);
if (error)
@@ -3822,27 +3997,25 @@ xfs_btree_delrec(
* Try to get rid of the next level down. If we can't then there's
* nothing left to do.
*/
- if (level == cur->bc_nlevels - 1) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
- xfs_iroot_realloc(cur->bc_ino.ip, -1,
- cur->bc_ino.whichfork);
+ if (xfs_btree_at_iroot(cur, level)) {
+ xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
- error = xfs_btree_kill_iroot(cur);
- if (error)
- goto error0;
+ error = xfs_btree_kill_iroot(cur);
+ if (error)
+ goto error0;
- error = xfs_btree_dec_cursor(cur, level, stat);
- if (error)
- goto error0;
- *stat = 1;
- return 0;
- }
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ *stat = 1;
+ return 0;
+ }
- /*
- * If this is the root level, and there's only one entry left,
- * and it's NOT the leaf level, then we can get rid of this
- * level.
- */
+ /*
+ * If this is the root level, and there's only one entry left, and it's
+ * NOT the leaf level, then we can get rid of this level.
+ */
+ if (level == cur->bc_nlevels - 1) {
if (numrecs == 1 && level > 0) {
union xfs_btree_ptr *pp;
/*
@@ -3891,7 +4064,7 @@ xfs_btree_delrec(
xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
/*
* One child of root, need to get a chance to copy its contents
* into the root and delete it. Can't go up to next level,
@@ -3931,6 +4104,7 @@ xfs_btree_delrec(
*/
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3939,12 +4113,14 @@ xfs_btree_delrec(
if (error)
goto error0;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3992,6 +4168,7 @@ xfs_btree_delrec(
if (!xfs_btree_ptr_is_null(cur, &lptr)) {
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4000,6 +4177,7 @@ xfs_btree_delrec(
if (error)
goto error0;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4017,6 +4195,7 @@ xfs_btree_delrec(
*/
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4026,6 +4205,7 @@ xfs_btree_delrec(
goto error0;
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4201,8 +4381,8 @@ xfs_btree_delrec(
* If we joined with the right neighbor and there's a level above
* us, increment the cursor at that level.
*/
- else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
- (level + 1 < cur->bc_nlevels)) {
+ else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE ||
+ level + 1 < cur->bc_nlevels) {
error = xfs_btree_increment(cur, level + 1, &i);
if (error)
goto error0;
@@ -4270,7 +4450,7 @@ xfs_btree_delete(
* If we combined blocks as part of deleting the record, delrec won't
* have updated the parent high keys so we have to do that here.
*/
- if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+ if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) {
error = xfs_btree_updkeys_force(cur, 0);
if (error)
goto error0;
@@ -4344,7 +4524,7 @@ xfs_btree_visit_block(
{
struct xfs_btree_block *block;
struct xfs_buf *bp;
- union xfs_btree_ptr rptr;
+ union xfs_btree_ptr rptr, bufptr;
int error;
/* do right sibling readahead */
@@ -4367,15 +4547,12 @@ xfs_btree_visit_block(
* return the same block without checking if the right sibling points
* back to us and creates a cyclic reference in the btree.
*/
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(bp)))
- return -EFSCORRUPTED;
- } else {
- if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
- xfs_buf_daddr(bp)))
- return -EFSCORRUPTED;
+ xfs_btree_buf_to_ptr(cur, bp, &bufptr);
+ if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
}
+
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4393,7 +4570,7 @@ xfs_btree_visit_blocks(
struct xfs_btree_block *block = NULL;
int error = 0;
- cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+ xfs_btree_init_ptr_from_cur(cur, &lptr);
/* for each level */
for (level = cur->bc_nlevels - 1; level >= 0; level--) {
@@ -4471,7 +4648,7 @@ xfs_btree_block_change_owner(
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
return 0;
block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
@@ -4489,7 +4666,7 @@ xfs_btree_block_change_owner(
* though, so everything is consistent in memory.
*/
if (!bp) {
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(level == cur->bc_nlevels - 1);
return 0;
}
@@ -4523,7 +4700,7 @@ xfs_btree_change_owner(
/* Verify the v5 fields of a long-format btree block. */
xfs_failaddr_t
-xfs_btree_lblock_v5hdr_verify(
+xfs_btree_fsblock_v5hdr_verify(
struct xfs_buf *bp,
uint64_t owner)
{
@@ -4544,7 +4721,7 @@ xfs_btree_lblock_v5hdr_verify(
/* Verify a long-format btree block. */
xfs_failaddr_t
-xfs_btree_lblock_verify(
+xfs_btree_fsblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
@@ -4553,28 +4730,60 @@ xfs_btree_lblock_verify(
xfs_fsblock_t fsb;
xfs_failaddr_t fa;
+ ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
- fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_rightsib);
return fa;
}
+/* Verify an in-memory btree block. */
+xfs_failaddr_t
+xfs_btree_memblock_verify(
+ struct xfs_buf *bp,
+ unsigned int max_recs)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buftarg *btp = bp->b_target;
+ xfs_failaddr_t fa;
+ xfbno_t bno;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > max_recs)
+ return __this_address;
+
+ /* sibling pointer verification */
+ bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_leftsib);
+ if (fa)
+ return fa;
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_rightsib);
+ if (fa)
+ return fa;
+
+ return NULL;
+}
/**
- * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
* btree block
*
* @bp: buffer containing the btree block
*/
xfs_failaddr_t
-xfs_btree_sblock_v5hdr_verify(
+xfs_btree_agblock_v5hdr_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
@@ -4593,13 +4802,13 @@ xfs_btree_sblock_v5hdr_verify(
}
/**
- * xfs_btree_sblock_verify() -- verify a short-format btree block
+ * xfs_btree_agblock_verify() -- verify a short-format btree block
*
* @bp: buffer containing the btree block
* @max_recs: maximum records allowed in this btree node
*/
xfs_failaddr_t
-xfs_btree_sblock_verify(
+xfs_btree_agblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
@@ -4608,16 +4817,18 @@ xfs_btree_sblock_verify(
xfs_agblock_t agbno;
xfs_failaddr_t fa;
+ ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
- fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
block->bb_u.s.bb_rightsib);
return fa;
}
@@ -4815,7 +5026,7 @@ xfs_btree_overlapped_query_range(
/* Load the root of the btree. */
level = cur->bc_nlevels - 1;
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
if (error)
return error;
@@ -4966,7 +5177,7 @@ xfs_btree_query_range(
if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
return -EINVAL;
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return xfs_btree_simple_query_range(cur, &low_key,
&high_key, fn, priv);
return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
@@ -5020,7 +5231,7 @@ xfs_btree_diff_two_ptrs(
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
}
@@ -5074,7 +5285,7 @@ xfs_btree_has_records_helper(
key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
&rec_key, info->key_mask);
if (key_contig == XBTREE_KEY_OVERLAP &&
- !(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return -EFSCORRUPTED;
if (key_contig == XBTREE_KEY_GAP)
return -ECANCELED;
@@ -5168,7 +5379,7 @@ xfs_btree_has_more_records(
return true;
/* There are more record blocks. */
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
else
return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
@@ -5233,6 +5444,7 @@ xfs_btree_goto_left_edge(
return error;
if (stat != 0) {
ASSERT(0);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index d906324e25c8..f93374278aa1 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -55,15 +55,8 @@ union xfs_btree_rec {
#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
-#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
-#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
-#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
-#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
-#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
-#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
-#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
-
-uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
+struct xfs_btree_ops;
+uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
/*
* For logging record fields.
@@ -86,9 +79,11 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
* Generic stats interface
*/
#define XFS_BTREE_STATS_INC(cur, stat) \
- XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+ XFS_STATS_INC_OFF((cur)->bc_mp, \
+ (cur)->bc_ops->statoff + __XBTS_ ## stat)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
- XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
+ XFS_STATS_ADD_OFF((cur)->bc_mp, \
+ (cur)->bc_ops->statoff + __XBTS_ ## stat, val)
enum xbtree_key_contig {
XBTREE_KEY_GAP = 0,
@@ -111,10 +106,37 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
return XBTREE_KEY_OVERLAP;
}
+#define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64))
+#define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32))
+
+enum xfs_btree_type {
+ XFS_BTREE_TYPE_AG,
+ XFS_BTREE_TYPE_INODE,
+ XFS_BTREE_TYPE_MEM,
+};
+
struct xfs_btree_ops {
- /* size of the key and record structures */
- size_t key_len;
- size_t rec_len;
+ const char *name;
+
+ /* Type of btree - AG-rooted or inode-rooted */
+ enum xfs_btree_type type;
+
+ /* XFS_BTGEO_* flags that determine the geometry of the btree */
+ unsigned int geom_flags;
+
+ /* size of the key, pointer, and record structures */
+ size_t key_len;
+ size_t ptr_len;
+ size_t rec_len;
+
+ /* LRU refcount to set on each btree buffer created */
+ unsigned int lru_refs;
+
+ /* offset of btree stats array */
+ unsigned int statoff;
+
+ /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */
+ unsigned int sick_mask;
/* cursor operations */
struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
@@ -199,6 +221,10 @@ struct xfs_btree_ops {
const union xfs_btree_key *mask);
};
+/* btree geometry flags */
+#define XFS_BTGEO_LASTREC_UPDATE (1U << 0) /* track last rec externally */
+#define XFS_BTGEO_OVERLAPPING (1U << 1) /* overlapping intervals */
+
/*
* Reasons for the update_lastrec method to be called.
*/
@@ -215,39 +241,6 @@ union xfs_btree_irec {
struct xfs_refcount_irec rc;
};
-/* Per-AG btree information. */
-struct xfs_btree_cur_ag {
- struct xfs_perag *pag;
- union {
- struct xfs_buf *agbp;
- struct xbtree_afakeroot *afake; /* for staging cursor */
- };
- union {
- struct {
- unsigned int nr_ops; /* # record updates */
- unsigned int shape_changes; /* # of extent splits */
- } refc;
- struct {
- bool active; /* allocation cursor state */
- } abt;
- };
-};
-
-/* Btree-in-inode cursor information */
-struct xfs_btree_cur_ino {
- struct xfs_inode *ip;
- struct xbtree_ifakeroot *ifake; /* for staging cursor */
- int allocated;
- short forksize;
- char whichfork;
- char flags;
-/* We are converting a delalloc reservation */
-#define XFS_BTCUR_BMBT_WASDEL (1 << 0)
-
-/* For extent swap, ignore owner check in verifier */
-#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
-};
-
struct xfs_btree_level {
/* buffer pointer */
struct xfs_buf *bp;
@@ -272,21 +265,38 @@ struct xfs_btree_cur
const struct xfs_btree_ops *bc_ops;
struct kmem_cache *bc_cache; /* cursor cache */
unsigned int bc_flags; /* btree features - below */
- xfs_btnum_t bc_btnum; /* identifies which btree type */
union xfs_btree_irec bc_rec; /* current insert/search record value */
uint8_t bc_nlevels; /* number of levels in the tree */
uint8_t bc_maxlevels; /* maximum levels for this btree type */
- int bc_statoff; /* offset of btree stats array */
- /*
- * Short btree pointers need an agno to be able to turn the pointers
- * into physical addresses for IO, so the btree cursor switches between
- * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the
- * cursor.
- */
+ /* per-type information */
union {
- struct xfs_btree_cur_ag bc_ag;
- struct xfs_btree_cur_ino bc_ino;
+ struct {
+ struct xfs_inode *ip;
+ short forksize;
+ char whichfork;
+ struct xbtree_ifakeroot *ifake; /* for staging cursor */
+ } bc_ino;
+ struct {
+ struct xfs_perag *pag;
+ struct xfs_buf *agbp;
+ struct xbtree_afakeroot *afake; /* for staging cursor */
+ } bc_ag;
+ struct {
+ struct xfbtree *xfbtree;
+ struct xfs_perag *pag;
+ } bc_mem;
+ };
+
+ /* per-format private data */
+ union {
+ struct {
+ int allocated;
+ } bc_bmap; /* bmapbt */
+ struct {
+ unsigned int nr_ops; /* # record updates */
+ unsigned int shape_changes; /* # of extent splits */
+ } bc_refc; /* refcountbt */
};
/* Must be at the end of the struct! */
@@ -304,18 +314,22 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels);
}
-/* cursor flags */
-#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
-#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
-#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
-#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
-#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
+/* cursor state flags */
/*
* The root of this btree is a fakeroot structure so that we can stage a btree
* rebuild without leaving it accessible via primary metadata. The ops struct
* is dynamically allocated and must be freed when the cursor is deleted.
*/
-#define XFS_BTREE_STAGING (1<<5)
+#define XFS_BTREE_STAGING (1U << 0)
+
+/* We are converting a delalloc reservation (only for bmbt btrees) */
+#define XFS_BTREE_BMBT_WASDEL (1U << 1)
+
+/* For extent swap, ignore owner check in verifier (only for bmbt btrees) */
+#define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2)
+
+/* Cursor is active (only for allocbt btrees) */
+#define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3)
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
@@ -325,14 +339,10 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
*/
#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
-/*
- * Internal long and short btree block checks. They return NULL if the
- * block is ok or the address of the failed check otherwise.
- */
-xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
- struct xfs_btree_block *block, int level, struct xfs_buf *bp);
-xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
+xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+int __xfs_btree_check_ptr(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int index, int level);
/*
* Check that block header is ok.
@@ -345,24 +355,6 @@ xfs_btree_check_block(
struct xfs_buf *bp); /* buffer containing block, if any */
/*
- * Check that (long) pointer is ok.
- */
-bool /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_fsblock_t fsbno, /* btree block disk address */
- int level); /* btree block level */
-
-/*
- * Check that (short) pointer is ok.
- */
-bool /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agblock_t agbno, /* btree block disk address */
- int level); /* btree block level */
-
-/*
* Delete the btree cursor.
*/
void
@@ -392,63 +384,14 @@ xfs_btree_offsets(
int *last); /* output: last byte offset */
/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int /* error */
-xfs_btree_read_bufl(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- struct xfs_buf **bpp, /* buffer for fsbno */
- int refval, /* ref count value for buffer */
- const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-void /* error */
-xfs_btree_reada_bufl(
- struct xfs_mount *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-void /* error */
-xfs_btree_reada_bufs(
- struct xfs_mount *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops);
-
-/*
* Initialise a new btree block header
*/
-void
-xfs_btree_init_block(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner);
-
-void
-xfs_btree_init_block_int(
- struct xfs_mount *mp,
- struct xfs_btree_block *buf,
- xfs_daddr_t blkno,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner,
- unsigned int flags);
+void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp,
+ const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs,
+ __u64 owner);
+void xfs_btree_init_block(struct xfs_mount *mp,
+ struct xfs_btree_block *buf, const struct xfs_btree_ops *ops,
+ __u16 level, __u16 numrecs, __u64 owner);
/*
* Common btree core entry points.
@@ -467,10 +410,10 @@ int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
/*
* btree block CRC helpers
*/
-void xfs_btree_lblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
-void xfs_btree_sblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+void xfs_btree_fsblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_fsblock_verify_crc(struct xfs_buf *);
+void xfs_btree_agblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_agblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
@@ -510,12 +453,14 @@ static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
-xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
-xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp);
+xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
-xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp,
uint64_t owner);
-xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp,
+ unsigned int max_recs);
+xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
@@ -690,7 +635,7 @@ xfs_btree_islastblock(
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
}
@@ -714,21 +659,28 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
union xfs_btree_key *dst_key,
const union xfs_btree_key *src_key, int numkeys);
+void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
static inline struct xfs_btree_cur *
xfs_btree_alloc_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_btnum_t btnum,
+ const struct xfs_btree_ops *ops,
uint8_t maxlevels,
struct kmem_cache *cache)
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+ ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN ||
+ ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN);
+
+ /* BMBT allocations can come through from non-transactional context. */
+ cur = kmem_cache_zalloc(cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+ cur->bc_ops = ops;
cur->bc_tp = tp;
cur->bc_mp = mp;
- cur->bc_btnum = btnum;
cur->bc_maxlevels = maxlevels;
cur->bc_cache = cache;
@@ -740,4 +692,14 @@ void xfs_btree_destroy_cur_caches(void);
int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
+/* Does this level of the cursor point to the inode root (and not a block)? */
+static inline bool
+xfs_btree_at_iroot(
+ const struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
+ level == cur->bc_nlevels - 1;
+}
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
new file mode 100644
index 000000000000..036061fe32cc
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_mem.c
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_ag.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+
+/* Set the root of an in-memory btree. */
+void
+xfbtree_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ cur->bc_mem.xfbtree->root = *ptr;
+ cur->bc_mem.xfbtree->nlevels += inc;
+}
+
+/* Initialize a pointer from the in-memory btree header. */
+void
+xfbtree_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ *ptr = cur->bc_mem.xfbtree->root;
+}
+
+/* Duplicate an in-memory btree cursor. */
+struct xfs_btree_cur *
+xfbtree_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_cur *ncur;
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops,
+ cur->bc_maxlevels, cur->bc_cache);
+ ncur->bc_flags = cur->bc_flags;
+ ncur->bc_nlevels = cur->bc_nlevels;
+ ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree;
+
+ if (cur->bc_mem.pag)
+ ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
+
+ return ncur;
+}
+
+/* Close the btree xfile and release all resources. */
+void
+xfbtree_destroy(
+ struct xfbtree *xfbt)
+{
+ xfs_buftarg_drain(xfbt->target);
+}
+
+/* Compute the number of bytes available for records. */
+static inline unsigned int
+xfbtree_rec_bytes(
+ struct xfs_mount *mp,
+ const struct xfs_btree_ops *ops)
+{
+ return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+}
+
+/* Initialize an empty leaf block as the btree root. */
+STATIC int
+xfbtree_init_leaf_block(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ const struct xfs_btree_ops *ops)
+{
+ struct xfs_buf *bp;
+ xfbno_t bno = xfbt->highest_bno++;
+ int error;
+
+ error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE,
+ &bp);
+ if (error)
+ return error;
+
+ trace_xfbtree_create_root_buf(xfbt, bp);
+
+ bp->b_ops = ops->buf_ops;
+ xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner);
+ xfs_buf_relse(bp);
+
+ xfbt->root.l = cpu_to_be64(bno);
+ return 0;
+}
+
+/*
+ * Create an in-memory btree root that can be used with the given xmbuf.
+ * Callers must set xfbt->owner.
+ */
+int
+xfbtree_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp,
+ const struct xfs_btree_ops *ops)
+{
+ unsigned int blocklen = xfbtree_rec_bytes(mp, ops);
+ unsigned int keyptr_len;
+ int error;
+
+ /* Requires a long-format CRC-format btree */
+ if (!xfs_has_crc(mp)) {
+ ASSERT(xfs_has_crc(mp));
+ return -EINVAL;
+ }
+ if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) {
+ ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN);
+ return -EINVAL;
+ }
+
+ memset(xfbt, 0, sizeof(*xfbt));
+ xfbt->target = btp;
+
+ /* Set up min/maxrecs for this btree. */
+ keyptr_len = ops->key_len + sizeof(__be64);
+ xfbt->maxrecs[0] = blocklen / ops->rec_len;
+ xfbt->maxrecs[1] = blocklen / keyptr_len;
+ xfbt->minrecs[0] = xfbt->maxrecs[0] / 2;
+ xfbt->minrecs[1] = xfbt->maxrecs[1] / 2;
+ xfbt->highest_bno = 0;
+ xfbt->nlevels = 1;
+
+ /* Initialize the empty btree. */
+ error = xfbtree_init_leaf_block(mp, xfbt, ops);
+ if (error)
+ goto err_freesp;
+
+ trace_xfbtree_init(mp, xfbt, ops);
+
+ return 0;
+
+err_freesp:
+ xfs_buftarg_drain(xfbt->target);
+ return error;
+}
+
+/* Allocate a block to our in-memory btree. */
+int
+xfbtree_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+ xfbno_t bno = xfbt->highest_bno++;
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ trace_xfbtree_alloc_block(xfbt, cur, bno);
+
+ /* Fail if the block address exceeds the maximum for the buftarg. */
+ if (!xfbtree_verify_bno(xfbt, bno)) {
+ ASSERT(xfbtree_verify_bno(xfbt, bno));
+ *stat = 0;
+ return 0;
+ }
+
+ new->l = cpu_to_be64(bno);
+ *stat = 1;
+ return 0;
+}
+
+/* Free a block from our in-memory btree. */
+int
+xfbtree_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+ xfs_daddr_t daddr = xfs_buf_daddr(bp);
+ xfbno_t bno = xfs_daddr_to_xfbno(daddr);
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ trace_xfbtree_free_block(xfbt, cur, bno);
+
+ if (bno + 1 == xfbt->highest_bno)
+ xfbt->highest_bno--;
+
+ return 0;
+}
+
+/* Return the minimum number of records for a btree block. */
+int
+xfbtree_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+
+ return xfbt->minrecs[level != 0];
+}
+
+/* Return the maximum number of records for a btree block. */
+int
+xfbtree_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+
+ return xfbt->maxrecs[level != 0];
+}
+
+/* If this log item is a buffer item that came from the xfbtree, return it. */
+static inline struct xfs_buf *
+xfbtree_buf_match(
+ struct xfbtree *xfbt,
+ const struct xfs_log_item *lip)
+{
+ const struct xfs_buf_log_item *bli;
+ struct xfs_buf *bp;
+
+ if (lip->li_type != XFS_LI_BUF)
+ return NULL;
+
+ bli = container_of(lip, struct xfs_buf_log_item, bli_item);
+ bp = bli->bli_buf;
+ if (bp->b_target != xfbt->target)
+ return NULL;
+
+ return bp;
+}
+
+/*
+ * Commit changes to the incore btree immediately by writing all dirty xfbtree
+ * buffers to the backing xfile. This detaches all xfbtree buffers from the
+ * transaction, even on failure. The buffer locks are dropped between the
+ * delwri queue and submit, so the caller must synchronize btree access.
+ *
+ * Normally we'd let the buffers commit with the transaction and get written to
+ * the xfile via the log, but online repair stages ephemeral btrees in memory
+ * and uses the btree_staging functions to write new btrees to disk atomically.
+ * The in-memory btree (and its backing store) are discarded at the end of the
+ * repair phase, which means that xfbtree buffers cannot commit with the rest
+ * of a transaction.
+ *
+ * In other words, online repair only needs the transaction to collect buffer
+ * pointers and to avoid buffer deadlocks, not to guarantee consistency of
+ * updates.
+ */
+int
+xfbtree_trans_commit(
+ struct xfbtree *xfbt,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *n;
+ bool tp_dirty = false;
+ int error = 0;
+
+ /*
+ * For each xfbtree buffer attached to the transaction, write the dirty
+ * buffers to the xfile and release them.
+ */
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
+
+ if (!bp) {
+ if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ tp_dirty |= true;
+ continue;
+ }
+
+ trace_xfbtree_trans_commit_buf(xfbt, bp);
+
+ xmbuf_trans_bdetach(tp, bp);
+
+ /*
+ * If the buffer fails verification, note the failure but
+ * continue walking the transaction items so that we remove all
+ * ephemeral btree buffers.
+ */
+ if (!error)
+ error = xmbuf_finalize(bp);
+
+ xfs_buf_relse(bp);
+ }
+
+ /*
+ * Reset the transaction's dirty flag to reflect the dirty state of the
+ * log items that are still attached.
+ */
+ tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
+ (tp_dirty ? XFS_TRANS_DIRTY : 0);
+
+ return error;
+}
+
+/*
+ * Cancel changes to the incore btree by detaching all the xfbtree buffers.
+ * Changes are not undone, so callers must not access the btree ever again.
+ */
+void
+xfbtree_trans_cancel(
+ struct xfbtree *xfbt,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *n;
+ bool tp_dirty = false;
+
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
+
+ if (!bp) {
+ if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ tp_dirty |= true;
+ continue;
+ }
+
+ trace_xfbtree_trans_cancel_buf(xfbt, bp);
+
+ xmbuf_trans_bdetach(tp, bp);
+ xfs_buf_relse(bp);
+ }
+
+ /*
+ * Reset the transaction's dirty flag to reflect the dirty state of the
+ * log items that are still attached.
+ */
+ tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
+ (tp_dirty ? XFS_TRANS_DIRTY : 0);
+}
diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h
new file mode 100644
index 000000000000..1c3825786ec8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_mem.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BTREE_MEM_H__
+#define __XFS_BTREE_MEM_H__
+
+typedef uint64_t xfbno_t;
+
+#define XFBNO_BLOCKSIZE (XMBUF_BLOCKSIZE)
+#define XFBNO_BBSHIFT (XMBUF_BLOCKSHIFT - BBSHIFT)
+#define XFBNO_BBSIZE (XFBNO_BLOCKSIZE >> BBSHIFT)
+
+static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno)
+{
+ return blkno << XFBNO_BBSHIFT;
+}
+
+static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr)
+{
+ return daddr >> XFBNO_BBSHIFT;
+}
+
+struct xfbtree {
+ /* buffer cache target for this in-memory btree */
+ struct xfs_buftarg *target;
+
+ /* Highest block number that has been written to. */
+ xfbno_t highest_bno;
+
+ /* Owner of this btree. */
+ unsigned long long owner;
+
+ /* Btree header */
+ union xfs_btree_ptr root;
+ unsigned int nlevels;
+
+ /* Minimum and maximum records per block. */
+ unsigned int maxrecs[2];
+ unsigned int minrecs[2];
+};
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno)
+{
+ return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno));
+}
+
+void xfbtree_set_root(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int inc);
+void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur);
+
+int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level);
+int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level);
+
+int xfbtree_alloc_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr,
+ int *stat);
+int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+/* Callers must set xfbt->target and xfbt->owner before calling this */
+int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt,
+ struct xfs_buftarg *btp, const struct xfs_btree_ops *ops);
+void xfbtree_destroy(struct xfbtree *xfbt);
+
+int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp);
+void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp);
+#else
+# define xfbtree_verify_bno(...) (false)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_BTREE_MEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index e276eba87cb1..694929703152 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -39,63 +39,6 @@
*/
/*
- * Don't allow staging cursors to be duplicated because they're supposed to be
- * kept private to a single thread.
- */
-STATIC struct xfs_btree_cur *
-xfs_btree_fakeroot_dup_cursor(
- struct xfs_btree_cur *cur)
-{
- ASSERT(0);
- return NULL;
-}
-
-/*
- * Don't allow block allocation for a staging cursor, because staging cursors
- * do not support regular btree modifications.
- *
- * Bulk loading uses a separate callback to obtain new blocks from a
- * preallocated list, which prevents ENOSPC failures during loading.
- */
-STATIC int
-xfs_btree_fakeroot_alloc_block(
- struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *start_bno,
- union xfs_btree_ptr *new_bno,
- int *stat)
-{
- ASSERT(0);
- return -EFSCORRUPTED;
-}
-
-/*
- * Don't allow block freeing for a staging cursor, because staging cursors
- * do not support regular btree modifications.
- */
-STATIC int
-xfs_btree_fakeroot_free_block(
- struct xfs_btree_cur *cur,
- struct xfs_buf *bp)
-{
- ASSERT(0);
- return -EFSCORRUPTED;
-}
-
-/* Initialize a pointer to the root block from the fakeroot. */
-STATIC void
-xfs_btree_fakeroot_init_ptr_from_cur(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- struct xbtree_afakeroot *afake;
-
- ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
-
- afake = cur->bc_ag.afake;
- ptr->s = cpu_to_be32(afake->af_root);
-}
-
-/*
* Bulk Loading for AG Btrees
* ==========================
*
@@ -109,47 +52,20 @@ xfs_btree_fakeroot_init_ptr_from_cur(
* cursor into a regular btree cursor.
*/
-/* Update the btree root information for a per-AG fake root. */
-STATIC void
-xfs_btree_afakeroot_set_root(
- struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *ptr,
- int inc)
-{
- struct xbtree_afakeroot *afake = cur->bc_ag.afake;
-
- ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- afake->af_root = be32_to_cpu(ptr->s);
- afake->af_levels += inc;
-}
-
/*
* Initialize a AG-rooted btree cursor with the given AG btree fake root.
- * The btree cursor's bc_ops will be overridden as needed to make the staging
- * functionality work.
*/
void
xfs_btree_stage_afakeroot(
struct xfs_btree_cur *cur,
struct xbtree_afakeroot *afake)
{
- struct xfs_btree_ops *nops;
-
ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
- ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+ ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_tp == NULL);
- nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
- memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
- nops->alloc_block = xfs_btree_fakeroot_alloc_block;
- nops->free_block = xfs_btree_fakeroot_free_block;
- nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
- nops->set_root = xfs_btree_afakeroot_set_root;
- nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
-
cur->bc_ag.afake = afake;
cur->bc_nlevels = afake->af_levels;
- cur->bc_ops = nops;
cur->bc_flags |= XFS_BTREE_STAGING;
}
@@ -163,17 +79,15 @@ void
xfs_btree_commit_afakeroot(
struct xfs_btree_cur *cur,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- const struct xfs_btree_ops *ops)
+ struct xfs_buf *agbp)
{
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
ASSERT(cur->bc_tp == NULL);
trace_xfs_btree_commit_afakeroot(cur);
- kmem_free((void *)cur->bc_ops);
+ cur->bc_ag.afake = NULL;
cur->bc_ag.agbp = agbp;
- cur->bc_ops = ops;
cur->bc_flags &= ~XFS_BTREE_STAGING;
cur->bc_tp = tp;
}
@@ -211,29 +125,16 @@ xfs_btree_commit_afakeroot(
void
xfs_btree_stage_ifakeroot(
struct xfs_btree_cur *cur,
- struct xbtree_ifakeroot *ifake,
- struct xfs_btree_ops **new_ops)
+ struct xbtree_ifakeroot *ifake)
{
- struct xfs_btree_ops *nops;
-
ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_tp == NULL);
- nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
- memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
- nops->alloc_block = xfs_btree_fakeroot_alloc_block;
- nops->free_block = xfs_btree_fakeroot_free_block;
- nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
- nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
-
cur->bc_ino.ifake = ifake;
cur->bc_nlevels = ifake->if_levels;
- cur->bc_ops = nops;
+ cur->bc_ino.forksize = ifake->if_fork_size;
cur->bc_flags |= XFS_BTREE_STAGING;
-
- if (new_ops)
- *new_ops = nops;
}
/*
@@ -246,18 +147,15 @@ void
xfs_btree_commit_ifakeroot(
struct xfs_btree_cur *cur,
struct xfs_trans *tp,
- int whichfork,
- const struct xfs_btree_ops *ops)
+ int whichfork)
{
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
ASSERT(cur->bc_tp == NULL);
trace_xfs_btree_commit_ifakeroot(cur);
- kmem_free((void *)cur->bc_ops);
cur->bc_ino.ifake = NULL;
cur->bc_ino.whichfork = whichfork;
- cur->bc_ops = ops;
cur->bc_flags &= ~XFS_BTREE_STAGING;
cur->bc_tp = tp;
}
@@ -397,8 +295,7 @@ xfs_btree_bload_prep_block(
struct xfs_btree_block *new_block;
int ret;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1) {
+ if (xfs_btree_at_iroot(cur, level)) {
struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
size_t new_size;
@@ -406,14 +303,12 @@ xfs_btree_bload_prep_block(
/* Allocate a new incore btree root block. */
new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
- ifp->if_broot = kmem_zalloc(new_size, 0);
+ ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
ifp->if_broot_bytes = (int)new_size;
/* Initialize it and send it out. */
- xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
- XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
- nr_this_block, cur->bc_ino.ip->i_ino,
- cur->bc_flags);
+ xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
+ level, nr_this_block, cur->bc_ino.ip->i_ino);
*bpp = NULL;
*blockp = ifp->if_broot;
@@ -704,7 +599,7 @@ xfs_btree_bload_compute_geometry(
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
&avg_per_block, &level_blocks, &dontcare64);
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
/*
* If all the items we want to store at this level
* would fit in the inode root block, then we have our
@@ -763,7 +658,7 @@ xfs_btree_bload_compute_geometry(
return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
bbl->nr_blocks = nr_blocks - 1;
else
bbl->nr_blocks = nr_blocks;
@@ -890,7 +785,7 @@ xfs_btree_bload(
}
/* Initialize the new root. */
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
cur->bc_ino.ifake->if_blocks = total_blocks - 1;
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
index 055ea43b1e18..0c9c2ffb127a 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -22,7 +22,7 @@ struct xbtree_afakeroot {
void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
struct xbtree_afakeroot *afake);
void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
- struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+ struct xfs_buf *agbp);
/* Fake root for an inode-rooted btree. */
struct xbtree_ifakeroot {
@@ -41,10 +41,9 @@ struct xbtree_ifakeroot {
/* Cursor interactions with fake roots for inode-rooted btrees. */
void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
- struct xbtree_ifakeroot *ifake,
- struct xfs_btree_ops **new_ops);
+ struct xbtree_ifakeroot *ifake);
void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
- int whichfork, const struct xfs_btree_ops *ops);
+ int whichfork);
/* Bulk loading of staged btrees. */
typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur,
@@ -76,8 +75,7 @@ struct xfs_btree_bload {
/*
* This function should return the size of the in-core btree root
- * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
- * types.
+ * block. It is only necessary for XFS_BTREE_TYPE_INODE btrees.
*/
xfs_btree_bload_iroot_size_fn iroot_size;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 5457188bb4de..718d071bb21a 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -23,6 +23,7 @@
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
/*
* xfs_da_btree.c
@@ -85,7 +86,8 @@ xfs_da_state_alloc(
{
struct xfs_da_state *state;
- state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
+ state = kmem_cache_zalloc(xfs_da_state_cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
state->args = args;
state->mp = args->dp->i_mount;
return state;
@@ -352,6 +354,8 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = {
static int
xfs_da3_node_set_type(
struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ int whichfork,
struct xfs_buf *bp)
{
struct xfs_da_blkinfo *info = bp->b_addr;
@@ -373,6 +377,7 @@ xfs_da3_node_set_type(
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp,
info, sizeof(*info));
xfs_trans_brelse(tp, bp);
+ xfs_dirattr_mark_sick(dp, whichfork);
return -EFSCORRUPTED;
}
}
@@ -391,7 +396,7 @@ xfs_da3_node_read(
&xfs_da3_node_buf_ops);
if (error || !*bpp || !tp)
return error;
- return xfs_da3_node_set_type(tp, *bpp);
+ return xfs_da3_node_set_type(tp, dp, whichfork, *bpp);
}
int
@@ -408,6 +413,8 @@ xfs_da3_node_read_mapped(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno,
XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0,
bpp, &xfs_da3_node_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, whichfork);
if (error || !*bpp)
return error;
@@ -418,7 +425,7 @@ xfs_da3_node_read_mapped(
if (!tp)
return 0;
- return xfs_da3_node_set_type(tp, *bpp);
+ return xfs_da3_node_set_type(tp, dp, whichfork, *bpp);
}
/*
@@ -631,6 +638,7 @@ xfs_da3_split(
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
xfs_buf_mark_corrupt(oldblk->bp);
+ xfs_da_mark_sick(state->args);
error = -EFSCORRUPTED;
goto out;
}
@@ -644,6 +652,7 @@ xfs_da3_split(
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
xfs_buf_mark_corrupt(oldblk->bp);
+ xfs_da_mark_sick(state->args);
error = -EFSCORRUPTED;
goto out;
}
@@ -1635,6 +1644,7 @@ xfs_da3_node_lookup_int(
if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1650,6 +1660,7 @@ xfs_da3_node_lookup_int(
/* Tree taller than we can handle; bail out! */
if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1658,6 +1669,7 @@ xfs_da3_node_lookup_int(
expected_level = nodehdr.level - 1;
else if (expected_level != nodehdr.level) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
} else
expected_level--;
@@ -1709,12 +1721,16 @@ xfs_da3_node_lookup_int(
}
/* We can't point back to the root. */
- if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk))
+ if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
}
- if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0))
+ if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
/*
* A leaf block that ends in the hashval that we are interested in
@@ -1732,6 +1748,7 @@ xfs_da3_node_lookup_int(
args->blkno = blk->blkno;
} else {
ASSERT(0);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
@@ -2182,7 +2199,8 @@ xfs_da_grow_inode_int(
* If we didn't get it and the block might work if fragmented,
* try without the CONTIG flag. Loop until we get it all.
*/
- mapp = kmem_alloc(sizeof(*mapp) * count, 0);
+ mapp = kmalloc(sizeof(*mapp) * count,
+ GFP_KERNEL | __GFP_NOFAIL);
for (b = *bno, mapi = 0; b < *bno + count; ) {
c = (int)(*bno + count - b);
nmap = min(XFS_BMAP_MAX_NMAP, c);
@@ -2219,7 +2237,7 @@ xfs_da_grow_inode_int(
out_free_map:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2297,8 +2315,10 @@ xfs_da3_swap_lastblock(
error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, lastoff == 0))
+ if (XFS_IS_CORRUPT(mp, lastoff == 0)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
/*
* Read the last block in the btree space.
*/
@@ -2348,6 +2368,7 @@ xfs_da3_swap_lastblock(
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->forw) != last_blkno ||
sib_info->magic != dead_info->magic)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2368,6 +2389,7 @@ xfs_da3_swap_lastblock(
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->back) != last_blkno ||
sib_info->magic != dead_info->magic)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2390,6 +2412,7 @@ xfs_da3_swap_lastblock(
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp,
level >= 0 && level != par_hdr.level + 1)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2401,6 +2424,7 @@ xfs_da3_swap_lastblock(
entno++)
continue;
if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2426,6 +2450,7 @@ xfs_da3_swap_lastblock(
xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
if (XFS_IS_CORRUPT(mp, par_blkno == 0)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2435,6 +2460,7 @@ xfs_da3_swap_lastblock(
par_node = par_buf->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2518,7 +2544,8 @@ xfs_dabuf_map(
int error = 0, nirecs, i;
if (nfsb > 1)
- irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS);
+ irecs = kzalloc(sizeof(irec) * nfsb,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
nirecs = nfsb;
error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs,
@@ -2531,7 +2558,8 @@ xfs_dabuf_map(
* larger one that needs to be free by the caller.
*/
if (nirecs > 1) {
- map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
+ map = kzalloc(nirecs * sizeof(struct xfs_buf_map),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
if (!map) {
error = -ENOMEM;
goto out_free_irecs;
@@ -2557,12 +2585,13 @@ xfs_dabuf_map(
*nmaps = nirecs;
out_free_irecs:
if (irecs != &irec)
- kmem_free(irecs);
+ kfree(irecs);
return error;
invalid_mapping:
/* Caller ok with no mapping. */
if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) {
+ xfs_dirattr_mark_sick(dp, whichfork);
error = -EFSCORRUPTED;
if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
xfs_alert(mp, "%s: bno %u inode %llu",
@@ -2613,7 +2642,7 @@ xfs_da_get_buf(
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2644,6 +2673,8 @@ xfs_da_read_buf(
error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0,
&bp, ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, whichfork);
if (error)
goto out_free;
@@ -2654,7 +2685,7 @@ xfs_da_read_buf(
*bpp = bp;
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2685,7 +2716,7 @@ xfs_da_reada_buf(
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 24f9d1461f9a..060e5c96b70f 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -159,6 +159,17 @@ struct xfs_da3_intnode {
#define XFS_DIR3_FT_MAX 9
+#define XFS_DIR3_FTYPE_STR \
+ { XFS_DIR3_FT_UNKNOWN, "unknown" }, \
+ { XFS_DIR3_FT_REG_FILE, "file" }, \
+ { XFS_DIR3_FT_DIR, "directory" }, \
+ { XFS_DIR3_FT_CHRDEV, "char" }, \
+ { XFS_DIR3_FT_BLKDEV, "block" }, \
+ { XFS_DIR3_FT_FIFO, "fifo" }, \
+ { XFS_DIR3_FT_SOCK, "sock" }, \
+ { XFS_DIR3_FT_SYMLINK, "symlink" }, \
+ { XFS_DIR3_FT_WHT, "whiteout" }
+
/*
* Byte offset in data block and shortform entry.
*/
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 66a17910d021..c13276095cc0 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -819,16 +819,16 @@ xfs_defer_can_append(
/* Create a new pending item at the end of the transaction list. */
static inline struct xfs_defer_pending *
xfs_defer_alloc(
- struct xfs_trans *tp,
+ struct list_head *dfops,
const struct xfs_defer_op_type *ops)
{
struct xfs_defer_pending *dfp;
dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
dfp->dfp_ops = ops;
INIT_LIST_HEAD(&dfp->dfp_work);
- list_add_tail(&dfp->dfp_list, &tp->t_dfops);
+ list_add_tail(&dfp->dfp_list, dfops);
return dfp;
}
@@ -846,7 +846,7 @@ xfs_defer_add(
dfp = xfs_defer_find_last(tp, ops);
if (!dfp || !xfs_defer_can_append(dfp, ops))
- dfp = xfs_defer_alloc(tp, ops);
+ dfp = xfs_defer_alloc(&tp->t_dfops, ops);
xfs_defer_add_item(dfp, li);
trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
@@ -870,7 +870,7 @@ xfs_defer_add_barrier(
if (dfp)
return;
- xfs_defer_alloc(tp, &xfs_barrier_defer_type);
+ xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type);
trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
}
@@ -885,14 +885,9 @@ xfs_defer_start_recovery(
struct list_head *r_dfops,
const struct xfs_defer_op_type *ops)
{
- struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops);
- dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
- GFP_NOFS | __GFP_NOFAIL);
- dfp->dfp_ops = ops;
dfp->dfp_intent = lip;
- INIT_LIST_HEAD(&dfp->dfp_work);
- list_add_tail(&dfp->dfp_list, r_dfops);
}
/*
@@ -979,7 +974,7 @@ xfs_defer_ops_capture(
return ERR_PTR(error);
/* Create an object to capture the defer ops. */
- dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&dfc->dfc_list);
INIT_LIST_HEAD(&dfc->dfc_dfops);
@@ -1011,7 +1006,7 @@ xfs_defer_ops_capture(
* transaction.
*/
for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
- ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL);
ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
}
@@ -1038,7 +1033,7 @@ xfs_defer_ops_capture_abort(
for (i = 0; i < dfc->dfc_held.dr_inos; i++)
xfs_irele(dfc->dfc_held.dr_ip[i]);
- kmem_free(dfc);
+ kfree(dfc);
}
/*
@@ -1114,7 +1109,7 @@ xfs_defer_ops_continue(
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
tp->t_flags |= dfc->dfc_tpflags;
- kmem_free(dfc);
+ kfree(dfc);
}
/* Release the resources captured and continued during recovery. */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index a76673281514..4821519efad4 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -18,6 +18,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_health.h"
const struct xfs_name xfs_name_dotdot = {
.name = (const unsigned char *)"..",
@@ -25,6 +26,12 @@ const struct xfs_name xfs_name_dotdot = {
.type = XFS_DIR3_FT_DIR,
};
+const struct xfs_name xfs_name_dot = {
+ .name = (const unsigned char *)".",
+ .len = 1,
+ .type = XFS_DIR3_FT_DIR,
+};
+
/*
* Convert inode mode to directory entry filetype
*/
@@ -104,13 +111,13 @@ xfs_da_mount(
ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE);
- mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_MAYFAIL);
- mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_MAYFAIL);
+ mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!mp->m_dir_geo || !mp->m_attr_geo) {
- kmem_free(mp->m_dir_geo);
- kmem_free(mp->m_attr_geo);
+ kfree(mp->m_dir_geo);
+ kfree(mp->m_attr_geo);
return -ENOMEM;
}
@@ -178,8 +185,8 @@ void
xfs_da_unmount(
struct xfs_mount *mp)
{
- kmem_free(mp->m_dir_geo);
- kmem_free(mp->m_attr_geo);
+ kfree(mp->m_dir_geo);
+ kfree(mp->m_attr_geo);
}
/*
@@ -236,7 +243,7 @@ xfs_dir_init(
if (error)
return error;
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -244,7 +251,7 @@ xfs_dir_init(
args->dp = dp;
args->trans = tp;
error = xfs_dir2_sf_create(args, pdp->i_ino);
- kmem_free(args);
+ kfree(args);
return error;
}
@@ -273,7 +280,7 @@ xfs_dir_createname(
XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -313,7 +320,7 @@ xfs_dir_createname(
rval = xfs_dir2_node_addname(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -333,7 +340,8 @@ xfs_dir_cilookup_result(
!(args->op_flags & XFS_DA_OP_CILOOKUP))
return -EEXIST;
- args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+ args->value = kmalloc(len,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL);
if (!args->value)
return -ENOMEM;
@@ -364,15 +372,8 @@ xfs_dir_lookup(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
- /*
- * We need to use KM_NOFS here so that lockdep will not throw false
- * positive deadlock warnings on a non-transactional lookup path. It is
- * safe to recurse into inode recalim in that case, but lockdep can't
- * easily be taught about it. Hence KM_NOFS avoids having to add more
- * lockdep Doing this avoids having to add a bunch of lockdep class
- * annotations into the reclaim path for the ilock.
- */
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
@@ -419,7 +420,7 @@ out_check_rval:
}
out_free:
xfs_iunlock(dp, lock_mode);
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -441,7 +442,7 @@ xfs_dir_removename(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -477,7 +478,7 @@ xfs_dir_removename(
else
rval = xfs_dir2_node_removename(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -502,7 +503,7 @@ xfs_dir_replace(
if (rval)
return rval;
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -538,7 +539,7 @@ xfs_dir_replace(
else
rval = xfs_dir2_node_replace(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -626,8 +627,10 @@ xfs_dir2_isblock(
return 0;
*isblock = true;
- if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
+ if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 19af22a16c41..8497d041f316 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -22,6 +22,19 @@ struct xfs_dir3_icfree_hdr;
struct xfs_dir3_icleaf_hdr;
extern const struct xfs_name xfs_name_dotdot;
+extern const struct xfs_name xfs_name_dot;
+
+static inline bool
+xfs_dir2_samename(
+ const struct xfs_name *n1,
+ const struct xfs_name *n2)
+{
+ if (n1 == n2)
+ return true;
+ if (n1->len != n2->len)
+ return false;
+ return !memcmp(n1->name, n2->name, n1->len);
+}
/*
* Convert inode mode to directory entry filetype
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 3c256d4cc40b..a2da007adb46 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -20,6 +20,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_health.h"
/*
* Local function prototypes.
@@ -152,6 +153,7 @@ xfs_dir3_block_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -1108,7 +1110,7 @@ xfs_dir2_sf_to_block(
* Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
- sfp = kmem_alloc(ifp->if_bytes, 0);
+ sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL);
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
@@ -1253,7 +1255,7 @@ xfs_dir2_sf_to_block(
sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
/* Done with the temporary buffer */
- kmem_free(sfp);
+ kfree(sfp);
/*
* Sort the leaf entries by hash value.
*/
@@ -1268,6 +1270,6 @@ xfs_dir2_sf_to_block(
xfs_dir3_data_check(dp, bp);
return 0;
out_free:
- kmem_free(sfp);
+ kfree(sfp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index dbcf58979a59..7a6d965bea71 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_health.h"
static xfs_failaddr_t xfs_dir2_data_freefind_verify(
struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
@@ -433,6 +434,7 @@ xfs_dir3_data_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -1198,6 +1200,7 @@ xfs_dir2_data_use_free(
corrupt:
xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount,
hdr, sizeof(*hdr), __FILE__, __LINE__, fa);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index cb9e950a911d..08dda5ce9d91 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -19,6 +19,7 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_health.h"
/*
* Local function declarations.
@@ -1393,8 +1394,10 @@ xfs_dir2_leaf_removename(
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[db]) != oldbest) {
xfs_buf_mark_corrupt(lbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
+
/*
* Mark the former data entry unused.
*/
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 7a03aeb9f4c9..be0b8834028c 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_health.h"
/*
* Function declarations.
@@ -231,6 +232,7 @@ __xfs_dir3_free_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -443,6 +445,7 @@ xfs_dir2_leaf_to_node(
if (be32_to_cpu(ltp->bestcount) >
(uint)dp->i_disk_size / args->geo->blksize) {
xfs_buf_mark_corrupt(lbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -517,6 +520,7 @@ xfs_dir2_leafn_add(
*/
if (index < 0) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -736,6 +740,7 @@ xfs_dir2_leafn_lookup_for_addname(
cpu_to_be16(NULLDATAOFF))) {
if (curfdb != newfdb)
xfs_trans_brelse(tp, curbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
curfdb = newfdb;
@@ -804,6 +809,7 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_dir3_leaf_check(dp, bp);
if (leafhdr.count <= 0) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1739,6 +1745,7 @@ xfs_dir2_node_add_datablk(
} else {
xfs_alert(mp, " ... fblk is NULL");
}
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index e1f83fc7b6ad..17a20384c8b7 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -276,7 +276,7 @@ xfs_dir2_block_to_sf(
* format the data into. Once we have formatted the data, we can free
* the block and copy the formatted data into the inode literal area.
*/
- sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0);
+ sfp = kmalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
/*
@@ -350,7 +350,7 @@ xfs_dir2_block_to_sf(
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(sfp);
+ kfree(sfp);
return error;
}
@@ -524,7 +524,7 @@ xfs_dir2_sf_addname_hard(
* Copy the old directory to the stack buffer.
*/
old_isize = (int)dp->i_disk_size;
- buf = kmem_alloc(old_isize, 0);
+ buf = kmalloc(old_isize, GFP_KERNEL | __GFP_NOFAIL);
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
memcpy(oldsfp, dp->i_df.if_data, old_isize);
/*
@@ -576,7 +576,7 @@ xfs_dir2_sf_addname_hard(
sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = new_isize;
xfs_dir2_sf_check(args);
}
@@ -1151,7 +1151,7 @@ xfs_dir2_sf_toino4(
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, 0);
+ buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL);
ASSERT(oldsfp->i8count == 1);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1190,7 +1190,7 @@ xfs_dir2_sf_toino4(
/*
* Clean up the inode.
*/
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
@@ -1223,7 +1223,7 @@ xfs_dir2_sf_toino8(
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, 0);
+ buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL);
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1262,7 +1262,7 @@ xfs_dir2_sf_toino8(
/*
* Clean up the inode.
*/
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 382ab1e71c0b..2b2f9050fbfb 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -477,15 +477,9 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
/*
- * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
- * arrays below.
- */
-#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1)
-
-/*
- * The second word of agf_levels in the first a.g. overlaps the EFS
- * superblock's magic number. Since the magic numbers valid for EFS
- * are > 64k, our value cannot be confused for an EFS superblock's.
+ * agf_cnt_level in the first AGF overlaps the EFS superblock's magic number.
+ * Since the magic numbers valid for EFS are > 64k, our value cannot be confused
+ * for an EFS superblock.
*/
typedef struct xfs_agf {
@@ -499,8 +493,13 @@ typedef struct xfs_agf {
/*
* Freespace and rmap information
*/
- __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
- __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
+ __be32 agf_bno_root; /* bnobt root block */
+ __be32 agf_cnt_root; /* cntbt root block */
+ __be32 agf_rmap_root; /* rmapbt root block */
+
+ __be32 agf_bno_level; /* bnobt btree levels */
+ __be32 agf_cnt_level; /* cntbt btree levels */
+ __be32 agf_rmap_level; /* rmapbt btree levels */
__be32 agf_flfirst; /* first freelist block's index */
__be32 agf_fllast; /* last freelist block's index */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 6360073865db..ca1b17d01437 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -195,6 +195,8 @@ struct xfs_fsop_geom {
#define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */
#define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */
#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */
+#define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */
+#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */
/* Output for XFS_FS_COUNTS */
typedef struct xfs_fsop_counts {
@@ -292,6 +294,7 @@ struct xfs_ag_geometry {
#define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */
#define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */
#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */
+#define XFS_AG_GEOM_SICK_INODES (1 << 10) /* bad inodes were seen */
/*
* Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
@@ -709,9 +712,12 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */
#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */
#define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */
+#define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */
+#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */
+#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 25
+#define XFS_SCRUB_TYPE_NR 28
/* i: Repair this metadata. */
#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 6296993ff8f3..3c64b5f9bd68 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -26,21 +26,40 @@
* and the "sick" field tells us if that piece was found to need repairs.
* Therefore we can conclude that for a given sick flag value:
*
- * - checked && sick => metadata needs repair
- * - checked && !sick => metadata is ok
- * - !checked => has not been examined since mount
+ * - checked && sick => metadata needs repair
+ * - checked && !sick => metadata is ok
+ * - !checked && sick => errors have been observed during normal operation,
+ * but the metadata has not been checked thoroughly
+ * - !checked && !sick => has not been examined since mount
+ *
+ * Evidence of health problems can be sorted into three basic categories:
+ *
+ * a) Primary evidence, which signals that something is defective within the
+ * general grouping of metadata.
+ *
+ * b) Secondary evidence, which are side effects of primary problem but are
+ * not themselves problems. These can be forgotten when the primary
+ * health problems are addressed.
+ *
+ * c) Indirect evidence, which points to something being wrong in another
+ * group, but we had to release resources and this is all that's left of
+ * that state.
*/
struct xfs_mount;
struct xfs_perag;
struct xfs_inode;
struct xfs_fsop_geom;
+struct xfs_btree_cur;
+struct xfs_da_args;
/* Observable health issues for metadata spanning the entire filesystem. */
#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */
#define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */
#define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */
#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */
+#define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */
+#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */
/* Observable health issues for realtime volume metadata. */
#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */
@@ -57,6 +76,7 @@ struct xfs_fsop_geom;
#define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */
#define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */
#define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */
+#define XFS_SICK_AG_INODES (1 << 10) /* inactivated bad inodes */
/* Observable health issues for inode metadata. */
#define XFS_SICK_INO_CORE (1 << 0) /* inode core */
@@ -73,11 +93,16 @@ struct xfs_fsop_geom;
#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */
#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */
+/* Don't propagate sick status to ag health summary during inactivation */
+#define XFS_SICK_INO_FORGET (1 << 12)
+
/* Primary evidence of health problems in a given group. */
#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
XFS_SICK_FS_UQUOTA | \
XFS_SICK_FS_GQUOTA | \
- XFS_SICK_FS_PQUOTA)
+ XFS_SICK_FS_PQUOTA | \
+ XFS_SICK_FS_QUOTACHECK | \
+ XFS_SICK_FS_NLINKS)
#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \
XFS_SICK_RT_SUMMARY)
@@ -107,29 +132,86 @@ struct xfs_fsop_geom;
XFS_SICK_INO_DIR_ZAPPED | \
XFS_SICK_INO_SYMLINK_ZAPPED)
-/* These functions must be provided by the xfs implementation. */
+/* Secondary state related to (but not primary evidence of) health problems. */
+#define XFS_SICK_FS_SECONDARY (0)
+#define XFS_SICK_RT_SECONDARY (0)
+#define XFS_SICK_AG_SECONDARY (0)
+#define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET)
+
+/* Evidence of health problems elsewhere. */
+#define XFS_SICK_FS_INDIRECT (0)
+#define XFS_SICK_RT_INDIRECT (0)
+#define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES)
+#define XFS_SICK_INO_INDIRECT (0)
+
+/* All health masks. */
+#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \
+ XFS_SICK_FS_SECONDARY | \
+ XFS_SICK_FS_INDIRECT)
+
+#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \
+ XFS_SICK_RT_SECONDARY | \
+ XFS_SICK_RT_INDIRECT)
+
+#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \
+ XFS_SICK_AG_SECONDARY | \
+ XFS_SICK_AG_INDIRECT)
+
+#define XFS_SICK_INO_ALL (XFS_SICK_INO_PRIMARY | \
+ XFS_SICK_INO_SECONDARY | \
+ XFS_SICK_INO_INDIRECT | \
+ XFS_SICK_INO_ZAPPED)
+
+/*
+ * These functions must be provided by the xfs implementation. Function
+ * behavior with respect to the first argument should be as follows:
+ *
+ * xfs_*_mark_sick: Set the sick flags and do not set checked flags.
+ * Runtime code should call this upon encountering
+ * a corruption.
+ *
+ * xfs_*_mark_corrupt: Set the sick and checked flags simultaneously.
+ * Fsck tools should call this when corruption is
+ * found.
+ *
+ * xfs_*_mark_healthy: Clear the sick flags and set the checked flags.
+ * Fsck tools should call this after correcting errors.
+ *
+ * xfs_*_measure_sickness: Return the sick and check status in the provided
+ * out parameters.
+ */
void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
unsigned int *checked);
void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
unsigned int *checked);
+void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
+ unsigned int mask);
void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask);
void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
unsigned int *checked);
void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_corrupt(struct xfs_inode *ip, unsigned int mask);
void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
unsigned int *checked);
void xfs_health_unmount(struct xfs_mount *mp);
+void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork);
+void xfs_btree_mark_sick(struct xfs_btree_cur *cur);
+void xfs_dirattr_mark_sick(struct xfs_inode *ip, int whichfork);
+void xfs_da_mark_sick(struct xfs_da_args *args);
/* Now some helpers. */
@@ -197,4 +279,7 @@ void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
+#define xfs_metadata_is_sick(error) \
+ (unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC))
+
#endif /* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 2361a22035b0..e5ac3e5430c4 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -27,6 +27,7 @@
#include "xfs_log.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
/*
* Lookup a record by ino in the btree given by cur.
@@ -140,13 +141,13 @@ xfs_inobt_complain_bad_rec(
struct xfs_mount *mp = cur->bc_mp;
xfs_warn(mp,
- "%s Inode BTree record corruption in AG %d detected at %pS!",
- cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
- cur->bc_ag.pag->pag_agno, fa);
+ "%sbt record corruption in AG %d detected at %pS!",
+ cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
irec->ir_free, irec->ir_holemask);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -205,14 +206,17 @@ xfs_inobt_insert(
struct xfs_buf *agbp,
xfs_agino_t newino,
xfs_agino_t newlen,
- xfs_btnum_t btnum)
+ bool is_finobt)
{
struct xfs_btree_cur *cur;
xfs_agino_t thisino;
int i;
int error;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
+ if (is_finobt)
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
+ else
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
for (thisino = newino;
thisino < newino + newlen;
@@ -528,16 +532,14 @@ __xfs_inobt_rec_merge(
}
/*
- * Insert a new sparse inode chunk into the associated inode btree. The inode
- * record for the sparse chunk is pre-aligned to a startino that should match
- * any pre-existing sparse inode record in the tree. This allows sparse chunks
- * to fill over time.
+ * Insert a new sparse inode chunk into the associated inode allocation btree.
+ * The inode record for the sparse chunk is pre-aligned to a startino that
+ * should match any pre-existing sparse inode record in the tree. This allows
+ * sparse chunks to fill over time.
*
- * This function supports two modes of handling preexisting records depending on
- * the merge flag. If merge is true, the provided record is merged with the
+ * If no preexisting record exists, the provided record is inserted.
+ * If there is a preexisting record, the provided record is merged with the
* existing record and updated in place. The merged record is returned in nrec.
- * If merge is false, an existing record is replaced with the provided record.
- * If no preexisting record exists, the provided record is always inserted.
*
* It is considered corruption if a merge is requested and not possible. Given
* the sparse inode alignment constraints, this should never happen.
@@ -547,9 +549,7 @@ xfs_inobt_insert_sprec(
struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- int btnum,
- struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
- bool merge) /* merge or replace */
+ struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
@@ -557,7 +557,7 @@ xfs_inobt_insert_sprec(
int i;
struct xfs_inobt_rec_incore rec;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
/* the new record is pre-aligned so we know where to look */
error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
@@ -571,6 +571,7 @@ xfs_inobt_insert_sprec(
if (error)
goto error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -579,45 +580,45 @@ xfs_inobt_insert_sprec(
}
/*
- * A record exists at this startino. Merge or replace the record
- * depending on what we've been asked to do.
+ * A record exists at this startino. Merge the records.
*/
- if (merge) {
- error = xfs_inobt_get_rec(cur, &rec, &i);
- if (error)
- goto error;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto error;
- }
- if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
- error = -EFSCORRUPTED;
- goto error;
- }
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
- /*
- * This should never fail. If we have coexisting records that
- * cannot merge, something is seriously wrong.
- */
- if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
- error = -EFSCORRUPTED;
- goto error;
- }
+ /*
+ * This should never fail. If we have coexisting records that
+ * cannot merge, something is seriously wrong.
+ */
+ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
- trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
- rec.ir_holemask, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
+ rec.ir_holemask, nrec->ir_startino,
+ nrec->ir_holemask);
- /* merge to nrec to output the updated record */
- __xfs_inobt_rec_merge(nrec, &rec);
+ /* merge to nrec to output the updated record */
+ __xfs_inobt_rec_merge(nrec, &rec);
- trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
+ nrec->ir_holemask);
- error = xfs_inobt_rec_check_count(mp, nrec);
- if (error)
- goto error;
- }
+ error = xfs_inobt_rec_check_count(mp, nrec);
+ if (error)
+ goto error;
error = xfs_inobt_update(cur, nrec);
if (error)
@@ -632,6 +633,59 @@ error:
}
/*
+ * Insert a new sparse inode chunk into the free inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * The new record is always inserted, overwriting a pre-existing record if
+ * there is one.
+ */
+STATIC int
+xfs_finobt_insert_sprec(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_btree_cur *cur;
+ int error;
+ int i;
+
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
+
+ /* the new record is pre-aligned so we know where to look */
+ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ /* if nothing there, insert a new record and return */
+ if (i == 0) {
+ error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+ nrec->ir_count, nrec->ir_freecount,
+ nrec->ir_free, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ } else {
+ error = xfs_inobt_update(cur, nrec);
+ if (error)
+ goto error;
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+
+/*
* Allocate new inodes in the allocation group specified by agbp. Returns 0 if
* inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
* the caller knows it can try another AG, a hard -ENOSPC when over the maximum
@@ -857,8 +911,7 @@ sparse_alloc:
* if necessary. If a merge does occur, rec is updated to the
* merged record.
*/
- error = xfs_inobt_insert_sprec(pag, tp, agbp,
- XFS_BTNUM_INO, &rec, true);
+ error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
@@ -882,21 +935,19 @@ sparse_alloc:
* existing record with this one.
*/
if (xfs_has_finobt(args.mp)) {
- error = xfs_inobt_insert_sprec(pag, tp, agbp,
- XFS_BTNUM_FINO, &rec, false);
+ error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
if (error)
return error;
}
} else {
/* full chunk - insert new records to both btrees */
- error = xfs_inobt_insert(pag, tp, agbp, newino, newlen,
- XFS_BTNUM_INO);
+ error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
if (error)
return error;
if (xfs_has_finobt(args.mp)) {
error = xfs_inobt_insert(pag, tp, agbp, newino,
- newlen, XFS_BTNUM_FINO);
+ newlen, true);
if (error)
return error;
}
@@ -949,8 +1000,10 @@ xfs_ialloc_next_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
}
return 0;
@@ -974,8 +1027,10 @@ xfs_ialloc_get_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
}
return 0;
@@ -1030,7 +1085,7 @@ xfs_dialloc_ag_inobt(
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -1053,6 +1108,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1061,6 +1117,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, j != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1219,6 +1276,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1228,6 +1286,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1237,6 +1296,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1297,8 +1357,10 @@ xfs_dialloc_ag_finobt_near(
error = xfs_inobt_get_rec(lcur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(lcur);
return -EFSCORRUPTED;
+ }
/*
* See if we've landed in the parent inode record. The finobt
@@ -1322,12 +1384,14 @@ xfs_dialloc_ag_finobt_near(
if (error)
goto error_rcur;
if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
+ xfs_btree_mark_sick(lcur);
error = -EFSCORRUPTED;
goto error_rcur;
}
}
if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
+ xfs_btree_mark_sick(lcur);
error = -EFSCORRUPTED;
goto error_rcur;
}
@@ -1383,8 +1447,10 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
}
@@ -1395,14 +1461,18 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -1424,14 +1494,18 @@ xfs_dialloc_ag_update_inobt(
error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
@@ -1440,8 +1514,10 @@ xfs_dialloc_ag_update_inobt(
if (XFS_IS_CORRUPT(cur->bc_mp,
rec.ir_free != frec->ir_free ||
- rec.ir_freecount != frec->ir_freecount))
+ rec.ir_freecount != frec->ir_freecount)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return xfs_inobt_update(cur, &rec);
}
@@ -1483,7 +1559,7 @@ xfs_dialloc_ag(
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(cur);
if (error)
@@ -1526,7 +1602,7 @@ xfs_dialloc_ag(
* the original freecount. If all is well, make the equivalent update to
* the inobt using the finobt record and offset information.
*/
- icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ icur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(icur);
if (error)
@@ -1943,7 +2019,7 @@ xfs_difree_inobt(
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(cur);
if (error)
@@ -1958,6 +2034,7 @@ xfs_difree_inobt(
goto error0;
}
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1968,6 +2045,7 @@ xfs_difree_inobt(
goto error0;
}
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2068,7 +2146,7 @@ xfs_difree_finobt(
int error;
int i;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
@@ -2080,6 +2158,7 @@ xfs_difree_finobt(
* something is out of sync.
*/
if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2106,6 +2185,7 @@ xfs_difree_finobt(
if (error)
goto error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2116,6 +2196,7 @@ xfs_difree_finobt(
if (XFS_IS_CORRUPT(mp,
rec.ir_free != ibtrec->ir_free ||
rec.ir_freecount != ibtrec->ir_freecount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2265,7 +2346,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -2604,6 +2685,8 @@ xfs_read_agi(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
if (error)
return error;
if (tp)
@@ -2765,7 +2848,7 @@ xfs_ialloc_count_inodes(
struct xfs_ialloc_count_inodes ci = {0};
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+ ASSERT(xfs_btree_is_ino(cur->bc_ops));
error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
if (error)
return error;
@@ -2982,7 +3065,7 @@ xfs_ialloc_check_shrink(
if (!xfs_has_sparseinodes(pag->pag_mount))
return 0;
- cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agibp);
/* Look up the inobt record that would correspond to the new EOFS. */
agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
@@ -2995,6 +3078,7 @@ xfs_ialloc_check_shrink(
goto out;
if (!has) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
error = -EFSCORRUPTED;
goto out;
}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 42a5e1f227a0..cc661fca6ff5 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -17,6 +17,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_rmap.h"
@@ -37,7 +38,15 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_btnum);
+ cur->bc_ag.agbp);
+}
+
+STATIC struct xfs_btree_cur *
+xfs_finobt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+ cur->bc_ag.agbp);
}
STATIC void
@@ -81,9 +90,9 @@ xfs_inobt_mod_blockcount(
if (!xfs_has_inobtcounts(cur->bc_mp))
return;
- if (cur->bc_btnum == XFS_BTNUM_FINO)
+ if (xfs_btree_is_fino(cur->bc_ops))
be32_add_cpu(&agi->agi_fblocks, howmuch);
- else if (cur->bc_btnum == XFS_BTNUM_INO)
+ else
be32_add_cpu(&agi->agi_iblocks, howmuch);
xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
}
@@ -300,7 +309,7 @@ xfs_inobt_verify(
* xfs_perag_initialised_agi(pag)) if we ever do.
*/
if (xfs_has_crc(mp)) {
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
}
@@ -310,7 +319,7 @@ xfs_inobt_verify(
if (level >= M_IGEO(mp)->inobt_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp,
+ return xfs_btree_agblock_verify(bp,
M_IGEO(mp)->inobt_mxr[level != 0]);
}
@@ -320,7 +329,7 @@ xfs_inobt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_inobt_verify(bp);
@@ -344,7 +353,7 @@ xfs_inobt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -398,9 +407,17 @@ xfs_inobt_keys_contiguous(
be32_to_cpu(key2->inobt.ir_startino));
}
-static const struct xfs_btree_ops xfs_inobt_ops = {
+const struct xfs_btree_ops xfs_inobt_ops = {
+ .name = "ino",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_inobt_rec_t),
.key_len = sizeof(xfs_inobt_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_INO_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2),
+ .sick_mask = XFS_SICK_AG_INOBT,
.dup_cursor = xfs_inobt_dup_cursor,
.set_root = xfs_inobt_set_root,
@@ -420,11 +437,19 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.keys_contiguous = xfs_inobt_keys_contiguous,
};
-static const struct xfs_btree_ops xfs_finobt_ops = {
+const struct xfs_btree_ops xfs_finobt_ops = {
+ .name = "fino",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_inobt_rec_t),
.key_len = sizeof(xfs_inobt_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
- .dup_cursor = xfs_inobt_dup_cursor,
+ .lru_refs = XFS_INO_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2),
+ .sick_mask = XFS_SICK_AG_FINOBT,
+
+ .dup_cursor = xfs_finobt_dup_cursor,
.set_root = xfs_finobt_set_root,
.alloc_block = xfs_finobt_alloc_block,
.free_block = xfs_finobt_free_block,
@@ -443,65 +468,54 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
};
/*
- * Initialize a new inode btree cursor.
+ * Create an inode btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-static struct xfs_btree_cur *
-xfs_inobt_init_common(
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
struct xfs_perag *pag,
- struct xfs_trans *tp, /* transaction pointer */
- xfs_btnum_t btnum) /* ialloc or free ino btree */
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
- cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops,
M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
- if (btnum == XFS_BTNUM_INO) {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
- cur->bc_ops = &xfs_inobt_ops;
- } else {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
- cur->bc_ops = &xfs_finobt_ops;
- }
-
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agi *agi = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ }
return cur;
}
-/* Create an inode btree cursor. */
+/*
+ * Create a free inode btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
struct xfs_btree_cur *
-xfs_inobt_init_cursor(
+xfs_finobt_init_cursor(
struct xfs_perag *pag,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_btnum_t btnum)
+ struct xfs_buf *agbp)
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = agbp->b_addr;
- cur = xfs_inobt_init_common(pag, tp, btnum);
- if (btnum == XFS_BTNUM_INO)
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- else
- cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops,
+ M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
+ cur->bc_ag.pag = xfs_perag_hold(pag);
cur->bc_ag.agbp = agbp;
- return cur;
-}
+ if (agbp) {
+ struct xfs_agi *agi = agbp->b_addr;
-/* Create an inode btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_inobt_stage_cursor(
- struct xfs_perag *pag,
- struct xbtree_afakeroot *afake,
- xfs_btnum_t btnum)
-{
- struct xfs_btree_cur *cur;
-
- cur = xfs_inobt_init_common(pag, NULL, btnum);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ }
return cur;
}
@@ -521,7 +535,7 @@ xfs_inobt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- if (cur->bc_btnum == XFS_BTNUM_INO) {
+ if (xfs_btree_is_ino(cur->bc_ops)) {
fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
agi->agi_root = cpu_to_be32(afake->af_root);
agi->agi_level = cpu_to_be32(afake->af_levels);
@@ -530,7 +544,7 @@ xfs_inobt_commit_staged_btree(
fields |= XFS_AGI_IBLOCKS;
}
xfs_ialloc_log_agi(tp, agbp, fields);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
} else {
fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
agi->agi_free_root = cpu_to_be32(afake->af_root);
@@ -540,7 +554,7 @@ xfs_inobt_commit_staged_btree(
fields |= XFS_AGI_IBLOCKS;
}
xfs_ialloc_log_agi(tp, agbp, fields);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
}
@@ -721,45 +735,21 @@ xfs_inobt_max_size(
XFS_INODES_PER_CHUNK);
}
-/* Read AGI and create inobt cursor. */
-int
-xfs_inobt_cur(
- struct xfs_perag *pag,
- struct xfs_trans *tp,
- xfs_btnum_t which,
- struct xfs_btree_cur **curpp,
- struct xfs_buf **agi_bpp)
-{
- struct xfs_btree_cur *cur;
- int error;
-
- ASSERT(*agi_bpp == NULL);
- ASSERT(*curpp == NULL);
-
- error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
- if (error)
- return error;
-
- cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which);
- *curpp = cur;
- return 0;
-}
-
static int
-xfs_inobt_count_blocks(
+xfs_finobt_count_blocks(
struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
struct xfs_buf *agbp = NULL;
- struct xfs_btree_cur *cur = NULL;
+ struct xfs_btree_cur *cur;
int error;
- error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error)
return error;
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_btree_count_blocks(cur, tree_blocks);
xfs_btree_del_cursor(cur, error);
xfs_trans_brelse(tp, agbp);
@@ -807,8 +797,7 @@ xfs_finobt_calc_reserves(
if (xfs_has_inobtcounts(pag->pag_mount))
error = xfs_finobt_read_blocks(pag, tp, &tree_len);
else
- error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO,
- &tree_len);
+ error = xfs_finobt_count_blocks(pag, tp, &tree_len);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 3262c3fe5ebe..6472ec1ecbb4 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -46,10 +46,10 @@ struct xfs_perag;
(maxrecs) * sizeof(xfs_inobt_key_t) + \
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
-extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
- struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
- struct xbtree_afakeroot *afake, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
/* ir_holemask to inode allocation bitmap conversion */
@@ -66,9 +66,6 @@ int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp,
xfs_extlen_t *ask, xfs_extlen_t *used);
extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
-int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp,
- xfs_btnum_t btnum, struct xfs_btree_cur **curpp,
- struct xfs_buf **agi_bpp);
void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index f4e6b200cdf8..8796f2b3e534 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -394,11 +394,18 @@ xfs_iext_leaf_key(
return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
}
+static inline void *
+xfs_iext_alloc_node(
+ int size)
+{
+ return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+}
+
static void
xfs_iext_grow(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_node *node = xfs_iext_alloc_node(NODE_SIZE);
int i;
if (ifp->if_height == 1) {
@@ -454,7 +461,7 @@ xfs_iext_split_node(
int *nr_entries)
{
struct xfs_iext_node *node = *nodep;
- struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_node *new = xfs_iext_alloc_node(NODE_SIZE);
const int nr_move = KEYS_PER_NODE / 2;
int nr_keep = nr_move + (KEYS_PER_NODE & 1);
int i = 0;
@@ -542,7 +549,7 @@ xfs_iext_split_leaf(
int *nr_entries)
{
struct xfs_iext_leaf *leaf = cur->leaf;
- struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_leaf *new = xfs_iext_alloc_node(NODE_SIZE);
const int nr_move = RECS_PER_LEAF / 2;
int nr_keep = nr_move + (RECS_PER_LEAF & 1);
int i;
@@ -583,7 +590,7 @@ xfs_iext_alloc_root(
{
ASSERT(ifp->if_bytes == 0);
- ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+ ifp->if_data = xfs_iext_alloc_node(sizeof(struct xfs_iext_rec));
ifp->if_height = 1;
/* now that we have a node step into it */
@@ -603,7 +610,8 @@ xfs_iext_realloc_root(
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL);
+ new = krealloc(ifp->if_data, new_size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
ifp->if_data = new;
cur->leaf = new;
@@ -743,7 +751,7 @@ xfs_iext_remove_node(
again:
ASSERT(node->ptrs[pos]);
ASSERT(node->ptrs[pos] == victim);
- kmem_free(victim);
+ kfree(victim);
nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
offset = node->keys[0];
@@ -789,7 +797,7 @@ again:
ASSERT(node == ifp->if_data);
ifp->if_data = node->ptrs[0];
ifp->if_height--;
- kmem_free(node);
+ kfree(node);
}
}
@@ -863,7 +871,7 @@ xfs_iext_free_last_leaf(
struct xfs_ifork *ifp)
{
ifp->if_height--;
- kmem_free(ifp->if_data);
+ kfree(ifp->if_data);
ifp->if_data = NULL;
}
@@ -1044,7 +1052,7 @@ xfs_iext_destroy_node(
}
}
- kmem_free(node);
+ kfree(node);
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 137a65bda95d..d0dcce462bf4 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_ialloc.h"
#include "xfs_dir2.h"
+#include "xfs_health.h"
#include <linux/iversion.h>
@@ -132,9 +133,14 @@ xfs_imap_to_bp(
struct xfs_imap *imap,
struct xfs_buf **bpp)
{
- return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- imap->im_len, XBF_UNMAPPED, bpp,
- &xfs_inode_buf_ops);
+ int error;
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
+ XFS_SICK_AG_INODES);
+ return error;
}
static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index f4569e18a8d0..7d660a973909 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -25,6 +25,8 @@
#include "xfs_attr_leaf.h"
#include "xfs_types.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
struct kmem_cache *xfs_ifork_cache;
@@ -50,7 +52,8 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- char *new_data = kmem_alloc(mem_size, KM_NOFS);
+ char *new_data = kmalloc(mem_size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
memcpy(new_data, data, size);
if (zero_terminate)
@@ -77,7 +80,7 @@ xfs_iformat_local(
/*
* If the size is unreasonable, then something
* is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
+ * kmalloc() or memcpy() below.
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
@@ -87,6 +90,7 @@ xfs_iformat_local(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_local", dip, sizeof(*dip),
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
@@ -116,7 +120,7 @@ xfs_iformat_extents(
/*
* If the number of extents is unreasonable, then something is wrong and
- * we just bail out rather than crash in kmem_alloc() or memcpy() below.
+ * we just bail out rather than crash in kmalloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
@@ -124,6 +128,7 @@ xfs_iformat_extents(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
@@ -143,6 +148,7 @@ xfs_iformat_extents(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(2)",
dp, sizeof(*dp), fa);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return xfs_bmap_complain_bad_rec(ip, whichfork,
fa, &new);
}
@@ -201,11 +207,13 @@ xfs_iformat_btree(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_btree", dfp, size,
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
ifp->if_broot_bytes = size;
- ifp->if_broot = kmem_alloc(size, KM_NOFS);
+ ifp->if_broot = kmalloc(size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
ASSERT(ifp->if_broot != NULL);
/*
* Copy and convert from the on-disk structure
@@ -265,12 +273,14 @@ xfs_iformat_data_fork(
default:
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
dip, sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
break;
default:
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
}
@@ -342,6 +352,7 @@ xfs_iformat_attr_fork(
default:
xfs_inode_verifier_error(ip, error, __func__, dip,
sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
error = -EFSCORRUPTED;
break;
}
@@ -399,7 +410,8 @@ xfs_iroot_realloc(
*/
if (ifp->if_broot_bytes == 0) {
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
- ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
+ ifp->if_broot = kmalloc(new_size,
+ GFP_KERNEL | __GFP_NOFAIL);
ifp->if_broot_bytes = (int)new_size;
return;
}
@@ -414,7 +426,7 @@ xfs_iroot_realloc(
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
ifp->if_broot = krealloc(ifp->if_broot, new_size,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -440,7 +452,7 @@ xfs_iroot_realloc(
else
new_size = 0;
if (new_size > 0) {
- new_broot = kmem_alloc(new_size, KM_NOFS);
+ new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
/*
* First copy over the btree block header.
*/
@@ -470,7 +482,7 @@ xfs_iroot_realloc(
(int)new_size);
memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
}
- kmem_free(ifp->if_broot);
+ kfree(ifp->if_broot);
ifp->if_broot = new_broot;
ifp->if_broot_bytes = (int)new_size;
if (ifp->if_broot)
@@ -488,7 +500,7 @@ xfs_iroot_realloc(
*
* If the amount of space needed has decreased below the size of the
* inline buffer, then switch to using the inline buffer. Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * use krealloc() or kmalloc() to adjust the size of the buffer
* to what is needed.
*
* ip -- the inode whose if_data area is changing
@@ -509,7 +521,7 @@ xfs_idata_realloc(
if (byte_diff) {
ifp->if_data = krealloc(ifp->if_data, new_size,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
if (new_size == 0)
ifp->if_data = NULL;
ifp->if_bytes = new_size;
@@ -524,13 +536,13 @@ xfs_idestroy_fork(
struct xfs_ifork *ifp)
{
if (ifp->if_broot != NULL) {
- kmem_free(ifp->if_broot);
+ kfree(ifp->if_broot);
ifp->if_broot = NULL;
}
switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
- kmem_free(ifp->if_data);
+ kfree(ifp->if_data);
ifp->if_data = NULL;
break;
case XFS_DINODE_FMT_EXTENTS:
@@ -562,7 +574,7 @@ xfs_iextents_copy(
struct xfs_bmbt_irec rec;
int64_t copied = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
ASSERT(ifp->if_bytes > 0);
for_each_xfs_iext(ifp, &icur, &rec) {
@@ -689,7 +701,7 @@ xfs_ifork_init_cow(
return;
ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
}
@@ -802,3 +814,12 @@ xfs_iext_count_upgrade(
return 0;
}
+
+/* Decide if a file mapping is on the realtime device or not. */
+bool
+xfs_ifork_is_realtime(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ return XFS_IS_REALTIME_INODE(ip) && whichfork != XFS_ATTR_FORK;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 96303249d28a..bd53eb951b65 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -260,6 +260,7 @@ int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
int nr_to_add);
int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
uint nr_to_add);
+bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork);
/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 269573c82808..16872972e1e9 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -838,10 +838,12 @@ struct xfs_cud_log_format {
#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31)
#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30)
+#define XFS_BMAP_EXTENT_REALTIME (1U << 29)
#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \
XFS_BMAP_EXTENT_ATTR_FORK | \
- XFS_BMAP_EXTENT_UNWRITTEN)
+ XFS_BMAP_EXTENT_UNWRITTEN | \
+ XFS_BMAP_EXTENT_REALTIME)
/*
* This is the structure used to lay out an bui log item in the
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 6709a7f8bad5..511c912d515c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -23,6 +23,7 @@
#include "xfs_refcount.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_refcount_intent_cache;
@@ -156,6 +157,7 @@ xfs_refcount_complain_bad_rec(
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -238,6 +240,7 @@ xfs_refcount_insert(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -268,12 +271,14 @@ xfs_refcount_delete(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
error = xfs_btree_delete(cur, i);
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -398,6 +403,7 @@ xfs_refcount_split_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -425,6 +431,7 @@ xfs_refcount_split_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -470,6 +477,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -478,6 +486,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -487,6 +496,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -498,6 +508,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -542,6 +553,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -550,6 +562,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -561,6 +574,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -608,6 +622,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -616,6 +631,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -627,6 +643,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -674,6 +691,7 @@ xfs_refcount_find_left_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -693,6 +711,7 @@ xfs_refcount_find_left_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -767,6 +786,7 @@ xfs_refcount_find_right_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -786,6 +806,7 @@ xfs_refcount_find_right_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1056,7 +1077,7 @@ xfs_refcount_still_have_space(
* to handle each of the shape changes to the refcount btree.
*/
overhead = xfs_allocfree_block_count(cur->bc_mp,
- cur->bc_ag.refc.shape_changes);
+ cur->bc_refc.shape_changes);
overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
@@ -1064,17 +1085,17 @@ xfs_refcount_still_have_space(
* Only allow 2 refcount extent updates per transaction if the
* refcount continue update "error" has been injected.
*/
- if (cur->bc_ag.refc.nr_ops > 2 &&
+ if (cur->bc_refc.nr_ops > 2 &&
XFS_TEST_ERROR(false, cur->bc_mp,
XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
- if (cur->bc_ag.refc.nr_ops == 0)
+ if (cur->bc_refc.nr_ops == 0)
return true;
else if (overhead > cur->bc_tp->t_log_res)
return false;
- return cur->bc_tp->t_log_res - overhead >
- cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ return cur->bc_tp->t_log_res - overhead >
+ cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -1134,7 +1155,7 @@ xfs_refcount_adjust_extents(
* Either cover the hole (increment) or
* delete the range (decrement).
*/
- cur->bc_ag.refc.nr_ops++;
+ cur->bc_refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -1142,6 +1163,7 @@ xfs_refcount_adjust_extents(
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp,
found_tmp != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1180,6 +1202,7 @@ xfs_refcount_adjust_extents(
*/
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1193,7 +1216,7 @@ xfs_refcount_adjust_extents(
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &ext);
- cur->bc_ag.refc.nr_ops++;
+ cur->bc_refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
@@ -1203,6 +1226,7 @@ xfs_refcount_adjust_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1281,7 +1305,7 @@ xfs_refcount_adjust(
if (shape_changed)
shape_changes++;
if (shape_changes)
- cur->bc_ag.refc.shape_changes++;
+ cur->bc_refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
@@ -1327,8 +1351,10 @@ xfs_refcount_continue_op(
struct xfs_perag *pag = cur->bc_ag.pag;
if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
- ri->ri_blockcount)))
+ ri->ri_blockcount))) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
@@ -1374,8 +1400,8 @@ xfs_refcount_finish_one(
*/
rcur = *pcur;
if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
- nr_ops = rcur->bc_ag.refc.nr_ops;
- shape_changes = rcur->bc_ag.refc.shape_changes;
+ nr_ops = rcur->bc_refc.nr_ops;
+ shape_changes = rcur->bc_refc.shape_changes;
xfs_refcount_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -1387,8 +1413,8 @@ xfs_refcount_finish_one(
return error;
rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
- rcur->bc_ag.refc.nr_ops = nr_ops;
- rcur->bc_ag.refc.shape_changes = shape_changes;
+ rcur->bc_refc.nr_ops = nr_ops;
+ rcur->bc_refc.shape_changes = shape_changes;
}
*pcur = rcur;
@@ -1449,7 +1475,7 @@ __xfs_refcount_add(
blockcount);
ri = kmem_cache_alloc(xfs_refcount_intent_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
@@ -1535,6 +1561,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1552,6 +1579,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1585,6 +1613,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1682,6 +1711,7 @@ xfs_refcount_adjust_cow_extents(
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1697,6 +1727,7 @@ xfs_refcount_adjust_cow_extents(
/* Adding a CoW reservation, there should be nothing here. */
if (XFS_IS_CORRUPT(cur->bc_mp,
agbno + aglen > ext.rc_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1714,6 +1745,7 @@ xfs_refcount_adjust_cow_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1721,14 +1753,17 @@ xfs_refcount_adjust_cow_extents(
case XFS_REFCOUNT_ADJUST_COW_FREE:
/* Removing a CoW reservation, there should be one extent. */
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1740,6 +1775,7 @@ xfs_refcount_adjust_cow_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1889,8 +1925,10 @@ xfs_refcount_recover_extent(
struct xfs_refcount_recovery *rr;
if (XFS_IS_CORRUPT(cur->bc_mp,
- be32_to_cpu(rec->refc.rc_refcount) != 1))
+ be32_to_cpu(rec->refc.rc_refcount) != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
rr = kmalloc(sizeof(struct xfs_refcount_recovery),
GFP_KERNEL | __GFP_NOFAIL);
@@ -1900,6 +1938,7 @@ xfs_refcount_recover_extent(
if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ xfs_btree_mark_sick(cur);
kfree(rr);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 0d80bd99147c..ca59f6c89f3e 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -16,6 +16,7 @@
#include "xfs_refcount.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_bit.h"
@@ -77,8 +78,6 @@ xfs_refcountbt_alloc_block(
xfs_refc_block(args.mp)));
if (error)
goto out_error;
- trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- args.agbno, 1);
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
@@ -107,8 +106,6 @@ xfs_refcountbt_free_block(
struct xfs_agf *agf = agbp->b_addr;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
- trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
@@ -220,7 +217,7 @@ xfs_refcountbt_verify(
if (!xfs_has_reflink(mp))
return __this_address;
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
@@ -242,7 +239,7 @@ xfs_refcountbt_verify(
} else if (level >= mp->m_refc_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_refc_mxr[level != 0]);
}
STATIC void
@@ -251,7 +248,7 @@ xfs_refcountbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_refcountbt_verify(bp);
@@ -275,7 +272,7 @@ xfs_refcountbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -321,9 +318,17 @@ xfs_refcountbt_keys_contiguous(
be32_to_cpu(key2->refc.rc_startblock));
}
-static const struct xfs_btree_ops xfs_refcountbt_ops = {
+const struct xfs_btree_ops xfs_refcountbt_ops = {
+ .name = "refcount",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(struct xfs_refcount_rec),
.key_len = sizeof(struct xfs_refcount_key),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_REFC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2),
+ .sick_mask = XFS_SICK_AG_REFCNTBT,
.dup_cursor = xfs_refcountbt_dup_cursor,
.set_root = xfs_refcountbt_set_root,
@@ -344,59 +349,32 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
};
/*
- * Initialize a new refcount btree cursor.
+ * Create a new refcount btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-static struct xfs_btree_cur *
-xfs_refcountbt_init_common(
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct xfs_buf *agbp,
struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops,
mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
-
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
- cur->bc_ag.refc.nr_ops = 0;
- cur->bc_ag.refc.shape_changes = 0;
- cur->bc_ops = &xfs_refcountbt_ops;
- return cur;
-}
-
-/* Create a btree cursor. */
-struct xfs_btree_cur *
-xfs_refcountbt_init_cursor(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_buf *agbp,
- struct xfs_perag *pag)
-{
- struct xfs_agf *agf = agbp->b_addr;
- struct xfs_btree_cur *cur;
-
- cur = xfs_refcountbt_init_common(mp, tp, pag);
- cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ cur->bc_refc.nr_ops = 0;
+ cur->bc_refc.shape_changes = 0;
cur->bc_ag.agbp = agbp;
- return cur;
-}
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
-/* Create a btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_refcountbt_stage_cursor(
- struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag)
-{
- struct xfs_btree_cur *cur;
-
- cur = xfs_refcountbt_init_common(mp, NULL, pag);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ }
return cur;
}
@@ -421,7 +399,7 @@ xfs_refcountbt_commit_staged_btree(
xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
XFS_AGF_REFCOUNT_ROOT |
XFS_AGF_REFCOUNT_LEVEL);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in a refcount btree block. */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index d66b37259bed..1e0ab25f6c68 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -48,8 +48,6 @@ struct xbtree_afakeroot;
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp,
struct xfs_perag *pag);
-struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag);
extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 76bf7f48cb5a..ef16f6f9cef6 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -23,6 +23,7 @@
#include "xfs_error.h"
#include "xfs_inode.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_rmap_intent_cache;
@@ -56,8 +57,10 @@ xfs_rmap_lookup_le(
error = xfs_rmap_get_rec(cur, irec, &get_stat);
if (error)
return error;
- if (!get_stat)
+ if (!get_stat) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -132,6 +135,7 @@ xfs_rmap_insert(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -145,6 +149,7 @@ xfs_rmap_insert(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -174,6 +179,7 @@ xfs_rmap_delete(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -182,6 +188,7 @@ xfs_rmap_delete(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -208,10 +215,10 @@ xfs_rmap_btrec_to_irec(
/* Simple checks for rmap records. */
xfs_failaddr_t
xfs_rmap_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_rmap_irec *irec)
{
- struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_mount *mp = pag->pag_mount;
bool is_inode;
bool is_unwritten;
bool is_bmbt;
@@ -226,8 +233,8 @@ xfs_rmap_check_irec(
return __this_address;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock,
- irec->rm_blockcount))
+ if (!xfs_verify_agbext(pag, irec->rm_startblock,
+ irec->rm_blockcount))
return __this_address;
}
@@ -262,6 +269,16 @@ xfs_rmap_check_irec(
return NULL;
}
+static inline xfs_failaddr_t
+xfs_rmap_check_btrec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec)
+{
+ if (xfs_btree_is_mem_rmap(cur->bc_ops))
+ return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
+ return xfs_rmap_check_irec(cur->bc_ag.pag, irec);
+}
+
static inline int
xfs_rmap_complain_bad_rec(
struct xfs_btree_cur *cur,
@@ -270,13 +287,18 @@ xfs_rmap_complain_bad_rec(
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_warn(mp,
- "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
- cur->bc_ag.pag->pag_agno, fa);
+ if (xfs_btree_is_mem_rmap(cur->bc_ops))
+ xfs_warn(mp,
+ "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+ else
+ xfs_warn(mp,
+ "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
+ cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
irec->rm_blockcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -299,7 +321,7 @@ xfs_rmap_get_rec(
fa = xfs_rmap_btrec_to_irec(rec, irec);
if (!fa)
- fa = xfs_rmap_check_irec(cur, irec);
+ fa = xfs_rmap_check_btrec(cur, irec);
if (fa)
return xfs_rmap_complain_bad_rec(cur, fa, irec);
@@ -512,7 +534,7 @@ xfs_rmap_lookup_le_range(
*/
static int
xfs_rmap_free_check_owner(
- struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
uint64_t ltoff,
struct xfs_rmap_irec *rec,
xfs_filblks_t len,
@@ -520,6 +542,7 @@ xfs_rmap_free_check_owner(
uint64_t offset,
unsigned int flags)
{
+ struct xfs_mount *mp = cur->bc_mp;
int error = 0;
if (owner == XFS_RMAP_OWN_UNKNOWN)
@@ -529,12 +552,14 @@ xfs_rmap_free_check_owner(
if (XFS_IS_CORRUPT(mp,
(flags & XFS_RMAP_UNWRITTEN) !=
(rec->rm_flags & XFS_RMAP_UNWRITTEN))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
/* Make sure the owner matches what we expect to find in the tree. */
if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -546,16 +571,19 @@ xfs_rmap_free_check_owner(
if (flags & XFS_RMAP_BMBT_BLOCK) {
if (XFS_IS_CORRUPT(mp,
!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
} else {
if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
if (XFS_IS_CORRUPT(mp,
offset + len > ltoff + rec->rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -618,6 +646,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -639,6 +668,7 @@ xfs_rmap_unmap(
if (XFS_IS_CORRUPT(mp,
bno <
ltrec.rm_startblock + ltrec.rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -665,6 +695,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -677,12 +708,13 @@ xfs_rmap_unmap(
ltrec.rm_startblock > bno ||
ltrec.rm_startblock + ltrec.rm_blockcount <
bno + len)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Check owner information. */
- error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, len, owner,
+ error = xfs_rmap_free_check_owner(cur, ltoff, &ltrec, len, owner,
offset, flags);
if (error)
goto out_error;
@@ -697,6 +729,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -788,6 +821,86 @@ out_error:
return error;
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of rmapbt live updates. If
+ * the compiler supports jump labels, the static branch will be replaced by a
+ * nop sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_rmap_hooks_switch);
+
+void
+xfs_rmap_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_rmap_hooks_switch);
+}
+
+void
+xfs_rmap_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_rmap_hooks_switch);
+}
+
+/* Call downstream hooks for a reverse mapping update. */
+static inline void
+xfs_rmap_update_hook(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ enum xfs_rmap_intent_type op,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) {
+ struct xfs_rmap_update_params p = {
+ .startblock = startblock,
+ .blockcount = blockcount,
+ .unwritten = unwritten,
+ .oinfo = *oinfo, /* struct copy */
+ };
+
+ if (pag)
+ xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
+ }
+}
+
+/* Call the specified function during a reverse mapping update. */
+int
+xfs_rmap_hook_add(
+ struct xfs_perag *pag,
+ struct xfs_rmap_hook *hook)
+{
+ return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+}
+
+/* Stop calling the specified function during a reverse mapping update. */
+void
+xfs_rmap_hook_del(
+ struct xfs_perag *pag,
+ struct xfs_rmap_hook *hook)
+{
+ xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+}
+
+/* Configure rmap update hook functions. */
+void
+xfs_rmap_hook_setup(
+ struct xfs_rmap_hook *hook,
+ notifier_fn_t mod_fn)
+{
+ xfs_hook_setup(&hook->rmap_hook, mod_fn);
+}
+#else
+# define xfs_rmap_update_hook(t, p, o, s, b, u, oi) do { } while (0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Remove a reference to an extent in the rmap btree.
*/
@@ -808,7 +921,7 @@ xfs_rmap_free(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
-
+ xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo);
error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -900,6 +1013,7 @@ xfs_rmap_map(
if (XFS_IS_CORRUPT(mp,
have_lt != 0 &&
ltrec.rm_startblock + ltrec.rm_blockcount > bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -917,10 +1031,12 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -974,6 +1090,7 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1021,6 +1138,7 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1055,6 +1173,7 @@ xfs_rmap_alloc(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+ xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo);
error = xfs_rmap_map(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -1116,6 +1235,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1153,12 +1273,14 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp,
LEFT.rm_startblock + LEFT.rm_blockcount >
bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1181,6 +1303,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1193,10 +1316,12 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1227,6 +1352,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1246,6 +1372,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1257,6 +1384,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1264,6 +1392,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1275,6 +1404,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1282,6 +1412,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1305,6 +1436,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1312,6 +1444,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1331,6 +1464,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1342,6 +1476,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1349,6 +1484,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1419,6 +1555,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1461,6 +1598,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1476,6 +1614,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1509,6 +1648,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1522,6 +1662,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1534,6 +1675,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1606,6 +1748,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1634,6 +1777,7 @@ xfs_rmap_convert_shared(
if (XFS_IS_CORRUPT(mp,
LEFT.rm_startblock + LEFT.rm_blockcount >
bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1652,10 +1796,12 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1706,6 +1852,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1732,6 +1879,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1758,6 +1906,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1781,6 +1930,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1816,6 +1966,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1861,6 +2012,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1896,6 +2048,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1934,6 +2087,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2023,6 +2177,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2033,12 +2188,14 @@ xfs_rmap_unmap_shared(
ltrec.rm_startblock > bno ||
ltrec.rm_startblock + ltrec.rm_blockcount <
bno + len)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Make sure the owner matches what we expect to find in the tree. */
if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2047,16 +2204,19 @@ xfs_rmap_unmap_shared(
if (XFS_IS_CORRUPT(mp,
(flags & XFS_RMAP_UNWRITTEN) !=
(ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Check the offset. */
if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2113,6 +2273,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2142,6 +2303,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2221,6 +2383,7 @@ xfs_rmap_map_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2273,6 +2436,7 @@ xfs_rmap_map_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2335,15 +2499,12 @@ xfs_rmap_map_raw(
{
struct xfs_owner_info oinfo;
- oinfo.oi_owner = rmap->rm_owner;
- oinfo.oi_offset = rmap->rm_offset;
- oinfo.oi_flags = 0;
- if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
- if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+ xfs_owner_info_pack(&oinfo, rmap->rm_owner, rmap->rm_offset,
+ rmap->rm_flags);
- if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ if ((rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN)) ||
+ XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
return xfs_rmap_map(cur, rmap->rm_startblock,
rmap->rm_blockcount,
rmap->rm_flags & XFS_RMAP_UNWRITTEN,
@@ -2373,7 +2534,7 @@ xfs_rmap_query_range_helper(
fa = xfs_rmap_btrec_to_irec(rec, &irec);
if (!fa)
- fa = xfs_rmap_check_irec(cur, &irec);
+ fa = xfs_rmap_check_btrec(cur, &irec);
if (fa)
return xfs_rmap_complain_bad_rec(cur, fa, &irec);
@@ -2428,6 +2589,38 @@ xfs_rmap_finish_one_cleanup(
xfs_trans_brelse(tp, agbp);
}
+/* Commit an rmap operation into the ondisk tree. */
+int
+__xfs_rmap_finish_intent(
+ struct xfs_btree_cur *rcur,
+ enum xfs_rmap_intent_type op,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool unwritten)
+{
+ switch (op) {
+ case XFS_RMAP_ALLOC:
+ case XFS_RMAP_MAP:
+ return xfs_rmap_map(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_MAP_SHARED:
+ return xfs_rmap_map_shared(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_FREE:
+ case XFS_RMAP_UNMAP:
+ return xfs_rmap_unmap(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_UNMAP_SHARED:
+ return xfs_rmap_unmap_shared(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_CONVERT:
+ return xfs_rmap_convert(rcur, bno, len, !unwritten, oinfo);
+ case XFS_RMAP_CONVERT_SHARED:
+ return xfs_rmap_convert_shared(rcur, bno, len, !unwritten,
+ oinfo);
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+}
+
/*
* Process one of the deferred rmap operations. We pass back the
* btree cursor to maintain our lock on the rmapbt between calls.
@@ -2476,10 +2669,14 @@ xfs_rmap_finish_one(
* allocate blocks.
*/
error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp);
- if (error)
+ if (error) {
+ xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
return error;
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
+ }
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+ xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
return -EFSCORRUPTED;
+ }
rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
}
@@ -2490,39 +2687,14 @@ xfs_rmap_finish_one(
unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
- switch (ri->ri_type) {
- case XFS_RMAP_ALLOC:
- case XFS_RMAP_MAP:
- error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount,
- unwritten, &oinfo);
- break;
- case XFS_RMAP_MAP_SHARED:
- error = xfs_rmap_map_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, unwritten, &oinfo);
- break;
- case XFS_RMAP_FREE:
- case XFS_RMAP_UNMAP:
- error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount,
- unwritten, &oinfo);
- break;
- case XFS_RMAP_UNMAP_SHARED:
- error = xfs_rmap_unmap_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, unwritten, &oinfo);
- break;
- case XFS_RMAP_CONVERT:
- error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount,
- !unwritten, &oinfo);
- break;
- case XFS_RMAP_CONVERT_SHARED:
- error = xfs_rmap_convert_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, !unwritten, &oinfo);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- }
+ error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+ ri->ri_bmap.br_blockcount, &oinfo, unwritten);
+ if (error)
+ return error;
- return error;
+ xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno,
+ ri->ri_bmap.br_blockcount, unwritten, &oinfo);
+ return 0;
}
/*
@@ -2559,7 +2731,7 @@ __xfs_rmap_add(
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 3c98d9d50afb..9d01fe689497 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -186,6 +186,10 @@ void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
struct xfs_btree_cur *rcur, int error);
int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur);
+int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,
+ enum xfs_rmap_intent_type op, xfs_agblock_t bno,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+ bool unwritten);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
@@ -195,7 +199,7 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a,
union xfs_btree_rec;
xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec);
-xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
const struct xfs_rmap_irec *irec);
int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
@@ -235,4 +239,29 @@ extern struct kmem_cache *xfs_rmap_intent_cache;
int __init xfs_rmap_intent_init_cache(void);
void xfs_rmap_intent_destroy_cache(void);
+/*
+ * Parameters for tracking reverse mapping changes. The hook function arg
+ * parameter is enum xfs_rmap_intent_type, and the rest is below.
+ */
+struct xfs_rmap_update_params {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+ struct xfs_owner_info oinfo;
+ bool unwritten;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+
+struct xfs_rmap_hook {
+ struct xfs_hook rmap_hook;
+};
+
+void xfs_rmap_hook_disable(void);
+void xfs_rmap_hook_enable(void);
+
+int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn);
+#endif
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 6c81b20e97d2..9e759efa81cc 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -16,11 +16,14 @@
#include "xfs_btree_staging.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
static struct kmem_cache *xfs_rmapbt_cur_cache;
@@ -65,13 +68,12 @@ xfs_rmapbt_set_root(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- int btnum = cur->bc_btnum;
ASSERT(ptr->s != 0);
- agf->agf_roots[btnum] = ptr->s;
- be32_add_cpu(&agf->agf_levels[btnum], inc);
- cur->bc_ag.pag->pagf_levels[btnum] += inc;
+ agf->agf_rmap_root = ptr->s;
+ be32_add_cpu(&agf->agf_rmap_level, inc);
+ cur->bc_ag.pag->pagf_rmap_level += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -94,8 +96,6 @@ xfs_rmapbt_alloc_block(
&bno, 1);
if (error)
return error;
-
- trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
if (bno == NULLAGBLOCK) {
*stat = 0;
return 0;
@@ -125,8 +125,6 @@ xfs_rmapbt_free_block(
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
- trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
- bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
@@ -226,7 +224,7 @@ xfs_rmapbt_init_ptr_from_cur(
ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
- ptr->s = agf->agf_roots[cur->bc_btnum];
+ ptr->s = agf->agf_rmap_root;
}
/*
@@ -340,18 +338,29 @@ xfs_rmapbt_verify(
if (!xfs_has_rmapbt(mp))
return __this_address;
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
level = be16_to_cpu(block->bb_level);
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+ unsigned int maxlevel = pag->pagf_rmap_level;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Online repair could be rewriting the free space btrees, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ maxlevel = max_t(unsigned int, maxlevel,
+ pag->pagf_repair_rmap_level);
+#endif
+ if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_rmap_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
}
static void
@@ -360,7 +369,7 @@ xfs_rmapbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_rmapbt_verify(bp);
@@ -384,7 +393,7 @@ xfs_rmapbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -476,9 +485,19 @@ xfs_rmapbt_keys_contiguous(
be32_to_cpu(key2->rmap.rm_startblock));
}
-static const struct xfs_btree_ops xfs_rmapbt_ops = {
+const struct xfs_btree_ops xfs_rmapbt_ops = {
+ .name = "rmap",
+ .type = XFS_BTREE_TYPE_AG,
+ .geom_flags = XFS_BTGEO_OVERLAPPING,
+
.rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
.key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2),
+ .sick_mask = XFS_SICK_AG_RMAPBT,
.dup_cursor = xfs_rmapbt_dup_cursor,
.set_root = xfs_rmapbt_set_root,
@@ -498,55 +517,176 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
.keys_contiguous = xfs_rmapbt_keys_contiguous,
};
-static struct xfs_btree_cur *
-xfs_rmapbt_init_common(
+/*
+ * Create a new reverse mapping btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct xfs_buf *agbp,
struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- /* Overlapping btree; 2 keys per pointer. */
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
- cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
- cur->bc_ops = &xfs_rmapbt_ops;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
+ }
return cur;
}
-/* Create a new reverse mapping btree cursor. */
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline unsigned int
+xfs_rmapbt_mem_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64));
+}
+
+/*
+ * Validate an in-memory rmap btree block. Callers are allowed to generate an
+ * in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rmapbt_mem_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (level >= xfs_rmapbt_maxlevels_ondisk())
+ return __this_address;
+
+ maxrecs = xfs_rmapbt_mem_block_maxrecs(
+ XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+xfs_rmapbt_mem_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
+ .name = "xfs_rmapbt_mem",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
+ .verify_read = xfs_rmapbt_mem_rw_verify,
+ .verify_write = xfs_rmapbt_mem_rw_verify,
+ .verify_struct = xfs_rmapbt_mem_verify,
+};
+
+const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
+ .name = "mem_rmap",
+ .type = XFS_BTREE_TYPE_MEM,
+ .geom_flags = XFS_BTGEO_OVERLAPPING,
+
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = xfs_rmapbt_key_diff,
+ .buf_ops = &xfs_rmapbt_mem_buf_ops,
+ .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .keys_inorder = xfs_rmapbt_keys_inorder,
+ .recs_inorder = xfs_rmapbt_recs_inorder,
+ .keys_contiguous = xfs_rmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
struct xfs_btree_cur *
-xfs_rmapbt_init_cursor(
- struct xfs_mount *mp,
+xfs_rmapbt_mem_cursor(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- struct xfs_perag *pag)
+ struct xfbtree *xfbt)
{
- struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = pag->pag_mount;
- cur = xfs_rmapbt_init_common(mp, tp, pag);
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- cur->bc_ag.agbp = agbp;
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
+ xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
+ cur->bc_mem.xfbtree = xfbt;
+ cur->bc_nlevels = xfbt->nlevels;
+
+ cur->bc_mem.pag = xfs_perag_hold(pag);
return cur;
}
-/* Create a new reverse mapping btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_rmapbt_stage_cursor(
+/* Create an in-memory rmap btree. */
+int
+xfs_rmapbt_mem_init(
struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag)
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp,
+ xfs_agnumber_t agno)
{
- struct xfs_btree_cur *cur;
+ xfbt->owner = agno;
+ return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops);
+}
- cur = xfs_rmapbt_init_common(mp, NULL, pag);
- xfs_btree_stage_afakeroot(cur, afake);
- return cur;
+/* Compute the max possible height for reverse mapping btrees in memory. */
+static unsigned int
+xfs_rmapbt_mem_maxlevels(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * How tall can an in-memory rmap btree become if we filled the entire
+ * AG with rmap records?
+ */
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
}
+#else
+# define xfs_rmapbt_mem_maxlevels() (0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
/*
* Install a new reverse mapping btree root. Caller is responsible for
@@ -563,12 +703,12 @@ xfs_rmapbt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
- agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ agf->agf_rmap_root = cpu_to_be32(afake->af_root);
+ agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
XFS_AGF_RMAP_BLOCKS);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in a reverse mapping btree block. */
@@ -618,7 +758,8 @@ xfs_rmapbt_maxlevels_ondisk(void)
* like if it consumes almost all the blocks in the AG due to maximal
* sharing factor.
*/
- return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
+ return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
+ xfs_rmapbt_mem_maxlevels());
}
/* Compute the maximum height of an rmap btree. */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 3244715dd111..eb90d89e8086 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -10,6 +10,7 @@ struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
struct xbtree_afakeroot;
+struct xfbtree;
/* rmaps only exist on crc enabled filesystems */
#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
@@ -44,8 +45,6 @@ struct xbtree_afakeroot;
struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_perag *pag);
-struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag);
void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
int xfs_rmapbt_maxrecs(int blocklen, int leaf);
@@ -64,4 +63,9 @@ unsigned int xfs_rmapbt_maxlevels_ondisk(void);
int __init xfs_rmapbt_init_cur_cache(void);
void xfs_rmapbt_destroy_cur_cache(void);
+struct xfs_btree_cur *xfs_rmapbt_mem_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int xfs_rmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp, xfs_agnumber_t agno);
+
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index e31663cb7b43..f246d6dbf4ec 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -17,6 +17,7 @@
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_rtbitmap.h"
+#include "xfs_health.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -115,13 +116,19 @@ xfs_rtbuf_get(
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
+ if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) {
+ xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+ XFS_SICK_RT_BITMAP);
return -EFSCORRUPTED;
+ }
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+ XFS_SICK_RT_BITMAP);
if (error)
return error;
@@ -934,7 +941,7 @@ xfs_rtfree_extent(
struct timespec64 atime;
ASSERT(mp->m_rbmip->i_itemp != NULL);
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
error = xfs_rtcheck_alloc_range(&args, start, len);
if (error)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5bb6e2bd6dee..73a4b895de67 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -530,7 +530,8 @@ xfs_validate_sb_common(
}
if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
- XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
+ XFS_FSB_TO_B(mp, sbp->sb_width), 0,
+ xfs_buf_daddr(bp) == XFS_SB_DADDR, false))
return -EFSCORRUPTED;
/*
@@ -1290,6 +1291,8 @@ xfs_sb_read_secondary(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB);
if (error)
return error;
xfs_buf_set_ref(bp, XFS_SSB_REF);
@@ -1321,8 +1324,10 @@ xfs_sb_get_secondary(
}
/*
- * sunit, swidth, sectorsize(optional with 0) should be all in bytes,
- * so users won't be confused by values in error messages.
+ * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users
+ * won't be confused by values in error messages. This function returns false
+ * if the stripe geometry is invalid and the caller is unable to repair the
+ * stripe configuration later in the mount process.
*/
bool
xfs_validate_stripe_geometry(
@@ -1330,20 +1335,21 @@ xfs_validate_stripe_geometry(
__s64 sunit,
__s64 swidth,
int sectorsize,
+ bool may_repair,
bool silent)
{
if (swidth > INT_MAX) {
if (!silent)
xfs_notice(mp,
"stripe width (%lld) is too large", swidth);
- return false;
+ goto check_override;
}
if (sunit > swidth) {
if (!silent)
xfs_notice(mp,
"stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
- return false;
+ goto check_override;
}
if (sectorsize && (int)sunit % sectorsize) {
@@ -1351,21 +1357,21 @@ xfs_validate_stripe_geometry(
xfs_notice(mp,
"stripe unit (%lld) must be a multiple of the sector size (%d)",
sunit, sectorsize);
- return false;
+ goto check_override;
}
if (sunit && !swidth) {
if (!silent)
xfs_notice(mp,
"invalid stripe unit (%lld) and stripe width of 0", sunit);
- return false;
+ goto check_override;
}
if (!sunit && swidth) {
if (!silent)
xfs_notice(mp,
"invalid stripe width (%lld) and stripe unit of 0", swidth);
- return false;
+ goto check_override;
}
if (sunit && (int)swidth % (int)sunit) {
@@ -1373,9 +1379,27 @@ xfs_validate_stripe_geometry(
xfs_notice(mp,
"stripe width (%lld) must be a multiple of the stripe unit (%lld)",
swidth, sunit);
- return false;
+ goto check_override;
}
return true;
+
+check_override:
+ if (!may_repair)
+ return false;
+ /*
+ * During mount, mp->m_dalign will not be set unless the sunit mount
+ * option was set. If it was set, ignore the bad stripe alignment values
+ * and allow the validation and overwrite later in the mount process to
+ * attempt to overwrite the bad stripe alignment values with the values
+ * supplied by mount options.
+ */
+ if (!mp->m_dalign)
+ return false;
+ if (!silent)
+ xfs_notice(mp,
+"Will try to correct with specified mount options sunit (%d) and swidth (%d)",
+ BBTOB(mp->m_dalign), BBTOB(mp->m_swidth));
+ return true;
}
/*
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 2e8e8d63d4eb..37b1ed1bc209 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -35,8 +35,9 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **bpp);
-extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
- __s64 sunit, __s64 swidth, int sectorsize, bool silent);
+bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
+ __s64 sunit, __s64 swidth, int sectorsize, bool may_repair,
+ bool silent);
uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 4220d3584c1b..dfd61fa8332e 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -43,6 +43,60 @@ extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+/* btree ops */
+extern const struct xfs_btree_ops xfs_bnobt_ops;
+extern const struct xfs_btree_ops xfs_cntbt_ops;
+extern const struct xfs_btree_ops xfs_inobt_ops;
+extern const struct xfs_btree_ops xfs_finobt_ops;
+extern const struct xfs_btree_ops xfs_bmbt_ops;
+extern const struct xfs_btree_ops xfs_refcountbt_ops;
+extern const struct xfs_btree_ops xfs_rmapbt_ops;
+extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
+
+static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_bnobt_ops;
+}
+
+static inline bool xfs_btree_is_cnt(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_cntbt_ops;
+}
+
+static inline bool xfs_btree_is_bmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_bmbt_ops;
+}
+
+static inline bool xfs_btree_is_ino(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_inobt_ops;
+}
+
+static inline bool xfs_btree_is_fino(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_finobt_ops;
+}
+
+static inline bool xfs_btree_is_refcount(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_refcountbt_ops;
+}
+
+static inline bool xfs_btree_is_rmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rmapbt_ops;
+}
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rmapbt_mem_ops;
+}
+#else
+# define xfs_btree_is_mem_rmap(...) (false)
+#endif
+
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
int xfs_log_calc_minimum_size(struct xfs_mount *);
@@ -128,19 +182,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
-
-/*
- * Symlink decoding/encoding functions
- */
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
- uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
- uint32_t size, struct xfs_buf *bp);
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_inode *ip, struct xfs_ifork *ifp);
-xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
-
/* Computed inode geometry for the filesystem. */
struct xfs_ino_geometry {
/* Maximum inode count in this filesystem. */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 160aa20aa441..ffb1317a9212 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -16,7 +16,10 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
-
+#include "xfs_symlink_remote.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_health.h"
/*
* Each contiguous block has a header, so it is not just a simple pathlen
@@ -227,3 +230,153 @@ xfs_symlink_shortform_verify(
return __this_address;
return NULL;
}
+
+/* Read a remote symlink target into the buffer. */
+int
+xfs_symlink_remote_read(
+ struct xfs_inode *ip,
+ char *link)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_buf *bp;
+ xfs_daddr_t d;
+ char *cur_chunk;
+ int pathlen = ip->i_disk_size;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int byte_cnt;
+ int n;
+ int error = 0;
+ int fsblocks = 0;
+ int offset;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ fsblocks = xfs_symlink_blocks(mp, pathlen);
+ error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ goto out;
+
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+ error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+ &bp, &xfs_symlink_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ if (error)
+ return error;
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ if (pathlen < byte_cnt)
+ byte_cnt = pathlen;
+
+ cur_chunk = bp->b_addr;
+ if (xfs_has_crc(mp)) {
+ if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
+ byte_cnt, bp)) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ error = -EFSCORRUPTED;
+ xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/0x%x,0x%llx)",
+ offset, byte_cnt, ip->i_ino);
+ xfs_buf_relse(bp);
+ goto out;
+
+ }
+
+ cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+ }
+
+ memcpy(link + offset, cur_chunk, byte_cnt);
+
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_buf_relse(bp);
+ }
+ ASSERT(pathlen == 0);
+
+ link[ip->i_disk_size] = '\0';
+ error = 0;
+
+ out:
+ return error;
+}
+
+/* Write the symlink target into the inode. */
+int
+xfs_symlink_write_target(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ const char *target_path,
+ int pathlen,
+ xfs_fsblock_t fs_blocks,
+ uint resblks)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_mount *mp = tp->t_mountp;
+ const char *cur_chunk;
+ struct xfs_buf *bp;
+ xfs_daddr_t d;
+ int byte_cnt;
+ int nmaps;
+ int offset = 0;
+ int n;
+ int error;
+
+ /*
+ * If the symlink will fit into the inode, write it inline.
+ */
+ if (pathlen <= xfs_inode_data_fork_size(ip)) {
+ xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
+
+ ip->i_disk_size = pathlen;
+ ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+ return 0;
+ }
+
+ nmaps = XFS_SYMLINK_MAPS;
+ error = xfs_bmapi_write(tp, ip, 0, fs_blocks, XFS_BMAPI_METADATA,
+ resblks, mval, &nmaps);
+ if (error)
+ return error;
+
+ ip->i_disk_size = pathlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ cur_chunk = target_path;
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ char *buf;
+
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ byte_cnt = min(byte_cnt, pathlen);
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt,
+ bp);
+
+ memcpy(buf, cur_chunk, byte_cnt);
+
+ cur_chunk += byte_cnt;
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+ xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+ (char *)bp->b_addr);
+ }
+ ASSERT(pathlen == 0);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h
new file mode 100644
index 000000000000..a63bd38ae4fa
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.h
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SYMLINK_REMOTE_H
+#define __XFS_SYMLINK_REMOTE_H
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp);
+xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
+int xfs_symlink_remote_read(struct xfs_inode *ip, char *link);
+int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip,
+ const char *target_path, int pathlen, xfs_fsblock_t fs_blocks,
+ uint resblks);
+
+#endif /* __XFS_SYMLINK_REMOTE_H */
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 70e97ea6eee7..69fc5b981352 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -31,7 +31,7 @@ xfs_trans_ijoin(
{
struct xfs_inode_log_item *iip;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (ip->i_itemp == NULL)
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
@@ -60,7 +60,7 @@ xfs_trans_ichgtime(
struct timespec64 tv;
ASSERT(tp);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
tv = current_time(inode);
@@ -90,7 +90,7 @@ xfs_trans_log_inode(
struct inode *inode = VFS_I(ip);
ASSERT(iip);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
tp->t_flags |= XFS_TRANS_DIRTY;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 62e02d5380ad..76eb9e328835 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -80,11 +80,13 @@ typedef void * xfs_failaddr_t;
/*
* Inode fork identifiers.
*/
-#define XFS_DATA_FORK 0
-#define XFS_ATTR_FORK 1
-#define XFS_COW_FORK 2
+#define XFS_STAGING_FORK (-1) /* fake fork for staging a btree */
+#define XFS_DATA_FORK (0)
+#define XFS_ATTR_FORK (1)
+#define XFS_COW_FORK (2)
#define XFS_WHICHFORK_STRINGS \
+ { XFS_STAGING_FORK, "staging" }, \
{ XFS_DATA_FORK, "data" }, \
{ XFS_ATTR_FORK, "attr" }, \
{ XFS_COW_FORK, "cow" }
@@ -114,24 +116,6 @@ typedef enum {
{ XFS_LOOKUP_LEi, "le" }, \
{ XFS_LOOKUP_GEi, "ge" }
-/*
- * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
- * please keep the TRACE_DEFINE_ENUMs for it up to date.
- */
-typedef enum {
- XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
- XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
-} xfs_btnum_t;
-
-#define XFS_BTNUM_STRINGS \
- { XFS_BTNUM_BNOi, "bnobt" }, \
- { XFS_BTNUM_CNTi, "cntbt" }, \
- { XFS_BTNUM_RMAPi, "rmapbt" }, \
- { XFS_BTNUM_BMAPi, "bmbt" }, \
- { XFS_BTNUM_INOi, "inobt" }, \
- { XFS_BTNUM_FINOi, "finobt" }, \
- { XFS_BTNUM_REFCi, "refcbt" }
-
struct xfs_name {
const unsigned char *name;
int len;
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
deleted file mode 100644
index 79155eec341b..000000000000
--- a/fs/xfs/mrlock.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_MRLOCK_H__
-#define __XFS_SUPPORT_MRLOCK_H__
-
-#include <linux/rwsem.h>
-
-typedef struct {
- struct rw_semaphore mr_lock;
-#if defined(DEBUG) || defined(XFS_WARN)
- int mr_writer;
-#endif
-} mrlock_t;
-
-#if defined(DEBUG) || defined(XFS_WARN)
-#define mrinit(mrp, name) \
- do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
-#else
-#define mrinit(mrp, name) \
- do { init_rwsem(&(mrp)->mr_lock); } while (0)
-#endif
-
-#define mrlock_init(mrp, t,n,s) mrinit(mrp, n)
-#define mrfree(mrp) do { } while (0)
-
-static inline void mraccess_nested(mrlock_t *mrp, int subclass)
-{
- down_read_nested(&mrp->mr_lock, subclass);
-}
-
-static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
-{
- down_write_nested(&mrp->mr_lock, subclass);
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 1;
-#endif
-}
-
-static inline int mrtryaccess(mrlock_t *mrp)
-{
- return down_read_trylock(&mrp->mr_lock);
-}
-
-static inline int mrtryupdate(mrlock_t *mrp)
-{
- if (!down_write_trylock(&mrp->mr_lock))
- return 0;
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 1;
-#endif
- return 1;
-}
-
-static inline void mrunlock_excl(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 0;
-#endif
- up_write(&mrp->mr_lock);
-}
-
-static inline void mrunlock_shared(mrlock_t *mrp)
-{
- up_read(&mrp->mr_lock);
-}
-
-static inline void mrdemote(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 0;
-#endif
- downgrade_write(&mrp->mr_lock);
-}
-
-#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h
index ed08f76ff4f3..e488e1f4f63d 100644
--- a/fs/xfs/scrub/agb_bitmap.h
+++ b/fs/xfs/scrub/agb_bitmap.h
@@ -65,4 +65,9 @@ int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
+static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b)
+{
+ return xbitmap32_count_set_regions(&b->agbitmap);
+}
+
#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 6c6e5eba42c8..e954f07679dd 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -556,28 +556,28 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Check the AGF btree roots and levels */
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+ agbno = be32_to_cpu(agf->agf_bno_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+ agbno = be32_to_cpu(agf->agf_cnt_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ level = be32_to_cpu(agf->agf_bno_level);
if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ level = be32_to_cpu(agf->agf_cnt_level);
if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (xfs_has_rmapbt(mp)) {
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+ agbno = be32_to_cpu(agf->agf_rmap_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ level = be32_to_cpu(agf->agf_rmap_level);
if (level <= 0 || level > mp->m_rmap_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 26bd1ff68f1b..427054b65b23 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -174,8 +174,7 @@ xrep_agf_find_btrees(
* We relied on the rmapbt to reconstruct the AGF. If we get a
* different root then something's seriously wrong.
*/
- if (fab[XREP_AGF_RMAPBT].root !=
- be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi]))
+ if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
return -EFSCORRUPTED;
/* We must find the refcountbt root if that feature is enabled. */
@@ -224,20 +223,14 @@ xrep_agf_set_roots(
struct xfs_agf *agf,
struct xrep_find_ag_btree *fab)
{
- agf->agf_roots[XFS_BTNUM_BNOi] =
- cpu_to_be32(fab[XREP_AGF_BNOBT].root);
- agf->agf_levels[XFS_BTNUM_BNOi] =
- cpu_to_be32(fab[XREP_AGF_BNOBT].height);
+ agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
+ agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
- agf->agf_roots[XFS_BTNUM_CNTi] =
- cpu_to_be32(fab[XREP_AGF_CNTBT].root);
- agf->agf_levels[XFS_BTNUM_CNTi] =
- cpu_to_be32(fab[XREP_AGF_CNTBT].height);
+ agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
+ agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
- agf->agf_levels[XFS_BTNUM_RMAPi] =
- cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
+ agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
+ agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
if (xfs_has_reflink(sc->mp)) {
agf->agf_refcount_root =
@@ -262,8 +255,7 @@ xrep_agf_calc_from_btrees(
int error;
/* Update the AGF counters from the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
if (error)
goto err;
@@ -276,8 +268,7 @@ xrep_agf_calc_from_btrees(
agf->agf_longest = cpu_to_be32(raa.longest);
/* Update the AGF counters from the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -333,12 +324,9 @@ xrep_agf_commit_new(
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
- pag->pagf_levels[XFS_BTNUM_BNOi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
- pag->pagf_levels[XFS_BTNUM_CNTi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- pag->pagf_levels[XFS_BTNUM_RMAPi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+ pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+ pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
@@ -559,16 +547,14 @@ xrep_agfl_collect_blocks(
goto out_bmp;
/* Find all blocks currently being used by the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_bmp;
/* Find all blocks currently being used by the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
@@ -908,7 +894,7 @@ xrep_agi_calc_from_btrees(
xfs_agino_t freecount;
int error;
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
@@ -928,8 +914,7 @@ xrep_agi_calc_from_btrees(
if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
xfs_agblock_t blocks;
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp,
- XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 45edda096869..d421b253923e 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -687,8 +687,8 @@ xrep_abt_reset_counters(
* height values before re-initializing the perag info from the updated
* AGF to capture all the new values.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
+ pag->pagf_repair_bno_level = pag->pagf_bno_level;
+ pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
/* Reinitialize with the values we just logged. */
return xrep_reinit_pagf(sc);
@@ -735,10 +735,11 @@ xrep_abt_build_new_trees(
ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
/* Allocate cursors for the staged btrees. */
- bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
- pag, XFS_BTNUM_BNO);
- cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
- pag, XFS_BTNUM_CNT);
+ bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
+
+ cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
/* Last chance to abort before we start committing fixes. */
if (xchk_should_terminate(sc, &error))
@@ -765,10 +766,8 @@ xrep_abt_build_new_trees(
* height so that we don't trip the verifiers when writing the new
* btree blocks to disk.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
- ra->new_bnobt.bload.btree_height;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
- ra->new_cntbt.bload.btree_height;
+ pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
+ pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
/* Load the free space by length tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
@@ -807,8 +806,8 @@ xrep_abt_build_new_trees(
return xrep_roll_ag_trans(sc);
err_levels:
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+ pag->pagf_repair_bno_level = 0;
+ pag->pagf_repair_cnt_level = 0;
err_cur:
xfs_btree_del_cursor(cnt_cur, error);
xfs_btree_del_cursor(bno_cur, error);
@@ -838,8 +837,8 @@ xrep_abt_remove_old_trees(
* Now that we've zapped all the old allocbt blocks we can turn off
* the alternate height mechanism.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+ pag->pagf_repair_bno_level = 0;
+ pag->pagf_repair_cnt_level = 0;
return 0;
}
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 1449bb5262d9..0cb8d43912a8 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -566,3 +566,17 @@ xbitmap32_test(
*len = bn->bn_start - start;
return false;
}
+
+/* Count the number of set regions in this bitmap. */
+uint32_t
+xbitmap32_count_set_regions(
+ struct xbitmap32 *bitmap)
+{
+ struct xbitmap32_node *bn;
+ uint32_t nr = 0;
+
+ for_each_xbitmap32_extent(bn, bitmap)
+ nr++;
+
+ return nr;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 2df8911606d6..710c1ac5e323 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -62,4 +62,6 @@ int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
bool xbitmap32_empty(struct xbitmap32 *bitmap);
bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
+uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap);
+
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index b169cddde6da..24a15bf784f1 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -924,7 +924,7 @@ xchk_bmap(
if (!ifp)
return -ENOENT;
- info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+ info.is_rt = xfs_ifork_is_realtime(ip, whichfork);
info.whichfork = whichfork;
info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
info.sc = sc;
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index a4bb89fdd510..1e656fab5e41 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -639,7 +639,13 @@ xrep_bmap_build_new_fork(
rb->new_bmapbt.bload.get_records = xrep_bmap_get_records;
rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block;
rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size;
- bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake);
+
+ /*
+ * Allocate a new bmap btree cursor for reloading an inode block mapping
+ * data structure.
+ */
+ bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK);
+ xfs_btree_stage_ifakeroot(bmap_cur, ifake);
/*
* Figure out the size and format of the new fork, then fill it with
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 1935b9ce1885..fe678a0438bc 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -47,7 +47,7 @@ __xchk_btree_process_error(
*error = 0;
fallthrough;
default:
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
trace_xchk_ifork_btree_op_error(sc, cur, level,
*error, ret_ip);
else
@@ -91,7 +91,7 @@ __xchk_btree_set_corrupt(
{
sc->sm->sm_flags |= errflag;
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
trace_xchk_ifork_btree_error(sc, cur, level,
ret_ip);
else
@@ -168,7 +168,7 @@ xchk_btree_rec(
if (xfs_btree_keycmp_lt(cur, &key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, 1);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Is high_key(rec) no larger than the parent high key? */
@@ -215,7 +215,7 @@ xchk_btree_key(
if (xfs_btree_keycmp_lt(cur, key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, level);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Is this block's high key no larger than the parent high key? */
@@ -236,22 +236,18 @@ xchk_btree_ptr_ok(
int level,
union xfs_btree_ptr *ptr)
{
- bool res;
-
/* A btree rooted in an inode has no block pointer to the root. */
- if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
level == bs->cur->bc_nlevels)
return true;
/* Otherwise, check the pointers. */
- if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
- res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
- else
- res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
- if (!res)
+ if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+ return false;
+ }
- return res;
+ return true;
}
/* Check that a btree block's sibling matches what we expect it. */
@@ -374,18 +370,21 @@ xchk_btree_check_block_owner(
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
- xfs_btnum_t btnum;
bool init_sa;
int error = 0;
if (!bs->cur)
return 0;
- btnum = bs->cur->bc_btnum;
agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
- init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
+ /*
+ * If the btree being examined is not itself a per-AG btree, initialize
+ * sc->sa so that we can check for the presence of an ownership record
+ * in the rmap btree for the AG containing the block.
+ */
+ init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG;
if (init_sa) {
error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
@@ -399,11 +398,11 @@ xchk_btree_check_block_owner(
* have to nullify it (to shut down further block owner checks) if
* self-xref encounters problems.
*/
- if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
+ if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops))
bs->cur = NULL;
xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
- if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
+ if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops))
bs->cur = NULL;
out_free:
@@ -429,7 +428,7 @@ xchk_btree_check_owner(
* up.
*/
if (bp == NULL) {
- if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE)
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
}
@@ -442,7 +441,7 @@ xchk_btree_check_owner(
* duplicate cursors. Therefore, save the buffer daddr for
* later scanning.
*/
- if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
+ if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) {
struct check_owner *co;
co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
@@ -475,7 +474,7 @@ xchk_btree_check_iroot_minrecs(
* existing filesystems, so instead we disable the check for data fork
* bmap btrees when there's an attr fork.
*/
- if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
+ if (xfs_btree_is_bmap(bs->cur->bc_ops) &&
bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
xfs_inode_has_attr_fork(bs->sc->ip))
return false;
@@ -508,7 +507,7 @@ xchk_btree_check_minrecs(
* child block might be less than the standard minrecs, but that's ok
* provided that there's only one direct child of the root.
*/
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
level == cur->bc_nlevels - 2) {
struct xfs_btree_block *root_block;
struct xfs_buf *root_bp;
@@ -562,7 +561,7 @@ xchk_btree_block_check_keys(
return;
}
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Make sure the high key of this block matches the parent. */
@@ -585,7 +584,6 @@ xchk_btree_get_block(
struct xfs_btree_block **pblock,
struct xfs_buf **pbp)
{
- xfs_failaddr_t failed_at;
int error;
*pblock = NULL;
@@ -597,13 +595,7 @@ xchk_btree_get_block(
return error;
xfs_btree_get_block(bs->cur, level, pbp);
- if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
- failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
- level, *pbp);
- else
- failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
- level, *pbp);
- if (failed_at) {
+ if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
}
@@ -664,7 +656,7 @@ xchk_btree_block_keys(
if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
xchk_btree_set_corrupt(bs->sc, cur, 1);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Get high keys */
@@ -728,7 +720,7 @@ xchk_btree(
* error codes for us.
*/
level = cur->bc_nlevels - 1;
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
goto out;
error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 81f2b96bb5a7..47a20cf5205f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -29,6 +29,8 @@
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -82,6 +84,15 @@ __xchk_process_error(
sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
sc->sm, *error);
break;
+ case -ECANCELED:
+ /*
+ * ECANCELED here means that the caller set one of the scrub
+ * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+ * quickly. Set error to zero and do not continue.
+ */
+ trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
+ *error = 0;
+ break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
@@ -89,8 +100,7 @@ __xchk_process_error(
*error = 0;
fallthrough;
default:
- trace_xchk_op_error(sc, agno, bno, *error,
- ret_ip);
+ trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
break;
}
return false;
@@ -136,6 +146,16 @@ __xchk_fblock_process_error(
/* Used to restart an op with deadlock avoidance. */
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
break;
+ case -ECANCELED:
+ /*
+ * ECANCELED here means that the caller set one of the scrub
+ * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+ * quickly. Set error to zero and do not continue.
+ */
+ trace_xchk_file_op_error(sc, whichfork, offset, *error,
+ ret_ip);
+ *error = 0;
+ break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
@@ -227,6 +247,19 @@ xchk_block_set_corrupt(
trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
}
+#ifdef CONFIG_XFS_QUOTA
+/* Record a corrupt quota counter. */
+void
+xchk_qcheck_set_corrupt(
+ struct xfs_scrub *sc,
+ unsigned int dqtype,
+ xfs_dqid_t id)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
+}
+#endif
+
/* Record a corruption while cross-referencing. */
void
xchk_block_xref_set_corrupt(
@@ -427,7 +460,7 @@ xchk_perag_read_headers(
* Grab the AG headers for the attached perag structure and wait for pending
* intents to drain.
*/
-static int
+int
xchk_perag_drain_and_lock(
struct xfs_scrub *sc)
{
@@ -555,46 +588,50 @@ xchk_ag_btcur_init(
{
struct xfs_mount *mp = sc->mp;
- if (sa->agf_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
+ if (sa->agf_bp) {
/* Set up a bnobt cursor for cross-referencing. */
- sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sa->pag, XFS_BTNUM_BNO);
- }
+ sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
+ XFS_SCRUB_TYPE_BNOBT);
- if (sa->agf_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
/* Set up a cntbt cursor for cross-referencing. */
- sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sa->pag, XFS_BTNUM_CNT);
- }
-
- /* Set up a inobt cursor for cross-referencing. */
- if (sa->agi_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
- sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
- XFS_BTNUM_INO);
- }
-
- /* Set up a finobt cursor for cross-referencing. */
- if (sa->agi_bp && xfs_has_finobt(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
- sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
- XFS_BTNUM_FINO);
- }
-
- /* Set up a rmapbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_has_rmapbt(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
- sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
+ XFS_SCRUB_TYPE_CNTBT);
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (xfs_has_rmapbt(mp)) {
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
+ XFS_SCRUB_TYPE_RMAPBT);
+ }
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (xfs_has_reflink(mp)) {
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
+ XFS_SCRUB_TYPE_REFCNTBT);
+ }
}
- /* Set up a refcountbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_has_reflink(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
- sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
- sa->agf_bp, sa->pag);
+ if (sa->agi_bp) {
+ /* Set up a inobt cursor for cross-referencing. */
+ sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
+ sa->agi_bp);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
+ XFS_SCRUB_TYPE_INOBT);
+
+ /* Set up a finobt cursor for cross-referencing. */
+ if (xfs_has_finobt(mp)) {
+ sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
+ sa->agi_bp);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
+ XFS_SCRUB_TYPE_FINOBT);
+ }
}
}
@@ -653,6 +690,13 @@ xchk_trans_cancel(
sc->tp = NULL;
}
+int
+xchk_trans_alloc_empty(
+ struct xfs_scrub *sc)
+{
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
/*
* Grab an empty transaction so that we can re-grab locked buffers if
* one of our btrees turns out to be cyclic.
@@ -672,7 +716,7 @@ xchk_trans_alloc(
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
resblks, 0, 0, &sc->tp);
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ return xchk_trans_alloc_empty(sc);
}
/* Set us up with a transaction and an empty context. */
@@ -1000,9 +1044,7 @@ xchk_irele(
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- if (current->journal_info != NULL) {
- ASSERT(current->journal_info == sc->tp);
-
+ if (sc->tp) {
/*
* If we are in a transaction, we /cannot/ drop the inode
* ourselves, because the VFS will trigger writeback, which
@@ -1259,6 +1301,15 @@ xchk_fsgates_enable(
if (scrub_fsgates & XCHK_FSGATES_DRAIN)
xfs_drain_wait_enable();
+ if (scrub_fsgates & XCHK_FSGATES_QUOTA)
+ xfs_dqtrx_hook_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
+ xfs_dir_hook_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_RMAP)
+ xfs_rmap_hook_enable();
+
sc->flags |= scrub_fsgates;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index da09580b454a..89f7bbec887e 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -32,6 +32,7 @@ xchk_should_terminate(
}
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+int xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
@@ -54,6 +55,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc,
void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork,
xfs_fileoff_t offset);
+#ifdef CONFIG_XFS_QUOTA
+void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype,
+ xfs_dqid_t id);
+#endif
void xchk_block_xref_set_corrupt(struct xfs_scrub *sc,
struct xfs_buf *bp);
@@ -105,6 +110,7 @@ xchk_setup_rtsummary(struct xfs_scrub *sc)
#ifdef CONFIG_XFS_QUOTA
int xchk_ino_dqattach(struct xfs_scrub *sc);
int xchk_setup_quota(struct xfs_scrub *sc);
+int xchk_setup_quotacheck(struct xfs_scrub *sc);
#else
static inline int
xchk_ino_dqattach(struct xfs_scrub *sc)
@@ -116,12 +122,19 @@ xchk_setup_quota(struct xfs_scrub *sc)
{
return -ENOENT;
}
+static inline int
+xchk_setup_quotacheck(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
#endif
int xchk_setup_fscounters(struct xfs_scrub *sc);
+int xchk_setup_nlinks(struct xfs_scrub *sc);
void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
+int xchk_perag_drain_and_lock(struct xfs_scrub *sc);
/*
* Grab all AG resources, treating the inability to grab the perag structure as
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 1e82c727af8e..4de3f0f40f48 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -609,6 +609,6 @@ xrep_bmap_cow(
out_bitmap:
xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
xoff_bitmap_destroy(&xc->bad_fileoffs);
- kmem_free(xc);
+ kfree(xc);
return error;
}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index d86ab51af928..076a310b8eb0 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -93,11 +93,11 @@ xchk_dir_actor(
return -ECANCELED;
}
- if (!strncmp(".", name->name, name->len)) {
+ if (xfs_dir2_samename(name, &xfs_name_dot)) {
/* If this is "." then check that the inum matches the dir. */
if (ino != dp->i_ino)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- } else if (!strncmp("..", name->name, name->len)) {
+ } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) {
/*
* If this is ".." in the root inode, check that the inum
* matches this dir.
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 5799e9a94f1f..d310737c8823 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -22,6 +22,7 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
+#include "scrub/fscounters.h"
/*
* FS Summary Counters
@@ -48,17 +49,6 @@
* our tolerance for mismatch between expected and actual counter values.
*/
-struct xchk_fscounters {
- struct xfs_scrub *sc;
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- uint64_t frextents;
- unsigned long long icount_min;
- unsigned long long icount_max;
- bool frozen;
-};
-
/*
* Since the expected value computation is lockless but only browses incore
* values, the percpu counters should be fairly close to each other. However,
@@ -235,14 +225,19 @@ xchk_setup_fscounters(
* Pause all writer activity in the filesystem while we're scrubbing to
* reduce the likelihood of background perturbations to the counters
* throwing off our calculations.
+ *
+ * If we're repairing, we need to prevent any other thread from
+ * changing the global fs summary counters while we're repairing them.
+ * This requires the fs to be frozen, which will disable background
+ * reclaim and purge all inactive inodes.
*/
- if (sc->flags & XCHK_TRY_HARDER) {
+ if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
error = xchk_fscounters_freeze(sc);
if (error)
return error;
}
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ return xchk_trans_alloc_empty(sc);
}
/*
@@ -254,7 +249,9 @@ xchk_setup_fscounters(
* set the INCOMPLETE flag even when a negative errno is returned. This care
* must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
* ECANCELED) that are absorbed into a scrub state flag update by
- * xchk_*_process_error.
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
*/
/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
@@ -482,6 +479,10 @@ xchk_fscount_within_range(
if (curr_value == expected)
return true;
+ /* We require exact matches when repair is running. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return false;
+
min_value = min(old_value, curr_value);
max_value = max(old_value, curr_value);
diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h
new file mode 100644
index 000000000000..461a13d25f4b
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSCOUNTERS_H__
+#define __XFS_SCRUB_FSCOUNTERS_H__
+
+struct xchk_fscounters {
+ struct xfs_scrub *sc;
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ uint64_t frextents;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+ bool frozen;
+};
+
+#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
new file mode 100644
index 000000000000..94cdb852bee4
--- /dev/null
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/fscounters.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * We correct errors in the filesystem summary counters by setting them to the
+ * values computed during the obligatory scrub phase. However, we must be
+ * careful not to allow any other thread to change the counters while we're
+ * computing and setting new values. To achieve this, we freeze the
+ * filesystem for the whole operation if the REPAIR flag is set. The checking
+ * function is stricter when we've frozen the fs.
+ */
+
+/*
+ * Reset the superblock counters. Caller is responsible for freezing the
+ * filesystem during the calculation and reset phases.
+ */
+int
+xrep_fscounters(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_fscounters *fsc = sc->buf;
+
+ /*
+ * Reinitialize the in-core counters from what we computed. We froze
+ * the filesystem, so there shouldn't be anyone else trying to modify
+ * these counters.
+ */
+ if (!fsc->frozen) {
+ ASSERT(fsc->frozen);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xrep_reset_counters(mp, fsc);
+
+ percpu_counter_set(&mp->m_icount, fsc->icount);
+ percpu_counter_set(&mp->m_ifree, fsc->ifree);
+ percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+ percpu_counter_set(&mp->m_frextents, fsc->frextents);
+ mp->m_sb.sb_frextents = fsc->frextents;
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 531006910ca9..9020a6bef7f1 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -14,6 +14,7 @@
#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/health.h"
+#include "scrub/common.h"
/*
* Scrub and In-Core Filesystem Health Assessments
@@ -105,6 +106,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA },
[XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA },
[XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS },
+ [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK },
+ [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS },
};
/* Return the health status mask for this scrub type. */
@@ -148,6 +151,24 @@ xchk_file_looks_zapped(
}
/*
+ * Scrub gave the filesystem a clean bill of health, so clear all the indirect
+ * markers of past problems (at least for the fs and ags) so that we can be
+ * healthy again.
+ */
+STATIC void
+xchk_mark_all_healthy(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT);
+ xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT);
+ for_each_perag(mp, agno, pag)
+ xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT);
+}
+
+/*
* Update filesystem health assessments based on what we found and did.
*
* If the scrubber finds errors, we mark sick whatever's mentioned in
@@ -164,6 +185,18 @@ xchk_update_health(
struct xfs_perag *pag;
bool bad;
+ /*
+ * The HEALTHY scrub type is a request from userspace to clear all the
+ * indirect flags after a clean scan of the entire filesystem. As such
+ * there's no sick flag defined for it, so we branch here ahead of the
+ * mask check.
+ */
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY &&
+ !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xchk_mark_all_healthy(sc->mp);
+ return;
+ }
+
if (!sc->sick_mask)
return;
@@ -173,7 +206,7 @@ xchk_update_health(
case XHG_AG:
pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_ag_mark_sick(pag, sc->sick_mask);
+ xfs_ag_mark_corrupt(pag, sc->sick_mask);
else
xfs_ag_mark_healthy(pag, sc->sick_mask);
xfs_perag_put(pag);
@@ -181,20 +214,30 @@ xchk_update_health(
case XHG_INO:
if (!sc->ip)
return;
- if (bad)
- xfs_inode_mark_sick(sc->ip, sc->sick_mask);
- else
+ if (bad) {
+ unsigned int mask = sc->sick_mask;
+
+ /*
+ * If we're coming in for repairs then we don't want
+ * sickness flags to propagate to the incore health
+ * status if the inode gets inactivated before we can
+ * fix it.
+ */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ mask |= XFS_SICK_INO_FORGET;
+ xfs_inode_mark_corrupt(sc->ip, mask);
+ } else
xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
break;
case XHG_FS:
if (bad)
- xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+ xfs_fs_mark_corrupt(sc->mp, sc->sick_mask);
else
xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
break;
case XHG_RT:
if (bad)
- xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+ xfs_rt_mark_corrupt(sc->mp, sc->sick_mask);
else
xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
break;
@@ -205,13 +248,13 @@ xchk_update_health(
}
/* Is the given per-AG btree healthy enough for scanning? */
-bool
-xchk_ag_btree_healthy_enough(
+void
+xchk_ag_btree_del_cursor_if_sick(
struct xfs_scrub *sc,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_btree_cur **curp,
+ unsigned int sm_type)
{
- unsigned int mask = 0;
+ unsigned int mask = (*curp)->bc_ops->sick_mask;
/*
* We always want the cursor if it's the same type as whatever we're
@@ -220,41 +263,8 @@ xchk_ag_btree_healthy_enough(
* Otherwise, we're only interested in the btree for cross-referencing.
* If we know the btree is bad then don't bother, just set XFAIL.
*/
- switch (btnum) {
- case XFS_BTNUM_BNO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
- return true;
- mask = XFS_SICK_AG_BNOBT;
- break;
- case XFS_BTNUM_CNT:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
- return true;
- mask = XFS_SICK_AG_CNTBT;
- break;
- case XFS_BTNUM_INO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
- return true;
- mask = XFS_SICK_AG_INOBT;
- break;
- case XFS_BTNUM_FINO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
- return true;
- mask = XFS_SICK_AG_FINOBT;
- break;
- case XFS_BTNUM_RMAP:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
- return true;
- mask = XFS_SICK_AG_RMAPBT;
- break;
- case XFS_BTNUM_REFC:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
- return true;
- mask = XFS_SICK_AG_REFCNTBT;
- break;
- default:
- ASSERT(0);
- return true;
- }
+ if (sc->sm->sm_type == sm_type)
+ return;
/*
* If we just repaired some AG metadata, sc->sick_mask will reflect all
@@ -266,10 +276,42 @@ xchk_ag_btree_healthy_enough(
type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
mask &= ~sc->sick_mask;
- if (xfs_ag_has_sickness(pag, mask)) {
+ if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
- return false;
+ xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR);
+ *curp = NULL;
+ }
+}
+
+/*
+ * Quick scan to double-check that there isn't any evidence of lingering
+ * primary health problems. If we're still clear, then the health update will
+ * take care of clearing the indirect evidence.
+ */
+int
+xchk_health_record(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ unsigned int sick;
+ unsigned int checked;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ if (sick & XFS_SICK_FS_PRIMARY)
+ xchk_set_corrupt(sc);
+
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ if (sick & XFS_SICK_RT_PRIMARY)
+ xchk_set_corrupt(sc);
+
+ for_each_perag(mp, agno, pag) {
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ if (sick & XFS_SICK_AG_PRIMARY)
+ xchk_set_corrupt(sc);
}
- return true;
+ return 0;
}
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index a731b2467399..63fc426eb5ae 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -8,9 +8,10 @@
unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
void xchk_update_health(struct xfs_scrub *sc);
-bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
- xfs_btnum_t btnum);
+void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc,
+ struct xfs_btree_cur **curp, unsigned int sm_type);
void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask);
bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask);
+int xchk_health_record(struct xfs_scrub *sc);
#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index a720fc62262a..750d7b0cd25a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -76,7 +76,7 @@ xchk_inobt_xref_finobt(
int has_record;
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_FINO);
+ ASSERT(xfs_btree_is_fino(cur->bc_ops));
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
if (error)
@@ -179,7 +179,7 @@ xchk_finobt_xref_inobt(
int has_record;
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+ ASSERT(xfs_btree_is_ino(cur->bc_ops));
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
if (error)
@@ -514,7 +514,7 @@ xchk_iallocbt_rec_alignment(
* Otherwise, we expect that the finobt record is aligned to the
* cluster alignment as told by the superblock.
*/
- if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ if (xfs_btree_is_fino(bs->cur->bc_ops)) {
unsigned int imask;
imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
@@ -649,8 +649,7 @@ out:
*/
STATIC void
xchk_iallocbt_xref_rmap_btreeblks(
- struct xfs_scrub *sc,
- int which)
+ struct xfs_scrub *sc)
{
xfs_filblks_t blocks;
xfs_extlen_t inobt_blocks = 0;
@@ -688,7 +687,6 @@ xchk_iallocbt_xref_rmap_btreeblks(
STATIC void
xchk_iallocbt_xref_rmap_inodes(
struct xfs_scrub *sc,
- int which,
unsigned long long inodes)
{
xfs_filblks_t blocks;
@@ -719,17 +717,14 @@ xchk_iallocbt(
.next_startino = NULLAGINO,
.next_cluster_ino = NULLAGINO,
};
- xfs_btnum_t which;
int error;
switch (sc->sm->sm_type) {
case XFS_SCRUB_TYPE_INOBT:
cur = sc->sa.ino_cur;
- which = XFS_BTNUM_INO;
break;
case XFS_SCRUB_TYPE_FINOBT:
cur = sc->sa.fino_cur;
- which = XFS_BTNUM_FINO;
break;
default:
ASSERT(0);
@@ -741,7 +736,7 @@ xchk_iallocbt(
if (error)
return error;
- xchk_iallocbt_xref_rmap_btreeblks(sc, which);
+ xchk_iallocbt_xref_rmap_btreeblks(sc);
/*
* If we're scrubbing the inode btree, inode_blocks is the number of
@@ -750,9 +745,8 @@ xchk_iallocbt(
* knows about. We can't do this for the finobt since it only points
* to inode chunks with free inodes.
*/
- if (which == XFS_BTNUM_INO)
- xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
-
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+ xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes);
return error;
}
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
index b3f7182dd2f5..a00ec7ae1792 100644
--- a/fs/xfs/scrub/ialloc_repair.c
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -369,7 +369,7 @@ xrep_ibt_check_inode_ext(
* On a sparse inode fs, this cluster could be part of a sparse chunk.
* Sparse clusters must be aligned to sparse chunk alignment.
*/
- if (xfs_has_sparseinodes(mp) &&
+ if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
(!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
!IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
return -EFSCORRUPTED;
@@ -663,8 +663,8 @@ xrep_ibt_build_new_trees(
ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
ri->new_inobt.bload.get_records = xrep_ibt_get_records;
- ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake,
- XFS_BTNUM_INO);
+ ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
+ xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
xfarray_length(ri->inode_records));
if (error)
@@ -684,8 +684,8 @@ xrep_ibt_build_new_trees(
ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
ri->new_finobt.bload.get_records = xrep_fibt_get_records;
- fino_cur = xfs_inobt_stage_cursor(sc->sa.pag,
- &ri->new_finobt.afake, XFS_BTNUM_FINO);
+ fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
+ xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
error = xfs_btree_bload_compute_geometry(fino_cur,
&ri->new_finobt.bload, ri->finobt_recs);
if (error)
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 0ca62d59f84a..eab380e95ef4 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -37,12 +37,15 @@
#include "xfs_attr_leaf.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
/*
* Inode Record Repair
@@ -126,6 +129,10 @@ struct xrep_inode {
/* Must we remove all access from this file? */
bool zap_acls;
+
+ /* Inode scanner to see if we can find the ftype from dirents */
+ struct xchk_iscan ftype_iscan;
+ uint8_t alleged_ftype;
};
/*
@@ -227,26 +234,233 @@ xrep_dinode_header(
dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
}
-/* Turn di_mode into /something/ recognizable. */
-STATIC void
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ if (ino != sc->sm->sm_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ /*
+ * Uhoh, more than one parent for this inode and they don't agree on
+ * the file type?
+ */
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
+ ri->alleged_ftype != name->type) {
+ trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
+ ri->alleged_ftype);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember the ftype. */
+ trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
+ ri->alleged_ftype = name->type;
+ return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_walk_directory(
+ struct xrep_inode *ri,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = ri->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Try to find the mode of the inode being repaired by looking for directories
+ * that point down to this file.
+ */
+STATIC int
+xrep_dinode_find_mode(
+ struct xrep_inode *ri,
+ uint16_t *mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_inode *dp;
+ int error;
+
+ /* No ftype means we have no other metadata to consult. */
+ if (!xfs_has_ftype(sc->mp)) {
+ *mode = S_IFREG;
+ return 0;
+ }
+
+ /*
+ * Scan all directories for parents that might point down to this
+ * inode. Skip the inode being repaired during the scan since it
+ * cannot be its own parent. Note that we still hold the AGI locked
+ * so there's a real possibility that _iscan_iter can return EBUSY.
+ */
+ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
+ ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
+ ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
+ while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
+ if (S_ISDIR(VFS_I(dp)->i_mode))
+ error = xrep_dinode_findmode_walk_directory(ri, dp);
+ xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
+ xchk_irele(sc, dp);
+ if (error < 0)
+ break;
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&ri->ftype_iscan);
+ xchk_iscan_teardown(&ri->ftype_iscan);
+
+ if (error == -EBUSY) {
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
+ /*
+ * If we got an EBUSY after finding at least one
+ * dirent, that means the scan found an inode on the
+ * inactivation list and could not open it. Accept the
+ * alleged ftype and install a new mode below.
+ */
+ error = 0;
+ } else if (!(sc->flags & XCHK_TRY_HARDER)) {
+ /*
+ * Otherwise, retry the operation one time to see if
+ * the reason for the delay is an inode from the same
+ * cluster buffer waiting on the inactivation list.
+ */
+ error = -EDEADLOCK;
+ }
+ }
+ if (error)
+ return error;
+
+ /*
+ * Convert the discovered ftype into the file mode. If all else fails,
+ * return S_IFREG.
+ */
+ switch (ri->alleged_ftype) {
+ case XFS_DIR3_FT_DIR:
+ *mode = S_IFDIR;
+ break;
+ case XFS_DIR3_FT_WHT:
+ case XFS_DIR3_FT_CHRDEV:
+ *mode = S_IFCHR;
+ break;
+ case XFS_DIR3_FT_BLKDEV:
+ *mode = S_IFBLK;
+ break;
+ case XFS_DIR3_FT_FIFO:
+ *mode = S_IFIFO;
+ break;
+ case XFS_DIR3_FT_SOCK:
+ *mode = S_IFSOCK;
+ break;
+ case XFS_DIR3_FT_SYMLINK:
+ *mode = S_IFLNK;
+ break;
+ default:
+ *mode = S_IFREG;
+ break;
+ }
+ return 0;
+}
+
+/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */
+STATIC int
xrep_dinode_mode(
struct xrep_inode *ri,
struct xfs_dinode *dip)
{
struct xfs_scrub *sc = ri->sc;
uint16_t mode = be16_to_cpu(dip->di_mode);
+ int error;
trace_xrep_dinode_mode(sc, dip);
if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
- return;
+ return 0;
+
+ /* Try to fix the mode. If we cannot, then leave everything alone. */
+ error = xrep_dinode_find_mode(ri, &mode);
+ switch (error) {
+ case -EINTR:
+ case -EBUSY:
+ case -EDEADLOCK:
+ /* temporary failure or fatal signal */
+ return error;
+ case 0:
+ /* found mode */
+ break;
+ default:
+ /* some other error, assume S_IFREG */
+ mode = S_IFREG;
+ break;
+ }
/* bad mode, so we set it to a file that only root can read */
- mode = S_IFREG;
dip->di_mode = cpu_to_be16(mode);
dip->di_uid = 0;
dip->di_gid = 0;
ri->zap_acls = true;
+ return 0;
}
/* Fix any conflicting flags that the verifiers complain about. */
@@ -1107,12 +1321,15 @@ xrep_dinode_core(
/* Fix everything the verifier will complain about. */
dip = xfs_buf_offset(bp, ri->imap.im_boffset);
xrep_dinode_header(sc, dip);
- xrep_dinode_mode(ri, dip);
+ iget_error = xrep_dinode_mode(ri, dip);
+ if (iget_error)
+ goto write;
xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
xrep_dinode_size(ri, dip);
xrep_dinode_extsize_hints(sc, dip);
xrep_dinode_zap_forks(ri, dip);
+write:
/* Write out the inode. */
trace_xrep_dinode_fixed(sc, dip);
xfs_dinode_calc_crc(sc->mp, dip);
@@ -1128,7 +1345,8 @@ xrep_dinode_core(
* accessing the inode. If iget fails, we still need to commit the
* changes.
*/
- iget_error = xchk_iget(sc, ino, &sc->ip);
+ if (!iget_error)
+ iget_error = xchk_iget(sc, ino, &sc->ip);
if (!iget_error)
xchk_ilock(sc, XFS_IOLOCK_EXCL);
@@ -1496,6 +1714,13 @@ xrep_inode(
ASSERT(ri != NULL);
error = xrep_dinode_problems(ri);
+ if (error == -EBUSY) {
+ /*
+ * Directory scan to recover inode mode encountered a
+ * busy inode, so we did not continue repairing things.
+ */
+ return 0;
+ }
if (error)
return error;
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
new file mode 100644
index 000000000000..ec3478bc505e
--- /dev/null
+++ b/fs/xfs/scrub/iscan.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_icache.h"
+#include "scrub/scrub.h"
+#include "scrub/iscan.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Live File Scan
+ * ==============
+ *
+ * Live file scans walk every inode in a live filesystem. This is more or
+ * less like a regular iwalk, except that when we're advancing the scan cursor,
+ * we must ensure that inodes cannot be added or deleted anywhere between the
+ * old cursor value and the new cursor value. If we're advancing the cursor
+ * by one inode, the caller must hold that inode; if we're finding the next
+ * inode to scan, we must grab the AGI and hold it until we've updated the
+ * scan cursor.
+ *
+ * Callers are expected to use this code to scan all files in the filesystem to
+ * construct a new metadata index of some kind. The scan races against other
+ * live updates, which means there must be a provision to update the new index
+ * when updates are made to inodes that already been scanned. The iscan lock
+ * can be used in live update hook code to stop the scan and protect this data
+ * structure.
+ *
+ * To keep the new index up to date with other metadata updates being made to
+ * the live filesystem, it is assumed that the caller will add hooks as needed
+ * to be notified when a metadata update occurs. The inode scanner must tell
+ * the hook code when an inode has been visited with xchk_iscan_mark_visit.
+ * Hook functions can use xchk_iscan_want_live_update to decide if the
+ * scanner's observations must be updated.
+ */
+
+/*
+ * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so
+ * that the scan ignores that inode.
+ */
+STATIC void
+xchk_iscan_mask_skipino(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_inobt_rec_incore *rec,
+ xfs_agino_t lastrecino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino);
+ xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino);
+
+ if (pag->pag_agno != skip_agno)
+ return;
+ if (skip_agino < rec->ir_startino)
+ return;
+ if (skip_agino > lastrecino)
+ return;
+
+ rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1);
+}
+
+/*
+ * Set *cursor to the next allocated inode after whatever it's set to now.
+ * If there are no more inodes in this AG, cursor is set to NULLAGINO.
+ */
+STATIC int
+xchk_iscan_find_next(
+ struct xchk_iscan *iscan,
+ struct xfs_buf *agi_bp,
+ struct xfs_perag *pag,
+ xfs_inofree_t *allocmaskp,
+ xfs_agino_t *cursor,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ xfs_agnumber_t agno = pag->pag_agno;
+ xfs_agino_t lastino = NULLAGINO;
+ xfs_agino_t first, last;
+ xfs_agino_t agino = *cursor;
+ int has_rec;
+ int error;
+
+ /* If the cursor is beyond the end of this AG, move to the next one. */
+ xfs_agino_range(mp, agno, &first, &last);
+ if (agino > last) {
+ *cursor = NULLAGINO;
+ return 0;
+ }
+
+ /*
+ * Look up the inode chunk for the current cursor position. If there
+ * is no chunk here, we want the next one.
+ */
+ cur = xfs_inobt_init_cursor(pag, tp, agi_bp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec);
+ if (!error && !has_rec)
+ error = xfs_btree_increment(cur, 0, &has_rec);
+ for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) {
+ xfs_inofree_t allocmask;
+
+ /*
+ * If we've run out of inobt records in this AG, move the
+ * cursor on to the next AG and exit. The caller can try
+ * again with the next AG.
+ */
+ if (!has_rec) {
+ *cursor = NULLAGINO;
+ break;
+ }
+
+ error = xfs_inobt_get_rec(cur, &rec, &has_rec);
+ if (error)
+ break;
+ if (!has_rec) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ /* Make sure that we always move forward. */
+ if (lastino != NULLAGINO &&
+ XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+ lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+ /*
+ * If this record only covers inodes that come before the
+ * cursor, advance to the next record.
+ */
+ if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ continue;
+
+ if (iscan->skip_ino)
+ xchk_iscan_mask_skipino(iscan, pag, &rec, lastino);
+
+ /*
+ * If the incoming lookup put us in the middle of an inobt
+ * record, mark it and the previous inodes "free" so that the
+ * search for allocated inodes will start at the cursor.
+ * We don't care about ir_freecount here.
+ */
+ if (agino >= rec.ir_startino)
+ rec.ir_free |= xfs_inobt_maskn(0,
+ agino + 1 - rec.ir_startino);
+
+ /*
+ * If there are allocated inodes in this chunk, find them
+ * and update the scan cursor.
+ */
+ allocmask = ~rec.ir_free;
+ if (hweight64(allocmask) > 0) {
+ int next = xfs_lowbit64(allocmask);
+
+ ASSERT(next >= 0);
+ *cursor = rec.ir_startino + next;
+ *allocmaskp = allocmask >> next;
+ *nr_inodesp = XFS_INODES_PER_CHUNK - next;
+ break;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Advance both the scan and the visited cursors.
+ *
+ * The inumber address space for a given filesystem is sparse, which means that
+ * the scan cursor can jump a long ways in a single iter() call. There are no
+ * inodes in these sparse areas, so we must move the visited cursor forward at
+ * the same time so that the scan user can receive live updates for inodes that
+ * may get created once we release the AGI buffer.
+ */
+static inline void
+xchk_iscan_move_cursor(
+ struct xchk_iscan *iscan,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t cursor, visited;
+
+ BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO);
+
+ /*
+ * Special-case ino == 0 here so that we never set visited_ino to
+ * NULLFSINO when wrapping around EOFS, for that will let through all
+ * live updates.
+ */
+ cursor = XFS_AGINO_TO_INO(mp, agno, agino);
+ if (cursor == 0)
+ visited = XFS_MAXINUMBER;
+ else
+ visited = cursor - 1;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = cursor;
+ iscan->__visited_ino = visited;
+ trace_xchk_iscan_move_cursor(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Prepare to return agno/agino to the iscan caller by moving the lastino
+ * cursor to the previous inode. Do this while we still hold the AGI so that
+ * no other threads can create or delete inodes in this AG.
+ */
+static inline void
+xchk_iscan_finish(
+ struct xchk_iscan *iscan)
+{
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = NULLFSINO;
+
+ /* All live updates will be applied from now on */
+ iscan->__visited_ino = NULLFSINO;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance ino to the next inode that the inobt thinks is allocated, being
+ * careful to jump to the next AG if we've reached the right end of this AG's
+ * inode btree. Advancing ino effectively means that we've pushed the inode
+ * scan forward, so set the iscan cursor to (ino - 1) so that our live update
+ * predicates will track inode allocations in that part of the inode number
+ * key space once we release the AGI buffer.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_advance(
+ struct xchk_iscan *iscan,
+ struct xfs_perag **pagp,
+ struct xfs_buf **agi_bpp,
+ xfs_inofree_t *allocmaskp,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi_bp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ int ret;
+
+ ASSERT(iscan->cursor_ino >= iscan->__visited_ino);
+
+ do {
+ if (xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+
+ agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino);
+ pag = xfs_perag_get(mp, agno);
+ if (!pag)
+ return -ECANCELED;
+
+ ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ if (ret)
+ goto out_pag;
+
+ agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino);
+ ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp,
+ &agino, nr_inodesp);
+ if (ret)
+ goto out_buf;
+
+ if (agino != NULLAGINO) {
+ /*
+ * Found the next inode in this AG, so return it along
+ * with the AGI buffer and the perag structure to
+ * ensure it cannot go away.
+ */
+ xchk_iscan_move_cursor(iscan, agno, agino);
+ *agi_bpp = agi_bp;
+ *pagp = pag;
+ return 1;
+ }
+
+ /*
+ * Did not find any more inodes in this AG, move on to the next
+ * AG.
+ */
+ agno = (agno + 1) % mp->m_sb.sb_agcount;
+ xchk_iscan_move_cursor(iscan, agno, 0);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ trace_xchk_iscan_advance_ag(iscan);
+ } while (iscan->cursor_ino != iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+ return 0;
+
+out_buf:
+ xfs_trans_brelse(sc->tp, agi_bp);
+out_pag:
+ xfs_perag_put(pag);
+ return ret;
+}
+
+/*
+ * Grabbing the inode failed, so we need to back up the scan and ask the caller
+ * to try to _advance the scan again. Returns -EBUSY if we've run out of retry
+ * opportunities, -ECANCELED if the process has a fatal signal pending, or
+ * -EAGAIN if we should try again.
+ */
+STATIC int
+xchk_iscan_iget_retry(
+ struct xchk_iscan *iscan,
+ bool wait)
+{
+ ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1);
+
+ if (!iscan->iget_timeout ||
+ time_is_before_jiffies(iscan->__iget_deadline))
+ return -EBUSY;
+
+ if (wait) {
+ unsigned long relax;
+
+ /*
+ * Sleep for a period of time to let the rest of the system
+ * catch up. If we return early, someone sent a kill signal to
+ * the calling process.
+ */
+ relax = msecs_to_jiffies(iscan->iget_retry_delay);
+ trace_xchk_iscan_iget_retry_wait(iscan);
+
+ if (schedule_timeout_killable(relax) ||
+ xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+ }
+
+ iscan->cursor_ino--;
+ return -EAGAIN;
+}
+
+/*
+ * Grab an inode as part of an inode scan. While scanning this inode, the
+ * caller must ensure that no other threads can modify the inode until a call
+ * to xchk_iscan_visit succeeds.
+ *
+ * Returns the number of incore inodes grabbed; -EAGAIN if the caller should
+ * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode;
+ * -ECANCELED if there's a fatal signal pending; or some other negative errno.
+ */
+STATIC int
+xchk_iscan_iget(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_buf *agi_bp,
+ xfs_inofree_t allocmask,
+ uint8_t nr_inodes)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t ino = iscan->cursor_ino;
+ unsigned int idx = 0;
+ unsigned int i;
+ int error;
+
+ ASSERT(iscan->__inodes[0] == NULL);
+
+ /* Fill the first slot in the inode array. */
+ error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ &iscan->__inodes[idx]);
+
+ trace_xchk_iscan_iget(iscan, error);
+
+ if (error == -ENOENT || error == -EAGAIN) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * It's possible that this inode has lost all of its links but
+ * hasn't yet been inactivated. If we don't have a transaction
+ * or it's not writable, flush the inodegc workers and wait.
+ */
+ xfs_inodegc_flush(mp);
+ return xchk_iscan_iget_retry(iscan, true);
+ }
+
+ if (error == -EINVAL) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * We thought the inode was allocated, but the inode btree
+ * lookup failed, which means that it was freed since the last
+ * time we advanced the cursor. Back up and try again. This
+ * should never happen since still hold the AGI buffer from the
+ * inobt check, but we need to be careful about infinite loops.
+ */
+ return xchk_iscan_iget_retry(iscan, false);
+ }
+
+ if (error) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return error;
+ }
+ idx++;
+ ino++;
+ allocmask >>= 1;
+
+ /*
+ * Now that we've filled the first slot in __inodes, try to fill the
+ * rest of the batch with consecutively ordered inodes. to reduce the
+ * number of _iter calls. Make a bitmap of unallocated inodes from the
+ * zeroes in the inuse bitmap; these inodes will not be scanned, but
+ * the _want_live_update predicate will pass through all live updates.
+ *
+ * If we can't iget an allocated inode, stop and return what we have.
+ */
+ mutex_lock(&iscan->lock);
+ iscan->__batch_ino = ino - 1;
+ iscan->__skipped_inomask = 0;
+ mutex_unlock(&iscan->lock);
+
+ for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) {
+ if (!(allocmask & 1)) {
+ ASSERT(!(iscan->__skipped_inomask & (1ULL << i)));
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ iscan->__skipped_inomask |= (1ULL << i);
+ mutex_unlock(&iscan->lock);
+ continue;
+ }
+
+ ASSERT(iscan->__inodes[idx] == NULL);
+
+ error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ &iscan->__inodes[idx]);
+ if (error)
+ break;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ mutex_unlock(&iscan->lock);
+ idx++;
+ }
+
+ trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return idx;
+}
+
+/*
+ * Advance the visit cursor to reflect skipped inodes beyond whatever we
+ * scanned.
+ */
+STATIC void
+xchk_iscan_finish_batch(
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t highest_skipped;
+
+ mutex_lock(&iscan->lock);
+
+ if (iscan->__batch_ino != NULLFSINO) {
+ highest_skipped = iscan->__batch_ino +
+ xfs_highbit64(iscan->__skipped_inomask);
+ iscan->__visited_ino = max(iscan->__visited_ino,
+ highest_skipped);
+
+ trace_xchk_iscan_skip(iscan);
+ }
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return up to
+ * 64 consecutive allocated inodes starting with the cursor position.
+ */
+STATIC int
+xchk_iscan_iter_batch(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ int ret;
+
+ xchk_iscan_finish_batch(iscan);
+
+ if (iscan->iget_timeout)
+ iscan->__iget_deadline = jiffies +
+ msecs_to_jiffies(iscan->iget_timeout);
+
+ do {
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_inofree_t allocmask = 0;
+ uint8_t nr_inodes = 0;
+
+ ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask,
+ &nr_inodes);
+ if (ret != 1)
+ return ret;
+
+ if (xchk_iscan_aborted(iscan)) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ ret = -ECANCELED;
+ break;
+ }
+
+ ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return the
+ * incore inode structure associated with it.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be
+ * grabbed, or the usual negative errno.
+ *
+ * If the function returns -EBUSY and the caller can handle skipping an inode,
+ * it may call this function again to continue the scan with the next allocated
+ * inode.
+ */
+int
+xchk_iscan_iter(
+ struct xchk_iscan *iscan,
+ struct xfs_inode **ipp)
+{
+ unsigned int i;
+ int error;
+
+ /* Find a cached inode, or go get another batch. */
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i])
+ goto foundit;
+ }
+
+ error = xchk_iscan_iter_batch(iscan);
+ if (error <= 0)
+ return error;
+
+ ASSERT(iscan->__inodes[0] != NULL);
+ i = 0;
+
+foundit:
+ /* Give the caller our reference. */
+ *ipp = iscan->__inodes[i];
+ iscan->__inodes[i] = NULL;
+ return 1;
+}
+
+/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */
+void
+xchk_iscan_iter_finish(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ unsigned int i;
+
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i]) {
+ xchk_irele(sc, iscan->__inodes[i]);
+ iscan->__inodes[i] = NULL;
+ }
+ }
+}
+
+/* Mark this inode scan finished and release resources. */
+void
+xchk_iscan_teardown(
+ struct xchk_iscan *iscan)
+{
+ xchk_iscan_iter_finish(iscan);
+ xchk_iscan_finish(iscan);
+ mutex_destroy(&iscan->lock);
+}
+
+/* Pick an AG from which to start a scan. */
+static inline xfs_ino_t
+xchk_iscan_rotor(
+ struct xfs_mount *mp)
+{
+ static atomic_t agi_rotor;
+ unsigned int r = atomic_inc_return(&agi_rotor) - 1;
+
+ /*
+ * Rotoring *backwards* through the AGs, so we add one here before
+ * subtracting from the agcount to arrive at an AG number.
+ */
+ r = (r % mp->m_sb.sb_agcount) + 1;
+
+ return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0);
+}
+
+/*
+ * Set ourselves up to start an inode scan. If the @iget_timeout and
+ * @iget_retry_delay parameters are set, the scan will try to iget each inode
+ * for @iget_timeout milliseconds. If an iget call indicates that the inode is
+ * waiting to be inactivated, the CPU will relax for @iget_retry_delay
+ * milliseconds after pushing the inactivation workers.
+ */
+void
+xchk_iscan_start(
+ struct xfs_scrub *sc,
+ unsigned int iget_timeout,
+ unsigned int iget_retry_delay,
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t start_ino;
+
+ start_ino = xchk_iscan_rotor(sc->mp);
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ iscan->sc = sc;
+ clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+ iscan->iget_timeout = iget_timeout;
+ iscan->iget_retry_delay = iget_retry_delay;
+ iscan->__visited_ino = start_ino;
+ iscan->cursor_ino = start_ino;
+ iscan->scan_start_ino = start_ino;
+ mutex_init(&iscan->lock);
+ memset(iscan->__inodes, 0, sizeof(iscan->__inodes));
+
+ trace_xchk_iscan_start(iscan, start_ino);
+}
+
+/*
+ * Mark this inode as having been visited. Callers must hold a sufficiently
+ * exclusive lock on the inode to prevent concurrent modifications.
+ */
+void
+xchk_iscan_mark_visited(
+ struct xchk_iscan *iscan,
+ struct xfs_inode *ip)
+{
+ mutex_lock(&iscan->lock);
+ iscan->__visited_ino = ip->i_ino;
+ trace_xchk_iscan_visit(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Did we skip this inode because it wasn't allocated when we loaded the batch?
+ * If so, it is newly allocated and will not be scanned. All live updates to
+ * this inode must be passed to the caller to maintain scan correctness.
+ */
+static inline bool
+xchk_iscan_skipped(
+ const struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ if (iscan->__batch_ino == NULLFSINO)
+ return false;
+ if (ino < iscan->__batch_ino)
+ return false;
+ if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK)
+ return false;
+
+ return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino));
+}
+
+/*
+ * Do we need a live update for this inode? This is true if the scanner thread
+ * has visited this inode and the scan hasn't been aborted due to errors.
+ * Callers must hold a sufficiently exclusive lock on the inode to prevent
+ * scanners from reading any inode metadata.
+ */
+bool
+xchk_iscan_want_live_update(
+ struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ bool ret = false;
+
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ mutex_lock(&iscan->lock);
+
+ trace_xchk_iscan_want_live_update(iscan, ino);
+
+ /* Scan is finished, caller should receive all updates. */
+ if (iscan->__visited_ino == NULLFSINO) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * No inodes have been visited yet, so the visited cursor points at the
+ * start of the scan range. The caller should not receive any updates.
+ */
+ if (iscan->scan_start_ino == iscan->__visited_ino) {
+ ret = false;
+ goto unlock;
+ }
+
+ /*
+ * This inode was not allocated at the time of the iscan batch.
+ * The caller should receive all updates.
+ */
+ if (xchk_iscan_skipped(iscan, ino)) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor hasn't yet wrapped around the end of the FS. If
+ * @ino is inside the starred range, the caller should receive updates:
+ *
+ * 0 ------------ S ************ V ------------ EOFS
+ */
+ if (iscan->scan_start_ino <= iscan->__visited_ino) {
+ if (ino >= iscan->scan_start_ino &&
+ ino <= iscan->__visited_ino)
+ ret = true;
+
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor wrapped around the end of the FS. If @ino is
+ * inside the starred range, the caller should receive updates:
+ *
+ * 0 ************ V ------------ S ************ EOFS
+ */
+ if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino)
+ ret = true;
+
+unlock:
+ mutex_unlock(&iscan->lock);
+ return ret;
+}
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
new file mode 100644
index 000000000000..71f657552dfa
--- /dev/null
+++ b/fs/xfs/scrub/iscan.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ISCAN_H__
+#define __XFS_SCRUB_ISCAN_H__
+
+struct xchk_iscan {
+ struct xfs_scrub *sc;
+
+ /* Lock to protect the scan cursor. */
+ struct mutex lock;
+
+ /*
+ * This is the first inode in the inumber address space that we
+ * examined. When the scan wraps around back to here, the scan is
+ * finished.
+ */
+ xfs_ino_t scan_start_ino;
+
+ /* This is the inode that will be examined next. */
+ xfs_ino_t cursor_ino;
+
+ /* If nonzero and non-NULL, skip this inode when scanning. */
+ xfs_ino_t skip_ino;
+
+ /*
+ * This is the last inode that we've successfully scanned, either
+ * because the caller scanned it, or we moved the cursor past an empty
+ * part of the inode address space. Scan callers should only use the
+ * xchk_iscan_visit function to modify this.
+ */
+ xfs_ino_t __visited_ino;
+
+ /* Operational state of the livescan. */
+ unsigned long __opstate;
+
+ /* Give up on iterating @cursor_ino if we can't iget it by this time. */
+ unsigned long __iget_deadline;
+
+ /* Amount of time (in ms) that we will try to iget an inode. */
+ unsigned int iget_timeout;
+
+ /* Wait this many ms to retry an iget. */
+ unsigned int iget_retry_delay;
+
+ /*
+ * The scan grabs batches of inodes and stashes them here before
+ * handing them out with _iter. Unallocated inodes are set in the
+ * mask so that all updates to that inode are selected for live
+ * update propagation.
+ */
+ xfs_ino_t __batch_ino;
+ xfs_inofree_t __skipped_inomask;
+ struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK];
+};
+
+/* Set if the scan has been aborted due to some event in the fs. */
+#define XCHK_ISCAN_OPSTATE_ABORTED (1)
+
+static inline bool
+xchk_iscan_aborted(const struct xchk_iscan *iscan)
+{
+ return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_abort(struct xchk_iscan *iscan)
+{
+ set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
+ unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_teardown(struct xchk_iscan *iscan);
+
+int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
+void xchk_iscan_iter_finish(struct xchk_iscan *iscan);
+
+void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip);
+bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino);
+
+#endif /* __XFS_SCRUB_ISCAN_H__ */
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index bb6d980b4fcd..4a0271123d94 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -239,7 +239,11 @@ xrep_newbt_alloc_ag_blocks(
xrep_newbt_validate_ag_alloc_hint(xnr);
- error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
+ if (xnr->alloc_vextent)
+ error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+ else
+ error = xfs_alloc_vextent_near_bno(&args,
+ xnr->alloc_hint);
if (error)
return error;
if (args.fsbno == NULLFSBLOCK)
@@ -309,7 +313,11 @@ xrep_newbt_alloc_file_blocks(
xrep_newbt_validate_file_alloc_hint(xnr);
- error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
+ if (xnr->alloc_vextent)
+ error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+ else
+ error = xfs_alloc_vextent_start_ag(&args,
+ xnr->alloc_hint);
if (error)
return error;
if (args.fsbno == NULLFSBLOCK)
@@ -535,7 +543,7 @@ xrep_newbt_claim_block(
trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
xnr->oinfo.oi_owner);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
agbno));
else
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index 89f8e3970b1f..3d804d31af24 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -6,6 +6,8 @@
#ifndef __XFS_SCRUB_NEWBT_H__
#define __XFS_SCRUB_NEWBT_H__
+struct xfs_alloc_arg;
+
struct xrep_newbt_resv {
/* Link to list of extents that we've reserved. */
struct list_head list;
@@ -28,6 +30,11 @@ struct xrep_newbt_resv {
struct xrep_newbt {
struct xfs_scrub *sc;
+ /* Custom allocation function, or NULL for xfs_alloc_vextent */
+ int (*alloc_vextent)(struct xfs_scrub *sc,
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t alloc_hint);
+
/* List of extents that we've reserved. */
struct list_head resv_list;
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
new file mode 100644
index 000000000000..8a7d9557897c
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+#include "scrub/readdir.h"
+
+/*
+ * Live Inode Link Count Checking
+ * ==============================
+ *
+ * Inode link counts are "summary" metadata, in the sense that they are
+ * computed as the number of directory entries referencing each file on the
+ * filesystem. Therefore, we compute the correct link counts by creating a
+ * shadow link count structure and walking every inode.
+ */
+
+/* Set us up to scrub inode link counts. */
+int
+xchk_setup_nlinks(
+ struct xfs_scrub *sc)
+{
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting file link counts. For each file, we create a shadow link
+ * counting structure, then walk the entire directory tree, incrementing parent
+ * and child link counts for each directory entry seen.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the link counts for an inode that we've already scanned.
+ * This will cause our counts to be incorrect. Therefore, we hook all
+ * directory entry updates because that is when link count updates occur. By
+ * shadowing transaction updates in this manner, live nlink check can ensure by
+ * locking the inode and the shadow structure that its own copies are not out
+ * of date. Because the hook code runs in a different process context from the
+ * scrub code and the scrub state flags are not accessed atomically, failures
+ * in the hook code must abort the iscan and the scrubber must notice the
+ * aborted scan and set the incomplete flag.
+ *
+ * Note that we use jump labels and srcu notifier hooks to minimize the
+ * overhead when live nlinks is /not/ running. Locking order for nlink
+ * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
+ */
+
+/*
+ * Add a delta to an nlink counter, clamping the value to U32_MAX. Because
+ * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
+ * even if we lose some precision.
+ */
+static inline void
+careful_add(
+ xfs_nlink_t *nlinkp,
+ int delta)
+{
+ uint64_t new_value = (uint64_t)(*nlinkp) + delta;
+
+ BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
+ *nlinkp = min_t(uint64_t, new_value, U32_MAX);
+}
+
+/* Update incore link count information. Caller must hold the nlinks lock. */
+STATIC int
+xchk_nlinks_update_incore(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino,
+ int parents_delta,
+ int backrefs_delta,
+ int children_delta)
+{
+ struct xchk_nlink nl;
+ int error;
+
+ if (!xnc->nlinks)
+ return 0;
+
+ error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+ if (error)
+ return error;
+
+ trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
+ backrefs_delta, children_delta);
+
+ careful_add(&nl.parents, parents_delta);
+ careful_add(&nl.backrefs, backrefs_delta);
+ careful_add(&nl.children, children_delta);
+
+ nl.flags |= XCHK_NLINK_WRITTEN;
+ error = xfarray_store(xnc->nlinks, ino, &nl);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete.
+ */
+ error = -ECANCELED;
+ }
+ return error;
+}
+
+/*
+ * Apply a link count change from the regular filesystem into our shadow link
+ * count structure based on a directory update in progress.
+ */
+STATIC int
+xchk_nlinks_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xchk_nlink_ctrs *xnc;
+ int error;
+
+ xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+
+ trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
+ p->delta, p->name->name, p->name->len);
+
+ /*
+ * If we've already scanned @dp, update the number of parents that link
+ * to @ip. If @ip is a subdirectory, update the number of child links
+ * going out of @dp.
+ */
+ if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
+ 0, 0);
+ if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
+ error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+ 0, p->delta);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_abort;
+ }
+
+ /*
+ * If @ip is a subdirectory and we've already scanned it, update the
+ * number of backrefs pointing to @dp.
+ */
+ if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
+ xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+ p->delta, 0);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_abort;
+ }
+
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xnc->collect_iscan);
+ return NOTIFY_DONE;
+}
+
+/* Bump the observed link count for the inode referenced by this entry. */
+STATIC int
+xchk_nlinks_collect_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xchk_nlink_ctrs *xnc = priv;
+ bool dot = false, dotdot = false;
+ int error;
+
+ /* Does this name make sense? */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ if (name->len == 1 && name->name[0] == '.')
+ dot = true;
+ else if (name->len == 2 && name->name[0] == '.' &&
+ name->name[1] == '.')
+ dotdot = true;
+
+ /* Don't accept a '.' entry that points somewhere else. */
+ if (dot && ino != dp->i_ino) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ /* Don't accept an invalid inode number. */
+ if (!xfs_verify_dir_ino(sc->mp, ino)) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ /* Update the shadow link counts if we haven't already failed. */
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
+
+ mutex_lock(&xnc->lock);
+
+ /*
+ * If this is a dotdot entry, it is a back link from dp to ino. How
+ * we handle this depends on whether or not dp is the root directory.
+ *
+ * The root directory is its own parent, so we pretend the dotdot entry
+ * establishes the "parent" of the root directory. Increment the
+ * number of parents of the root directory.
+ *
+ * Otherwise, increment the number of backrefs pointing back to ino.
+ */
+ if (dotdot) {
+ if (dp == sc->mp->m_rootip)
+ error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+ else
+ error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * If this dirent is a forward link from dp to ino, increment the
+ * number of parents linking into ino.
+ */
+ if (!dot && !dotdot) {
+ error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * If this dirent is a forward link to a subdirectory, increment the
+ * number of child links of dp.
+ */
+ if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
+ error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
+ if (error)
+ goto out_unlock;
+ }
+
+ mutex_unlock(&xnc->lock);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xnc->lock);
+out_abort:
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(sc);
+ return error;
+}
+
+/* Walk a directory to bump the observed link counts of the children. */
+STATIC int
+xchk_nlinks_collect_dir(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = xnc->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /* Prevent anyone from changing this directory while we walk it. */
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * The dotdot entry of an unlinked directory still points to the last
+ * parent, but the parent no longer links to this directory. Skip the
+ * directory to avoid overcounting.
+ */
+ if (VFS_I(dp)->i_nlink == 0)
+ goto out_unlock;
+
+ /*
+ * We cannot count file links if the directory looks as though it has
+ * been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_abort;
+ }
+
+ error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (error)
+ goto out_abort;
+
+ xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
+ goto out_unlock;
+
+out_abort:
+ xchk_set_incomplete(sc);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return error;
+}
+
+/* If this looks like a valid pointer, count it. */
+static inline int
+xchk_nlinks_collect_metafile(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino)
+{
+ if (!xfs_verify_ino(xnc->sc->mp, ino))
+ return 0;
+
+ trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
+ return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+}
+
+/* Bump the link counts of metadata files rooted in the superblock. */
+STATIC int
+xchk_nlinks_collect_metafiles(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_mount *mp = xnc->sc->mp;
+ int error = -ECANCELED;
+
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ goto out_incomplete;
+
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
+ if (error)
+ goto out_abort;
+ mutex_unlock(&xnc->lock);
+
+ return 0;
+
+out_abort:
+ mutex_unlock(&xnc->lock);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(xnc->sc);
+ return error;
+}
+
+/* Advance the collection scan cursor for this non-directory file. */
+static inline int
+xchk_nlinks_collect_file(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *ip)
+{
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+}
+
+/* Walk all directories and count inode links. */
+STATIC int
+xchk_nlinks_collect(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /* Count the rt and quota files that are rooted in the superblock. */
+ error = xchk_nlinks_collect_metafiles(xnc);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Cancel the transaction to release the log grant space while we scan
+ * the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
+ if (S_ISDIR(VFS_I(ip)->i_mode))
+ error = xchk_nlinks_collect_dir(xnc, ip);
+ else
+ error = xchk_nlinks_collect_file(xnc, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->collect_iscan);
+ if (error) {
+ xchk_set_incomplete(sc);
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Switch out for a real transaction in preparation for building a new
+ * tree.
+ */
+ xchk_trans_cancel(sc);
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing file link counters. Walk each inode and compare the link
+ * counts against our shadow information; and then walk each shadow link count
+ * structure (that wasn't covered in the first part), comparing it against the
+ * file.
+ */
+
+/* Read the observed link count for comparison with the actual inode. */
+STATIC int
+xchk_nlinks_comparison_read(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino,
+ struct xchk_nlink *obs)
+{
+ struct xchk_nlink nl;
+ int error;
+
+ error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+ if (error)
+ return error;
+
+ nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
+
+ error = xfarray_store(xnc->nlinks, ino, &nl);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete. This
+ * shouldn't really happen outside of the collection phase.
+ */
+ xchk_set_incomplete(xnc->sc);
+ return -ECANCELED;
+ }
+ if (error)
+ return error;
+
+ /* Copy the counters, but do not expose the internal state. */
+ obs->parents = nl.parents;
+ obs->backrefs = nl.backrefs;
+ obs->children = nl.children;
+ obs->flags = 0;
+ return 0;
+}
+
+/* Check our link count against an inode. */
+STATIC int
+xchk_nlinks_compare_inode(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *ip)
+{
+ struct xchk_nlink obs;
+ struct xfs_scrub *sc = xnc->sc;
+ uint64_t total_links;
+ unsigned int actual_nlink;
+ int error;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ mutex_lock(&xnc->lock);
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ xchk_set_incomplete(xnc->sc);
+ error = -ECANCELED;
+ goto out_scanlock;
+ }
+
+ error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ /*
+ * If we don't have ftype to get an accurate count of the subdirectory
+ * entries in this directory, take advantage of the fact that on a
+ * consistent ftype=0 filesystem, the number of subdirectory
+ * backreferences (dotdot entries) pointing towards this directory
+ * should be equal to the number of subdirectory entries in the
+ * directory.
+ */
+ if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
+ obs.children = obs.backrefs;
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+
+ trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
+
+ /*
+ * If we found so many parents that we'd overflow i_nlink, we must flag
+ * this as a corruption. The VFS won't let users increase the link
+ * count, but it will let them decrease it.
+ */
+ if (total_links > XFS_MAXLINK) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ /* Link counts should match. */
+ if (total_links != actual_nlink) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
+ /*
+ * The collection phase ignores directories with zero link
+ * count, so we ignore them here too.
+ *
+ * The number of subdirectory backreferences (dotdot entries)
+ * pointing towards this directory should be equal to the
+ * number of subdirectory entries in the directory.
+ */
+ if (obs.children != obs.backrefs)
+ xchk_ino_xref_set_corrupt(sc, ip->i_ino);
+ } else {
+ /*
+ * Non-directories and unlinked directories should not have
+ * back references.
+ */
+ if (obs.backrefs != 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ /*
+ * Non-directories and unlinked directories should not have
+ * children.
+ */
+ if (obs.children != 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ }
+
+ if (ip == sc->mp->m_rootip) {
+ /*
+ * For the root of a directory tree, both the '.' and '..'
+ * entries should point to the root directory. The dotdot
+ * entry is counted as a parent of the root /and/ a backref of
+ * the root directory.
+ */
+ if (obs.parents != 1) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ } else if (actual_nlink > 0) {
+ /*
+ * Linked files that are not the root directory should have at
+ * least one parent.
+ */
+ if (obs.parents == 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ }
+
+out_corrupt:
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ error = -ECANCELED;
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/*
+ * Check our link count against an inode that wasn't checked previously. This
+ * is intended to catch directories with dangling links, though we could be
+ * racing with inode allocation in other threads.
+ */
+STATIC int
+xchk_nlinks_compare_inum(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino)
+{
+ struct xchk_nlink obs;
+ struct xfs_mount *mp = xnc->sc->mp;
+ struct xfs_trans *tp = xnc->sc->tp;
+ struct xfs_buf *agi_bp;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * The first iget failed, so try again with the variant that returns
+ * either an incore inode or the AGI buffer. If the function returns
+ * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
+ * can guarantee that the inode won't be allocated while we check for
+ * a zero link count in the observed link count data.
+ */
+ error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
+ if (!error) {
+ /* Actually got an inode, so use the inode compare. */
+ error = xchk_nlinks_compare_inode(xnc, ip);
+ xchk_irele(xnc->sc, ip);
+ return error;
+ }
+ if (error == -ENOENT || error == -EINVAL) {
+ /* No inode was found. Check for zero link count below. */
+ error = 0;
+ }
+ if (error)
+ goto out_agi;
+
+ /* Ensure that we have protected against inode allocation/freeing. */
+ if (agi_bp == NULL) {
+ ASSERT(agi_bp != NULL);
+ xchk_set_incomplete(xnc->sc);
+ return -ECANCELED;
+ }
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ xchk_set_incomplete(xnc->sc);
+ error = -ECANCELED;
+ goto out_agi;
+ }
+
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_comparison_read(xnc, ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ trace_xchk_nlinks_check_zero(mp, ino, &obs);
+
+ /*
+ * If we can't grab the inode, the link count had better be zero. We
+ * still hold the AGI to prevent inode allocation/freeing.
+ */
+ if (xchk_nlink_total(NULL, &obs) != 0) {
+ xchk_ino_set_corrupt(xnc->sc, ino);
+ error = -ECANCELED;
+ }
+
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+out_agi:
+ if (agi_bp)
+ xfs_trans_brelse(tp, agi_bp);
+ return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem to compare the link count. Move
+ * on if we can't grab an inode, since we'll revisit unchecked nlink records in
+ * the second part.
+ */
+static int
+xchk_nlinks_compare_iter(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ do {
+ error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+ } while (error == -EBUSY);
+
+ return error;
+}
+
+/* Compare the link counts we observed against the live information. */
+STATIC int
+xchk_nlinks_compare(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xchk_nlink nl;
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_inode *ip;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * Create a new empty transaction so that we can advance the iscan
+ * cursor without deadlocking if the inobt has a cycle and push on the
+ * inactivation workqueue.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ /*
+ * Use the inobt to walk all allocated inodes to compare the link
+ * counts. Inodes skipped by _compare_iter will be tried again in the
+ * next phase of the scan.
+ */
+ xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
+ while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
+ error = xchk_nlinks_compare_inode(xnc, ip);
+ xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->compare_iscan);
+ xchk_iscan_teardown(&xnc->compare_iscan);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * Walk all the non-null nlink observations that weren't checked in the
+ * previous step.
+ */
+ mutex_lock(&xnc->lock);
+ while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
+ xfs_ino_t ino = cur - 1;
+
+ if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
+ continue;
+
+ mutex_unlock(&xnc->lock);
+
+ error = xchk_nlinks_compare_inum(xnc, ino);
+ if (error)
+ return error;
+
+ if (xchk_should_terminate(xnc->sc, &error))
+ return error;
+
+ mutex_lock(&xnc->lock);
+ }
+ mutex_unlock(&xnc->lock);
+
+ return error;
+}
+
+/* Tear down everything associated with a nlinks check. */
+static void
+xchk_nlinks_teardown_scan(
+ void *priv)
+{
+ struct xchk_nlink_ctrs *xnc = priv;
+
+ /* Discourage any hook functions that might be running. */
+ xchk_iscan_abort(&xnc->collect_iscan);
+
+ xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
+
+ xfarray_destroy(xnc->nlinks);
+ xnc->nlinks = NULL;
+
+ xchk_iscan_teardown(&xnc->collect_iscan);
+ mutex_destroy(&xnc->lock);
+ xnc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate link count data. If
+ * the scan is successful, the counts will be left alive for a repair. If any
+ * error occurs, we'll tear everything down.
+ */
+STATIC int
+xchk_nlinks_setup_scan(
+ struct xfs_scrub *sc,
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ unsigned long long max_inos;
+ xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1;
+ xfs_agino_t first_agino, last_agino;
+ int error;
+
+ ASSERT(xnc->sc == NULL);
+ xnc->sc = sc;
+
+ mutex_init(&xnc->lock);
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
+
+ /*
+ * Set up enough space to store an nlink record for the highest
+ * possible inode number in this system.
+ */
+ xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
+ max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
+ descr = xchk_xfile_descr(sc, "file link counts");
+ error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
+ sizeof(struct xchk_nlink), &xnc->nlinks);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Hook into the directory entry code so that we can capture updates to
+ * file link counts. The hook only triggers for inodes that were
+ * already scanned, and the scanner thread takes each inode's ILOCK,
+ * which means that any in-progress inode updates will finish before we
+ * can scan the inode.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
+ error = xfs_dir_hook_add(mp, &xnc->dhook);
+ if (error)
+ goto out_teardown;
+
+ /* Use deferred cleanup to pass the inode link count data to repair. */
+ sc->buf_cleanup = xchk_nlinks_teardown_scan;
+ return 0;
+
+out_teardown:
+ xchk_nlinks_teardown_scan(xnc);
+ return error;
+}
+
+/* Scrub the link count of all inodes on the filesystem. */
+int
+xchk_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc = sc->buf;
+ int error = 0;
+
+ /* Set ourselves up to check link counts on the live filesystem. */
+ error = xchk_nlinks_setup_scan(sc, xnc);
+ if (error)
+ return error;
+
+ /* Walk all inodes, picking up link count information. */
+ error = xchk_nlinks_collect(xnc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Fail fast if we're not playing with a full dataset. */
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ xchk_set_incomplete(sc);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare link counts. */
+ error = xchk_nlinks_compare(xnc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Check one last time for an incomplete dataset. */
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ xchk_set_incomplete(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h
new file mode 100644
index 000000000000..a950f3daf204
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NLINKS_H__
+#define __XFS_SCRUB_NLINKS_H__
+
+/* Live link count control structure. */
+struct xchk_nlink_ctrs {
+ struct xfs_scrub *sc;
+
+ /* Shadow link count data and its mutex. */
+ struct xfarray *nlinks;
+ struct mutex lock;
+
+ /*
+ * The collection step uses a separate iscan context from the compare
+ * step because the collection iscan coordinates live updates to the
+ * observation data while this scanner is running. The compare iscan
+ * is secondary and can be reinitialized as needed.
+ */
+ struct xchk_iscan collect_iscan;
+ struct xchk_iscan compare_iscan;
+
+ /*
+ * Hook into directory updates so that we can receive live updates
+ * from other writer threads.
+ */
+ struct xfs_dir_hook dhook;
+};
+
+/*
+ * In-core link counts for a given inode in the filesystem.
+ *
+ * For an empty rootdir, the directory entries and the field to which they are
+ * accounted are as follows:
+ *
+ * Root directory:
+ *
+ * . points to self (root.child)
+ * .. points to self (root.parent)
+ * f1 points to a child file (f1.parent)
+ * d1 points to a child dir (d1.parent, root.child)
+ *
+ * Subdirectory d1:
+ *
+ * . points to self (d1.child)
+ * .. points to root dir (root.backref)
+ * f2 points to child file (f2.parent)
+ * f3 points to root.f1 (f1.parent)
+ *
+ * root.nlink == 3 (root.dot, root.dotdot, root.d1)
+ * d1.nlink == 2 (root.d1, d1.dot)
+ * f1.nlink == 2 (root.f1, d1.f3)
+ * f2.nlink == 1 (d1.f2)
+ */
+struct xchk_nlink {
+ /* Count of forward links from parent directories to this file. */
+ xfs_nlink_t parents;
+
+ /*
+ * Count of back links to this parent directory from child
+ * subdirectories.
+ */
+ xfs_nlink_t backrefs;
+
+ /*
+ * Count of forward links from this directory to all child files and
+ * the number of dot entries. Should be zero for non-directories.
+ */
+ xfs_nlink_t children;
+
+ /* Record state flags */
+ unsigned int flags;
+};
+
+/*
+ * This incore link count has been written at least once. We never want to
+ * store an xchk_nlink that looks uninitialized.
+ */
+#define XCHK_NLINK_WRITTEN (1U << 0)
+
+/* Already checked this link count record. */
+#define XCHK_NLINK_COMPARE_SCANNED (1U << 1)
+
+/* Already made a repair with this link count record. */
+#define XREP_NLINK_DIRTY (1U << 2)
+
+/* Compute total link count, using large enough variables to detect overflow. */
+static inline uint64_t
+xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live)
+{
+ uint64_t ret = live->parents;
+
+ /* Add one link count for the dot entry of any linked directory. */
+ if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink)
+ ret++;
+ return ret + live->children;
+}
+
+#endif /* __XFS_SCRUB_NLINKS_H__ */
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
new file mode 100644
index 000000000000..b87618322f55
--- /dev/null
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Inode Link Count Repair
+ * ============================
+ *
+ * Use the live inode link count information that we collected to replace the
+ * nlink values of the incore inodes. A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * inode is locked.
+ */
+
+/*
+ * Correct the link count of the given inode. Because we have to grab locks
+ * and resources in a certain order, it's possible that this will be a no-op.
+ */
+STATIC int
+xrep_nlinks_repair_inode(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xchk_nlink obs;
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ uint64_t total_links;
+ uint64_t actual_nlink;
+ bool dirty = false;
+ int error;
+
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp);
+ if (error)
+ return error;
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(sc->tp, ip, 0);
+
+ mutex_lock(&xnc->lock);
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_scanlock;
+ }
+
+ error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ /*
+ * We're done accessing the shared scan data, so we can drop the lock.
+ * We still hold @ip's ILOCK, so its link count cannot change.
+ */
+ mutex_unlock(&xnc->lock);
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+
+ /*
+ * Non-directories cannot have directories pointing up to them.
+ *
+ * We previously set error to zero, but set it again because one static
+ * checker author fears that programmers will fail to maintain this
+ * invariant and built their tool to flag this as a security risk. A
+ * different tool author made their bot complain about the redundant
+ * store. This is a never-ending and stupid battle; both tools missed
+ * *actual bugs* elsewhere; and I no longer care.
+ */
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) {
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ error = 0;
+ goto out_trans;
+ }
+
+ /*
+ * We did not find any links to this inode. If the inode agrees, we
+ * have nothing further to do. If not, the inode has a nonzero link
+ * count and we don't have anywhere to graft the child onto. Dropping
+ * a live inode's link count to zero can cause unexpected shutdowns in
+ * inactivation, so leave it alone.
+ */
+ if (total_links == 0) {
+ if (actual_nlink != 0)
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ goto out_trans;
+ }
+
+ /* Commit the new link count if it changed. */
+ if (total_links != actual_nlink) {
+ if (total_links > XFS_MAXLINK) {
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ goto out_trans;
+ }
+
+ trace_xrep_nlinks_update_inode(mp, ip, &obs);
+
+ set_nlink(VFS_I(ip), total_links);
+ dirty = true;
+ }
+
+ if (!dirty) {
+ error = 0;
+ goto out_trans;
+ }
+
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+
+ error = xrep_trans_commit(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ return error;
+
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+out_trans:
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem for repairs. Move on if we can't
+ * grab an inode, since we're still making forward progress.
+ */
+static int
+xrep_nlinks_iter(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ do {
+ error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+ } while (error == -EBUSY);
+
+ return error;
+}
+
+/* Commit the new inode link counters. */
+int
+xrep_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc = sc->buf;
+ int error;
+
+ /*
+ * We need ftype for an accurate count of the number of child
+ * subdirectory links. Child subdirectories with a back link (dotdot
+ * entry) but no forward link are unfixable, so we cannot repair the
+ * link count of the parent directory based on the back link count
+ * alone. Filesystems without ftype support are rare (old V4) so we
+ * just skip out here.
+ */
+ if (!xfs_has_ftype(sc->mp))
+ return -EOPNOTSUPP;
+
+ /*
+ * Use the inobt to walk all allocated inodes to compare and fix the
+ * link counts. Retry iget every tenth of a second for up to 30
+ * seconds -- even if repair misses a few inodes, we still try to fix
+ * as many of them as we can.
+ */
+ xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan);
+ ASSERT(sc->ip == NULL);
+
+ while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) {
+ /*
+ * Commit the scrub transaction so that we can create repair
+ * transactions with the correct reservations.
+ */
+ xchk_trans_cancel(sc);
+
+ error = xrep_nlinks_repair_inode(xnc);
+ xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip);
+ xchk_irele(sc, sc->ip);
+ sc->ip = NULL;
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+
+ /*
+ * Create a new empty transaction so that we can advance the
+ * iscan cursor without deadlocking if the inobt has a cycle.
+ * We can only push the inactivation workqueues with an empty
+ * transaction.
+ */
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->compare_iscan);
+ xchk_iscan_teardown(&xnc->compare_iscan);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
new file mode 100644
index 000000000000..c77eb2de8df7
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck
+ * ===============
+ *
+ * Quota counters are "summary" metadata, in the sense that they are computed
+ * as the summation of the block usage counts for every file on the filesystem.
+ * Therefore, we compute the correct icount, bcount, and rtbcount values by
+ * creating a shadow quota counter structure and walking every inode.
+ */
+
+/* Track the quota deltas for a dquot in a transaction. */
+struct xqcheck_dqtrx {
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+
+ int64_t icount_delta;
+
+ int64_t bcount_delta;
+ int64_t delbcnt_delta;
+
+ int64_t rtbcount_delta;
+ int64_t delrtb_delta;
+};
+
+#define XQCHECK_MAX_NR_DQTRXS (XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS)
+
+/*
+ * Track the quota deltas for all dquots attached to a transaction if the
+ * quota deltas are being applied to an inode that we already scanned.
+ */
+struct xqcheck_dqacct {
+ struct rhash_head hash;
+ uintptr_t tx_id;
+ struct xqcheck_dqtrx dqtrx[XQCHECK_MAX_NR_DQTRXS];
+ unsigned int refcount;
+};
+
+/* Free a shadow dquot accounting structure. */
+static void
+xqcheck_dqacct_free(
+ void *ptr,
+ void *arg)
+{
+ struct xqcheck_dqacct *dqa = ptr;
+
+ kfree(dqa);
+}
+
+/* Set us up to scrub quota counters. */
+int
+xchk_setup_quotacheck(
+ struct xfs_scrub *sc)
+{
+ if (!XFS_IS_QUOTA_ON(sc->mp))
+ return -ENOENT;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA);
+
+ sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting dquot resource usage counts. For each xfs_dquot attached
+ * to each inode, we create a shadow dquot, and compute the inode count and add
+ * the data/rt block usage from what we see.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the quota counters for an inode that we've already
+ * scanned. This will cause our counts to be incorrect. Therefore, we hook
+ * the live transaction code in two places: (1) when the callers update the
+ * per-transaction dqtrx structure to log quota counter updates; and (2) when
+ * transaction commit actually logs those updates to the incore dquot. By
+ * shadowing transaction updates in this manner, live quotacheck can ensure
+ * by locking the dquot and the shadow structure that its own copies are not
+ * out of date. Because the hook code runs in a different process context from
+ * the scrub code and the scrub state flags are not accessed atomically,
+ * failures in the hook code must abort the iscan and the scrubber must notice
+ * the aborted scan and set the incomplete flag.
+ *
+ * Note that we use srcu notifier hooks to minimize the overhead when live
+ * quotacheck is /not/ running.
+ */
+
+/* Update an incore dquot counter information from a live update. */
+static int
+xqcheck_update_incore_counts(
+ struct xqcheck *xqc,
+ struct xfarray *counts,
+ xfs_dqid_t id,
+ int64_t inodes,
+ int64_t nblks,
+ int64_t rtblks)
+{
+ struct xqcheck_dquot xcdq;
+ int error;
+
+ error = xfarray_load_sparse(counts, id, &xcdq);
+ if (error)
+ return error;
+
+ xcdq.flags |= XQCHECK_DQUOT_WRITTEN;
+ xcdq.icount += inodes;
+ xcdq.bcount += nblks;
+ xcdq.rtbcount += rtblks;
+
+ error = xfarray_store(counts, id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete.
+ */
+ error = -ECANCELED;
+ }
+ return error;
+}
+
+/* Decide if this is the shadow dquot accounting structure for a transaction. */
+static int
+xqcheck_dqacct_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const uintptr_t *tx_idp = arg->key;
+ const struct xqcheck_dqacct *dqa = obj;
+
+ if (dqa->tx_id != *tx_idp)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xqcheck_dqacct_hash_params = {
+ .min_size = 32,
+ .key_len = sizeof(uintptr_t),
+ .key_offset = offsetof(struct xqcheck_dqacct, tx_id),
+ .head_offset = offsetof(struct xqcheck_dqacct, hash),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xqcheck_dqacct_obj_cmpfn,
+};
+
+/* Find a shadow dqtrx slot for the given dquot. */
+STATIC struct xqcheck_dqtrx *
+xqcheck_get_dqtrx(
+ struct xqcheck_dqacct *dqa,
+ xfs_dqtype_t q_type,
+ xfs_dqid_t q_id)
+{
+ int i;
+
+ for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) {
+ if (dqa->dqtrx[i].q_type == 0 ||
+ (dqa->dqtrx[i].q_type == q_type &&
+ dqa->dqtrx[i].q_id == q_id))
+ return &dqa->dqtrx[i];
+ }
+
+ return NULL;
+}
+
+/*
+ * Create and fill out a quota delta tracking structure to shadow the updates
+ * going on in the regular quota code.
+ */
+static int
+xqcheck_mod_live_ino_dqtrx(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_mod_ino_dqtrx_params *p = data;
+ struct xqcheck *xqc;
+ struct xqcheck_dqacct *dqa;
+ struct xqcheck_dqtrx *dqtrx;
+ int error;
+
+ xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb);
+
+ /* Skip quota reservation fields. */
+ switch (action) {
+ case XFS_TRANS_DQ_BCOUNT:
+ case XFS_TRANS_DQ_DELBCOUNT:
+ case XFS_TRANS_DQ_ICOUNT:
+ case XFS_TRANS_DQ_RTBCOUNT:
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* Ignore dqtrx updates for quota types we don't care about. */
+ switch (p->q_type) {
+ case XFS_DQTYPE_USER:
+ if (!xqc->ucounts)
+ return NOTIFY_DONE;
+ break;
+ case XFS_DQTYPE_GROUP:
+ if (!xqc->gcounts)
+ return NOTIFY_DONE;
+ break;
+ case XFS_DQTYPE_PROJ:
+ if (!xqc->pcounts)
+ return NOTIFY_DONE;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* Skip inodes that haven't been scanned yet. */
+ if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino))
+ return NOTIFY_DONE;
+
+ /* Make a shadow quota accounting tracker for this transaction. */
+ mutex_lock(&xqc->lock);
+ dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+ xqcheck_dqacct_hash_params);
+ if (!dqa) {
+ dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS);
+ if (!dqa)
+ goto out_abort;
+
+ dqa->tx_id = p->tx_id;
+ error = rhashtable_insert_fast(&xqc->shadow_dquot_acct,
+ &dqa->hash, xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_abort;
+ }
+
+ /* Find the shadow dqtrx (or an empty slot) here. */
+ dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+ if (!dqtrx)
+ goto out_abort;
+ if (dqtrx->q_type == 0) {
+ dqtrx->q_type = p->q_type;
+ dqtrx->q_id = p->q_id;
+ dqa->refcount++;
+ }
+
+ /* Update counter */
+ switch (action) {
+ case XFS_TRANS_DQ_BCOUNT:
+ dqtrx->bcount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_DELBCOUNT:
+ dqtrx->delbcnt_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_ICOUNT:
+ dqtrx->icount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_RTBCOUNT:
+ dqtrx->rtbcount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ dqtrx->delrtb_delta += p->delta;
+ break;
+ }
+
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Apply the transaction quota deltas to our shadow quota accounting info when
+ * the regular quota code are doing the same.
+ */
+static int
+xqcheck_apply_live_dqtrx(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_apply_dqtrx_params *p = data;
+ struct xqcheck *xqc;
+ struct xqcheck_dqacct *dqa;
+ struct xqcheck_dqtrx *dqtrx;
+ struct xfarray *counts;
+ int error;
+
+ xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb);
+
+ /* Map the dquot type to an incore counter object. */
+ switch (p->q_type) {
+ case XFS_DQTYPE_USER:
+ counts = xqc->ucounts;
+ break;
+ case XFS_DQTYPE_GROUP:
+ counts = xqc->gcounts;
+ break;
+ case XFS_DQTYPE_PROJ:
+ counts = xqc->pcounts;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL)
+ return NOTIFY_DONE;
+
+ /*
+ * Find the shadow dqtrx for this transaction and dquot, if any deltas
+ * need to be applied here. If not, we're finished early.
+ */
+ mutex_lock(&xqc->lock);
+ dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+ xqcheck_dqacct_hash_params);
+ if (!dqa)
+ goto out_unlock;
+ dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+ if (!dqtrx || dqtrx->q_type == 0)
+ goto out_unlock;
+
+ /* Update our shadow dquot if we're committing. */
+ if (action == XFS_APPLY_DQTRX_COMMIT) {
+ error = xqcheck_update_incore_counts(xqc, counts, p->q_id,
+ dqtrx->icount_delta,
+ dqtrx->bcount_delta + dqtrx->delbcnt_delta,
+ dqtrx->rtbcount_delta + dqtrx->delrtb_delta);
+ if (error)
+ goto out_abort;
+ }
+
+ /* Free the shadow accounting structure if that was the last user. */
+ dqa->refcount--;
+ if (dqa->refcount == 0) {
+ error = rhashtable_remove_fast(&xqc->shadow_dquot_acct,
+ &dqa->hash, xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_abort;
+ xqcheck_dqacct_free(dqa, NULL);
+ }
+
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+out_unlock:
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+}
+
+/* Record this inode's quota usage in our shadow quota counter data. */
+STATIC int
+xqcheck_collect_inode(
+ struct xqcheck *xqc,
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp = xqc->sc->tp;
+ xfs_filblks_t nblks, rtblks;
+ uint ilock_flags = 0;
+ xfs_dqid_t id;
+ bool isreg = S_ISREG(VFS_I(ip)->i_mode);
+ int error = 0;
+
+ if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
+ /*
+ * Quota files are never counted towards quota, so we do not
+ * need to take the lock.
+ */
+ xchk_iscan_mark_visited(&xqc->iscan, ip);
+ return 0;
+ }
+
+ /* Figure out the data / rt device block counts. */
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (isreg)
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Read in the data fork for rt files so that _count_blocks
+ * can count the number of blocks allocated from the rt volume.
+ * Inodes do not track that separately.
+ */
+ ilock_flags = xfs_ilock_data_map_shared(ip);
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_abort;
+ } else {
+ ilock_flags = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ }
+ xfs_inode_count_blocks(tp, ip, &nblks, &rtblks);
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ /* Update the shadow dquot counters. */
+ mutex_lock(&xqc->lock);
+ if (xqc->ucounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER);
+ error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+
+ if (xqc->gcounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP);
+ error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+
+ if (xqc->pcounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ);
+ error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+ mutex_unlock(&xqc->lock);
+
+ xchk_iscan_mark_visited(&xqc->iscan, ip);
+ goto out_ilock;
+
+out_mutex:
+ mutex_unlock(&xqc->lock);
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+out_incomplete:
+ xchk_set_incomplete(xqc->sc);
+out_ilock:
+ xfs_iunlock(ip, ilock_flags);
+ if (isreg)
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return error;
+}
+
+/* Walk all the allocated inodes and run a quota scan on them. */
+STATIC int
+xqcheck_collect_counts(
+ struct xqcheck *xqc)
+{
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Cancel the transaction to release the log grant space while we scan
+ * the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done without
+ * risk of deadlock between sb_internal and the IOLOCK (we take the
+ * IOLOCK to quiesce the file before scanning) because empty
+ * transactions do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) {
+ error = xqcheck_collect_inode(xqc, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xqc->iscan);
+ if (error) {
+ xchk_set_incomplete(sc);
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Switch out for a real transaction in preparation for building a new
+ * tree.
+ */
+ xchk_trans_cancel(sc);
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing dquot resource counters. Walk each xfs_dquot, comparing
+ * the resource usage counters against our shadow dquots; and then walk each
+ * shadow dquot (that wasn't covered in the first part), comparing it against
+ * the xfs_dquot.
+ */
+
+/*
+ * Check the dquot data against what we observed. Caller must hold the dquot
+ * lock.
+ */
+STATIC int
+xqcheck_compare_dquot(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype,
+ struct xfs_dquot *dq)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ int error;
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ xchk_set_incomplete(xqc->sc);
+ return -ECANCELED;
+ }
+
+ mutex_lock(&xqc->lock);
+ error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+ if (error)
+ goto out_unlock;
+
+ if (xcdq.icount != dq->q_ino.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ if (xcdq.bcount != dq->q_blk.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ if (xcdq.rtbcount != dq->q_rtb.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN);
+ error = xfarray_store(counts, dq->q_id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete. This
+ * should never happen outside of the collection phase.
+ */
+ xchk_set_incomplete(xqc->sc);
+ error = -ECANCELED;
+ }
+ mutex_unlock(&xqc->lock);
+ if (error)
+ return error;
+
+ if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -ECANCELED;
+
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xqc->lock);
+ return error;
+}
+
+/*
+ * Walk all the observed dquots, and make sure there's a matching incore
+ * dquot and that its counts match ours.
+ */
+STATIC int
+xqcheck_walk_observations(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfs_dquot *dq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ mutex_lock(&xqc->lock);
+ while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+ xfs_dqid_t id = cur - 1;
+
+ if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED)
+ continue;
+
+ mutex_unlock(&xqc->lock);
+
+ error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq);
+ if (error == -ENOENT) {
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, id);
+ return 0;
+ }
+ if (error)
+ return error;
+
+ error = xqcheck_compare_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ return error;
+
+ if (xchk_should_terminate(xqc->sc, &error))
+ return error;
+
+ mutex_lock(&xqc->lock);
+ }
+ mutex_unlock(&xqc->lock);
+
+ return error;
+}
+
+/* Compare the quota counters we observed against the live dquots. */
+STATIC int
+xqcheck_compare_dqtype(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_dquot *dq;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* If the quota CHKD flag is cleared, we need to repair this quota. */
+ if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) {
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0);
+ return 0;
+ }
+
+ /* Compare what we observed against the actual dquots. */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xqcheck_compare_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /* Walk all the observed dquots and compare to the incore ones. */
+ return xqcheck_walk_observations(xqc, dqtype);
+}
+
+/* Tear down everything associated with a quotacheck. */
+static void
+xqcheck_teardown_scan(
+ void *priv)
+{
+ struct xqcheck *xqc = priv;
+ struct xfs_quotainfo *qi = xqc->sc->mp->m_quotainfo;
+
+ /* Discourage any hook functions that might be running. */
+ xchk_iscan_abort(&xqc->iscan);
+
+ /*
+ * As noted above, the apply hook is responsible for cleaning up the
+ * shadow dquot accounting data when a transaction completes. The mod
+ * hook must be removed before the apply hook so that we don't
+ * mistakenly leave an active shadow account for the mod hook to get
+ * its hands on. No hooks should be running after these functions
+ * return.
+ */
+ xfs_dqtrx_hook_del(qi, &xqc->qhook);
+
+ if (xqc->shadow_dquot_acct.key_len) {
+ rhashtable_free_and_destroy(&xqc->shadow_dquot_acct,
+ xqcheck_dqacct_free, NULL);
+ xqc->shadow_dquot_acct.key_len = 0;
+ }
+
+ if (xqc->pcounts) {
+ xfarray_destroy(xqc->pcounts);
+ xqc->pcounts = NULL;
+ }
+
+ if (xqc->gcounts) {
+ xfarray_destroy(xqc->gcounts);
+ xqc->gcounts = NULL;
+ }
+
+ if (xqc->ucounts) {
+ xfarray_destroy(xqc->ucounts);
+ xqc->ucounts = NULL;
+ }
+
+ xchk_iscan_teardown(&xqc->iscan);
+ mutex_destroy(&xqc->lock);
+ xqc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate quota counter data.
+ * If the scan is successful, the quota data will be left alive for a repair.
+ * If any error occurs, we'll tear everything down.
+ */
+STATIC int
+xqcheck_setup_scan(
+ struct xfs_scrub *sc,
+ struct xqcheck *xqc)
+{
+ char *descr;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL;
+ int error;
+
+ ASSERT(xqc->sc == NULL);
+ xqc->sc = sc;
+
+ mutex_init(&xqc->lock);
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &xqc->iscan);
+
+ error = -ENOMEM;
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) {
+ descr = xchk_xfile_descr(sc, "user dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->ucounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) {
+ descr = xchk_xfile_descr(sc, "group dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->gcounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) {
+ descr = xchk_xfile_descr(sc, "project dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->pcounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ /*
+ * Set up hash table to map transactions to our internal shadow dqtrx
+ * structures.
+ */
+ error = rhashtable_init(&xqc->shadow_dquot_acct,
+ &xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Hook into the quota code. The hook only triggers for inodes that
+ * were already scanned, and the scanner thread takes each inode's
+ * ILOCK, which means that any in-progress inode updates will finish
+ * before we can scan the inode.
+ *
+ * The apply hook (which removes the shadow dquot accounting struct)
+ * must be installed before the mod hook so that we never fail to catch
+ * the end of a quota update sequence and leave stale shadow data.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_QUOTA);
+ xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx,
+ xqcheck_apply_live_dqtrx);
+
+ error = xfs_dqtrx_hook_add(qi, &xqc->qhook);
+ if (error)
+ goto out_teardown;
+
+ /* Use deferred cleanup to pass the quota count data to repair. */
+ sc->buf_cleanup = xqcheck_teardown_scan;
+ return 0;
+
+out_teardown:
+ xqcheck_teardown_scan(xqc);
+ return error;
+}
+
+/* Scrub all counters for a given quota type. */
+int
+xchk_quotacheck(
+ struct xfs_scrub *sc)
+{
+ struct xqcheck *xqc = sc->buf;
+ int error = 0;
+
+ /* Check quota counters on the live filesystem. */
+ error = xqcheck_setup_scan(sc, xqc);
+ if (error)
+ return error;
+
+ /* Walk all inodes, picking up quota information. */
+ error = xqcheck_collect_counts(xqc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Fail fast if we're not playing with a full dataset. */
+ if (xchk_iscan_aborted(&xqc->iscan))
+ xchk_set_incomplete(sc);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare quota counters. */
+ if (xqc->ucounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+ if (xqc->gcounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+ if (xqc->pcounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+
+ /* Check one last time for an incomplete dataset. */
+ if (xchk_iscan_aborted(&xqc->iscan))
+ xchk_set_incomplete(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h
new file mode 100644
index 000000000000..4ea5f249c978
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTACHECK_H__
+#define __XFS_SCRUB_QUOTACHECK_H__
+
+/* Quota counters for live quotacheck. */
+struct xqcheck_dquot {
+ /* block usage count */
+ int64_t bcount;
+
+ /* inode usage count */
+ int64_t icount;
+
+ /* realtime block usage count */
+ int64_t rtbcount;
+
+ /* Record state */
+ unsigned int flags;
+};
+
+/*
+ * This incore dquot record has been written at least once. We never want to
+ * store an xqcheck_dquot that looks uninitialized.
+ */
+#define XQCHECK_DQUOT_WRITTEN (1U << 0)
+
+/* Already checked this dquot. */
+#define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1)
+
+/* Already repaired this dquot. */
+#define XQCHECK_DQUOT_REPAIR_SCANNED (1U << 2)
+
+/* Live quotacheck control structure. */
+struct xqcheck {
+ struct xfs_scrub *sc;
+
+ /* Shadow dquot counter data. */
+ struct xfarray *ucounts;
+ struct xfarray *gcounts;
+ struct xfarray *pcounts;
+
+ /* Lock protecting quotacheck count observations */
+ struct mutex lock;
+
+ struct xchk_iscan iscan;
+
+ /* Hooks into the quota code. */
+ struct xfs_dqtrx_hook qhook;
+
+ /* Shadow quota delta tracking structure. */
+ struct rhashtable shadow_dquot_acct;
+};
+
+/* Return the incore counter array for a given quota type. */
+static inline struct xfarray *
+xqcheck_counters_for(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ switch (dqtype) {
+ case XFS_DQTYPE_USER:
+ return xqc->ucounts;
+ case XFS_DQTYPE_GROUP:
+ return xqc->gcounts;
+ case XFS_DQTYPE_PROJ:
+ return xqc->pcounts;
+ }
+
+ ASSERT(0);
+ return NULL;
+}
+
+#endif /* __XFS_SCRUB_QUOTACHECK_H__ */
diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c
new file mode 100644
index 000000000000..dd8554c755b5
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck_repair.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck Repair
+ * ======================
+ *
+ * Use the live quota counter information that we collected to replace the
+ * counter values in the incore dquots. A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * dquot is locked.
+ */
+
+/* Commit new counters to a dquot. */
+static int
+xqcheck_commit_dquot(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype,
+ struct xfs_dquot *dq)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ int64_t delta;
+ bool dirty = false;
+ int error = 0;
+
+ /* Unlock the dquot just long enough to allocate a transaction. */
+ xfs_dqunlock(dq);
+ error = xchk_trans_alloc(xqc->sc, 0);
+ xfs_dqlock(dq);
+ if (error)
+ return error;
+
+ xfs_trans_dqjoin(xqc->sc->tp, dq);
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ error = -ECANCELED;
+ goto out_cancel;
+ }
+
+ mutex_lock(&xqc->lock);
+ error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+ if (error)
+ goto out_unlock;
+
+ /* Adjust counters as needed. */
+ delta = (int64_t)xcdq.icount - dq->q_ino.count;
+ if (delta) {
+ dq->q_ino.reserved += delta;
+ dq->q_ino.count += delta;
+ dirty = true;
+ }
+
+ delta = (int64_t)xcdq.bcount - dq->q_blk.count;
+ if (delta) {
+ dq->q_blk.reserved += delta;
+ dq->q_blk.count += delta;
+ dirty = true;
+ }
+
+ delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count;
+ if (delta) {
+ dq->q_rtb.reserved += delta;
+ dq->q_rtb.count += delta;
+ dirty = true;
+ }
+
+ xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN);
+ error = xfarray_store(counts, dq->q_id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the repair
+ * and must cancel the whole operation. This should never
+ * happen, but we need to catch it anyway.
+ */
+ error = -ECANCELED;
+ }
+ mutex_unlock(&xqc->lock);
+ if (error || !dirty)
+ goto out_cancel;
+
+ trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id);
+
+ /* Commit the dirty dquot to disk. */
+ dq->q_flags |= XFS_DQFLAG_DIRTY;
+ if (dq->q_id)
+ xfs_qm_adjust_dqtimers(dq);
+ xfs_trans_log_dquot(xqc->sc->tp, dq);
+
+ /*
+ * Transaction commit unlocks the dquot, so we must re-lock it so that
+ * the caller can put the reference (which apparently requires a locked
+ * dquot).
+ */
+ error = xrep_trans_commit(xqc->sc);
+ xfs_dqlock(dq);
+ return error;
+
+out_unlock:
+ mutex_unlock(&xqc->lock);
+out_cancel:
+ xchk_trans_cancel(xqc->sc);
+
+ /* Re-lock the dquot so the caller can put the reference. */
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Commit new quota counters for a particular quota type. */
+STATIC int
+xqcheck_commit_dqtype(
+ struct xqcheck *xqc,
+ unsigned int dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xqcheck_dquot xcdq;
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ struct xfs_dquot *dq;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ /*
+ * Update the counters of every dquot that the quota file knows about.
+ */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xqcheck_commit_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Make a second pass to deal with the dquots that we know about but
+ * the quota file previously did not know about.
+ */
+ mutex_lock(&xqc->lock);
+ while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+ xfs_dqid_t id = cur - 1;
+
+ if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED)
+ continue;
+
+ mutex_unlock(&xqc->lock);
+
+ /*
+ * Grab the dquot, allowing for dquot block allocation in a
+ * separate transaction. We committed the scrub transaction
+ * in a previous step, so we will not be creating nested
+ * transactions here.
+ */
+ error = xfs_qm_dqget(mp, id, dqtype, true, &dq);
+ if (error)
+ return error;
+
+ error = xqcheck_commit_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ return error;
+
+ mutex_lock(&xqc->lock);
+ }
+ mutex_unlock(&xqc->lock);
+
+ return error;
+}
+
+/* Figure out quota CHKD flags for the running quota types. */
+static inline unsigned int
+xqcheck_chkd_flags(
+ struct xfs_mount *mp)
+{
+ unsigned int ret = 0;
+
+ if (XFS_IS_UQUOTA_ON(mp))
+ ret |= XFS_UQUOTA_CHKD;
+ if (XFS_IS_GQUOTA_ON(mp))
+ ret |= XFS_GQUOTA_CHKD;
+ if (XFS_IS_PQUOTA_ON(mp))
+ ret |= XFS_PQUOTA_CHKD;
+ return ret;
+}
+
+/* Commit the new dquot counters. */
+int
+xrep_quotacheck(
+ struct xfs_scrub *sc)
+{
+ struct xqcheck *xqc = sc->buf;
+ unsigned int qflags = xqcheck_chkd_flags(sc->mp);
+ int error;
+
+ /*
+ * Clear the CHKD flag for the running quota types and commit the scrub
+ * transaction so that we can allocate new quota block mappings if we
+ * have to. If we crash after this point, the sb still has the CHKD
+ * flags cleared, so mount quotacheck will fix all of this up.
+ */
+ xrep_update_qflags(sc, qflags, 0);
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ /* Commit the new counters to the dquots. */
+ if (xqc->ucounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER);
+ if (error)
+ return error;
+ }
+ if (xqc->gcounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP);
+ if (error)
+ return error;
+ }
+ if (xqc->pcounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ);
+ if (error)
+ return error;
+ }
+
+ /* Set the CHKD flags now that we've fixed quota counts. */
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ xrep_update_qflags(sc, 0, qflags);
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c
new file mode 100644
index 000000000000..e1e52bc20713
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/scrub.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/rcbag.h"
+#include "scrub/trace.h"
+
+struct rcbag {
+ struct xfs_mount *mp;
+ struct xfbtree xfbtree;
+ uint64_t nr_items;
+};
+
+int
+rcbag_init(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *btp,
+ struct rcbag **bagp)
+{
+ struct rcbag *bag;
+ int error;
+
+ bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS);
+ if (!bag)
+ return -ENOMEM;
+
+ bag->nr_items = 0;
+ bag->mp = mp;
+
+ error = rcbagbt_mem_init(mp, &bag->xfbtree, btp);
+ if (error)
+ goto out_bag;
+
+ *bagp = bag;
+ return 0;
+
+out_bag:
+ kfree(bag);
+ return error;
+}
+
+void
+rcbag_free(
+ struct rcbag **bagp)
+{
+ struct rcbag *bag = *bagp;
+
+ xfbtree_destroy(&bag->xfbtree);
+ kfree(bag);
+ *bagp = NULL;
+}
+
+/* Track an rmap in the refcount bag. */
+int
+rcbag_add(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = rcbagbt_lookup_eq(cur, rmap, &has);
+ if (error)
+ goto out_cur;
+
+ if (has) {
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bagrec.rbg_refcount++;
+ error = rcbagbt_update(cur, &bagrec);
+ if (error)
+ goto out_cur;
+ } else {
+ bagrec.rbg_startblock = rmap->rm_startblock;
+ bagrec.rbg_blockcount = rmap->rm_blockcount;
+ bagrec.rbg_refcount = 1;
+
+ error = rcbagbt_insert(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ error = xfbtree_trans_commit(&bag->xfbtree, tp);
+ if (error)
+ return error;
+
+ bag->nr_items++;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Return the number of records in the bag. */
+uint64_t
+rcbag_count(
+ const struct rcbag *rcbag)
+{
+ return rcbag->nr_items;
+}
+
+static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r)
+{
+ return r->rbg_startblock + r->rbg_blockcount;
+}
+
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+int
+rcbag_next_edge(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap,
+ bool next_valid,
+ uint32_t *next_bnop)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ uint32_t next_bno = NULLAGBLOCK;
+ int has;
+ int error;
+
+ if (next_valid)
+ next_bno = next_rmap->rm_startblock;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec));
+ }
+
+ /*
+ * We should have found /something/ because either next_rrm is the next
+ * interesting rmap to look at after emitting this refcount extent, or
+ * there are other rmaps in rmap_bag contributing to the current
+ * sharing count. But if something is seriously wrong, bail out.
+ */
+ if (next_bno == NULLAGBLOCK) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ *next_bnop = next_bno;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/* Pop all refcount bag records that end at next_bno */
+int
+rcbag_remove_ending_at(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ uint32_t next_bno)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ /* go to the right edge of the tree */
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec));
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_decrement(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ if (rcbag_rec_next_bno(&bagrec) != next_bno)
+ continue;
+
+ error = xfs_btree_delete(cur, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bag->nr_items -= bagrec.rbg_refcount;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+ return xfbtree_trans_commit(&bag->xfbtree, tp);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Dump the rcbag. */
+void
+rcbag_dump(
+ struct rcbag *bag,
+ struct xfs_trans *tp)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ unsigned long long nr = 0;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n",
+ nr++,
+ (unsigned int)bagrec.rbg_startblock,
+ (unsigned int)bagrec.rbg_blockcount,
+ (unsigned long long)bagrec.rbg_refcount);
+ }
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+}
diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h
new file mode 100644
index 000000000000..e29ef788ba72
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_H__
+#define __XFS_SCRUB_RCBAG_H__
+
+struct xfs_mount;
+struct rcbag;
+struct xfs_buftarg;
+
+int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp,
+ struct rcbag **bagp);
+void rcbag_free(struct rcbag **bagp);
+int rcbag_add(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap);
+uint64_t rcbag_count(const struct rcbag *bag);
+
+int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap, bool next_valid,
+ uint32_t *next_bnop);
+int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp,
+ uint32_t next_bno);
+
+void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp);
+
+#endif /* __XFS_SCRUB_RCBAG_H__ */
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
new file mode 100644
index 000000000000..709356dc6256
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/trace.h"
+
+static struct kmem_cache *rcbagbt_cur_cache;
+
+STATIC void
+rcbagbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ struct rcbag_key *bag_key = (struct rcbag_key *)key;
+ const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec;
+
+ BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key));
+ BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec));
+
+ bag_key->rbg_startblock = bag_rec->rbg_startblock;
+ bag_key->rbg_blockcount = bag_rec->rbg_blockcount;
+}
+
+STATIC void
+rcbagbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec;
+ struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec;
+
+ bag_rec->rbg_startblock = bag_irec->rbg_startblock;
+ bag_rec->rbg_blockcount = bag_irec->rbg_blockcount;
+ bag_rec->rbg_refcount = bag_irec->rbg_refcount;
+}
+
+STATIC int64_t
+rcbagbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+ const struct rcbag_key *kp = (const struct rcbag_key *)key;
+
+ if (kp->rbg_startblock > rec->rbg_startblock)
+ return 1;
+ if (kp->rbg_startblock < rec->rbg_startblock)
+ return -1;
+
+ if (kp->rbg_blockcount > rec->rbg_blockcount)
+ return 1;
+ if (kp->rbg_blockcount < rec->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int64_t
+rcbagbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ ASSERT(mask == NULL);
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 1;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return -1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 1;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 0;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return 1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 0;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1;
+ const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2;
+
+ if (rp1->rbg_startblock > rp2->rbg_startblock)
+ return 0;
+ if (rp1->rbg_startblock < rp2->rbg_startblock)
+ return 1;
+
+ if (rp1->rbg_blockcount > rp2->rbg_blockcount)
+ return 0;
+ if (rp1->rbg_blockcount < rp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+static xfs_failaddr_t
+rcbagbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (level >= rcbagbt_maxlevels_possible())
+ return __this_address;
+
+ maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+rcbagbt_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = rcbagbt_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops rcbagbt_mem_buf_ops = {
+ .name = "rcbagbt_mem",
+ .magic = { 0, cpu_to_be32(RCBAG_MAGIC) },
+ .verify_read = rcbagbt_rw_verify,
+ .verify_write = rcbagbt_rw_verify,
+ .verify_struct = rcbagbt_verify,
+};
+
+static const struct xfs_btree_ops rcbagbt_mem_ops = {
+ .name = "rcbag",
+ .type = XFS_BTREE_TYPE_MEM,
+
+ .rec_len = sizeof(struct rcbag_rec),
+ .key_len = sizeof(struct rcbag_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = 1,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = rcbagbt_init_key_from_rec,
+ .init_rec_from_cur = rcbagbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = rcbagbt_key_diff,
+ .buf_ops = &rcbagbt_mem_buf_ops,
+ .diff_two_keys = rcbagbt_diff_two_keys,
+ .keys_inorder = rcbagbt_keys_inorder,
+ .recs_inorder = rcbagbt_recs_inorder,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+rcbagbt_mem_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfbtree *xfbtree)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops,
+ rcbagbt_maxlevels_possible(), rcbagbt_cur_cache);
+
+ cur->bc_mem.xfbtree = xfbtree;
+ cur->bc_nlevels = xfbtree->nlevels;
+ return cur;
+}
+
+/* Create an in-memory refcount bag btree. */
+int
+rcbagbt_mem_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp)
+{
+ xfbt->owner = 0;
+ return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops);
+}
+
+/* Calculate number of records in a refcount bag btree block. */
+static inline unsigned int
+rcbagbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct rcbag_rec);
+ return blocklen /
+ (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount bag btree block.
+ */
+unsigned int
+rcbagbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= RCBAG_BLOCK_LEN;
+ return rcbagbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for refcount bag btrees. */
+unsigned int
+rcbagbt_maxlevels_possible(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_space_to_height(minrecs, ULLONG_MAX);
+}
+
+/* Calculate the refcount bag btree size for some records. */
+unsigned long long
+rcbagbt_calc_size(
+ unsigned long long nr_records)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_calc_size(minrecs, nr_records);
+}
+
+int __init
+rcbagbt_init_cur_cache(void)
+{
+ rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur",
+ xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()),
+ 0, 0, NULL);
+
+ if (!rcbagbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+rcbagbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(rcbagbt_cur_cache);
+ rcbagbt_cur_cache = NULL;
+}
+
+/* Look up the refcount bag record corresponding to this reverse mapping. */
+int
+rcbagbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap,
+ int *success)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+
+ rec->rbg_startblock = rmap->rm_startblock;
+ rec->rbg_blockcount = rmap->rm_blockcount;
+
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success);
+}
+
+/* Get the data from the pointed-to record. */
+int
+rcbagbt_get_rec(
+ struct xfs_btree_cur *cur,
+ struct rcbag_rec *rec,
+ int *has)
+{
+ union xfs_btree_rec *btrec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &btrec, has);
+ if (error || !(*has))
+ return error;
+
+ memcpy(rec, btrec, sizeof(struct rcbag_rec));
+ return 0;
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_update(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec)
+{
+ union xfs_btree_rec btrec;
+
+ memcpy(&btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_update(cur, &btrec);
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_insert(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec,
+ int *success)
+{
+ struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec;
+
+ memcpy(btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_insert(cur, success);
+}
diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h
new file mode 100644
index 000000000000..03cadb032552
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_BTREE_H__
+#define __XFS_SCRUB_RCBAG_BTREE_H__
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */
+
+struct rcbag_key {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+};
+
+struct rcbag_rec {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+ uint64_t rbg_refcount;
+};
+
+typedef __be64 rcbag_ptr_t;
+
+/* reflinks only exist on crc enabled filesystems */
+#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define RCBAG_REC_ADDR(block, index) \
+ ((struct rcbag_rec *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct rcbag_rec))))
+
+#define RCBAG_KEY_ADDR(block, index) \
+ ((struct rcbag_key *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ ((index) - 1) * sizeof(struct rcbag_key)))
+
+#define RCBAG_PTR_ADDR(block, index, maxrecs) \
+ ((rcbag_ptr_t *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (maxrecs) * sizeof(struct rcbag_key) + \
+ ((index) - 1) * sizeof(rcbag_ptr_t)))
+
+unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
+
+unsigned long long rcbagbt_calc_size(unsigned long long nr_records);
+
+unsigned int rcbagbt_maxlevels_possible(void);
+
+int __init rcbagbt_init_cur_cache(void);
+void rcbagbt_destroy_cur_cache(void);
+
+struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp);
+
+int rcbagbt_lookup_eq(struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap, int *success);
+int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has);
+int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec);
+int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec,
+ int *success);
+
+#else
+# define rcbagbt_init_cur_cache() 0
+# define rcbagbt_destroy_cur_cache() ((void)0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index 16462332c897..dfdcb96b6c16 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -281,7 +281,7 @@ xchk_dir_walk(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
@@ -332,7 +332,7 @@ xchk_dir_lookup(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
error = xfs_dir2_sf_lookup(&args);
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index f99eca799809..0252a3b5b65a 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -114,7 +114,7 @@ xreap_put_freelist(
int error;
/* Make sure there's space on the freelist. */
- error = xrep_fix_freelist(sc, true);
+ error = xrep_fix_freelist(sc, 0);
if (error)
return error;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index bf22f245bbfa..d0c7d4a29c0f 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -7,8 +7,10 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
@@ -17,6 +19,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reference count btrees.
@@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt(
{
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_refcountbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index f38fccc42a20..a00d7ce7ae5b 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -25,6 +25,7 @@
#include "xfs_refcount_btree.h"
#include "xfs_error.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -37,6 +38,7 @@
#include "scrub/xfarray.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"
+#include "scrub/rcbag.h"
/*
* Rebuilding the Reference Count Btree
@@ -97,12 +99,6 @@
* insert all the records.
*/
-/* The only parts of the rmap that we care about for computing refcounts. */
-struct xrep_refc_rmap {
- xfs_agblock_t startblock;
- xfs_extlen_t blockcount;
-} __packed;
-
struct xrep_refc {
/* refcount extents */
struct xfarray *refcount_records;
@@ -122,6 +118,20 @@ struct xrep_refc {
xfs_extlen_t btblocks;
};
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_ag_refcountbt(
+ struct xfs_scrub *sc)
+{
+ char *descr;
+ int error;
+
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ return error;
+}
+
/* Check for any obvious conflicts with this shared/CoW staging extent. */
STATIC int
xrep_refc_check_ext(
@@ -223,10 +233,9 @@ xrep_refc_rmap_shareable(
STATIC int
xrep_refc_walk_rmaps(
struct xrep_refc *rr,
- struct xrep_refc_rmap *rrm,
+ struct xfs_rmap_irec *rmap,
bool *have_rec)
{
- struct xfs_rmap_irec rmap;
struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
struct xfs_mount *mp = cur->bc_mp;
int have_gt;
@@ -250,29 +259,30 @@ xrep_refc_walk_rmaps(
if (!have_gt)
return 0;
- error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+ error = xfs_rmap_get_rec(cur, rmap, &have_gt);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, !have_gt))
+ if (XFS_IS_CORRUPT(mp, !have_gt)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
- if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
- error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
- rmap.rm_blockcount);
+ if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
+ rmap->rm_blockcount);
if (error)
return error;
- } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+ } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) {
/* refcountbt block, dump it when we're done. */
- rr->btblocks += rmap.rm_blockcount;
+ rr->btblocks += rmap->rm_blockcount;
error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
- rmap.rm_startblock, rmap.rm_blockcount);
+ rmap->rm_startblock,
+ rmap->rm_blockcount);
if (error)
return error;
}
- } while (!xrep_refc_rmap_shareable(mp, &rmap));
+ } while (!xrep_refc_rmap_shareable(mp, rmap));
- rrm->startblock = rmap.rm_startblock;
- rrm->blockcount = rmap.rm_blockcount;
*have_rec = true;
return 0;
}
@@ -354,45 +364,6 @@ xrep_refc_sort_records(
return error;
}
-#define RRM_NEXT(r) ((r).startblock + (r).blockcount)
-/*
- * Find the next block where the refcount changes, given the next rmap we
- * looked at and the ones we're already tracking.
- */
-static inline int
-xrep_refc_next_edge(
- struct xfarray *rmap_bag,
- struct xrep_refc_rmap *next_rrm,
- bool next_valid,
- xfs_agblock_t *nbnop)
-{
- struct xrep_refc_rmap rrm;
- xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
- xfs_agblock_t nbno = NULLAGBLOCK;
- int error;
-
- if (next_valid)
- nbno = next_rrm->startblock;
-
- while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
- nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
-
- if (error)
- return error;
-
- /*
- * We should have found /something/ because either next_rrm is the next
- * interesting rmap to look at after emitting this refcount extent, or
- * there are other rmaps in rmap_bag contributing to the current
- * sharing count. But if something is seriously wrong, bail out.
- */
- if (nbno == NULLAGBLOCK)
- return -EFSCORRUPTED;
-
- *nbnop = nbno;
- return 0;
-}
-
/*
* Walk forward through the rmap btree to collect all rmaps starting at
* @bno in @rmap_bag. These represent the file(s) that share ownership of
@@ -402,22 +373,21 @@ xrep_refc_next_edge(
static int
xrep_refc_push_rmaps_at(
struct xrep_refc *rr,
- struct xfarray *rmap_bag,
+ struct rcbag *rcstack,
xfs_agblock_t bno,
- struct xrep_refc_rmap *rrm,
- bool *have,
- uint64_t *stack_sz)
+ struct xfs_rmap_irec *rmap,
+ bool *have)
{
struct xfs_scrub *sc = rr->sc;
int have_gt;
int error;
- while (*have && rrm->startblock == bno) {
- error = xfarray_store_anywhere(rmap_bag, rrm);
+ while (*have && rmap->rm_startblock == bno) {
+ error = rcbag_add(rcstack, rr->sc->tp, rmap);
if (error)
return error;
- (*stack_sz)++;
- error = xrep_refc_walk_rmaps(rr, rrm, have);
+
+ error = xrep_refc_walk_rmaps(rr, rmap, have);
if (error)
return error;
}
@@ -425,8 +395,10 @@ xrep_refc_push_rmaps_at(
error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
if (error)
return error;
- if (XFS_IS_CORRUPT(sc->mp, !have_gt))
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+ xfs_btree_mark_sick(sc->sa.rmap_cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -436,12 +408,9 @@ STATIC int
xrep_refc_find_refcounts(
struct xrep_refc *rr)
{
- struct xrep_refc_rmap rrm;
struct xfs_scrub *sc = rr->sc;
- struct xfarray *rmap_bag;
- char *descr;
- uint64_t old_stack_sz;
- uint64_t stack_sz = 0;
+ struct rcbag *rcstack;
+ uint64_t old_stack_height;
xfs_agblock_t sbno;
xfs_agblock_t cbno;
xfs_agblock_t nbno;
@@ -451,14 +420,11 @@ xrep_refc_find_refcounts(
xrep_ag_btcur_init(sc, &sc->sa);
/*
- * Set up a sparse array to store all the rmap records that we're
- * tracking to generate a reference count record. If this exceeds
+ * Set up a bag to store all the rmap records that we're tracking to
+ * generate a reference count record. If the size of the bag exceeds
* MAXREFCOUNT, we clamp rc_refcount.
*/
- descr = xchk_xfile_ag_descr(sc, "rmap record bag");
- error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
- &rmap_bag);
- kfree(descr);
+ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
if (error)
goto out_cur;
@@ -469,62 +435,54 @@ xrep_refc_find_refcounts(
/* Process reverse mappings into refcount data. */
while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+ struct xfs_rmap_irec rmap;
+
/* Push all rmaps with pblk == sbno onto the stack */
- error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
if (error)
goto out_bag;
if (!have)
break;
- sbno = cbno = rrm.startblock;
- error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
- &rrm, &have, &stack_sz);
+ sbno = cbno = rmap.rm_startblock;
+ error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+ &have);
if (error)
goto out_bag;
/* Set nbno to the bno of the next refcount change */
- error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
if (error)
goto out_bag;
ASSERT(nbno > sbno);
- old_stack_sz = stack_sz;
+ old_stack_height = rcbag_count(rcstack);
/* While stack isn't empty... */
- while (stack_sz) {
- xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
-
+ while (rcbag_count(rcstack) > 0) {
/* Pop all rmaps that end at nbno */
- while ((error = xfarray_iter(rmap_bag, &array_cur,
- &rrm)) == 1) {
- if (RRM_NEXT(rrm) != nbno)
- continue;
- error = xfarray_unset(rmap_bag, array_cur - 1);
- if (error)
- goto out_bag;
- stack_sz--;
- }
+ error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
if (error)
goto out_bag;
/* Push array items that start at nbno */
- error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
if (error)
goto out_bag;
if (have) {
- error = xrep_refc_push_rmaps_at(rr, rmap_bag,
- nbno, &rrm, &have, &stack_sz);
+ error = xrep_refc_push_rmaps_at(rr, rcstack,
+ nbno, &rmap, &have);
if (error)
goto out_bag;
}
/* Emit refcount if necessary */
ASSERT(nbno > cbno);
- if (stack_sz != old_stack_sz) {
- if (old_stack_sz > 1) {
+ if (rcbag_count(rcstack) != old_stack_height) {
+ if (old_stack_height > 1) {
error = xrep_refc_stash(rr,
XFS_REFC_DOMAIN_SHARED,
cbno, nbno - cbno,
- old_stack_sz);
+ old_stack_height);
if (error)
goto out_bag;
}
@@ -532,13 +490,13 @@ xrep_refc_find_refcounts(
}
/* Stack empty, go find the next rmap */
- if (stack_sz == 0)
+ if (rcbag_count(rcstack) == 0)
break;
- old_stack_sz = stack_sz;
+ old_stack_height = rcbag_count(rcstack);
sbno = nbno;
/* Set nbno to the bno of the next refcount change */
- error = xrep_refc_next_edge(rmap_bag, &rrm, have,
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
&nbno);
if (error)
goto out_bag;
@@ -547,14 +505,13 @@ xrep_refc_find_refcounts(
}
}
- ASSERT(stack_sz == 0);
+ ASSERT(rcbag_count(rcstack) == 0);
out_bag:
- xfarray_destroy(rmap_bag);
+ rcbag_free(&rcstack);
out_cur:
xchk_ag_btcur_free(&sc->sa);
return error;
}
-#undef RRM_NEXT
/* Retrieve refcountbt data for bulk load. */
STATIC int
@@ -653,8 +610,8 @@ xrep_refc_build_new_tree(
rr->new_btree.bload.claim_block = xrep_refc_claim_block;
/* Compute how many blocks we'll need. */
- refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
- pag);
+ refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake);
error = xfs_btree_bload_compute_geometry(refc_cur,
&rr->new_btree.bload,
xfarray_length(rr->refcount_records));
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 745d5b8f405a..f43dce771cdd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -30,12 +30,15 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_reflink.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/stats.h"
+#include "scrub/xfile.h"
/*
* Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -400,7 +403,7 @@ xrep_calc_ag_resblks(
int
xrep_fix_freelist(
struct xfs_scrub *sc,
- bool can_shrink)
+ int alloc_flags)
{
struct xfs_alloc_arg args = {0};
@@ -410,8 +413,7 @@ xrep_fix_freelist(
args.alignment = 1;
args.pag = sc->sa.pag;
- return xfs_alloc_fix_freelist(&args,
- can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+ return xfs_alloc_fix_freelist(&args, alloc_flags);
}
/*
@@ -687,6 +689,44 @@ xrep_find_ag_btree_roots(
}
#ifdef CONFIG_XFS_QUOTA
+/* Update some quota flags in the superblock. */
+void
+xrep_update_qflags(
+ struct xfs_scrub *sc,
+ unsigned int clear_flags,
+ unsigned int set_flags)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+
+ mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+ if ((mp->m_qflags & clear_flags) == 0 &&
+ (mp->m_qflags & set_flags) == set_flags)
+ goto no_update;
+
+ mp->m_qflags &= ~clear_flags;
+ mp->m_qflags |= set_flags;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags &= ~clear_flags;
+ mp->m_sb.sb_qflags |= set_flags;
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * Update the quota flags in the ondisk superblock without touching
+ * the summary counters. We have not quiesced inode chunk allocation,
+ * so we cannot coordinate with updates to the icount and ifree percpu
+ * counters.
+ */
+ bp = xfs_trans_getsb(sc->tp);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+
+no_update:
+ mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+}
+
/* Force a quotacheck the next time we mount. */
void
xrep_force_quotacheck(
@@ -699,13 +739,7 @@ xrep_force_quotacheck(
if (!(flag & sc->mp->m_qflags))
return;
- mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
- sc->mp->m_qflags &= ~flag;
- spin_lock(&sc->mp->m_sb_lock);
- sc->mp->m_sb.sb_qflags &= ~flag;
- spin_unlock(&sc->mp->m_sb_lock);
- xfs_log_sb(sc->tp);
- mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ xrep_update_qflags(sc, flag, 0);
}
/*
@@ -799,20 +833,20 @@ xrep_ag_btcur_init(
/* Set up a bnobt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
- sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
- sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+ sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
}
/* Set up a inobt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
- sa->agi_bp, XFS_BTNUM_INO);
+ sa->agi_bp);
if (xfs_has_finobt(mp))
- sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
- sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
+ sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag,
+ sc->tp, sa->agi_bp);
}
/* Set up a rmapbt cursor for cross-referencing. */
@@ -1115,3 +1149,55 @@ xrep_metadata_inode_forks(
return 0;
}
+
+/*
+ * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating
+ * a shmem file might take loks, so we cannot be in transaction context. Park
+ * our resources in the scrub context and let the teardown function take care
+ * of them at the right time.
+ */
+int
+xrep_setup_xfbtree(
+ struct xfs_scrub *sc,
+ const char *descr)
+{
+ ASSERT(sc->tp == NULL);
+
+ return xmbuf_alloc(sc->mp, descr, &sc->xmbtp);
+}
+
+/*
+ * Create a dummy transaction for use in a live update hook function. This
+ * function MUST NOT be called from regular repair code because the current
+ * process' transaction is saved via the cookie.
+ */
+int
+xrep_trans_alloc_hook_dummy(
+ struct xfs_mount *mp,
+ void **cookiep,
+ struct xfs_trans **tpp)
+{
+ int error;
+
+ *cookiep = current->journal_info;
+ current->journal_info = NULL;
+
+ error = xfs_trans_alloc_empty(mp, tpp);
+ if (!error)
+ return 0;
+
+ current->journal_info = *cookiep;
+ *cookiep = NULL;
+ return error;
+}
+
+/* Cancel a dummy transaction used by a live update hook function. */
+void
+xrep_trans_cancel_hook_dummy(
+ void **cookiep,
+ struct xfs_trans *tp)
+{
+ xfs_trans_cancel(tp);
+ current->journal_info = *cookiep;
+ *cookiep = NULL;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 17114327e6fa..ce082d941459 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -51,7 +51,7 @@ struct xbitmap;
struct xagb_bitmap;
struct xfsb_bitmap;
-int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
+int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
struct xrep_find_ag_btree {
/* in: rmap owner of the btree we're looking for */
@@ -72,6 +72,8 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
#ifdef CONFIG_XFS_QUOTA
+void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags,
+ unsigned int set_flags);
void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
#else
@@ -79,11 +81,15 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
# define xrep_ino_dqattach(sc) (0)
#endif /* CONFIG_XFS_QUOTA */
+int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr);
+
int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
xfs_extnum_t nextents);
int xrep_reset_perag_resv(struct xfs_scrub *sc);
int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
int xrep_metadata_inode_forks(struct xfs_scrub *sc);
+int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
+int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -109,11 +115,14 @@ int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
int xrep_allocbt(struct xfs_scrub *sc);
int xrep_iallocbt(struct xfs_scrub *sc);
+int xrep_rmapbt(struct xfs_scrub *sc);
int xrep_refcountbt(struct xfs_scrub *sc);
int xrep_inode(struct xfs_scrub *sc);
int xrep_bmap_data(struct xfs_scrub *sc);
int xrep_bmap_attr(struct xfs_scrub *sc);
int xrep_bmap_cow(struct xfs_scrub *sc);
+int xrep_nlinks(struct xfs_scrub *sc);
+int xrep_fscounters(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xrep_rtbitmap(struct xfs_scrub *sc);
@@ -123,13 +132,19 @@ int xrep_rtbitmap(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_QUOTA
int xrep_quota(struct xfs_scrub *sc);
+int xrep_quotacheck(struct xfs_scrub *sc);
#else
# define xrep_quota xrep_notsupported
+# define xrep_quotacheck xrep_notsupported
#endif /* CONFIG_XFS_QUOTA */
int xrep_reinit_pagf(struct xfs_scrub *sc);
int xrep_reinit_pagi(struct xfs_scrub *sc);
+int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
+ struct xfs_trans **tpp);
+void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
+
#else
#define xrep_ino_dqattach(sc) (0)
@@ -171,6 +186,8 @@ xrep_setup_nothing(
return 0;
}
#define xrep_setup_ag_allocbt xrep_setup_nothing
+#define xrep_setup_ag_rmapbt xrep_setup_nothing
+#define xrep_setup_ag_refcountbt xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
@@ -184,6 +201,7 @@ xrep_setup_nothing(
#define xrep_agi xrep_notsupported
#define xrep_allocbt xrep_notsupported
#define xrep_iallocbt xrep_notsupported
+#define xrep_rmapbt xrep_notsupported
#define xrep_refcountbt xrep_notsupported
#define xrep_inode xrep_notsupported
#define xrep_bmap_data xrep_notsupported
@@ -191,6 +209,9 @@ xrep_setup_nothing(
#define xrep_bmap_cow xrep_notsupported
#define xrep_rtbitmap xrep_notsupported
#define xrep_quota xrep_notsupported
+#define xrep_quotacheck xrep_notsupported
+#define xrep_nlinks xrep_notsupported
+#define xrep_fscounters xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index c99d1714f283..ba5bbc3fb754 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -25,6 +25,7 @@
#include "scrub/btree.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reverse mapping btrees.
@@ -36,6 +37,14 @@ xchk_setup_ag_rmapbt(
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_rmapbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
@@ -349,7 +358,7 @@ xchk_rmapbt_rec(
struct xfs_rmap_irec irec;
if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
- xfs_rmap_check_irec(bs->cur, &irec) != NULL) {
+ xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -412,8 +421,8 @@ xchk_rmapbt_walk_ag_metadata(
/* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */
cur = sc->sa.bno_cur;
if (!cur)
- cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
if (cur != sc->sa.bno_cur)
xfs_btree_del_cursor(cur, error);
@@ -422,8 +431,8 @@ xchk_rmapbt_walk_ag_metadata(
cur = sc->sa.cnt_cur;
if (!cur)
- cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
if (cur != sc->sa.cnt_cur)
xfs_btree_del_cursor(cur, error);
@@ -447,8 +456,7 @@ xchk_rmapbt_walk_ag_metadata(
/* OWN_INOBT: inobt, finobt */
cur = sc->sa.ino_cur;
if (!cur)
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp,
- XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp);
error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
if (cur != sc->sa.ino_cur)
xfs_btree_del_cursor(cur, error);
@@ -458,8 +466,8 @@ xchk_rmapbt_walk_ag_metadata(
if (xfs_has_finobt(sc->mp)) {
cur = sc->sa.fino_cur;
if (!cur)
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
- sc->sa.agi_bp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp,
+ sc->sa.agi_bp);
error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
if (cur != sc->sa.fino_cur)
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
new file mode 100644
index 000000000000..e8e07b683eab
--- /dev/null
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -0,0 +1,1697 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Reverse Mapping Btree Repair
+ * ============================
+ *
+ * This is the most involved of all the AG space btree rebuilds. Everywhere
+ * else in XFS we lock inodes and then AG data structures, but generating the
+ * list of rmap records requires that we be able to scan both block mapping
+ * btrees of every inode in the filesystem to see if it owns any extents in
+ * this AG. We can't tolerate any inode updates while we do this, so we
+ * freeze the filesystem to lock everyone else out, and grant ourselves
+ * special privileges to run transactions with regular background reclamation
+ * turned off.
+ *
+ * We also have to be very careful not to allow inode reclaim to start a
+ * transaction because all transactions (other than our own) will block.
+ * Deferred inode inactivation helps us out there.
+ *
+ * I) Reverse mappings for all non-space metadata and file data are collected
+ * according to the following algorithm:
+ *
+ * 1. For each fork of each inode:
+ * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
+ * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
+ * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree
+ * block.
+ * 1.3. If the incore extent map is loaded but the fork is in btree format,
+ * just visit the bmbt blocks to set the corresponding BMBIT areas.
+ * 1.4. From the incore extent map, accumulate each bmap that falls into our
+ * target AG. Remember, multiple bmap records can map to a single rmap
+ * record, so we cannot simply emit rmap records 1:1.
+ * 1.5. Emit rmap records for each extent in BMBIT and free it.
+ * 2. Create bitmaps INOBIT and ICHUNKBIT.
+ * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
+ * and set bits in INOBIT for each btree block. If the inobt has no records
+ * at all, we must be careful to record its root in INOBIT.
+ * 4. For each block in the finobt, set the corresponding INOBIT area.
+ * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
+ * 6. Create bitmaps REFCBIT and COWBIT.
+ * 7. For each CoW staging extent in the refcountbt, set the corresponding
+ * areas in COWBIT.
+ * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
+ * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
+ * A. Emit rmap for the AG headers.
+ * B. Emit rmap for the log, if there is one.
+ *
+ * II) The rmapbt shape and space metadata rmaps are computed as follows:
+ *
+ * 1. Count the rmaps collected in the previous step. (= NR)
+ * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
+ * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
+ * 4. Create bitmap AGBIT.
+ * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
+ * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
+ * 7. Count the extents in AGBIT. (= AGNR)
+ * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
+ * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
+ * and clear AGBIT. Go to step 5.
+ * A. Emit rmaps for each extent in AGBIT.
+ *
+ * III) The rmapbt is constructed and set in place as follows:
+ *
+ * 1. Sort the rmap records.
+ * 2. Bulk load the rmaps.
+ *
+ * IV) Reap the old btree blocks.
+ *
+ * 1. Create a bitmap OLDRMBIT.
+ * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
+ * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
+ * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are
+ * the parts of the AG that the rmap didn't find during its scan of the
+ * primary metadata and aren't known to be in the free space, which implies
+ * that they were the old rmapbt blocks.
+ * 5. Commit.
+ *
+ * We use the 'xrep_rmap' prefix for all the rmap functions.
+ */
+
+/* Context for collecting rmaps */
+struct xrep_rmap {
+ /* new rmapbt information */
+ struct xrep_newbt new_btree;
+
+ /* lock for the xfbtree and xfile */
+ struct mutex lock;
+
+ /* rmap records generated from primary metadata */
+ struct xfbtree rmap_btree;
+
+ struct xfs_scrub *sc;
+
+ /* in-memory btree cursor for the xfs_btree_bload iteration */
+ struct xfs_btree_cur *mcur;
+
+ /* Hooks into rmap update code. */
+ struct xfs_rmap_hook rhook;
+
+ /* inode scan cursor */
+ struct xchk_iscan iscan;
+
+ /* Number of non-freespace records found. */
+ unsigned long long nr_records;
+
+ /* bnobt/cntbt contribution to btreeblks */
+ xfs_agblock_t freesp_btblocks;
+
+ /* old agf_rmap_blocks counter */
+ unsigned int old_rmapbt_fsbcount;
+};
+
+/* Set us up to repair reverse mapping btrees. */
+int
+xrep_setup_ag_rmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rmap *rr;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
+ descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+
+ rr->sc = sc;
+ sc->buf = rr;
+ return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rmap_check_mapping(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rmap_stash(
+ struct xrep_rmap *rr,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ struct xfs_rmap_irec rmap = {
+ .rm_startblock = startblock,
+ .rm_blockcount = blockcount,
+ .rm_owner = owner,
+ .rm_offset = offset,
+ .rm_flags = flags,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *mcur;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
+ error = xfs_rmap_map_raw(mcur, &rmap);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
+ if (error)
+ goto out_abort;
+
+ mutex_unlock(&rr->lock);
+ return 0;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
+out_abort:
+ xchk_iscan_abort(&rr->iscan);
+ mutex_unlock(&rr->lock);
+ return error;
+}
+
+struct xrep_rmap_stash_run {
+ struct xrep_rmap *rr;
+ uint64_t owner;
+ unsigned int rmap_flags;
+};
+
+static int
+xrep_rmap_stash_run(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_rmap_stash_run *rsr = priv;
+ struct xrep_rmap *rr = rsr->rr;
+
+ return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rmap_stash_bitmap(
+ struct xrep_rmap *rr,
+ struct xagb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xrep_rmap_stash_run rsr = {
+ .rr = rr,
+ .owner = oinfo->oi_owner,
+ .rmap_flags = 0,
+ };
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
+
+ return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
+}
+
+/* Section (I): Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rmap_ifork {
+ /*
+ * Accumulate rmap data here to turn multiple adjacent bmaps into a
+ * single rmap.
+ */
+ struct xfs_rmap_irec accum;
+
+ /* Bitmap of bmbt blocks in this AG. */
+ struct xagb_bitmap bmbt_blocks;
+
+ struct xrep_rmap *rr;
+
+ /* Which inode fork? */
+ int whichfork;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rmap_stash_accumulated(
+ struct xrep_rmap_ifork *rf)
+{
+ if (rf->accum.rm_blockcount == 0)
+ return 0;
+
+ return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
+ rf->accum.rm_blockcount, rf->accum.rm_owner,
+ rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rmap_visit_bmbt(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap_ifork *rf = priv;
+ struct xfs_mount *mp = rf->rr->sc->mp;
+ struct xfs_rmap_irec *accum = &rf->accum;
+ xfs_agblock_t agbno;
+ unsigned int rmap_flags = 0;
+ int error;
+
+ if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
+ rf->rr->sc->sa.pag->pag_agno)
+ return 0;
+
+ agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
+ if (rf->whichfork == XFS_ATTR_FORK)
+ rmap_flags |= XFS_RMAP_ATTR_FORK;
+ if (rec->br_state == XFS_EXT_UNWRITTEN)
+ rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+ /* If this bmap is adjacent to the previous one, just add it. */
+ if (accum->rm_blockcount > 0 &&
+ rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+ agbno == accum->rm_startblock + accum->rm_blockcount &&
+ rmap_flags == accum->rm_flags) {
+ accum->rm_blockcount += rec->br_blockcount;
+ return 0;
+ }
+
+ /* Otherwise stash the old rmap and start accumulating a new one. */
+ error = xrep_rmap_stash_accumulated(rf);
+ if (error)
+ return error;
+
+ accum->rm_startblock = agbno;
+ accum->rm_blockcount = rec->br_blockcount;
+ accum->rm_offset = rec->br_startoff;
+ accum->rm_flags = rmap_flags;
+ return 0;
+}
+
+/* Add a btree block to the bitmap. */
+STATIC int
+xrep_rmap_visit_iroot_btree_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xrep_rmap_ifork *rf = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
+ return 0;
+
+ agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
+}
+
+/*
+ * Iterate a metadata btree rooted in an inode to collect rmap records for
+ * anything in this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iroot_btree(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_owner_info oinfo;
+ struct xrep_rmap *rr = rf->rr;
+ int error;
+
+ xagb_bitmap_init(&rf->bmbt_blocks);
+
+ /* Record all the blocks in the btree itself. */
+ error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
+ XFS_BTREE_VISIT_ALL, rf);
+ if (error)
+ goto out;
+
+ /* Emit rmaps for the btree blocks. */
+ xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
+ error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
+ if (error)
+ goto out;
+
+ /* Stash any remaining accumulated rmaps. */
+ error = xrep_rmap_stash_accumulated(rf);
+out:
+ xagb_bitmap_destroy(&rf->bmbt_blocks);
+ return error;
+}
+
+static inline bool
+is_rt_data_fork(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that matches the AG. Sets @mappings_done to true if we've scanned the
+ * block mappings in this fork.
+ */
+STATIC int
+xrep_rmap_scan_bmbt(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_inode *ip,
+ bool *mappings_done)
+{
+ struct xrep_rmap *rr = rf->rr;
+ struct xfs_btree_cur *cur;
+ struct xfs_ifork *ifp;
+ int error;
+
+ *mappings_done = false;
+ ifp = xfs_ifork_ptr(ip, rf->whichfork);
+ cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
+
+ if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
+ xfs_need_iread_extents(ifp)) {
+ /*
+ * If the incore extent cache isn't loaded, scan the bmbt for
+ * mapping records. This avoids loading the incore extent
+ * tree, which will increase memory pressure at a time when
+ * we're trying to run as quickly as we possibly can. Ignore
+ * realtime extents.
+ */
+ error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
+ if (error)
+ goto out_cur;
+
+ *mappings_done = true;
+ }
+
+ /* Scan for the bmbt blocks, which always live on the data device. */
+ error = xrep_rmap_scan_iroot_btree(rf, cur);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iext(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
+ int error;
+
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
+ if (error)
+ return error;
+ }
+
+ return xrep_rmap_stash_accumulated(rf);
+}
+
+/* Find all the extents from a given AG in an inode fork. */
+STATIC int
+xrep_rmap_scan_ifork(
+ struct xrep_rmap *rr,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xrep_rmap_ifork rf = {
+ .accum = { .rm_owner = ip->i_ino, },
+ .rr = rr,
+ .whichfork = whichfork,
+ };
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ int error = 0;
+
+ if (!ifp)
+ return 0;
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ bool mappings_done;
+
+ /*
+ * Scan the bmap btree for data device mappings. This includes
+ * the btree blocks themselves, even if this is a realtime
+ * file.
+ */
+ error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
+ if (error || mappings_done)
+ return error;
+ } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+ return 0;
+ }
+
+ /* Scan incore extent cache if this isn't a realtime file. */
+ if (xfs_ifork_is_realtime(ip, whichfork))
+ return 0;
+
+ return xrep_rmap_scan_iext(&rf, ifp);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
+ * attr bmbt. Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_rmap_scan_ilock(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ if (xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rmap_scan_inode(
+ struct xrep_rmap *rr,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode = 0;
+ int error;
+
+ /*
+ * Directory updates (create/link/unlink/rename) drop the directory's
+ * ILOCK before finishing any rmapbt updates associated with directory
+ * shape changes. For this scan to coordinate correctly with the live
+ * update hook, we must take the only lock (i_rwsem) that is held all
+ * the way to dir op completion. This will get fixed by the parent
+ * pointer patchset.
+ */
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ lock_mode = XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lock_mode);
+ }
+ lock_mode |= xrep_rmap_scan_ilock(ip);
+
+ /* Check the data fork. */
+ error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* Check the attr fork. */
+ error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* COW fork extents are "owned" by the refcount btree. */
+
+ xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Section (I): Find all AG metadata extents except for free space metadata. */
+
+struct xrep_rmap_inodes {
+ struct xrep_rmap *rr;
+ struct xagb_bitmap inobt_blocks; /* INOBIT */
+ struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */
+};
+
+/* Record inode btree rmaps. */
+STATIC int
+xrep_rmap_walk_inobt(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xrep_rmap_inodes *ri = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agino_t agino;
+ xfs_agino_t iperhole;
+ unsigned int i;
+ int error;
+
+ /* Record the inobt blocks. */
+ error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
+ if (error)
+ return error;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+ if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
+ return -EFSCORRUPTED;
+
+ agino = irec.ir_startino;
+
+ /* Record a non-sparse inode chunk. */
+ if (!xfs_inobt_issparse(irec.ir_holemask)) {
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ aglen = max_t(xfs_extlen_t, 1,
+ XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
+
+ return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+ }
+
+ /* Iterate each chunk. */
+ iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
+ XFS_INODES_PER_HOLEMASK_BIT);
+ aglen = iperhole / mp->m_sb.sb_inopblock;
+ for (i = 0, agino = irec.ir_startino;
+ i < XFS_INOBT_HOLEMASK_BITS;
+ i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
+ /* Skip holes. */
+ if (irec.ir_holemask & (1 << i))
+ continue;
+
+ /* Record the inode chunk otherwise. */
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
+STATIC int
+xrep_rmap_find_inode_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xrep_rmap_inodes ri = {
+ .rr = rr,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ xagb_bitmap_init(&ri.inobt_blocks);
+ xagb_bitmap_init(&ri.ichunk_blocks);
+
+ /*
+ * Iterate every record in the inobt so we can capture all the inode
+ * chunks and the blocks in the inobt itself.
+ */
+ error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Note that if there are zero records in the inobt then query_all does
+ * nothing and we have to account the empty inobt root manually.
+ */
+ if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+
+ error = xagb_bitmap_set(&ri.inobt_blocks,
+ be32_to_cpu(agi->agi_root), 1);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Scan the finobt too. */
+ if (xfs_has_finobt(sc->mp)) {
+ error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
+ sc->sa.fino_cur);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Generate rmaps for everything. */
+ error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
+ &XFS_RMAP_OINFO_INOBT);
+ if (error)
+ goto out_bitmap;
+ error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
+ &XFS_RMAP_OINFO_INODES);
+
+out_bitmap:
+ xagb_bitmap_destroy(&ri.inobt_blocks);
+ xagb_bitmap_destroy(&ri.ichunk_blocks);
+ return error;
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rmap_walk_cowblocks(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+
+ if (!xfs_refcount_check_domain(irec) ||
+ irec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rmap_find_refcount_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xagb_bitmap refcountbt_blocks; /* REFCBIT */
+ struct xagb_bitmap cow_blocks; /* COWBIT */
+ struct xfs_refcount_irec low = {
+ .rc_startblock = 0,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_refcount_irec high = {
+ .rc_startblock = -1U,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ if (!xfs_has_reflink(sc->mp))
+ return 0;
+
+ xagb_bitmap_init(&refcountbt_blocks);
+ xagb_bitmap_init(&cow_blocks);
+
+ /* refcountbt */
+ error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
+ if (error)
+ goto out_bitmap;
+
+ /* Collect rmaps for CoW staging extents. */
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
+ xrep_rmap_walk_cowblocks, &cow_blocks);
+ if (error)
+ goto out_bitmap;
+
+ /* Generate rmaps for everything. */
+ error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+ error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
+ &XFS_RMAP_OINFO_REFC);
+
+out_bitmap:
+ xagb_bitmap_destroy(&cow_blocks);
+ xagb_bitmap_destroy(&refcountbt_blocks);
+ return error;
+}
+
+/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
+STATIC int
+xrep_rmap_find_agheader_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ /* Create a record for the AG sb->agfl. */
+ return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
+ XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
+ XFS_RMAP_OWN_FS, 0, 0);
+}
+
+/* Generate rmaps for the log, if it's in this AG. */
+STATIC int
+xrep_rmap_find_log_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
+ return 0;
+
+ return xrep_rmap_stash(rr,
+ XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
+ sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
+}
+
+/* Check and count all the records that we gathered. */
+STATIC int
+xrep_rmap_check_record(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+ int error;
+
+ error = xrep_rmap_check_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ rr->nr_records++;
+ return 0;
+}
+
+/*
+ * Generate all the reverse-mappings for this AG, a list of the old rmapbt
+ * blocks, and the new btreeblks count. Figure out if we have enough free
+ * space to reconstruct the inode btrees. The caller must clean up the lists
+ * if anything goes wrong. This implements section (I) above.
+ */
+STATIC int
+xrep_rmap_find_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xchk_ag *sa = &sc->sa;
+ struct xfs_inode *ip;
+ struct xfs_btree_cur *mcur;
+ int error;
+
+ /* Find all the per-AG metadata. */
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ error = xrep_rmap_find_inode_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_refcount_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_agheader_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_log_rmaps(rr);
+end_agscan:
+ xchk_ag_btcur_free(&sc->sa);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Unlock the AG header buffers and cancel the transaction to release
+ * the log grant space while we scan the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ sa->agf_bp = NULL;
+ sa->agi_bp = NULL;
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ /* Iterate all AGs for inodes rmaps. */
+ while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+ error = xrep_rmap_scan_inode(rr, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rr->iscan);
+ if (error)
+ return error;
+
+ /*
+ * Switch out for a real transaction and lock the AG headers in
+ * preparation for building a new tree.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_setup_fs(sc);
+ if (error)
+ return error;
+ error = xchk_perag_drain_and_lock(sc);
+ if (error)
+ return error;
+
+ /*
+ * If a hook failed to update the in-memory btree, we lack the data to
+ * continue the repair.
+ */
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ /*
+ * Now that we have everything locked again, we need to count the
+ * number of rmap records stashed in the btree. This should reflect
+ * all actively-owned space in the filesystem. At the same time, check
+ * all our records before we start building a new btree, which requires
+ * a bnobt cursor.
+ */
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+
+ rr->nr_records = 0;
+ error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
+
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ xfs_btree_del_cursor(mcur, error);
+
+ return error;
+}
+
+/* Section (II): Reserving space for new rmapbt and setting free space bitmap */
+
+struct xrep_rmap_agfl {
+ struct xagb_bitmap *bitmap;
+ xfs_agnumber_t agno;
+};
+
+/* Add an AGFL block to the rmap list. */
+STATIC int
+xrep_rmap_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xrep_rmap_agfl *ra = priv;
+
+ return xagb_bitmap_set(ra->bitmap, agbno, 1);
+}
+
+/*
+ * Run one round of reserving space for the new rmapbt and recomputing the
+ * number of blocks needed to store the previously observed rmapbt records and
+ * the ones we'll create for the free space metadata. When we don't need more
+ * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
+ * true.
+ */
+STATIC int
+xrep_rmap_try_reserve(
+ struct xrep_rmap *rr,
+ struct xfs_btree_cur *rmap_cur,
+ struct xagb_bitmap *freesp_blocks,
+ uint64_t *blocks_reserved,
+ bool *done)
+{
+ struct xrep_rmap_agfl ra = {
+ .bitmap = freesp_blocks,
+ .agno = rr->sc->sa.pag->pag_agno,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_buf *agfl_bp;
+ uint64_t nr_blocks; /* RMB */
+ uint64_t freesp_records;
+ int error;
+
+ /*
+ * We're going to recompute new_btree.bload.nr_blocks at the end of
+ * this function to reflect however many btree blocks we need to store
+ * all the rmap records (including the ones that reflect the changes we
+ * made to support the new rmapbt blocks), so we save the old value
+ * here so we can decide if we've reserved enough blocks.
+ */
+ nr_blocks = rr->new_btree.bload.nr_blocks;
+
+ /*
+ * Make sure we've reserved enough space for the new btree. This can
+ * change the shape of the free space btrees, which can cause secondary
+ * interactions with the rmap records because all three space btrees
+ * have the same rmap owner. We'll account for all that below.
+ */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ nr_blocks - *blocks_reserved);
+ if (error)
+ return error;
+
+ *blocks_reserved = rr->new_btree.bload.nr_blocks;
+
+ /* Clear everything in the bitmap. */
+ xagb_bitmap_destroy(freesp_blocks);
+
+ /* Set all the bnobt blocks in the bitmap. */
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ if (error)
+ return error;
+
+ /* Set all the cntbt blocks in the bitmap. */
+ sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
+ xfs_btree_del_cursor(sc->sa.cnt_cur, error);
+ sc->sa.cnt_cur = NULL;
+ if (error)
+ return error;
+
+ /* Record our new btreeblks value. */
+ rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
+
+ /* Set all the new rmapbt blocks in the bitmap. */
+ list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
+ error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
+ if (error)
+ return error;
+ }
+
+ /* Set all the AGFL blocks in the bitmap. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
+ error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
+ if (error)
+ return error;
+
+ /* Count the extents in the bitmap. */
+ freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
+
+ /* Compute how many blocks we'll need for all the rmaps. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records + freesp_records);
+ if (error)
+ return error;
+
+ /* We're done when we don't need more blocks. */
+ *done = nr_blocks >= rr->new_btree.bload.nr_blocks;
+ return 0;
+}
+
+/*
+ * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
+ * the free space metadata. This implements section (II) above.
+ */
+STATIC int
+xrep_rmap_reserve_space(
+ struct xrep_rmap *rr,
+ struct xfs_btree_cur *rmap_cur)
+{
+ struct xagb_bitmap freesp_blocks; /* AGBIT */
+ uint64_t blocks_reserved = 0;
+ bool done = false;
+ int error;
+
+ /* Compute how many blocks we'll need for the rmaps collected so far. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ xagb_bitmap_init(&freesp_blocks);
+
+ /*
+ * Iteratively reserve space for the new rmapbt and recompute the
+ * number of blocks needed to store the previously observed rmapbt
+ * records and the ones we'll create for the free space metadata.
+ * Finish when we don't need more blocks.
+ */
+ do {
+ error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
+ &blocks_reserved, &done);
+ if (error)
+ goto out_bitmap;
+ } while (!done);
+
+ /* Emit rmaps for everything in the free space bitmap. */
+ xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
+ error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
+ xchk_ag_btcur_free(&rr->sc->sa);
+
+out_bitmap:
+ xagb_bitmap_destroy(&freesp_blocks);
+ return error;
+}
+
+/* Section (III): Building the new rmap btree. */
+
+/* Update the AGF counters. */
+STATIC int
+xrep_rmap_reset_counters(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ xfs_agblock_t rmap_btblocks;
+
+ /*
+ * The AGF header contains extra information related to the reverse
+ * mapping btree, so we must update those fields here.
+ */
+ rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
+ agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the rmapbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/* Retrieve rmapbt data for bulk load. */
+STATIC int
+xrep_rmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ int stat = 0;
+
+ error = xfs_btree_increment(rr->mcur, 0, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Custom allocation function for new rmap btrees. */
+STATIC int
+xrep_rmap_alloc_vextent(
+ struct xfs_scrub *sc,
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t alloc_hint)
+{
+ int error;
+
+ /*
+ * We don't want an rmap update on the allocation, since we iteratively
+ * compute the OWN_AG records /after/ allocating blocks for the records
+ * that we already know we need to store. Therefore, fix the freelist
+ * with the NORMAP flag set so that we don't also try to create an rmap
+ * for new AGFL blocks.
+ */
+ error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
+ if (error)
+ return error;
+
+ /*
+ * If xrep_fix_freelist fixed the freelist by moving blocks from the
+ * free space btrees or by removing blocks from the AGFL and queueing
+ * an EFI to free the block, the transaction will be dirty. This
+ * second case is of interest to us.
+ *
+ * Later on, we will need to compare gaps in the new recordset against
+ * the block usage of all OWN_AG owners in order to free the old
+ * btree's blocks, which means that we can't have EFIs for former AGFL
+ * blocks attached to the repair transaction when we commit the new
+ * btree.
+ *
+ * xrep_newbt_alloc_blocks guarantees this for us by calling
+ * xrep_defer_finish to commit anything that fix_freelist may have
+ * added to the transaction.
+ */
+ return xfs_alloc_vextent_near_bno(args, alloc_hint);
+}
+
+
+/* Count the records in this btree. */
+STATIC int
+xrep_rmap_count_records(
+ struct xfs_btree_cur *cur,
+ unsigned long long *nr)
+{
+ int running = 1;
+ int error;
+
+ *nr = 0;
+
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ return error;
+
+ while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
+ if (running)
+ (*nr)++;
+ }
+
+ return error;
+}
+/*
+ * Use the collected rmap information to stage a new rmap btree. If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed. This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rmap_build_new_tree(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_btree_cur *rmap_cur;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ /*
+ * Preserve the old rmapbt block count so that we can adjust the
+ * per-AG rmapbt reservation after we commit the new btree root and
+ * want to dispose of the old btree blocks.
+ */
+ rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header. The new blocks are accounted to the
+ * rmapbt per-AG reservation, which we will adjust further after
+ * committing the new btree.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
+ fsbno, XFS_AG_RESV_RMAPBT);
+ rr->new_btree.bload.get_records = xrep_rmap_get_records;
+ rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
+ rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
+ rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
+
+ /*
+ * Initialize @rr->new_btree, reserve space for the new rmapbt,
+ * and compute OWN_AG rmaps.
+ */
+ error = xrep_rmap_reserve_space(rr, rmap_cur);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Count the rmapbt records again, because the space reservation
+ * for the rmapbt itself probably added more records to the btree.
+ */
+ rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
+ &rr->rmap_btree);
+
+ error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
+ if (error)
+ goto err_mcur;
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the incore btree height so
+ * that we don't trip the verifiers when writing the new btree blocks
+ * to disk.
+ */
+ pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
+
+ /*
+ * Move the cursor to the left edge of the tree so that the first
+ * increment in ->get_records positions us at the first record.
+ */
+ error = xfs_btree_goto_left_edge(rr->mcur);
+ if (error)
+ goto err_level;
+
+ /* Add all observed rmap records. */
+ error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_level;
+
+ /*
+ * Install the new btree in the AG header. After this point the old
+ * btree is no longer accessible and the new tree is live.
+ */
+ xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(rmap_cur, 0);
+ xfs_btree_del_cursor(rr->mcur, 0);
+ rr->mcur = NULL;
+
+ /*
+ * Now that we've written the new btree to disk, we don't need to keep
+ * updating the in-memory btree. Abort the scan to stop live updates.
+ */
+ xchk_iscan_abort(&rr->iscan);
+
+ /*
+ * The newly committed rmap recordset includes mappings for the blocks
+ * that we reserved to build the new btree. If there is excess space
+ * reservation to be freed, the corresponding rmap records must also be
+ * removed.
+ */
+ rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_rmap_reset_counters(rr);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_level:
+ pag->pagf_repair_rmap_level = 0;
+err_mcur:
+ xfs_btree_del_cursor(rr->mcur, error);
+err_cur:
+ xfs_btree_del_cursor(rmap_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/* Section (IV): Reaping the old btree. */
+
+struct xrep_rmap_find_gaps {
+ struct xagb_bitmap rmap_gaps;
+ xfs_agblock_t next_agbno;
+};
+
+/* Subtract each free extent in the bnobt from the rmap gaps. */
+STATIC int
+xrep_rmap_find_freesp(
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xrep_rmap_find_gaps *rfg = priv;
+
+ return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
+ rec->ar_blockcount);
+}
+
+/* Record the free space we find, as part of cleaning out the btree. */
+STATIC int
+xrep_rmap_find_gaps(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap_find_gaps *rfg = priv;
+ int error;
+
+ if (rec->rm_startblock > rfg->next_agbno) {
+ error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
+ rec->rm_startblock - rfg->next_agbno);
+ if (error)
+ return error;
+ }
+
+ rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/*
+ * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make
+ * a list of gaps in the rmap records and a list of the extents mentioned in
+ * the bnobt. Any block that's in the new rmapbt gap list but not mentioned
+ * in the bnobt is a block from the old rmapbt and can be removed.
+ */
+STATIC int
+xrep_rmap_remove_old_tree(
+ struct xrep_rmap *rr)
+{
+ struct xrep_rmap_find_gaps rfg = {
+ .next_agbno = 0,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_btree_cur *mcur;
+ xfs_agblock_t agend;
+ int error;
+
+ xagb_bitmap_init(&rfg.rmap_gaps);
+
+ /* Compute free space from the new rmapbt. */
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+
+ error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_bitmap;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agend = be32_to_cpu(agf->agf_length);
+ if (rfg.next_agbno < agend) {
+ error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
+ agend - rfg.next_agbno);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Compute free space from the existing bnobt. */
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
+ &rfg);
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Free the "free" blocks that the new rmapbt knows about but the bnobt
+ * doesn't--these are the old rmapbt blocks. Credit the old rmapbt
+ * block usage count back to the per-AG rmapbt reservation (and not
+ * fdblocks, since the rmap btree lives in free space) to keep the
+ * reservation and free space accounting correct.
+ */
+ error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
+ &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Now that we've zapped all the old rmapbt blocks we can turn off
+ * the alternate height mechanism and reset the per-AG space
+ * reservation.
+ */
+ pag->pagf_repair_rmap_level = 0;
+ sc->flags |= XREP_RESET_PERAG_RESV;
+out_bitmap:
+ xagb_bitmap_destroy(&rfg.rmap_gaps);
+ return error;
+}
+
+static inline bool
+xrep_rmapbt_want_live_update(
+ struct xchk_iscan *iscan,
+ const struct xfs_owner_info *oi)
+{
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ /*
+ * Before unlocking the AG header to perform the inode scan, we
+ * recorded reverse mappings for all AG metadata except for the OWN_AG
+ * metadata. IOWs, the in-memory btree knows about the AG headers, the
+ * two inode btrees, the CoW staging extents, and the refcount btrees.
+ * For these types of metadata, we need to record the live updates in
+ * the in-memory rmap btree.
+ *
+ * However, we do not scan the free space btrees or the AGFL until we
+ * have re-locked the AGF and are ready to reserve space for the new
+ * rmap btree, so we do not want live updates for OWN_AG metadata.
+ */
+ if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+ return oi->oi_owner != XFS_RMAP_OWN_AG;
+
+ /* Ignore updates to files that the scanner hasn't visited yet. */
+ return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the AGF buffer and is generating
+ * the update, so we must be careful about which parts of the struct xrep_rmap
+ * that we change.
+ */
+static int
+xrep_rmapbt_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_rmap_update_params *p = data;
+ struct xrep_rmap *rr;
+ struct xfs_mount *mp;
+ struct xfs_btree_cur *mcur;
+ struct xfs_trans *tp;
+ void *txcookie;
+ int error;
+
+ rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
+ mp = rr->sc->mp;
+
+ if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
+ goto out_unlock;
+
+ trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
+
+ error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+ if (error)
+ goto out_abort;
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
+ error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+ p->blockcount, &p->oinfo, p->unwritten);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rmap_btree, tp);
+ if (error)
+ goto out_cancel;
+
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ mutex_unlock(&rr->lock);
+ return NOTIFY_DONE;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rmap_btree, tp);
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+ mutex_unlock(&rr->lock);
+ xchk_iscan_abort(&rr->iscan);
+out_unlock:
+ return NOTIFY_DONE;
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rmap_setup_scan(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ mutex_init(&rr->lock);
+
+ /* Set up in-memory rmap btree */
+ error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
+ sc->sa.pag->pag_agno);
+ if (error)
+ goto out_mutex;
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+ /*
+ * Hook into live rmap operations so that we can update our in-memory
+ * btree to reflect live changes on the filesystem. Since we drop the
+ * AGF buffer to scan all the inodes, we need this piece to avoid
+ * installing a stale btree.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+ xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
+ error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
+ if (error)
+ goto out_iscan;
+ return 0;
+
+out_iscan:
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rmap_btree);
+out_mutex:
+ mutex_destroy(&rr->lock);
+ return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rmap_teardown(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ xchk_iscan_abort(&rr->iscan);
+ xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rmap_btree);
+ mutex_destroy(&rr->lock);
+}
+
+/* Repair the rmap btree for some AG. */
+int
+xrep_rmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rmap *rr = sc->buf;
+ int error;
+
+ error = xrep_rmap_setup_scan(rr);
+ if (error)
+ return error;
+
+ /*
+ * Collect rmaps for everything in this AG that isn't space metadata.
+ * These rmaps won't change even as we try to allocate blocks.
+ */
+ error = xrep_rmap_find_rmaps(rr);
+ if (error)
+ goto out_records;
+
+ /* Rebuild the rmap information. */
+ error = xrep_rmap_build_new_tree(rr);
+ if (error)
+ goto out_records;
+
+ /* Kill the old tree. */
+ error = xrep_rmap_remove_old_tree(rr);
+ if (error)
+ goto out_records;
+
+out_records:
+ xrep_rmap_teardown(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index b1ff4f33324a..5055092bd9e8 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -119,7 +119,7 @@ xfsum_load(
xfs_rtsumoff_t sumoff,
union xfs_suminfo_raw *rawinfo)
{
- return xfile_obj_load(sc->xfile, rawinfo,
+ return xfile_load(sc->xfile, rawinfo,
sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
@@ -130,7 +130,7 @@ xfsum_store(
xfs_rtsumoff_t sumoff,
const union xfs_suminfo_raw rawinfo)
{
- return xfile_obj_store(sc->xfile, &rawinfo,
+ return xfile_store(sc->xfile, &rawinfo,
sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
@@ -142,7 +142,7 @@ xfsum_copyout(
union xfs_suminfo_raw *rawinfo,
unsigned int nr_words)
{
- return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
+ return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
sumoff << XFS_WORDLOG);
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index caf324c2b991..20fac9723c08 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -15,6 +15,8 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_scrub.h"
+#include "xfs_buf_mem.h"
+#include "xfs_rmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -157,6 +159,15 @@ xchk_fsgates_disable(
if (sc->flags & XCHK_FSGATES_DRAIN)
xfs_drain_wait_disable();
+ if (sc->flags & XCHK_FSGATES_QUOTA)
+ xfs_dqtrx_hook_disable();
+
+ if (sc->flags & XCHK_FSGATES_DIRENTS)
+ xfs_dir_hook_disable();
+
+ if (sc->flags & XCHK_FSGATES_RMAP)
+ xfs_rmap_hook_disable();
+
sc->flags &= ~XCHK_FSGATES_ALL;
}
@@ -184,6 +195,10 @@ xchk_teardown(
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
}
+ if (sc->xmbtp) {
+ xmbuf_free(sc->xmbtp);
+ sc->xmbtp = NULL;
+ }
if (sc->xfile) {
xfile_destroy(sc->xfile);
sc->xfile = NULL;
@@ -267,7 +282,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.setup = xchk_setup_ag_rmapbt,
.scrub = xchk_rmapbt,
.has = xfs_has_rmapbt,
- .repair = xrep_notsupported,
+ .repair = xrep_rmapbt,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
@@ -358,7 +373,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_fscounters,
.scrub = xchk_fscounters,
- .repair = xrep_notsupported,
+ .repair = xrep_fscounters,
+ },
+ [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */
+ .type = ST_FS,
+ .setup = xchk_setup_quotacheck,
+ .scrub = xchk_quotacheck,
+ .repair = xrep_quotacheck,
+ },
+ [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */
+ .type = ST_FS,
+ .setup = xchk_setup_nlinks,
+ .scrub = xchk_nlinks,
+ .repair = xrep_nlinks,
+ },
+ [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */
+ .type = ST_FS,
+ .setup = xchk_setup_fs,
+ .scrub = xchk_health_record,
+ .repair = xrep_notsupported,
},
};
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 7fc50654c4fe..9ad65b604fe1 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -99,6 +99,9 @@ struct xfs_scrub {
/* xfile used by the scrubbers; freed at teardown. */
struct xfile *xfile;
+ /* buffer target for in-memory btrees; also freed at teardown. */
+ struct xfs_buftarg *xmbtp;
+
/* Lock flags for @ip. */
uint ilock_flags;
@@ -121,6 +124,9 @@ struct xfs_scrub {
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
+#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */
+#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */
+#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */
#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
@@ -130,7 +136,10 @@ struct xfs_scrub {
* features are gated off via dynamic code patching, which is why the state
* must be enabled during scrub setup and can only be torn down afterwards.
*/
-#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN)
+#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \
+ XCHK_FSGATES_QUOTA | \
+ XCHK_FSGATES_DIRENTS | \
+ XCHK_FSGATES_RMAP)
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
@@ -167,14 +176,21 @@ xchk_rtsummary(struct xfs_scrub *sc)
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_quota(struct xfs_scrub *sc);
+int xchk_quotacheck(struct xfs_scrub *sc);
#else
static inline int
xchk_quota(struct xfs_scrub *sc)
{
return -ENOENT;
}
+static inline int
+xchk_quotacheck(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
#endif
int xchk_fscounters(struct xfs_scrub *sc);
+int xchk_nlinks(struct xfs_scrub *sc);
/* cross-referencing helpers */
void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index cd91db4a5548..42cafbed94ac 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -77,6 +77,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_GQUOTA] = "grpquota",
[XFS_SCRUB_TYPE_PQUOTA] = "prjquota",
[XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters",
+ [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck",
+ [XFS_SCRUB_TYPE_NLINKS] = "nlinks",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
@@ -329,9 +331,9 @@ xchk_stats_register(
if (!cs->cs_debugfs)
return;
- debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
+ debugfs_create_file("stats", 0444, cs->cs_debugfs, cs,
&scrub_stats_fops);
- debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
+ debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs,
&clear_scrub_stats_fops);
}
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index ddff86713df3..d77d8a9598f6 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -13,6 +13,7 @@
#include "xfs_inode.h"
#include "xfs_symlink.h"
#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/health.h"
@@ -67,7 +68,7 @@ xchk_symlink(
}
/* Remote symlink; must read the contents. */
- error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+ error = xfs_symlink_remote_read(sc->ip, sc->buf);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index d0e24ffaf754..3dd281d6d185 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -16,10 +16,16 @@
#include "xfs_rtbitmap.h"
#include "xfs_quota.h"
#include "xfs_quota_defs.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
+#include "xfs_rmap.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/quota.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/fscounters.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
@@ -32,7 +38,7 @@ xchk_btree_cur_fsbno(
xfs_buf_daddr(cur->bc_levels[level].bp));
if (level == cur->bc_nlevels - 1 &&
- (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
return NULLFSBLOCK;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 6bbb4e8639dc..5b294be52c55 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -15,11 +15,17 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+#include "xfs_quota_defs.h"
+struct xfs_scrub;
struct xfile;
struct xfarray;
struct xfarray_sortinfo;
struct xchk_dqiter;
+struct xchk_iscan;
+struct xchk_nlink;
+struct xchk_fscounters;
+struct xfs_rmap_update_params;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -27,14 +33,6 @@ struct xchk_dqiter;
* ring buffer. Somehow this was only worth mentioning in the ftrace sample
* code.
*/
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
@@ -63,6 +61,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -89,7 +90,10 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \
{ XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \
{ XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \
- { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }
+ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \
+ { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \
+ { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \
+ { XFS_SCRUB_TYPE_HEALTHY, "healthy" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -107,9 +111,21 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
+ { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \
+ { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \
+ { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \
{ XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC);
+TRACE_DEFINE_ENUM(XFS_RMAP_FREE);
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -395,6 +411,29 @@ DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter);
+
+TRACE_EVENT(xchk_qcheck_error,
+ TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id,
+ void *ret_ip),
+ TP_ARGS(sc, dqtype, id, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, dqtype)
+ __field(xfs_dqid_t, id)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->dqtype = dqtype;
+ __entry->id = id;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+ __entry->id,
+ __entry->ret_ip)
+);
#endif /* CONFIG_XFS_QUOTA */
TRACE_EVENT(xchk_incomplete,
@@ -423,7 +462,7 @@ TRACE_EVENT(xchk_btree_op_error,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -436,7 +475,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -444,10 +483,10 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -465,7 +504,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(int, ptr)
__field(xfs_agnumber_t, agno)
@@ -479,7 +518,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
@@ -487,12 +526,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -508,7 +547,7 @@ TRACE_EVENT(xchk_btree_error,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -519,17 +558,17 @@ TRACE_EVENT(xchk_btree_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+ TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -546,7 +585,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -559,19 +598,19 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -586,7 +625,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
__field(int, level)
@@ -598,17 +637,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
),
- TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
+ TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->bno,
__entry->level,
@@ -861,18 +900,11 @@ TRACE_EVENT(xfile_destroy,
__field(loff_t, size)
),
TP_fast_assign(
- struct xfile_stat statbuf;
- int ret;
+ struct inode *inode = file_inode(xf->file);
- ret = xfile_stat(xf, &statbuf);
- if (!ret) {
- __entry->bytes = statbuf.bytes;
- __entry->size = statbuf.size;
- } else {
- __entry->bytes = -1;
- __entry->size = -1;
- }
- __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->ino = inode->i_ino;
+ __entry->bytes = inode->i_blocks << SECTOR_SHIFT;
+ __entry->size = i_size_read(inode);
),
TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx",
__entry->ino,
@@ -891,19 +923,12 @@ DECLARE_EVENT_CLASS(xfile_class,
__field(unsigned long long, bytecount)
),
TP_fast_assign(
- struct xfile_stat statbuf;
- int ret;
+ struct inode *inode = file_inode(xf->file);
- ret = xfile_stat(xf, &statbuf);
- if (!ret) {
- __entry->bytes_used = statbuf.bytes;
- __entry->size = statbuf.size;
- } else {
- __entry->bytes_used = -1;
- __entry->size = -1;
- }
- __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->ino = inode->i_ino;
+ __entry->bytes_used = inode->i_blocks << SECTOR_SHIFT;
__entry->pos = pos;
+ __entry->size = i_size_read(inode);
__entry->bytecount = bytecount;
),
TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx",
@@ -917,11 +942,11 @@ DECLARE_EVENT_CLASS(xfile_class,
DEFINE_EVENT(xfile_class, name, \
TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \
TP_ARGS(xf, pos, bytecount))
-DEFINE_XFILE_EVENT(xfile_pread);
-DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_load);
+DEFINE_XFILE_EVENT(xfile_store);
DEFINE_XFILE_EVENT(xfile_seek_data);
-DEFINE_XFILE_EVENT(xfile_get_page);
-DEFINE_XFILE_EVENT(xfile_put_page);
+DEFINE_XFILE_EVENT(xfile_get_folio);
+DEFINE_XFILE_EVENT(xfile_put_folio);
TRACE_EVENT(xfarray_create,
TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
@@ -968,7 +993,7 @@ TRACE_EVENT(xfarray_isort,
__entry->hi - __entry->lo)
);
-TRACE_EVENT(xfarray_pagesort,
+TRACE_EVENT(xfarray_foliosort,
TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
TP_ARGS(si, lo, hi),
TP_STRUCT__entry(
@@ -1039,6 +1064,47 @@ TRACE_EVENT(xfarray_sort,
__entry->bytes)
);
+TRACE_EVENT(xfarray_sort_scan,
+ TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx),
+ TP_ARGS(si, idx),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, nr)
+ __field(size_t, obj_size)
+ __field(unsigned long long, idx)
+ __field(unsigned long long, folio_pos)
+ __field(unsigned long, folio_bytes)
+ __field(unsigned long long, first_idx)
+ __field(unsigned long long, last_idx)
+ ),
+ TP_fast_assign(
+ __entry->nr = si->array->nr;
+ __entry->obj_size = si->array->obj_size;
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->idx = idx;
+ if (si->folio) {
+ __entry->folio_pos = folio_pos(si->folio);
+ __entry->folio_bytes = folio_size(si->folio);
+ __entry->first_idx = si->first_folio_idx;
+ __entry->last_idx = si->last_folio_idx;
+ } else {
+ __entry->folio_pos = 0;
+ __entry->folio_bytes = 0;
+ __entry->first_idx = 0;
+ __entry->last_idx = 0;
+ }
+ ),
+ TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu",
+ __entry->ino,
+ __entry->nr,
+ __entry->obj_size,
+ __entry->idx,
+ __entry->folio_pos,
+ __entry->folio_bytes,
+ __entry->first_idx,
+ __entry->last_idx)
+);
+
TRACE_EVENT(xfarray_sort_stats,
TP_PROTO(struct xfarray_sortinfo *si, int error),
TP_ARGS(si, error),
@@ -1119,6 +1185,323 @@ TRACE_EVENT(xchk_rtsum_record_free,
);
#endif /* CONFIG_XFS_RT */
+DECLARE_EVENT_CLASS(xchk_iscan_class,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited)
+)
+#define DEFINE_ISCAN_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan), \
+ TP_ARGS(iscan))
+DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor);
+DEFINE_ISCAN_EVENT(xchk_iscan_visit);
+DEFINE_ISCAN_EVENT(xchk_iscan_skip);
+DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag);
+
+DECLARE_EVENT_CLASS(xchk_iscan_ino_class,
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino),
+ TP_ARGS(iscan, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, startino)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->startino = iscan->scan_start_ino;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->startino,
+ __entry->cursor,
+ __entry->visited,
+ __entry->ino)
+)
+#define DEFINE_ISCAN_INO_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_ino_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \
+ TP_ARGS(iscan, ino))
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update);
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_start);
+
+TRACE_EVENT(xchk_iscan_iget,
+ TP_PROTO(struct xchk_iscan *iscan, int error),
+ TP_ARGS(iscan, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->error)
+);
+
+TRACE_EVENT(xchk_iscan_iget_batch,
+ TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan,
+ unsigned int nr, unsigned int avail),
+ TP_ARGS(mp, iscan, nr, avail),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, nr)
+ __field(unsigned int, avail)
+ __field(unsigned int, unavail)
+ __field(xfs_ino_t, batch_ino)
+ __field(unsigned long long, skipmask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->nr = nr;
+ __entry->avail = avail;
+ __entry->unavail = hweight64(iscan->__skipped_inomask);
+ __entry->batch_ino = iscan->__batch_ino;
+ __entry->skipmask = iscan->__skipped_inomask;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->batch_ino,
+ __entry->skipmask,
+ __entry->nr,
+ __entry->avail,
+ __entry->unavail)
+);
+
+TRACE_EVENT(xchk_iscan_iget_retry_wait,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, retry_delay)
+ __field(unsigned long, remaining)
+ __field(unsigned int, iget_timeout)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->retry_delay = iscan->iget_retry_delay;
+ __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies);
+ __entry->iget_timeout = iscan->iget_timeout;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->remaining,
+ __entry->iget_timeout,
+ __entry->retry_delay)
+);
+
+TRACE_EVENT(xchk_nlinks_collect_dirent,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
+ xfs_ino_t ino, const struct xfs_name *name),
+ TP_ARGS(mp, dp, ino, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp->i_ino;
+ __entry->ino = ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_collect_metafile,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino),
+ TP_ARGS(mp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino)
+);
+
+TRACE_EVENT(xchk_nlinks_live_update,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp,
+ int action, xfs_ino_t ino, int delta,
+ const char *name, unsigned int namelen),
+ TP_ARGS(mp, dp, action, ino, delta, name, namelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(int, action)
+ __field(xfs_ino_t, ino)
+ __field(int, delta)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp ? dp->i_ino : NULLFSINO;
+ __entry->action = action;
+ __entry->ino = ino;
+ __entry->delta = delta;
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->delta,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_check_zero,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *live),
+ TP_ARGS(mp, ino, live),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+
+TRACE_EVENT(xchk_nlinks_update_incore,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *live, int parents_delta,
+ int backrefs_delta, int children_delta),
+ TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ __field(int, parents_delta)
+ __field(int, backrefs_delta)
+ __field(int, children_delta)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ __entry->parents_delta = parents_delta;
+ __entry->backrefs_delta = backrefs_delta;
+ __entry->children_delta = children_delta;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents_delta,
+ __entry->parents,
+ __entry->backrefs_delta,
+ __entry->backrefs,
+ __entry->children_delta,
+ __entry->children)
+);
+
+DECLARE_EVENT_CLASS(xchk_nlinks_diff_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip,
+ const struct xchk_nlink *live),
+ TP_ARGS(mp, ip, live),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(uint8_t, ftype)
+ __field(xfs_nlink_t, nlink)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->nlink,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \
+DEFINE_EVENT(xchk_nlinks_diff_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \
+ const struct xchk_nlink *live), \
+ TP_ARGS(mp, ip, live))
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode);
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
@@ -1223,7 +1606,6 @@ DEFINE_EVENT(xrep_rmap_class, name, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
-DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
TRACE_EVENT(xrep_abt_found,
@@ -1341,6 +1723,38 @@ TRACE_EVENT(xrep_bmap_found,
__entry->state)
);
+TRACE_EVENT(xrep_rmap_found,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ const struct xfs_rmap_irec *rec),
+ TP_ARGS(mp, agno, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = rec->rm_startblock;
+ __entry->len = rec->rm_blockcount;
+ __entry->owner = rec->rm_owner;
+ __entry->offset = rec->rm_offset;
+ __entry->flags = rec->rm_flags;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
@@ -1425,16 +1839,28 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__entry->refcbt_sz)
)
TRACE_EVENT(xrep_reset_counters,
- TP_PROTO(struct xfs_mount *mp),
- TP_ARGS(mp),
+ TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc),
+ TP_ARGS(mp, fsc),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(uint64_t, icount)
+ __field(uint64_t, ifree)
+ __field(uint64_t, fdblocks)
+ __field(uint64_t, frextents)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
+ __entry->icount = fsc->icount;
+ __entry->ifree = fsc->ifree;
+ __entry->fdblocks = fsc->fdblocks;
+ __entry->frextents = fsc->frextents;
),
- TP_printk("dev %d:%d",
- MAJOR(__entry->dev), MINOR(__entry->dev))
+ TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->icount,
+ __entry->ifree,
+ __entry->fdblocks,
+ __entry->frextents)
)
DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
@@ -1645,6 +2071,55 @@ TRACE_EVENT(xrep_dinode_count_rmaps,
__entry->attr_extents)
);
+TRACE_EVENT(xrep_dinode_findmode_dirent,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype),
+ TP_ARGS(sc, dp, ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR))
+);
+
+TRACE_EVENT(xrep_dinode_findmode_dirent_inval,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype, unsigned int found_ftype),
+ TP_ARGS(sc, dp, ftype, found_ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ __field(unsigned int, found_ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ __entry->found_ftype = found_ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR))
+);
+
TRACE_EVENT(xrep_cow_mark_file_range,
TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
xfs_fileoff_t startoff, xfs_filblks_t blockcount),
@@ -1756,8 +2231,48 @@ DEFINE_EVENT(xrep_dquot_class, name, \
DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item);
DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot);
DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole);
+DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot);
#endif /* CONFIG_XFS_QUOTA */
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode);
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode);
+
+TRACE_EVENT(xrep_rmap_live_update,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op,
+ const struct xfs_rmap_update_params *p),
+ TP_ARGS(mp, agno, op, p),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, op)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->op = op;
+ __entry->agbno = p->startblock;
+ __entry->len = p->blockcount;
+ xfs_owner_info_unpack(&p->oinfo, &__entry->owner,
+ &__entry->offset, &__entry->flags);
+ if (p->unwritten)
+ __entry->flags |= XFS_RMAP_UNWRITTEN;
+ ),
+ TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->op,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index f0f532c10a5a..17c982a4821d 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -16,7 +16,7 @@
* Large Arrays of Fixed-Size Records
* ==================================
*
- * This memory array uses an xfile (which itself is a memfd "file") to store
+ * This memory array uses an xfile (which itself is a shmem file) to store
* large numbers of fixed-size records in memory that can be paged out. This
* puts less stress on the memory reclaim algorithms during an online repair
* because we don't have to pin so much memory. However, array access is less
@@ -136,7 +136,7 @@ xfarray_load(
if (idx >= array->nr)
return -ENODATA;
- return xfile_obj_load(array->xfile, ptr, array->obj_size,
+ return xfile_load(array->xfile, ptr, array->obj_size,
xfarray_pos(array, idx));
}
@@ -152,7 +152,7 @@ xfarray_is_unset(
if (array->unset_slots == 0)
return false;
- error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
+ error = xfile_load(array->xfile, temp, array->obj_size, pos);
if (!error && xfarray_element_is_null(array, temp))
return true;
@@ -184,7 +184,7 @@ xfarray_unset(
return 0;
memset(temp, 0, array->obj_size);
- error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
+ error = xfile_store(array->xfile, temp, array->obj_size, pos);
if (error)
return error;
@@ -209,7 +209,7 @@ xfarray_store(
ASSERT(!xfarray_element_is_null(array, ptr));
- ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ ret = xfile_store(array->xfile, ptr, array->obj_size,
xfarray_pos(array, idx));
if (ret)
return ret;
@@ -245,12 +245,12 @@ xfarray_store_anywhere(
for (pos = 0;
pos < endpos && array->unset_slots > 0;
pos += array->obj_size) {
- error = xfile_obj_load(array->xfile, temp, array->obj_size,
+ error = xfile_load(array->xfile, temp, array->obj_size,
pos);
if (error || !xfarray_element_is_null(array, temp))
continue;
- error = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ error = xfile_store(array->xfile, ptr, array->obj_size,
pos);
if (error)
return error;
@@ -552,7 +552,7 @@ xfarray_isort(
trace_xfarray_isort(si, lo, hi);
xfarray_sort_bump_loads(si);
- error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
+ error = xfile_load(si->array->xfile, scratch, len, lo_pos);
if (error)
return error;
@@ -560,88 +560,45 @@ xfarray_isort(
sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
- return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
+ return xfile_store(si->array->xfile, scratch, len, lo_pos);
}
-/* Grab a page for sorting records. */
-static inline int
-xfarray_sort_get_page(
- struct xfarray_sortinfo *si,
- loff_t pos,
- uint64_t len)
-{
- int error;
-
- error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
- if (error)
- return error;
-
- /*
- * xfile pages must never be mapped into userspace, so we skip the
- * dcache flush when mapping the page.
- */
- si->page_kaddr = kmap_local_page(si->xfpage.page);
- return 0;
-}
-
-/* Release a page we grabbed for sorting records. */
-static inline int
-xfarray_sort_put_page(
- struct xfarray_sortinfo *si)
-{
- if (!si->page_kaddr)
- return 0;
-
- kunmap_local(si->page_kaddr);
- si->page_kaddr = NULL;
-
- return xfile_put_page(si->array->xfile, &si->xfpage);
-}
-
-/* Decide if these records are eligible for in-page sorting. */
-static inline bool
-xfarray_want_pagesort(
- struct xfarray_sortinfo *si,
- xfarray_idx_t lo,
- xfarray_idx_t hi)
-{
- pgoff_t lo_page;
- pgoff_t hi_page;
- loff_t end_pos;
-
- /* We can only map one page at a time. */
- lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
- end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
- hi_page = end_pos >> PAGE_SHIFT;
-
- return lo_page == hi_page;
-}
-
-/* Sort a bunch of records that all live in the same memory page. */
+/*
+ * Sort the records from lo to hi (inclusive) if they are all backed by the
+ * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative
+ * errno.
+ */
STATIC int
-xfarray_pagesort(
+xfarray_foliosort(
struct xfarray_sortinfo *si,
xfarray_idx_t lo,
xfarray_idx_t hi)
{
+ struct folio *folio;
void *startp;
loff_t lo_pos = xfarray_pos(si->array, lo);
- uint64_t len = xfarray_pos(si->array, hi - lo);
- int error = 0;
+ uint64_t len = xfarray_pos(si->array, hi - lo + 1);
- trace_xfarray_pagesort(si, lo, hi);
+ /* No single folio could back this many records. */
+ if (len > XFILE_MAX_FOLIO_SIZE)
+ return 0;
xfarray_sort_bump_loads(si);
- error = xfarray_sort_get_page(si, lo_pos, len);
- if (error)
- return error;
+ folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ if (!folio)
+ return 0;
+
+ trace_xfarray_foliosort(si, lo, hi);
xfarray_sort_bump_heapsorts(si);
- startp = si->page_kaddr + offset_in_page(lo_pos);
+ startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
- return xfarray_sort_put_page(si);
+ xfile_put_folio(si->array->xfile, folio);
+ return 1;
}
/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
@@ -829,63 +786,78 @@ xfarray_qsort_push(
return 0;
}
+static inline void
+xfarray_sort_scan_done(
+ struct xfarray_sortinfo *si)
+{
+ if (si->folio)
+ xfile_put_folio(si->array->xfile, si->folio);
+ si->folio = NULL;
+}
+
/*
- * Load an element from the array into the first scratchpad and cache the page,
- * if possible.
+ * Cache the folio backing the start of the given array element. If the array
+ * element is contained entirely within the folio, return a pointer to the
+ * cached folio. Otherwise, load the element into the scratchpad and return a
+ * pointer to the scratchpad.
*/
static inline int
-xfarray_sort_load_cached(
+xfarray_sort_scan(
struct xfarray_sortinfo *si,
xfarray_idx_t idx,
- void *ptr)
+ void **ptrp)
{
loff_t idx_pos = xfarray_pos(si->array, idx);
- pgoff_t startpage;
- pgoff_t endpage;
int error = 0;
- /*
- * If this load would split a page, release the cached page, if any,
- * and perform a traditional read.
- */
- startpage = idx_pos >> PAGE_SHIFT;
- endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
- if (startpage != endpage) {
- error = xfarray_sort_put_page(si);
- if (error)
- return error;
+ if (xfarray_sort_terminated(si, &error))
+ return error;
- if (xfarray_sort_terminated(si, &error))
- return error;
+ trace_xfarray_sort_scan(si, idx);
- return xfile_obj_load(si->array->xfile, ptr,
- si->array->obj_size, idx_pos);
- }
+ /* If the cached folio doesn't cover this index, release it. */
+ if (si->folio &&
+ (idx < si->first_folio_idx || idx > si->last_folio_idx))
+ xfarray_sort_scan_done(si);
- /* If the cached page is not the one we want, release it. */
- if (xfile_page_cached(&si->xfpage) &&
- xfile_page_index(&si->xfpage) != startpage) {
- error = xfarray_sort_put_page(si);
- if (error)
- return error;
+ /* Grab the first folio that backs this array element. */
+ if (!si->folio) {
+ loff_t next_pos;
+
+ si->folio = xfile_get_folio(si->array->xfile, idx_pos,
+ si->array->obj_size, XFILE_ALLOC);
+ if (IS_ERR(si->folio))
+ return PTR_ERR(si->folio);
+
+ si->first_folio_idx = xfarray_idx(si->array,
+ folio_pos(si->folio) + si->array->obj_size - 1);
+
+ next_pos = folio_pos(si->folio) + folio_size(si->folio);
+ si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
+ if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
+ si->last_folio_idx--;
+
+ trace_xfarray_sort_scan(si, idx);
}
/*
- * If we don't have a cached page (and we know the load is contained
- * in a single page) then grab it.
+ * If this folio still doesn't cover the desired element, it must cross
+ * a folio boundary. Read into the scratchpad and we're done.
*/
- if (!xfile_page_cached(&si->xfpage)) {
- if (xfarray_sort_terminated(si, &error))
- return error;
+ if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
+ void *temp = xfarray_scratch(si->array);
- error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
- PAGE_SIZE);
+ error = xfile_load(si->array->xfile, temp, si->array->obj_size,
+ idx_pos);
if (error)
return error;
+
+ *ptrp = temp;
+ return 0;
}
- memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
- si->array->obj_size);
+ /* Otherwise return a pointer to the array element in the folio. */
+ *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
return 0;
}
@@ -952,6 +924,8 @@ xfarray_sort(
pivot = xfarray_sortinfo_pivot(si);
while (si->stack_depth >= 0) {
+ int ret;
+
lo = si_lo[si->stack_depth];
hi = si_hi[si->stack_depth];
@@ -964,13 +938,13 @@ xfarray_sort(
}
/*
- * If directly mapping the page and sorting can solve our
+ * If directly mapping the folio and sorting can solve our
* problems, we're done.
*/
- if (xfarray_want_pagesort(si, lo, hi)) {
- error = xfarray_pagesort(si, lo, hi);
- if (error)
- goto out_free;
+ ret = xfarray_foliosort(si, lo, hi);
+ if (ret < 0)
+ goto out_free;
+ if (ret == 1) {
si->stack_depth--;
continue;
}
@@ -995,25 +969,24 @@ xfarray_sort(
* than the pivot is on the right side of the range.
*/
while (lo < hi) {
+ void *p;
+
/*
* Decrement hi until it finds an a[hi] less than the
* pivot value.
*/
- error = xfarray_sort_load_cached(si, hi, scratch);
+ error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
- while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
- lo < hi) {
+ while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
hi--;
- error = xfarray_sort_load_cached(si, hi,
- scratch);
+ error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
}
- error = xfarray_sort_put_page(si);
- if (error)
- goto out_free;
-
+ if (p != scratch)
+ memcpy(scratch, p, si->array->obj_size);
+ xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
@@ -1028,21 +1001,18 @@ xfarray_sort(
* Increment lo until it finds an a[lo] greater than
* the pivot value.
*/
- error = xfarray_sort_load_cached(si, lo, scratch);
+ error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
- while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
- lo < hi) {
+ while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
lo++;
- error = xfarray_sort_load_cached(si, lo,
- scratch);
+ error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
}
- error = xfarray_sort_put_page(si);
- if (error)
- goto out_free;
-
+ if (p != scratch)
+ memcpy(scratch, p, si->array->obj_size);
+ xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 62b9c506fdd1..acb2f94c56c1 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -45,6 +45,25 @@ int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+/*
+ * Load an array element, but zero the buffer if there's no data because we
+ * haven't stored to that array element yet.
+ */
+static inline int
+xfarray_load_sparse(
+ struct xfarray *array,
+ uint64_t idx,
+ void *rec)
+{
+ int error = xfarray_load(array, idx, rec);
+
+ if (error == -ENODATA) {
+ memset(rec, 0, array->obj_size);
+ return 0;
+ }
+ return error;
+}
+
/* Append an element to the array. */
static inline int xfarray_append(struct xfarray *array, const void *ptr)
{
@@ -105,9 +124,14 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */
unsigned int flags;
- /* Cache a page here for faster access. */
- struct xfile_page xfpage;
- void *page_kaddr;
+ /* Cache a folio here for faster scanning for pivots */
+ struct folio *folio;
+
+ /* First array index in folio that is completely readable */
+ xfarray_idx_t first_folio_idx;
+
+ /* Last array index in folio that is completely readable */
+ xfarray_idx_t last_folio_idx;
#ifdef DEBUG
/* Performance statistics. */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 090c3ead43fd..8cdd863db585 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -34,13 +34,6 @@
* xfiles assume that the caller will handle all required concurrency
* management; standard vfs locks (freezer and inode) are not taken. Reads
* and writes are satisfied directly from the page cache.
- *
- * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
- * of a hole cause a page to be mapped into the file. If you are going to
- * create a sparse xfile, please be careful about reading from uninitialized
- * parts of the file. These pages are !Uptodate and will eventually be
- * reclaimed if not written, but in the short term this boosts memory
- * consumption.
*/
/*
@@ -62,38 +55,27 @@ xfile_create(
{
struct inode *inode;
struct xfile *xf;
- int error = -ENOMEM;
+ int error;
xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
if (!xf)
return -ENOMEM;
- xf->file = shmem_file_setup(description, isize, 0);
- if (!xf->file)
- goto out_xfile;
+ xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE);
if (IS_ERR(xf->file)) {
error = PTR_ERR(xf->file);
goto out_xfile;
}
- /*
- * We want a large sparse file that we can pread, pwrite, and seek.
- * xfile users are responsible for keeping the xfile hidden away from
- * all other callers, so we skip timestamp updates and security checks.
- * Make the inode only accessible by root, just in case the xfile ever
- * escapes.
- */
- xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
- FMODE_LSEEK;
- xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
inode = file_inode(xf->file);
- inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
- inode->i_mode &= ~0177;
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
-
lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
+ /*
+ * We don't want to bother with kmapping data during repair, so don't
+ * allow highmem pages to back this mapping.
+ */
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
trace_xfile_create(xf);
*xfilep = xf;
@@ -118,164 +100,128 @@ xfile_destroy(
}
/*
- * Read a memory object directly from the xfile's page cache. Unlike regular
- * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
- * high an offset, instead of truncating the read. Otherwise, we return
- * bytes read or an error code, like regular pread.
+ * Load an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
*/
-ssize_t
-xfile_pread(
+int
+xfile_load(
struct xfile *xf,
void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- struct page *page = NULL;
- ssize_t read = 0;
unsigned int pflags;
- int error = 0;
if (count > MAX_RW_COUNT)
- return -E2BIG;
+ return -ENOMEM;
if (inode->i_sb->s_maxbytes - pos < count)
- return -EFBIG;
+ return -ENOMEM;
- trace_xfile_pread(xf, pos, count);
+ trace_xfile_load(xf, pos, count);
pflags = memalloc_nofs_save();
while (count > 0) {
- void *p, *kaddr;
+ struct folio *folio;
unsigned int len;
+ unsigned int offset;
- len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
- /*
- * In-kernel reads of a shmem file cause it to allocate a page
- * if the mapping shows a hole. Therefore, if we hit ENOMEM
- * we can continue by zeroing the caller's buffer.
- */
- page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
- __GFP_NOWARN);
- if (IS_ERR(page)) {
- error = PTR_ERR(page);
- if (error != -ENOMEM)
- break;
-
- memset(buf, 0, len);
- goto advance;
- }
-
- if (PageUptodate(page)) {
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ SGP_READ) < 0)
+ break;
+ if (!folio) {
/*
- * xfile pages must never be mapped into userspace, so
- * we skip the dcache flush.
+ * No data stored at this offset, just zero the output
+ * buffer until the next page boundary.
*/
- kaddr = kmap_local_page(page);
- p = kaddr + offset_in_page(pos);
- memcpy(buf, p, len);
- kunmap_local(kaddr);
- } else {
+ len = min_t(ssize_t, count,
+ PAGE_SIZE - offset_in_page(pos));
memset(buf, 0, len);
- }
- put_page(page);
+ } else {
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ break;
+ }
+
+ offset = offset_in_folio(folio, pos);
+ len = min_t(ssize_t, count, folio_size(folio) - offset);
+ memcpy(buf, folio_address(folio) + offset, len);
-advance:
+ folio_unlock(folio);
+ folio_put(folio);
+ }
count -= len;
pos += len;
buf += len;
- read += len;
}
memalloc_nofs_restore(pflags);
- if (read > 0)
- return read;
- return error;
+ if (count)
+ return -ENOMEM;
+ return 0;
}
/*
- * Write a memory object directly to the xfile's page cache. Unlike regular
- * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
- * high an offset, instead of truncating the write. Otherwise, we return
- * bytes written or an error code, like regular pwrite.
+ * Store an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
*/
-ssize_t
-xfile_pwrite(
+int
+xfile_store(
struct xfile *xf,
const void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- struct page *page = NULL;
- ssize_t written = 0;
unsigned int pflags;
- int error = 0;
if (count > MAX_RW_COUNT)
- return -E2BIG;
+ return -ENOMEM;
if (inode->i_sb->s_maxbytes - pos < count)
- return -EFBIG;
+ return -ENOMEM;
- trace_xfile_pwrite(xf, pos, count);
+ trace_xfile_store(xf, pos, count);
+
+ /*
+ * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+ * actually allocates a folio instead of erroring out.
+ */
+ if (pos + count > i_size_read(inode))
+ i_size_write(inode, pos + count);
pflags = memalloc_nofs_save();
while (count > 0) {
- void *fsdata = NULL;
- void *p, *kaddr;
+ struct folio *folio;
unsigned int len;
- int ret;
-
- len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
- /*
- * We call write_begin directly here to avoid all the freezer
- * protection lock-taking that happens in the normal path.
- * shmem doesn't support fs freeze, but lockdep doesn't know
- * that and will trip over that.
- */
- error = aops->write_begin(NULL, mapping, pos, len, &page,
- &fsdata);
- if (error)
- break;
+ unsigned int offset;
- /*
- * xfile pages must never be mapped into userspace, so we skip
- * the dcache flush. If the page is not uptodate, zero it
- * before writing data.
- */
- kaddr = kmap_local_page(page);
- if (!PageUptodate(page)) {
- memset(kaddr, 0, PAGE_SIZE);
- SetPageUptodate(page);
- }
- p = kaddr + offset_in_page(pos);
- memcpy(p, buf, len);
- kunmap_local(kaddr);
-
- ret = aops->write_end(NULL, mapping, pos, len, len, page,
- fsdata);
- if (ret < 0) {
- error = ret;
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ SGP_CACHE) < 0)
+ break;
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
- written += ret;
- if (ret != len)
- break;
+ offset = offset_in_folio(folio, pos);
+ len = min_t(ssize_t, count, folio_size(folio) - offset);
+ memcpy(folio_address(folio) + offset, buf, len);
+
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
- count -= ret;
- pos += ret;
- buf += ret;
+ count -= len;
+ pos += len;
+ buf += len;
}
memalloc_nofs_restore(pflags);
- if (written > 0)
- return written;
- return error;
+ if (count)
+ return -ENOMEM;
+ return 0;
}
/* Find the next written area in the xfile data for a given offset. */
@@ -291,129 +237,76 @@ xfile_seek_data(
return ret;
}
-/* Query stat information for an xfile. */
-int
-xfile_stat(
- struct xfile *xf,
- struct xfile_stat *statbuf)
-{
- struct kstat ks;
- int error;
-
- error = vfs_getattr_nosec(&xf->file->f_path, &ks,
- STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
- if (error)
- return error;
-
- statbuf->size = ks.size;
- statbuf->bytes = ks.blocks << SECTOR_SHIFT;
- return 0;
-}
-
/*
- * Grab the (locked) page for a memory object. The object cannot span a page
- * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
- * cannot grab the page, or the usual negative errno.
+ * Grab the (locked) folio for a memory object. The object cannot span a folio
+ * boundary. Returns the locked folio if successful, NULL if there was no
+ * folio or it didn't cover the range requested, or an ERR_PTR on failure.
*/
-int
-xfile_get_page(
+struct folio *
+xfile_get_folio(
struct xfile *xf,
loff_t pos,
- unsigned int len,
- struct xfile_page *xfpage)
+ size_t len,
+ unsigned int flags)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- struct page *page = NULL;
- void *fsdata = NULL;
- loff_t key = round_down(pos, PAGE_SIZE);
+ struct folio *folio = NULL;
unsigned int pflags;
int error;
if (inode->i_sb->s_maxbytes - pos < len)
- return -ENOMEM;
- if (len > PAGE_SIZE - offset_in_page(pos))
- return -ENOTBLK;
-
- trace_xfile_get_page(xf, pos, len);
+ return ERR_PTR(-ENOMEM);
- pflags = memalloc_nofs_save();
+ trace_xfile_get_folio(xf, pos, len);
/*
- * We call write_begin directly here to avoid all the freezer
- * protection lock-taking that happens in the normal path. shmem
- * doesn't support fs freeze, but lockdep doesn't know that and will
- * trip over that.
+ * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+ * actually allocates a folio instead of erroring out.
*/
- error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
- &fsdata);
+ if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode))
+ i_size_write(inode, pos + len);
+
+ pflags = memalloc_nofs_save();
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
+ memalloc_nofs_restore(pflags);
if (error)
- goto out_pflags;
+ return ERR_PTR(error);
- /* We got the page, so make sure we push out EOF. */
- if (i_size_read(inode) < pos + len)
- i_size_write(inode, pos + len);
+ if (!folio)
+ return NULL;
- /*
- * If the page isn't up to date, fill it with zeroes before we hand it
- * to the caller and make sure the backing store will hold on to them.
- */
- if (!PageUptodate(page)) {
- void *kaddr;
+ if (len > folio_size(folio) - offset_in_folio(folio, pos)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return NULL;
+ }
- kaddr = kmap_local_page(page);
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_local(kaddr);
- SetPageUptodate(page);
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(-EIO);
}
/*
- * Mark each page dirty so that the contents are written to some
- * backing store when we drop this buffer, and take an extra reference
- * to prevent the xfile page from being swapped or removed from the
- * page cache by reclaim if the caller unlocks the page.
+ * Mark the folio dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xfile_put_folio.
*/
- set_page_dirty(page);
- get_page(page);
-
- xfpage->page = page;
- xfpage->fsdata = fsdata;
- xfpage->pos = key;
-out_pflags:
- memalloc_nofs_restore(pflags);
- return error;
+ if (flags & XFILE_ALLOC)
+ folio_set_dirty(folio);
+ return folio;
}
/*
- * Release the (locked) page for a memory object. Returns 0 or a negative
- * errno.
+ * Release the (locked) folio for a memory object.
*/
-int
-xfile_put_page(
+void
+xfile_put_folio(
struct xfile *xf,
- struct xfile_page *xfpage)
+ struct folio *folio)
{
- struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- unsigned int pflags;
- int ret;
-
- trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
-
- /* Give back the reference that we took in xfile_get_page. */
- put_page(xfpage->page);
+ trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio));
- pflags = memalloc_nofs_save();
- ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
- xfpage->page, xfpage->fsdata);
- memalloc_nofs_restore(pflags);
- memset(xfpage, 0, sizeof(struct xfile_page));
-
- if (ret < 0)
- return ret;
- if (ret != PAGE_SIZE)
- return -EIO;
- return 0;
+ folio_unlock(folio);
+ folio_put(folio);
}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index d56643b0f429..76d78dba7e34 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -6,22 +6,6 @@
#ifndef __XFS_SCRUB_XFILE_H__
#define __XFS_SCRUB_XFILE_H__
-struct xfile_page {
- struct page *page;
- void *fsdata;
- loff_t pos;
-};
-
-static inline bool xfile_page_cached(const struct xfile_page *xfpage)
-{
- return xfpage->page != NULL;
-}
-
-static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
-{
- return xfpage->page->index;
-}
-
struct xfile {
struct file *file;
};
@@ -29,49 +13,17 @@ struct xfile {
int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
void xfile_destroy(struct xfile *xf);
-ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos);
-ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count,
+int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);
+int xfile_store(struct xfile *xf, const void *buf, size_t count,
loff_t pos);
-/*
- * Load an object. Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos)
-{
- ssize_t ret = xfile_pread(xf, buf, count, pos);
-
- if (ret < 0 || ret != count)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Store an object. Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos)
-{
- ssize_t ret = xfile_pwrite(xf, buf, count, pos);
-
- if (ret < 0 || ret != count)
- return -ENOMEM;
- return 0;
-}
-
loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
-struct xfile_stat {
- loff_t size;
- unsigned long long bytes;
-};
-
-int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
+#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER)
-int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len,
- struct xfile_page *xbuf);
-int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf);
+#define XFILE_ALLOC (1 << 0) /* allocate folio if not present */
+struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len,
+ unsigned int flags);
+void xfile_put_folio(struct xfile *xf, struct folio *folio);
#endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6b840301817a..4bf69c9c088e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -167,7 +167,7 @@ xfs_get_acl(struct inode *inode, int type, bool rcu)
acl = ERR_PTR(error);
}
- kmem_free(args.value);
+ kvfree(args.value);
return acl;
}
@@ -204,7 +204,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
error = xfs_attr_change(&args);
- kmem_free(args.value);
+ kvfree(args.value);
/*
* If the attribute didn't exist to start with that's fine.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 813f85156b0c..3f428620ebf2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -112,7 +112,7 @@ xfs_end_ioend(
* longer dirty. If we don't remove delalloc blocks here, they become
* stale and can corrupt free space accounting on unmount.
*/
- error = blk_status_to_errno(ioend->io_bio->bi_status);
+ error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
@@ -179,7 +179,7 @@ STATIC void
xfs_end_bio(
struct bio *bio)
{
- struct iomap_ioend *ioend = bio->bi_private;
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
unsigned long flags;
@@ -276,7 +276,8 @@ static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
struct inode *inode,
- loff_t offset)
+ loff_t offset,
+ unsigned int len)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -444,7 +445,7 @@ xfs_prepare_ioend(
/* send ioends that might require a transaction to the completion wq */
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED))
- ioend->io_bio->bi_end_io = xfs_end_bio;
+ ioend->io_bio.bi_end_io = xfs_end_bio;
return status;
}
@@ -502,13 +503,6 @@ xfs_vm_writepages(
{
struct xfs_writepage_ctx wpc = { };
- /*
- * Writing back data in a transaction context can result in recursive
- * transactions. This is bad, so issue a warning and get out of here.
- */
- if (WARN_ON_ONCE(current->journal_info))
- return 0;
-
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 89c7a9f4f930..24fb12986a56 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -23,6 +23,7 @@
#include "xfs_quota.h"
#include "xfs_dir2.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Invalidate any incore buffers associated with this remote attribute value
@@ -147,6 +148,7 @@ xfs_attr3_node_inactive(
if (level > XFS_DA_NODE_MAXDEPTH) {
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -197,6 +199,7 @@ xfs_attr3_node_inactive(
default:
xfs_buf_mark_corrupt(child_bp);
xfs_trans_brelse(*trans, child_bp);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
break;
}
@@ -286,6 +289,7 @@ xfs_attr3_root_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, bp);
break;
default:
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp);
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 9e02111bd890..9b4c61e1c22e 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -108,7 +108,7 @@ STATIC void
xfs_attri_item_free(
struct xfs_attri_log_item *attrip)
{
- kmem_free(attrip->attri_item.li_lv_shadow);
+ kvfree(attrip->attri_item.li_lv_shadow);
xfs_attri_log_nameval_put(attrip->attri_nameval);
kmem_cache_free(xfs_attri_cache, attrip);
}
@@ -226,7 +226,7 @@ xfs_attri_init(
{
struct xfs_attri_log_item *attrip;
- attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL);
+ attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_KERNEL | __GFP_NOFAIL);
/*
* Grab an extra reference to the name/value buffer for this log item.
@@ -251,7 +251,7 @@ static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
STATIC void
xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
{
- kmem_free(attrdp->attrd_item.li_lv_shadow);
+ kvfree(attrdp->attrd_item.li_lv_shadow);
kmem_cache_free(xfs_attrd_cache, attrdp);
}
@@ -386,11 +386,16 @@ xfs_attr_free_item(
xfs_da_state_free(attr->xattri_da_state);
xfs_attri_log_nameval_put(attr->xattri_nameval);
if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY)
- kmem_free(attr);
+ kfree(attr);
else
kmem_cache_free(xfs_attr_intent_cache, attr);
}
+static inline struct xfs_attr_intent *attri_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_attr_intent, xattri_list);
+}
+
/* Process an attr. */
STATIC int
xfs_attr_finish_item(
@@ -399,11 +404,10 @@ xfs_attr_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_attr_intent *attr;
+ struct xfs_attr_intent *attr = attri_entry(item);
struct xfs_da_args *args;
int error;
- attr = container_of(item, struct xfs_attr_intent, xattri_list);
args = attr->xattri_da_args;
/* Reset trans after EAGAIN cycle since the transaction is new */
@@ -443,9 +447,8 @@ STATIC void
xfs_attr_cancel_item(
struct list_head *item)
{
- struct xfs_attr_intent *attr;
+ struct xfs_attr_intent *attr = attri_entry(item);
- attr = container_of(item, struct xfs_attr_intent, xattri_list);
xfs_attr_free_item(attr);
}
@@ -512,8 +515,8 @@ xfs_attri_recover_work(
if (error)
return ERR_PTR(error);
- attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
- sizeof(struct xfs_da_args), KM_NOFS);
+ attr = kzalloc(sizeof(struct xfs_attr_intent) +
+ sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL);
args = (struct xfs_da_args *)(attr + 1);
attr->xattri_da_args = args;
@@ -666,7 +669,7 @@ xfs_attr_create_done(
attrip = ATTRI_ITEM(intent);
- attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
&xfs_attrd_item_ops);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index e368ad671e26..a6819a642cc0 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -22,6 +22,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_dir2.h"
+#include "xfs_health.h"
STATIC int
xfs_attr_shortform_compare(const void *a, const void *b)
@@ -82,8 +83,10 @@ xfs_attr_shortform_list(
for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval,
- sfe->namelen)))
+ sfe->namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
context->put_listent(context,
sfe->flags,
sfe->nameval,
@@ -109,7 +112,7 @@ xfs_attr_shortform_list(
* It didn't all fit, so we have to sort everything on hashval.
*/
sbsize = sf->count * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
+ sbp = sbuf = kmalloc(sbsize, GFP_KERNEL | __GFP_NOFAIL);
/*
* Scan the attribute list for the rest of the entries, storing
@@ -124,7 +127,8 @@ xfs_attr_shortform_list(
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe,
sizeof(*sfe));
- kmem_free(sbuf);
+ kfree(sbuf);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -175,6 +179,7 @@ xfs_attr_shortform_list(
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sbp->name,
sbp->namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
goto out;
}
@@ -188,7 +193,7 @@ xfs_attr_shortform_list(
cursor->offset++;
}
out:
- kmem_free(sbuf);
+ kfree(sbuf);
return error;
}
@@ -262,8 +267,10 @@ xfs_attr_node_list_lookup(
return 0;
/* We can't point back to the root. */
- if (XFS_IS_CORRUPT(mp, cursor->blkno == 0))
+ if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) {
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
}
if (expected_level != 0)
@@ -275,6 +282,7 @@ xfs_attr_node_list_lookup(
out_corruptbuf:
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(tp, bp);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -304,6 +312,8 @@ xfs_attr_node_list(
if (cursor->blkno > 0) {
error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
XFS_ATTR_FORK);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
if ((error != 0) && (error != -EFSCORRUPTED))
return error;
if (bp) {
@@ -464,8 +474,10 @@ xfs_attr3_leaf_list_int(
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(name, namelen)))
+ !xfs_attr_namecheck(name, namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
context->put_listent(context, entry->flags,
name, namelen, valuelen);
if (context->seen_enough)
@@ -504,7 +516,7 @@ xfs_attr_list_ilocked(
{
struct xfs_inode *dp = context->dp;
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
/*
* Decide on what work routines to call based on the inode size.
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 52fb8a148b7d..d27859a684aa 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -25,6 +25,7 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
#include "xfs_ag.h"
+#include "xfs_trace.h"
struct kmem_cache *xfs_bui_cache;
struct kmem_cache *xfs_bud_cache;
@@ -40,7 +41,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_free(buip->bui_item.li_lv_shadow);
+ kvfree(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_cache, buip);
}
@@ -201,7 +202,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_free(budp->bud_item.li_lv_shadow);
+ kvfree(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_cache, budp);
}
@@ -221,6 +222,11 @@ static const struct xfs_item_ops xfs_bud_item_ops = {
.iop_intent = xfs_bud_item_intent,
};
+static inline struct xfs_bmap_intent *bi_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_bmap_intent, bi_list);
+}
+
/* Sort bmap intents by inode. */
static int
xfs_bmap_update_diff_items(
@@ -228,37 +234,12 @@ xfs_bmap_update_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_bmap_intent *ba;
- struct xfs_bmap_intent *bb;
+ struct xfs_bmap_intent *ba = bi_entry(a);
+ struct xfs_bmap_intent *bb = bi_entry(b);
- ba = container_of(a, struct xfs_bmap_intent, bi_list);
- bb = container_of(b, struct xfs_bmap_intent, bi_list);
return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
}
-/* Set the map extent flags for this mapping. */
-static void
-xfs_trans_set_bmap_flags(
- struct xfs_map_extent *map,
- enum xfs_bmap_intent_type type,
- int whichfork,
- xfs_exntst_t state)
-{
- map->me_flags = 0;
- switch (type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- map->me_flags = type;
- break;
- default:
- ASSERT(0);
- }
- if (state == XFS_EXT_UNWRITTEN)
- map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
- if (whichfork == XFS_ATTR_FORK)
- map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
-}
-
/* Log bmap updates in the intent item. */
STATIC void
xfs_bmap_update_log_item(
@@ -281,8 +262,21 @@ xfs_bmap_update_log_item(
map->me_startblock = bi->bi_bmap.br_startblock;
map->me_startoff = bi->bi_bmap.br_startoff;
map->me_len = bi->bi_bmap.br_blockcount;
- xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork,
- bi->bi_bmap.br_state);
+
+ switch (bi->bi_type) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+ map->me_flags = bi->bi_type;
+ break;
+ default:
+ ASSERT(0);
+ }
+ if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN)
+ map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+ if (bi->bi_whichfork == XFS_ATTR_FORK)
+ map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ map->me_flags |= XFS_BMAP_EXTENT_REALTIME;
}
static struct xfs_log_item *
@@ -325,13 +319,16 @@ xfs_bmap_update_create_done(
}
/* Take a passive ref to the AG containing the space we're mapping. */
-void
+static inline void
xfs_bmap_update_get_group(
struct xfs_mount *mp,
struct xfs_bmap_intent *bi)
{
xfs_agnumber_t agno;
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ return;
+
agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
/*
@@ -344,14 +341,40 @@ xfs_bmap_update_get_group(
bi->bi_pag = xfs_perag_intent_get(mp, agno);
}
+/* Add this deferred BUI to the transaction. */
+void
+xfs_bmap_defer_add(
+ struct xfs_trans *tp,
+ struct xfs_bmap_intent *bi)
+{
+ trace_xfs_bmap_defer(bi);
+
+ xfs_bmap_update_get_group(tp->t_mountp, bi);
+ xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
+}
+
/* Release a passive AG ref after finishing mapping work. */
static inline void
xfs_bmap_update_put_group(
struct xfs_bmap_intent *bi)
{
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ return;
+
xfs_perag_intent_put(bi->bi_pag);
}
+/* Cancel a deferred bmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_bmap_intent *bi = bi_entry(item);
+
+ xfs_bmap_update_put_group(bi);
+ kmem_cache_free(xfs_bmap_intent_cache, bi);
+}
+
/* Process a deferred bmap update. */
STATIC int
xfs_bmap_update_finish_item(
@@ -360,19 +383,16 @@ xfs_bmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_bmap_intent *bi;
+ struct xfs_bmap_intent *bi = bi_entry(item);
int error;
- bi = container_of(item, struct xfs_bmap_intent, bi_list);
-
error = xfs_bmap_finish_one(tp, bi);
if (!error && bi->bi_bmap.br_blockcount > 0) {
ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
return -EAGAIN;
}
- xfs_bmap_update_put_group(bi);
- kmem_cache_free(xfs_bmap_intent_cache, bi);
+ xfs_bmap_update_cancel_item(item);
return error;
}
@@ -384,19 +404,6 @@ xfs_bmap_update_abort_intent(
xfs_bui_release(BUI_ITEM(intent));
}
-/* Cancel a deferred bmap update. */
-STATIC void
-xfs_bmap_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_bmap_intent *bi;
-
- bi = container_of(item, struct xfs_bmap_intent, bi_list);
-
- xfs_bmap_update_put_group(bi);
- kmem_cache_free(xfs_bmap_intent_cache, bi);
-}
-
/* Is this recovered BUI ok? */
static inline bool
xfs_bui_validate(
@@ -428,6 +435,9 @@ xfs_bui_validate(
if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
return false;
+ if (map->me_flags & XFS_BMAP_EXTENT_REALTIME)
+ return xfs_verify_rtbext(mp, map->me_startblock, map->me_len);
+
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
@@ -445,7 +455,8 @@ xfs_bui_recover_work(
if (error)
return ERR_PTR(error);
- bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ bi = kmem_cache_zalloc(xfs_bmap_intent_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
@@ -502,6 +513,12 @@ xfs_bmap_recover_work(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ if (!!(map->me_flags & XFS_BMAP_EXTENT_REALTIME) !=
+ xfs_ifork_is_realtime(ip, work->bi_whichfork)) {
+ error = -EFSCORRUPTED;
+ goto err_cancel;
+ }
+
if (work->bi_type == XFS_BMAP_MAP)
iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
else
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 3fafd3881a0b..6fee6a508343 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -68,4 +68,8 @@ struct xfs_bud_log_item {
extern struct kmem_cache *xfs_bui_cache;
extern struct kmem_cache *xfs_bud_cache;
+struct xfs_bmap_intent;
+
+void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index c2531c28905c..19e11d1da660 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -66,7 +66,7 @@ xfs_zero_extent(
return blkdev_issue_zeroout(target->bt_bdev,
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
- GFP_NOFS, 0);
+ GFP_KERNEL, 0);
}
/*
@@ -508,8 +508,8 @@ xfs_can_free_eofblocks(
* Caller must either hold the exclusive io lock; or be inactivating
* the inode, which guarantees there are no other users of the inode.
*/
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) ||
- (VFS_I(ip)->i_state & I_FREEING));
+ if (!(VFS_I(ip)->i_state & I_FREEING))
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
/* prealloc/delalloc exists only on regular files */
if (!S_ISREG(VFS_I(ip)->i_mode))
@@ -965,8 +965,7 @@ xfs_collapse_file_space(
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
bool done = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
trace_xfs_collapse_file_space(ip);
@@ -1035,8 +1034,7 @@ xfs_insert_file_space(
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
bool done = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
trace_xfs_insert_file_space(ip);
@@ -1307,16 +1305,16 @@ xfs_swap_extent_rmap(
}
/* Remove the mapping from the donor file. */
- xfs_bmap_unmap_extent(tp, tip, &uirec);
+ xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec);
/* Remove the mapping from the source file. */
- xfs_bmap_unmap_extent(tp, ip, &irec);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec);
/* Map the donor file's blocks into the source file. */
- xfs_bmap_map_extent(tp, ip, &uirec);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec);
/* Map the source file's blocks into the donor file. */
- xfs_bmap_map_extent(tp, tip, &irec);
+ xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec);
error = xfs_defer_finish(tpp);
tp = *tpp;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8e5bd50d29fe..f0fa02264eda 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -21,6 +21,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_ag.h"
+#include "xfs_buf_mem.h"
struct kmem_cache *xfs_buf_cache;
@@ -60,6 +61,11 @@ xfs_buf_submit(
return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
}
+static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
+{
+ return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
+}
+
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
@@ -189,8 +195,8 @@ xfs_buf_get_maps(
return 0;
}
- bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
- KM_NOFS);
+ bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
if (!bp->b_maps)
return -ENOMEM;
return 0;
@@ -204,7 +210,7 @@ xfs_buf_free_maps(
struct xfs_buf *bp)
{
if (bp->b_maps != &bp->__b_map) {
- kmem_free(bp->b_maps);
+ kfree(bp->b_maps);
bp->b_maps = NULL;
}
}
@@ -222,7 +228,8 @@ _xfs_buf_alloc(
int i;
*bpp = NULL;
- bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
+ bp = kmem_cache_zalloc(xfs_buf_cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -289,7 +296,7 @@ xfs_buf_free_pages(
mm_account_reclaimed_pages(bp->b_page_count);
if (bp->b_pages != bp->b_page_array)
- kmem_free(bp->b_pages);
+ kfree(bp->b_pages);
bp->b_pages = NULL;
bp->b_flags &= ~_XBF_PAGES;
}
@@ -312,10 +319,12 @@ xfs_buf_free(
ASSERT(list_empty(&bp->b_lru));
- if (bp->b_flags & _XBF_PAGES)
+ if (xfs_buftarg_is_mem(bp->b_target))
+ xmbuf_unmap_page(bp);
+ else if (bp->b_flags & _XBF_PAGES)
xfs_buf_free_pages(bp);
else if (bp->b_flags & _XBF_KMEM)
- kmem_free(bp->b_addr);
+ kfree(bp->b_addr);
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
@@ -325,21 +334,21 @@ xfs_buf_alloc_kmem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
- xfs_km_flags_t kmflag_mask = KM_NOFS;
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
size_t size = BBTOB(bp->b_length);
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
- kmflag_mask |= KM_ZERO;
+ gfp_mask |= __GFP_ZERO;
- bp->b_addr = kmem_alloc(size, kmflag_mask);
+ bp->b_addr = kmalloc(size, gfp_mask);
if (!bp->b_addr)
return -ENOMEM;
if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
((unsigned long)bp->b_addr & PAGE_MASK)) {
/* b_addr spans two pages - use alloc_page instead */
- kmem_free(bp->b_addr);
+ kfree(bp->b_addr);
bp->b_addr = NULL;
return -ENOMEM;
}
@@ -356,13 +365,11 @@ xfs_buf_alloc_pages(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
- gfp_t gfp_mask = __GFP_NOWARN;
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
long filled = 0;
if (flags & XBF_READ_AHEAD)
gfp_mask |= __GFP_NORETRY;
- else
- gfp_mask |= GFP_NOFS;
/* Make sure that we have a page list */
bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
@@ -429,11 +436,18 @@ _xfs_buf_map_pages(
/*
* vm_map_ram() will allocate auxiliary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we are likely to be under
- * GFP_NOFS context here. Hence we need to tell memory reclaim
- * that we are in such a context via PF_MEMALLOC_NOFS to prevent
- * memory reclaim re-entering the filesystem here and
- * potentially deadlocking.
+ * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
+ * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
+ * from the same call site that can be run from both above and
+ * below memory reclaim causes lockdep false positives. Hence we
+ * always need to force this allocation to nofs context because
+ * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
+ * prevent false positive lockdep reports.
+ *
+ * XXX(dgc): I think dquot reclaim is the only place we can get
+ * to this function from memory reclaim context now. If we fix
+ * that like we've fixed inode reclaim to avoid writeback from
+ * reclaim, this nofs wrapping can go away.
*/
nofs_flag = memalloc_nofs_save();
do {
@@ -499,18 +513,18 @@ static const struct rhashtable_params xfs_buf_hash_params = {
};
int
-xfs_buf_hash_init(
- struct xfs_perag *pag)
+xfs_buf_cache_init(
+ struct xfs_buf_cache *bch)
{
- spin_lock_init(&pag->pag_buf_lock);
- return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+ spin_lock_init(&bch->bc_lock);
+ return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
}
void
-xfs_buf_hash_destroy(
- struct xfs_perag *pag)
+xfs_buf_cache_destroy(
+ struct xfs_buf_cache *bch)
{
- rhashtable_destroy(&pag->pag_buf_hash);
+ rhashtable_destroy(&bch->bc_hash);
}
static int
@@ -573,7 +587,7 @@ xfs_buf_find_lock(
static inline int
xfs_buf_lookup(
- struct xfs_perag *pag,
+ struct xfs_buf_cache *bch,
struct xfs_buf_map *map,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
@@ -582,7 +596,7 @@ xfs_buf_lookup(
int error;
rcu_read_lock();
- bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+ bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
rcu_read_unlock();
return -ENOENT;
@@ -607,6 +621,7 @@ xfs_buf_lookup(
static int
xfs_buf_find_insert(
struct xfs_buftarg *btp,
+ struct xfs_buf_cache *bch,
struct xfs_perag *pag,
struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
@@ -622,31 +637,33 @@ xfs_buf_find_insert(
if (error)
goto out_drop_pag;
- /*
- * For buffers that fit entirely within a single page, first attempt to
- * allocate the memory from the heap to minimise memory usage. If we
- * can't get heap memory for these small buffers, we fall back to using
- * the page allocator.
- */
- if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
- xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+ if (xfs_buftarg_is_mem(new_bp->b_target)) {
+ error = xmbuf_map_page(new_bp);
+ } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
+ xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+ /*
+ * For buffers that fit entirely within a single page, first
+ * attempt to allocate the memory from the heap to minimise
+ * memory usage. If we can't get heap memory for these small
+ * buffers, we fall back to using the page allocator.
+ */
error = xfs_buf_alloc_pages(new_bp, flags);
- if (error)
- goto out_free_buf;
}
+ if (error)
+ goto out_free_buf;
- spin_lock(&pag->pag_buf_lock);
- bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+ spin_lock(&bch->bc_lock);
+ bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
error = PTR_ERR(bp);
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
if (bp) {
/* found an existing buffer */
atomic_inc(&bp->b_hold);
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
error = xfs_buf_find_lock(bp, flags);
if (error)
xfs_buf_rele(bp);
@@ -657,17 +674,40 @@ xfs_buf_find_insert(
/* The new buffer keeps the perag reference until it is freed. */
new_bp->b_pag = pag;
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
*bpp = new_bp;
return 0;
out_free_buf:
xfs_buf_free(new_bp);
out_drop_pag:
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
return error;
}
+static inline struct xfs_perag *
+xfs_buftarg_get_pag(
+ struct xfs_buftarg *btp,
+ const struct xfs_buf_map *map)
+{
+ struct xfs_mount *mp = btp->bt_mount;
+
+ if (xfs_buftarg_is_mem(btp))
+ return NULL;
+ return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
+}
+
+static inline struct xfs_buf_cache *
+xfs_buftarg_buf_cache(
+ struct xfs_buftarg *btp,
+ struct xfs_perag *pag)
+{
+ if (pag)
+ return &pag->pag_bcache;
+ return btp->bt_cache;
+}
+
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -681,6 +721,7 @@ xfs_buf_get_map(
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
+ struct xfs_buf_cache *bch;
struct xfs_perag *pag;
struct xfs_buf *bp = NULL;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
@@ -696,10 +737,10 @@ xfs_buf_get_map(
if (error)
return error;
- pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+ pag = xfs_buftarg_get_pag(btp, &cmap);
+ bch = xfs_buftarg_buf_cache(btp, pag);
- error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+ error = xfs_buf_lookup(bch, &cmap, flags, &bp);
if (error && error != -ENOENT)
goto out_put_perag;
@@ -711,13 +752,14 @@ xfs_buf_get_map(
goto out_put_perag;
/* xfs_buf_find_insert() consumes the perag reference. */
- error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+ error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
flags, &bp);
if (error)
return error;
} else {
XFS_STATS_INC(btp->bt_mount, xb_get_locked);
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
}
/* We do not hold a perag reference anymore. */
@@ -745,7 +787,8 @@ xfs_buf_get_map(
return 0;
out_put_perag:
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
return error;
}
@@ -892,6 +935,13 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
+ /*
+ * Currently we don't have a good means or justification for performing
+ * xmbuf_map_page asynchronously, so we don't do readahead.
+ */
+ if (xfs_buftarg_is_mem(target))
+ return;
+
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
@@ -957,7 +1007,10 @@ xfs_buf_get_uncached(
if (error)
return error;
- error = xfs_buf_alloc_pages(bp, flags);
+ if (xfs_buftarg_is_mem(bp->b_target))
+ error = xmbuf_map_page(bp);
+ else
+ error = xfs_buf_alloc_pages(bp, flags);
if (error)
goto fail_free_buf;
@@ -990,29 +1043,29 @@ xfs_buf_hold(
atomic_inc(&bp->b_hold);
}
-/*
- * Release a hold on the specified buffer. If the hold count is 1, the buffer is
- * placed on LRU or freed (depending on b_lru_ref).
- */
-void
-xfs_buf_rele(
+static void
+xfs_buf_rele_uncached(
+ struct xfs_buf *bp)
+{
+ ASSERT(list_empty(&bp->b_lru));
+ if (atomic_dec_and_test(&bp->b_hold)) {
+ xfs_buf_ioacct_dec(bp);
+ xfs_buf_free(bp);
+ }
+}
+
+static void
+xfs_buf_rele_cached(
struct xfs_buf *bp)
{
+ struct xfs_buftarg *btp = bp->b_target;
struct xfs_perag *pag = bp->b_pag;
+ struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
bool release;
bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
- if (!pag) {
- ASSERT(list_empty(&bp->b_lru));
- if (atomic_dec_and_test(&bp->b_hold)) {
- xfs_buf_ioacct_dec(bp);
- xfs_buf_free(bp);
- }
- return;
- }
-
ASSERT(atomic_read(&bp->b_hold) > 0);
/*
@@ -1026,7 +1079,7 @@ xfs_buf_rele(
* leading to a use-after-free scenario.
*/
spin_lock(&bp->b_lock);
- release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
@@ -1047,11 +1100,11 @@ xfs_buf_rele(
* buffer for the LRU and clear the (now stale) dispose list
* state flag
*/
- if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) {
+ if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
} else {
/*
* most of the time buffers will already be removed from the
@@ -1060,16 +1113,17 @@ xfs_buf_rele(
* was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
+ rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
+ xfs_buf_hash_params);
+ spin_unlock(&bch->bc_lock);
+ if (pag)
+ xfs_perag_put(pag);
freebuf = true;
}
@@ -1080,6 +1134,19 @@ out_unlock:
xfs_buf_free(bp);
}
+/*
+ * Release a hold on the specified buffer.
+ */
+void
+xfs_buf_rele(
+ struct xfs_buf *bp)
+{
+ trace_xfs_buf_rele(bp, _RET_IP_);
+ if (xfs_buf_is_uncached(bp))
+ xfs_buf_rele_uncached(bp);
+ else
+ xfs_buf_rele_cached(bp);
+}
/*
* Lock a buffer object, if it is not already locked.
@@ -1585,6 +1652,12 @@ _xfs_buf_ioapply(
/* we only use the buffer cache for meta-data */
op |= REQ_META;
+ /* in-memory targets are directly mapped, no IO required. */
+ if (xfs_buftarg_is_mem(bp->b_target)) {
+ xfs_buf_ioend(bp);
+ return;
+ }
+
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
* into the buffer and the desired IO size before we start -
@@ -1940,25 +2013,30 @@ xfs_buftarg_shrink_count(
}
void
-xfs_free_buftarg(
+xfs_destroy_buftarg(
struct xfs_buftarg *btp)
{
shrinker_free(btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
+}
+void
+xfs_free_buftarg(
+ struct xfs_buftarg *btp)
+{
+ xfs_destroy_buftarg(btp);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
- bdev_release(btp->bt_bdev_handle);
-
- kmem_free(btp);
+ bdev_fput(btp->bt_bdev_file);
+ kfree(btp);
}
int
xfs_setsize_buftarg(
- xfs_buftarg_t *btp,
+ struct xfs_buftarg *btp,
unsigned int sectorsize)
{
/* Set up metadata sector size info */
@@ -1972,80 +2050,82 @@ xfs_setsize_buftarg(
return -EINVAL;
}
- /* Set up device logical sector size mask */
- btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
- btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
-
return 0;
}
-/*
- * When allocating the initial buffer target we have not yet
- * read in the superblock, so don't know what sized sectors
- * are being used at this early stage. Play safe.
- */
-STATIC int
-xfs_setsize_buftarg_early(
- xfs_buftarg_t *btp)
+int
+xfs_init_buftarg(
+ struct xfs_buftarg *btp,
+ size_t logical_sectorsize,
+ const char *descr)
{
- return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev));
+ /* Set up device logical sector size mask */
+ btp->bt_logical_sectorsize = logical_sectorsize;
+ btp->bt_logical_sectormask = logical_sectorsize - 1;
+
+ /*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages
+ * per 30 seconds so as to not spam logs too much on repeated errors.
+ */
+ ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
+ DEFAULT_RATELIMIT_BURST);
+
+ if (list_lru_init(&btp->bt_lru))
+ return -ENOMEM;
+ if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+ goto out_destroy_lru;
+
+ btp->bt_shrinker =
+ shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
+ if (!btp->bt_shrinker)
+ goto out_destroy_io_count;
+ btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
+ btp->bt_shrinker->private_data = btp;
+ shrinker_register(btp->bt_shrinker);
+ return 0;
+
+out_destroy_io_count:
+ percpu_counter_destroy(&btp->bt_io_count);
+out_destroy_lru:
+ list_lru_destroy(&btp->bt_lru);
+ return -ENOMEM;
}
struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct bdev_handle *bdev_handle)
+ struct file *bdev_file)
{
- xfs_buftarg_t *btp;
+ struct xfs_buftarg *btp;
const struct dax_holder_operations *ops = NULL;
#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
ops = &xfs_dax_holder_operations;
#endif
- btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
+ btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL);
btp->bt_mount = mp;
- btp->bt_bdev_handle = bdev_handle;
- btp->bt_dev = bdev_handle->bdev->bd_dev;
- btp->bt_bdev = bdev_handle->bdev;
+ btp->bt_bdev_file = bdev_file;
+ btp->bt_bdev = file_bdev(bdev_file);
+ btp->bt_dev = btp->bt_bdev->bd_dev;
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
/*
- * Buffer IO error rate limiting. Limit it to no more than 10 messages
- * per 30 seconds so as to not spam logs too much on repeated errors.
+ * When allocating the buftargs we have not yet read the super block and
+ * thus don't know the file system sector size yet.
*/
- ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
- DEFAULT_RATELIMIT_BURST);
-
- if (xfs_setsize_buftarg_early(btp))
+ if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
goto error_free;
-
- if (list_lru_init(&btp->bt_lru))
+ if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
+ mp->m_super->s_id))
goto error_free;
- if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
- goto error_lru;
-
- btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
- mp->m_super->s_id);
- if (!btp->bt_shrinker)
- goto error_pcpu;
-
- btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
- btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
- btp->bt_shrinker->private_data = btp;
-
- shrinker_register(btp->bt_shrinker);
-
return btp;
-error_pcpu:
- percpu_counter_destroy(&btp->bt_io_count);
-error_lru:
- list_lru_destroy(&btp->bt_lru);
error_free:
- kmem_free(btp);
+ kfree(btp);
return NULL;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b470de08a46c..b1580644501f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -83,6 +83,14 @@ typedef unsigned int xfs_buf_flags_t;
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */
+struct xfs_buf_cache {
+ spinlock_t bc_lock;
+ struct rhashtable bc_hash;
+};
+
+int xfs_buf_cache_init(struct xfs_buf_cache *bch);
+void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
+
/*
* The xfs_buftarg contains 2 notions of "sector size" -
*
@@ -96,11 +104,12 @@ typedef unsigned int xfs_buf_flags_t;
* The latter is derived from the underlying device, and controls direct IO
* alignment constraints.
*/
-typedef struct xfs_buftarg {
+struct xfs_buftarg {
dev_t bt_dev;
- struct bdev_handle *bt_bdev_handle;
+ struct file *bt_bdev_file;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
+ struct file *bt_file;
u64 bt_dax_part_off;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
@@ -114,7 +123,10 @@ typedef struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
-} xfs_buftarg_t;
+
+ /* built-in cache, if we're not using the perag one */
+ struct xfs_buf_cache bt_cache[];
+};
#define XB_PAGES 2
@@ -366,7 +378,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
* Handling of buftargs.
*/
struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
- struct bdev_handle *bdev_handle);
+ struct file *bdev_file);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
@@ -379,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+/* for xfs_buf_mem.c only: */
+int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
+ const char *descr);
+void xfs_destroy_buftarg(struct xfs_buftarg *btp);
+
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 023d4e0385dd..43031842341a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -805,8 +805,8 @@ xfs_buf_item_get_format(
return;
}
- bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
- 0);
+ bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
}
STATIC void
@@ -814,7 +814,7 @@ xfs_buf_item_free_format(
struct xfs_buf_log_item *bip)
{
if (bip->bli_formats != &bip->__bli_format) {
- kmem_free(bip->bli_formats);
+ kfree(bip->bli_formats);
bip->bli_formats = NULL;
}
}
@@ -1044,7 +1044,7 @@ xfs_buf_item_free(
struct xfs_buf_log_item *bip)
{
xfs_buf_item_free_format(bip);
- kmem_free(bip->bli_item.li_lv_shadow);
+ kvfree(bip->bli_item.li_lv_shadow);
kmem_cache_free(xfs_buf_item_cache, bip);
}
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 43167f543afc..09e893cf563c 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -85,7 +85,7 @@ xlog_add_buffer_cancelled(
return false;
}
- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
+ bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL);
bcp->bc_blkno = blkno;
bcp->bc_len = len;
bcp->bc_refcount = 1;
@@ -129,7 +129,7 @@ xlog_put_buffer_cancelled(
if (--bcp->bc_refcount == 0) {
list_del(&bcp->bc_list);
- kmem_free(bcp);
+ kfree(bcp);
}
return true;
}
@@ -1062,10 +1062,10 @@ xlog_free_buf_cancel_table(
&log->l_buf_cancel_table[i],
struct xfs_buf_cancel, bc_list))) {
list_del(&bc->bc_list);
- kmem_free(bc);
+ kfree(bc);
}
}
- kmem_free(log->l_buf_cancel_table);
+ kfree(log->l_buf_cancel_table);
log->l_buf_cancel_table = NULL;
}
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
new file mode 100644
index 000000000000..9bb2d24de709
--- /dev/null
+++ b/fs/xfs/xfs_buf_mem.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_buf.h"
+#include "xfs_buf_mem.h"
+#include "xfs_trace.h"
+#include <linux/shmem_fs.h>
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+/*
+ * Buffer Cache for In-Memory Files
+ * ================================
+ *
+ * Online fsck wants to create ephemeral ordered recordsets. The existing
+ * btree infrastructure can do this, but we need the buffer cache to target
+ * memory instead of block devices.
+ *
+ * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
+ * requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to
+ * store our staging data. This file is not installed in the file descriptor
+ * table so that user programs cannot access the data, which means that the
+ * xmbuf must be freed with xmbuf_destroy.
+ *
+ * xmbufs assume that the caller will handle all required concurrency
+ * management; standard vfs locks (freezer and inode) are not taken. Reads
+ * and writes are satisfied directly from the page cache.
+ *
+ * The only supported block size is PAGE_SIZE, and we cannot use highmem.
+ */
+
+/*
+ * shmem files used to back an in-memory buffer cache must not be exposed to
+ * userspace. Upper layers must coordinate access to the one handle returned
+ * by the constructor, so establish a separate lock class for xmbufs to avoid
+ * confusing lockdep.
+ */
+static struct lock_class_key xmbuf_i_mutex_key;
+
+/*
+ * Allocate a buffer cache target for a memory-backed file and set up the
+ * buffer target.
+ */
+int
+xmbuf_alloc(
+ struct xfs_mount *mp,
+ const char *descr,
+ struct xfs_buftarg **btpp)
+{
+ struct file *file;
+ struct inode *inode;
+ struct xfs_buftarg *btp;
+ int error;
+
+ btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
+ if (!btp)
+ return -ENOMEM;
+
+ file = shmem_kernel_file_setup(descr, 0, 0);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out_free_btp;
+ }
+ inode = file_inode(file);
+
+ /* private file, private locking */
+ lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);
+
+ /*
+ * We don't want to bother with kmapping data during repair, so don't
+ * allow highmem pages to back this mapping.
+ */
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
+ /* ensure all writes are below EOF to avoid pagecache zeroing */
+ i_size_write(inode, inode->i_sb->s_maxbytes);
+
+ error = xfs_buf_cache_init(btp->bt_cache);
+ if (error)
+ goto out_file;
+
+ /* Initialize buffer target */
+ btp->bt_mount = mp;
+ btp->bt_dev = (dev_t)-1U;
+ btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */
+ btp->bt_file = file;
+ btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
+ btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;
+
+ error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
+ if (error)
+ goto out_bcache;
+
+ trace_xmbuf_create(btp);
+
+ *btpp = btp;
+ return 0;
+
+out_bcache:
+ xfs_buf_cache_destroy(btp->bt_cache);
+out_file:
+ fput(file);
+out_free_btp:
+ kfree(btp);
+ return error;
+}
+
+/* Free a buffer cache target for a memory-backed buffer cache. */
+void
+xmbuf_free(
+ struct xfs_buftarg *btp)
+{
+ ASSERT(xfs_buftarg_is_mem(btp));
+ ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+
+ trace_xmbuf_free(btp);
+
+ xfs_destroy_buftarg(btp);
+ xfs_buf_cache_destroy(btp->bt_cache);
+ fput(btp->bt_file);
+ kfree(btp);
+}
+
+/* Directly map a shmem page into the buffer cache. */
+int
+xmbuf_map_page(
+ struct xfs_buf *bp)
+{
+ struct inode *inode = file_inode(bp->b_target->bt_file);
+ struct folio *folio = NULL;
+ struct page *page;
+ loff_t pos = BBTOB(xfs_buf_daddr(bp));
+ int error;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ if (bp->b_map_count != 1)
+ return -ENOMEM;
+ if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)
+ return -ENOMEM;
+ if (offset_in_page(pos) != 0) {
+ ASSERT(offset_in_page(pos));
+ return -ENOMEM;
+ }
+
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
+ if (error)
+ return error;
+
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -EIO;
+ }
+
+ page = folio_file_page(folio, pos >> PAGE_SHIFT);
+
+ /*
+ * Mark the page dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xmbuf_unmap_page.
+ */
+ set_page_dirty(page);
+ unlock_page(page);
+
+ bp->b_addr = page_address(page);
+ bp->b_pages = bp->b_page_array;
+ bp->b_pages[0] = page;
+ bp->b_page_count = 1;
+ return 0;
+}
+
+/* Unmap a shmem page that was mapped into the buffer cache. */
+void
+xmbuf_unmap_page(
+ struct xfs_buf *bp)
+{
+ struct page *page = bp->b_pages[0];
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ put_page(page);
+
+ bp->b_addr = NULL;
+ bp->b_pages[0] = NULL;
+ bp->b_pages = NULL;
+ bp->b_page_count = 0;
+}
+
+/* Is this a valid daddr within the buftarg? */
+bool
+xmbuf_verify_daddr(
+ struct xfs_buftarg *btp,
+ xfs_daddr_t daddr)
+{
+ struct inode *inode = file_inode(btp->bt_file);
+
+ ASSERT(xfs_buftarg_is_mem(btp));
+
+ return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
+}
+
+/* Discard the page backing this buffer. */
+static void
+xmbuf_stale(
+ struct xfs_buf *bp)
+{
+ struct inode *inode = file_inode(bp->b_target->bt_file);
+ loff_t pos;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ pos = BBTOB(xfs_buf_daddr(bp));
+ shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1);
+}
+
+/*
+ * Finalize a buffer -- discard the backing page if it's stale, or run the
+ * write verifier to detect problems.
+ */
+int
+xmbuf_finalize(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+ int error = 0;
+
+ if (bp->b_flags & XBF_STALE) {
+ xmbuf_stale(bp);
+ return 0;
+ }
+
+ /*
+ * Although this btree is ephemeral, validate the buffer structure so
+ * that we can detect memory corruption errors and software bugs.
+ */
+ fa = bp->b_ops->verify_struct(bp);
+ if (fa) {
+ error = -EFSCORRUPTED;
+ xfs_verifier_error(bp, error, fa);
+ }
+
+ return error;
+}
+
+/*
+ * Detach this xmbuf buffer from the transaction by any means necessary.
+ * All buffers are direct-mapped, so they do not need bwrite.
+ */
+void
+xmbuf_trans_bdetach(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bli = bp->b_log_item;
+
+ ASSERT(bli != NULL);
+
+ bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED |
+ XFS_BLI_LOGGED | XFS_BLI_STALE);
+ clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);
+
+ while (bp->b_log_item != NULL)
+ xfs_trans_bdetach(tp, bp);
+}
diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h
new file mode 100644
index 000000000000..eed4a7b63232
--- /dev/null
+++ b/fs/xfs/xfs_buf_mem.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BUF_MEM_H__
+#define __XFS_BUF_MEM_H__
+
+#define XMBUF_BLOCKSIZE (PAGE_SIZE)
+#define XMBUF_BLOCKSHIFT (PAGE_SHIFT)
+
+#ifdef CONFIG_XFS_MEMORY_BUFS
+static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp)
+{
+ return btp->bt_bdev == NULL;
+}
+
+int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
+ struct xfs_buftarg **btpp);
+void xmbuf_free(struct xfs_buftarg *btp);
+
+int xmbuf_map_page(struct xfs_buf *bp);
+void xmbuf_unmap_page(struct xfs_buf *bp);
+bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
+void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
+int xmbuf_finalize(struct xfs_buf *bp);
+#else
+# define xfs_buftarg_is_mem(...) (false)
+# define xmbuf_map_page(...) (-ENOMEM)
+# define xmbuf_unmap_page(...) ((void)0)
+# define xmbuf_verify_daddr(...) (false)
+#endif /* CONFIG_XFS_MEMORY_BUFS */
+
+#endif /* __XFS_BUF_MEM_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index cc6dc56f455d..cf9296b7e06f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -118,8 +118,10 @@ xfs_dir2_sf_getdents(
ctx->pos = off & 0x7fffffff;
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(sfep->name,
- sfep->namelen)))
+ sfep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
+ }
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
xfs_dir3_get_dtype(mp, filetype)))
return 0;
@@ -211,6 +213,7 @@ xfs_dir2_block_getdents(
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(dep->name,
dep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_rele;
}
@@ -465,6 +468,7 @@ xfs_dir2_leaf_getdents(
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(dep->name,
dep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
error = -EFSCORRUPTED;
break;
}
@@ -522,7 +526,7 @@ xfs_readdir(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d5787991bb5b..268bb734dc0a 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -8,6 +8,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_trans.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
@@ -18,6 +19,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
/*
* Notes on an efficient, low latency fstrim algorithm
@@ -79,7 +81,7 @@ xfs_discard_endio_work(
container_of(work, struct xfs_busy_extents, endio_work);
xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
- kmem_free(extents->owner);
+ kfree(extents->owner);
}
/*
@@ -120,7 +122,7 @@ xfs_discard_extents(
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, &bio);
+ GFP_KERNEL, &bio);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llx,%u], error %d",
@@ -155,6 +157,7 @@ xfs_trim_gather_extents(
uint64_t *blocks_trimmed)
{
struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_trans *tp;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
int error;
@@ -168,11 +171,15 @@ xfs_trim_gather_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
+ error = xfs_trans_alloc_empty(mp, &tp);
if (error)
return error;
- cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+ if (error)
+ goto out_trans_cancel;
+
+ cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
/*
* Look up the extent length requested in the AGF and start with it.
@@ -204,6 +211,7 @@ xfs_trim_gather_extents(
if (error)
break;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
break;
}
@@ -279,7 +287,8 @@ next_extent:
xfs_extent_busy_clear(mp, &extents->extent_list, false);
out_del_cursor:
xfs_btree_del_cursor(cur, error);
- xfs_buf_relse(agbp);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index b4f20d9c8f98..c98cb468c357 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -24,6 +24,7 @@
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Lock order:
@@ -44,6 +45,29 @@ static struct kmem_cache *xfs_dquot_cache;
static struct lock_class_key xfs_dquot_group_class;
static struct lock_class_key xfs_dquot_project_class;
+/* Record observations of quota corruption with the health tracking system. */
+static void
+xfs_dquot_mark_sick(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+
+ switch (dqp->q_type) {
+ case XFS_DQTYPE_USER:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_UQUOTA);
+ break;
+ case XFS_DQTYPE_GROUP:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
+ break;
+ case XFS_DQTYPE_PROJ:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
/*
* This is called to free all the memory associated with a dquot
*/
@@ -53,7 +77,7 @@ xfs_qm_dqdestroy(
{
ASSERT(list_empty(&dqp->q_lru));
- kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
+ kvfree(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
@@ -451,6 +475,8 @@ xfs_dquot_disk_read(
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dquot_mark_sick(dqp);
if (error) {
ASSERT(bp == NULL);
return error;
@@ -574,6 +600,7 @@ xfs_dquot_from_disk(
"Metadata corruption detected at %pS, quota %u",
__this_address, dqp->q_id);
xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
+ xfs_dquot_mark_sick(dqp);
return -EFSCORRUPTED;
}
@@ -784,6 +811,12 @@ restart:
* caller should throw away the dquot and start over. Otherwise, the dquot
* is returned locked (and held by the cache) as if there had been a cache
* hit.
+ *
+ * The insert needs to be done under memalloc_nofs context because the radix
+ * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in
+ * memory reclaim when freeing unused dquots, so we cannot have the radix tree
+ * node allocation recursing into filesystem reclaim whilst we hold the
+ * qi_tree_lock.
*/
static int
xfs_qm_dqget_cache_insert(
@@ -793,25 +826,27 @@ xfs_qm_dqget_cache_insert(
xfs_dqid_t id,
struct xfs_dquot *dqp)
{
+ unsigned int nofs_flags;
int error;
+ nofs_flags = memalloc_nofs_save();
mutex_lock(&qi->qi_tree_lock);
error = radix_tree_insert(tree, id, dqp);
if (unlikely(error)) {
/* Duplicate found! Caller must try again. */
- mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
- return error;
+ goto out_unlock;
}
/* Return a locked dquot to the caller, with a reference taken. */
xfs_dqlock(dqp);
dqp->q_nrefs = 1;
-
qi->qi_dquots++;
- mutex_unlock(&qi->qi_tree_lock);
- return 0;
+out_unlock:
+ mutex_unlock(&qi->qi_tree_lock);
+ memalloc_nofs_restore(nofs_flags);
+ return error;
}
/* Check our input parameters. */
@@ -950,7 +985,7 @@ xfs_qm_dqget_inode(
if (error)
return error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(xfs_inode_dquot(ip, type) == NULL);
id = xfs_qm_id_for_quotatype(ip, type);
@@ -1007,7 +1042,7 @@ restart:
}
dqret:
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
trace_xfs_dqget_miss(dqp);
*O_dqpp = dqp;
return 0;
@@ -1238,6 +1273,8 @@ xfs_qm_dqflush(
&bp, &xfs_dquot_buf_ops);
if (error == -EAGAIN)
goto out_unlock;
+ if (xfs_metadata_is_sick(error))
+ xfs_dquot_mark_sick(dqp);
if (error)
goto out_abort;
@@ -1246,6 +1283,7 @@ xfs_qm_dqflush(
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
dqp->q_id, fa);
xfs_buf_relse(bp);
+ xfs_dquot_mark_sick(dqp);
error = -EFSCORRUPTED;
goto out_abort;
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index b2cbbba3e15a..7ad0e92c6b5b 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -240,15 +240,15 @@ xfs_errortag_init(
{
int ret;
- mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
- KM_MAYFAIL);
+ mp->m_errortag = kzalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!mp->m_errortag)
return -ENOMEM;
ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
&mp->m_kobj, "errortag");
if (ret)
- kmem_free(mp->m_errortag);
+ kfree(mp->m_errortag);
return ret;
}
@@ -257,7 +257,7 @@ xfs_errortag_del(
struct xfs_mount *mp)
{
xfs_sysfs_del(&mp->m_errortag_kobj);
- kmem_free(mp->m_errortag);
+ kfree(mp->m_errortag);
}
static bool
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 2ccde32c9a9e..56cfa1498571 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -32,7 +32,8 @@ xfs_extent_busy_insert_list(
struct rb_node **rbp;
struct rb_node *parent = NULL;
- new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
+ new = kzalloc(sizeof(struct xfs_extent_busy),
+ GFP_KERNEL | __GFP_NOFAIL);
new->agno = pag->pag_agno;
new->bno = bno;
new->length = len;
@@ -530,7 +531,7 @@ xfs_extent_busy_clear_one(
}
list_del_init(&busyp->list);
- kmem_free(busyp);
+ kfree(busyp);
}
static void
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 1d1185fca6a5..8c382f092332 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,9 +40,9 @@ STATIC void
xfs_efi_item_free(
struct xfs_efi_log_item *efip)
{
- kmem_free(efip->efi_item.li_lv_shadow);
+ kvfree(efip->efi_item.li_lv_shadow);
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
- kmem_free(efip);
+ kfree(efip);
else
kmem_cache_free(xfs_efi_cache, efip);
}
@@ -229,9 +229,9 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
STATIC void
xfs_efd_item_free(struct xfs_efd_log_item *efdp)
{
- kmem_free(efdp->efd_item.li_lv_shadow);
+ kvfree(efdp->efd_item.li_lv_shadow);
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
- kmem_free(efdp);
+ kfree(efdp);
else
kmem_cache_free(xfs_efd_cache, efdp);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f..632653e00906 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -879,7 +879,7 @@ xfs_break_dax_layouts(
{
struct page *page;
- ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
page = dax_layout_busy_page(inode->i_mapping);
if (!page)
@@ -900,7 +900,7 @@ xfs_break_layouts(
bool retry;
int error;
- ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
do {
retry = false;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 2fc98d313708..e3aaa0555597 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -44,7 +44,7 @@ xfs_fstrm_free_func(
atomic_dec(&pag->pagf_fstrms);
xfs_perag_rele(pag);
- kmem_free(item);
+ kfree(item);
}
/*
@@ -313,7 +313,7 @@ xfs_filestream_create_association(
* we return a referenced AG, the allocation can still go ahead just
* fine.
*/
- item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
+ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!item)
goto out_put_fstrms;
@@ -326,7 +326,7 @@ xfs_filestream_create_association(
out_free_item:
xfs_perag_rele(item->pag);
- kmem_free(item);
+ kfree(item);
out_put_fstrms:
atomic_dec(&args->pag->pagf_fstrms);
return 0;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a72217f5feb..de59eec74765 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -763,8 +763,8 @@ xfs_getfsmap_datadev_bnobt_query(
return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
/* Allocate cursor for this AG and query_range it. */
- *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->pag, XFS_BTNUM_BNO);
+ *curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->pag);
key->ar_startblock = info->low.rm_startblock;
key[1].ar_startblock = info->high.rm_startblock;
return xfs_alloc_query_range(*curpp, key, &key[1],
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 9a57afee9338..b39f959146bc 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -14,6 +14,10 @@
#include "xfs_trace.h"
#include "xfs_health.h"
#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
/*
* Warn about metadata corruption that we detected but haven't fixed, and
@@ -93,11 +97,25 @@ xfs_fs_mark_sick(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
trace_xfs_fs_mark_sick(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_fs_sick |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark per-fs metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_fs_mark_corrupt(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
+ trace_xfs_fs_mark_corrupt(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_fs_sick |= mask;
mp->m_fs_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -108,11 +126,13 @@ xfs_fs_mark_healthy(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
trace_xfs_fs_mark_healthy(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_fs_sick &= ~mask;
+ if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY))
+ mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY;
mp->m_fs_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -136,11 +156,25 @@ xfs_rt_mark_sick(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
trace_xfs_rt_mark_sick(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_rt_sick |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark realtime metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_rt_mark_corrupt(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
+ trace_xfs_rt_mark_corrupt(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_rt_sick |= mask;
mp->m_rt_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -151,11 +185,13 @@ xfs_rt_mark_healthy(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
trace_xfs_rt_mark_healthy(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_rt_sick &= ~mask;
+ if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY))
+ mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY;
mp->m_rt_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -173,17 +209,48 @@ xfs_rt_measure_sickness(
spin_unlock(&mp->m_sb_lock);
}
+/* Mark unhealthy per-ag metadata given a raw AG number. */
+void
+xfs_agno_mark_sick(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ unsigned int mask)
+{
+ struct xfs_perag *pag = xfs_perag_get(mp, agno);
+
+ /* per-ag structure not set up yet? */
+ if (!pag)
+ return;
+
+ xfs_ag_mark_sick(pag, mask);
+ xfs_perag_put(pag);
+}
+
/* Mark unhealthy per-ag metadata. */
void
xfs_ag_mark_sick(
struct xfs_perag *pag,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
spin_lock(&pag->pag_state_lock);
pag->pag_sick |= mask;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_ag_mark_corrupt(
+ struct xfs_perag *pag,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
+ trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask);
+
+ spin_lock(&pag->pag_state_lock);
+ pag->pag_sick |= mask;
pag->pag_checked |= mask;
spin_unlock(&pag->pag_state_lock);
}
@@ -194,11 +261,13 @@ xfs_ag_mark_healthy(
struct xfs_perag *pag,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
spin_lock(&pag->pag_state_lock);
pag->pag_sick &= ~mask;
+ if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY))
+ pag->pag_sick &= ~XFS_SICK_AG_SECONDARY;
pag->pag_checked |= mask;
spin_unlock(&pag->pag_state_lock);
}
@@ -222,11 +291,34 @@ xfs_inode_mark_sick(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
trace_xfs_inode_mark_sick(ip, mask);
spin_lock(&ip->i_flags_lock);
ip->i_sick |= mask;
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Keep this inode around so we don't lose the sickness report. Scrub
+ * grabs inodes with DONTCACHE assuming that most inode are ok, which
+ * is not the case here.
+ */
+ spin_lock(&VFS_I(ip)->i_lock);
+ VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ spin_unlock(&VFS_I(ip)->i_lock);
+}
+
+/* Mark inode metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_inode_mark_corrupt(
+ struct xfs_inode *ip,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
+ trace_xfs_inode_mark_corrupt(ip, mask);
+
+ spin_lock(&ip->i_flags_lock);
+ ip->i_sick |= mask;
ip->i_checked |= mask;
spin_unlock(&ip->i_flags_lock);
@@ -246,11 +338,13 @@ xfs_inode_mark_healthy(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
trace_xfs_inode_mark_healthy(ip, mask);
spin_lock(&ip->i_flags_lock);
ip->i_sick &= ~mask;
+ if (!(ip->i_sick & XFS_SICK_INO_PRIMARY))
+ ip->i_sick &= ~XFS_SICK_INO_SECONDARY;
ip->i_checked |= mask;
spin_unlock(&ip->i_flags_lock);
}
@@ -280,6 +374,8 @@ static const struct ioctl_sick_map fs_map[] = {
{ XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA },
{ XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA },
{ XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA },
+ { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK },
+ { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS },
{ 0, 0 },
};
@@ -335,6 +431,7 @@ static const struct ioctl_sick_map ag_map[] = {
{ XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT },
{ XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT },
{ XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT },
+ { XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES },
{ 0, 0 },
};
@@ -397,3 +494,92 @@ xfs_bulkstat_health(
bs->bs_sick |= m->ioctl_mask;
}
}
+
+/* Mark a block mapping sick. */
+void
+xfs_bmap_mark_sick(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int mask;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ mask = XFS_SICK_INO_BMBTD;
+ break;
+ case XFS_ATTR_FORK:
+ mask = XFS_SICK_INO_BMBTA;
+ break;
+ case XFS_COW_FORK:
+ mask = XFS_SICK_INO_BMBTC;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ xfs_inode_mark_sick(ip, mask);
+}
+
+/* Record observations of btree corruption with the health tracking system. */
+void
+xfs_btree_mark_sick(
+ struct xfs_btree_cur *cur)
+{
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ /* no health state tracking for ephemeral btrees */
+ return;
+ case XFS_BTREE_TYPE_AG:
+ ASSERT(cur->bc_ops->sick_mask);
+ xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask);
+ return;
+ case XFS_BTREE_TYPE_INODE:
+ if (xfs_btree_is_bmap(cur->bc_ops)) {
+ xfs_bmap_mark_sick(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
+ return;
+ }
+ fallthrough;
+ default:
+ ASSERT(0);
+ return;
+ }
+}
+
+/*
+ * Record observations of dir/attr btree corruption with the health tracking
+ * system.
+ */
+void
+xfs_dirattr_mark_sick(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int mask;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ mask = XFS_SICK_INO_DIR;
+ break;
+ case XFS_ATTR_FORK:
+ mask = XFS_SICK_INO_XATTR;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ xfs_inode_mark_sick(ip, mask);
+}
+
+/*
+ * Record observations of dir/attr btree corruption with the health tracking
+ * system.
+ */
+void
+xfs_da_mark_sick(
+ struct xfs_da_args *args)
+{
+ xfs_dirattr_mark_sick(args->dp, args->whichfork);
+}
diff --git a/fs/xfs/xfs_hooks.c b/fs/xfs/xfs_hooks.c
new file mode 100644
index 000000000000..a58d1de2d37d
--- /dev/null
+++ b/fs/xfs/xfs_hooks.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_trace.h"
+
+/* Initialize a notifier chain. */
+void
+xfs_hooks_init(
+ struct xfs_hooks *chain)
+{
+ BLOCKING_INIT_NOTIFIER_HEAD(&chain->head);
+}
+
+/* Make it so a function gets called whenever we hit a certain hook point. */
+int
+xfs_hooks_add(
+ struct xfs_hooks *chain,
+ struct xfs_hook *hook)
+{
+ ASSERT(hook->nb.notifier_call != NULL);
+ BUILD_BUG_ON(offsetof(struct xfs_hook, nb) != 0);
+
+ return blocking_notifier_chain_register(&chain->head, &hook->nb);
+}
+
+/* Remove a previously installed hook. */
+void
+xfs_hooks_del(
+ struct xfs_hooks *chain,
+ struct xfs_hook *hook)
+{
+ blocking_notifier_chain_unregister(&chain->head, &hook->nb);
+}
+
+/* Call a hook. Returns the NOTIFY_* value returned by the last hook. */
+int
+xfs_hooks_call(
+ struct xfs_hooks *chain,
+ unsigned long val,
+ void *priv)
+{
+ return blocking_notifier_call_chain(&chain->head, val, priv);
+}
diff --git a/fs/xfs/xfs_hooks.h b/fs/xfs/xfs_hooks.h
new file mode 100644
index 000000000000..60b8a5831536
--- /dev/null
+++ b/fs/xfs/xfs_hooks.h
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef XFS_HOOKS_H_
+#define XFS_HOOKS_H_
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+struct xfs_hooks {
+ struct blocking_notifier_head head;
+};
+
+/*
+ * If jump labels are enabled in Kconfig, the static key uses nop sleds and
+ * code patching to eliminate the overhead of taking the rwsem in
+ * blocking_notifier_call_chain when there are no hooks configured. If not,
+ * the static key per-call overhead is an atomic read. Most arches that can
+ * handle XFS also support jump labels.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) \
+ static DEFINE_STATIC_KEY_FALSE(name)
+# define xfs_hooks_switch_on(name) static_branch_inc(name)
+# define xfs_hooks_switch_off(name) static_branch_dec(name)
+# define xfs_hooks_switched_on(name) static_branch_unlikely(name)
+
+struct xfs_hook {
+ /* This must come at the start of the structure. */
+ struct notifier_block nb;
+};
+
+typedef int (*xfs_hook_fn_t)(struct xfs_hook *hook, unsigned long action,
+ void *data);
+
+void xfs_hooks_init(struct xfs_hooks *chain);
+int xfs_hooks_add(struct xfs_hooks *chain, struct xfs_hook *hook);
+void xfs_hooks_del(struct xfs_hooks *chain, struct xfs_hook *hook);
+int xfs_hooks_call(struct xfs_hooks *chain, unsigned long action,
+ void *priv);
+
+static inline void xfs_hook_setup(struct xfs_hook *hook, notifier_fn_t fn)
+{
+ hook->nb.notifier_call = fn;
+ hook->nb.priority = 0;
+}
+
+#else
+
+struct xfs_hooks { /* empty */ };
+
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name)
+# define xfs_hooks_switch_on(name) ((void)0)
+# define xfs_hooks_switch_off(name) ((void)0)
+# define xfs_hooks_switched_on(name) (false)
+
+# define xfs_hooks_init(chain) ((void)0)
+# define xfs_hooks_call(chain, val, priv) (NOTIFY_DONE)
+#endif
+
+#endif /* XFS_HOOKS_H_ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index dba514a2c84d..74f1812b03cb 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -24,6 +24,7 @@
#include "xfs_ialloc.h"
#include "xfs_ag.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
#include <linux/iversion.h>
@@ -415,6 +416,9 @@ xfs_iget_check_free_state(
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
ip->i_ino, VFS_I(ip)->i_mode);
+ xfs_agno_mark_sick(ip->i_mount,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
@@ -422,6 +426,9 @@ xfs_iget_check_free_state(
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx has blocks allocated!",
ip->i_ino);
+ xfs_agno_mark_sick(ip->i_mount,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
return 0;
@@ -640,6 +647,8 @@ xfs_iget_cache_miss(
xfs_buf_offset(bp, ip->i_imap.im_boffset));
if (!error)
xfs_buf_set_ref(bp, XFS_INO_REF);
+ else
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
xfs_trans_brelse(tp, bp);
if (error)
@@ -659,10 +668,9 @@ xfs_iget_cache_miss(
/*
* Preload the radix tree so we can insert safely under the
* write spinlock. Note that we cannot sleep inside the preload
- * region. Since we can be called from transaction context, don't
- * recurse into the file system.
+ * region.
*/
- if (radix_tree_preload(GFP_NOFS)) {
+ if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
error = -EAGAIN;
goto out_destroy;
}
@@ -2031,8 +2039,10 @@ xfs_inodegc_want_queue_work(
* - Memory shrinkers queued the inactivation worker and it hasn't finished.
* - The queue depth exceeds the maximum allowable percpu backlog.
*
- * Note: If the current thread is running a transaction, we don't ever want to
- * wait for other transactions because that could introduce a deadlock.
+ * Note: If we are in a NOFS context here (e.g. current thread is running a
+ * transaction) the we don't want to block here as inodegc progress may require
+ * filesystem resources we hold to make progress and that could result in a
+ * deadlock. Hence we skip out of here if we are in a scoped NOFS context.
*/
static inline bool
xfs_inodegc_want_flush_work(
@@ -2040,7 +2050,7 @@ xfs_inodegc_want_flush_work(
unsigned int items,
unsigned int shrinker_hits)
{
- if (current->journal_info)
+ if (current->flags & PF_MEMALLOC_NOFS)
return false;
if (shrinker_hits > 0)
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index b05314d48176..4345db501714 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,7 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
+ kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1fd94958aa97..d55b42b2480d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -203,9 +203,9 @@ xfs_ilock(
}
if (lock_flags & XFS_ILOCK_EXCL)
- mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
else if (lock_flags & XFS_ILOCK_SHARED)
- mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
}
/*
@@ -246,10 +246,10 @@ xfs_ilock_nowait(
}
if (lock_flags & XFS_ILOCK_EXCL) {
- if (!mrtryupdate(&ip->i_lock))
+ if (!down_write_trylock(&ip->i_lock))
goto out_undo_mmaplock;
} else if (lock_flags & XFS_ILOCK_SHARED) {
- if (!mrtryaccess(&ip->i_lock))
+ if (!down_read_trylock(&ip->i_lock))
goto out_undo_mmaplock;
}
return 1;
@@ -298,9 +298,9 @@ xfs_iunlock(
up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_ILOCK_EXCL)
- mrunlock_excl(&ip->i_lock);
+ up_write(&ip->i_lock);
else if (lock_flags & XFS_ILOCK_SHARED)
- mrunlock_shared(&ip->i_lock);
+ up_read(&ip->i_lock);
trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
}
@@ -319,7 +319,7 @@ xfs_ilock_demote(
~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
if (lock_flags & XFS_ILOCK_EXCL)
- mrdemote(&ip->i_lock);
+ downgrade_write(&ip->i_lock);
if (lock_flags & XFS_MMAPLOCK_EXCL)
downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_IOLOCK_EXCL)
@@ -328,52 +328,30 @@ xfs_ilock_demote(
trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
}
-#if defined(DEBUG) || defined(XFS_WARN)
-static inline bool
-__xfs_rwsem_islocked(
- struct rw_semaphore *rwsem,
- bool shared)
-{
- if (!debug_locks)
- return rwsem_is_locked(rwsem);
-
- if (!shared)
- return lockdep_is_held_type(rwsem, 0);
-
- /*
- * We are checking that the lock is held at least in shared
- * mode but don't care that it might be held exclusively
- * (i.e. shared | excl). Hence we check if the lock is held
- * in any mode rather than an explicit shared mode.
- */
- return lockdep_is_held_type(rwsem, -1);
-}
-
-bool
-xfs_isilocked(
+void
+xfs_assert_ilocked(
struct xfs_inode *ip,
uint lock_flags)
{
- if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
- if (!(lock_flags & XFS_ILOCK_SHARED))
- return !!ip->i_lock.mr_writer;
- return rwsem_is_locked(&ip->i_lock.mr_lock);
- }
-
- if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
- (lock_flags & XFS_MMAPLOCK_SHARED));
- }
+ /*
+ * Sometimes we assert the ILOCK is held exclusively, but we're in
+ * a workqueue, so lockdep doesn't know we're the owner.
+ */
+ if (lock_flags & XFS_ILOCK_SHARED)
+ rwsem_assert_held(&ip->i_lock);
+ else if (lock_flags & XFS_ILOCK_EXCL)
+ rwsem_assert_held_write_nolockdep(&ip->i_lock);
- if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
- (lock_flags & XFS_IOLOCK_SHARED));
- }
+ if (lock_flags & XFS_MMAPLOCK_SHARED)
+ rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock);
+ else if (lock_flags & XFS_MMAPLOCK_EXCL)
+ rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock);
- ASSERT(0);
- return false;
+ if (lock_flags & XFS_IOLOCK_SHARED)
+ rwsem_assert_held(&VFS_I(ip)->i_rwsem);
+ else if (lock_flags & XFS_IOLOCK_EXCL)
+ rwsem_assert_held_write(&VFS_I(ip)->i_rwsem);
}
-#endif
/*
* xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
@@ -671,7 +649,7 @@ xfs_lookup(
out_free_name:
if (ci_name)
- kmem_free(ci_name->name);
+ kfree(ci_name->name);
out_unlock:
*ipp = NULL;
return error;
@@ -802,6 +780,8 @@ xfs_init_new_inode(
*/
if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+ xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
@@ -947,6 +927,81 @@ xfs_bumplink(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of directory live update hooks.
+ * If the compiler supports jump labels, the static branch will be replaced by
+ * a nop sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
+
+void
+xfs_dir_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_dir_hooks_switch);
+}
+
+void
+xfs_dir_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_dir_hooks_switch);
+}
+
+/* Call hooks for a directory update relating to a child dirent update. */
+inline void
+xfs_dir_update_hook(
+ struct xfs_inode *dp,
+ struct xfs_inode *ip,
+ int delta,
+ const struct xfs_name *name)
+{
+ if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
+ struct xfs_dir_update_params p = {
+ .dp = dp,
+ .ip = ip,
+ .delta = delta,
+ .name = name,
+ };
+ struct xfs_mount *mp = ip->i_mount;
+
+ xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+ }
+}
+
+/* Call the specified function during a directory update. */
+int
+xfs_dir_hook_add(
+ struct xfs_mount *mp,
+ struct xfs_dir_hook *hook)
+{
+ return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Stop calling the specified function during a directory update. */
+void
+xfs_dir_hook_del(
+ struct xfs_mount *mp,
+ struct xfs_dir_hook *hook)
+{
+ xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Configure directory update hook functions. */
+void
+xfs_dir_hook_setup(
+ struct xfs_dir_hook *hook,
+ notifier_fn_t mod_fn)
+{
+ xfs_hook_setup(&hook->dirent_hook, mod_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
int
xfs_create(
struct mnt_idmap *idmap,
@@ -1058,6 +1113,12 @@ xfs_create(
}
/*
+ * Create ip with a reference from dp, and add '.' and '..' references
+ * if it's a directory.
+ */
+ xfs_dir_update_hook(dp, ip, 1, name);
+
+ /*
* If this is a synchronous mount, make sure that the
* create transaction goes to disk before returning to
* the user.
@@ -1240,8 +1301,19 @@ xfs_link(
*/
if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
tdp->i_projid != sip->i_projid)) {
- error = -EXDEV;
- goto error_return;
+ /*
+ * Project quota setup skips special files which can
+ * leave inodes in a PROJINHERIT directory without a
+ * project ID set. We need to allow links to be made
+ * to these "project-less" inodes because userspace
+ * expects them to succeed after project ID setup,
+ * but everything else should be rejected.
+ */
+ if (!special_file(VFS_I(sip)->i_mode) ||
+ sip->i_projid != 0) {
+ error = -EXDEV;
+ goto error_return;
+ }
}
if (!resblks) {
@@ -1271,6 +1343,7 @@ xfs_link(
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
xfs_bumplink(tp, sip);
+ xfs_dir_update_hook(tdp, sip, 1, target_name);
/*
* If this is a synchronous mount, make sure that the
@@ -1342,9 +1415,9 @@ xfs_itruncate_extents_flags(
xfs_fileoff_t first_unmap_block;
int error = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+ if (atomic_read(&VFS_I(ip)->i_count))
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_itemp != NULL);
@@ -1596,7 +1669,7 @@ xfs_inactive_ifree(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_ifree(tp, ip);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (error) {
/*
* If we fail to free the inode, shut down. The cancel
@@ -1677,6 +1750,39 @@ xfs_inode_needs_inactive(
}
/*
+ * Save health status somewhere, if we're dumping an inode with uncorrected
+ * errors and online repair isn't running.
+ */
+static inline void
+xfs_inactive_health(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ unsigned int sick;
+ unsigned int checked;
+
+ xfs_inode_measure_sickness(ip, &sick, &checked);
+ if (!sick)
+ return;
+
+ trace_xfs_inode_unfixed_corruption(ip, sick);
+
+ if (sick & XFS_SICK_INO_FORGET)
+ return;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ if (!pag) {
+ /* There had better still be a perag structure! */
+ ASSERT(0);
+ return;
+ }
+
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES);
+ xfs_perag_put(pag);
+}
+
+/*
* xfs_inactive
*
* This is called when the vnode reference count for the vnode
@@ -1704,6 +1810,8 @@ xfs_inactive(
mp = ip->i_mount;
ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
+ xfs_inactive_health(ip);
+
/*
* If this is a read-only mount, don't do this (would generate I/O)
* unless we're in log recovery and cleaning the iunlinked list.
@@ -1910,6 +2018,7 @@ xfs_iunlink_update_bucket(
*/
if (old_value == new_agino) {
xfs_buf_mark_corrupt(agibp);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -1959,11 +2068,14 @@ xfs_iunlink_reload_next(
*/
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
- if (error)
+ if (error) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return error;
+ }
/* If this is not an unlinked inode, something is very wrong. */
if (VFS_I(next_ip)->i_nlink != 0) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
error = -EFSCORRUPTED;
goto rele;
}
@@ -2001,6 +2113,7 @@ xfs_iunlink_insert_inode(
if (next_agino == agino ||
!xfs_verify_agino_or_null(pag, next_agino)) {
xfs_buf_mark_corrupt(agibp);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -2088,6 +2201,7 @@ xfs_iunlink_remove_inode(
if (!xfs_verify_agino(pag, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -2116,8 +2230,10 @@ xfs_iunlink_remove_inode(
struct xfs_inode *prev_ip;
prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
- if (!prev_ip)
+ if (!prev_ip) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
+ }
error = xfs_iunlink_log_inode(tp, prev_ip, pag,
ip->i_next_unlinked);
@@ -2350,7 +2466,7 @@ xfs_ifree(
struct xfs_inode_log_item *iip = ip->i_itemp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(ip->i_df.if_nextents == 0);
ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
@@ -2378,7 +2494,7 @@ xfs_ifree(
* already been freed by xfs_attr_inactive.
*/
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- kmem_free(ip->i_df.if_data);
+ kfree(ip->i_df.if_data);
ip->i_df.if_data = NULL;
ip->i_df.if_bytes = 0;
}
@@ -2419,7 +2535,7 @@ static void
xfs_iunpin(
struct xfs_inode *ip)
{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
@@ -2585,6 +2701,12 @@ xfs_remove(
}
/*
+ * Drop the link from dp to ip, and if ip was a directory, remove the
+ * '.' and '..' references since we freed the directory.
+ */
+ xfs_dir_update_hook(dp, ip, -1, name);
+
+ /*
* If this is a synchronous mount, make sure that the
* remove transaction goes to disk before returning to
* the user.
@@ -2774,6 +2896,20 @@ xfs_cross_rename(
}
xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+
+ /*
+ * Inform our hook clients that we've finished an exchange operation as
+ * follows: removed the source and target files from their directories;
+ * added the target to the source directory; and added the source to
+ * the target directory. All inodes are locked, so it's ok to model a
+ * rename this way so long as we say we deleted entries before we add
+ * new ones.
+ */
+ xfs_dir_update_hook(dp1, ip1, -1, name1);
+ xfs_dir_update_hook(dp2, ip2, -1, name2);
+ xfs_dir_update_hook(dp1, ip2, 1, name1);
+ xfs_dir_update_hook(dp2, ip1, 1, name2);
+
return xfs_finish_rename(tp);
out_trans_abort:
@@ -3157,6 +3293,21 @@ retry:
if (new_parent)
xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+ /*
+ * Inform our hook clients that we've finished a rename operation as
+ * follows: removed the source and target files from their directories;
+ * that we've added the source to the target directory; and finally
+ * that we've added the whiteout, if there was one. All inodes are
+ * locked, so it's ok to model a rename this way so long as we say we
+ * deleted entries before we add new ones.
+ */
+ if (target_ip)
+ xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
+ xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
+ xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
+ if (wip)
+ xfs_dir_update_hook(src_dp, wip, 1, src_name);
+
error = xfs_finish_rename(tp);
if (wip)
xfs_irele(wip);
@@ -3182,7 +3333,7 @@ xfs_iflush(
struct xfs_mount *mp = ip->i_mount;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
@@ -3317,6 +3468,8 @@ flush_out:
/* generate the checksum. */
xfs_dinode_calc_crc(mp, dip);
+ if (error)
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return error;
}
@@ -3777,3 +3930,19 @@ xfs_ifork_zapped(
return false;
}
}
+
+/* Compute the number of data and realtime blocks used by a file. */
+void
+xfs_inode_count_blocks(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_filblks_t *dblocks,
+ xfs_filblks_t *rblocks)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+ *rblocks = 0;
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_bmap_count_leaves(ifp, rblocks);
+ *dblocks = ip->i_nblocks - *rblocks;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 97f63bacd4c2..ab46ffb3ac19 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -39,7 +39,7 @@ typedef struct xfs_inode {
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
- mrlock_t i_lock; /* inode lock */
+ struct rw_semaphore i_lock; /* inode lock */
atomic_t i_pincount; /* inode pin count */
struct llist_node i_gclist; /* deferred inactivation list */
@@ -171,6 +171,12 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
return &ip->i_vnode;
}
+/* convert from const xfs inode to const vfs inode */
+static inline const struct inode *VFS_IC(const struct xfs_inode *ip)
+{
+ return &ip->i_vnode;
+}
+
/*
* For regular files we only update the on-disk filesize when actually
* writing data back to disk. Until then only the copy in the VFS inode
@@ -523,7 +529,7 @@ void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
-bool xfs_isilocked(struct xfs_inode *, uint);
+void xfs_assert_ilocked(struct xfs_inode *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
@@ -623,5 +629,32 @@ int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip)
int xfs_inode_reload_unlinked(struct xfs_inode *ip);
bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
+void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
+
+struct xfs_dir_update_params {
+ const struct xfs_inode *dp;
+ const struct xfs_inode *ip;
+ const struct xfs_name *name;
+ int delta;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
+ int delta, const struct xfs_name *name);
+
+struct xfs_dir_hook {
+ struct xfs_hook dirent_hook;
+};
+
+void xfs_dir_hook_disable(void);
+void xfs_dir_hook_enable(void);
+
+int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn);
+#else
+# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0aee97ba0be8..f28d653300d1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -650,7 +650,7 @@ xfs_inode_item_pin(
{
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(lip->li_buf);
trace_xfs_inode_pin(ip, _RET_IP_);
@@ -756,7 +756,7 @@ xfs_inode_item_release(
unsigned short lock_flags;
ASSERT(ip->i_itemp != NULL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
lock_flags = iip->ili_lock_flags;
iip->ili_lock_flags = 0;
@@ -856,7 +856,7 @@ xfs_inode_item_destroy(
ASSERT(iip->ili_item.li_buf == NULL);
ip->i_itemp = NULL;
- kmem_free(iip->ili_item.li_lv_shadow);
+ kvfree(iip->ili_item.li_lv_shadow);
kmem_cache_free(xfs_ili_cache, iip);
}
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 144198a6b270..dbdab4ce7c44 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -291,7 +291,8 @@ xlog_recover_inode_commit_pass2(
if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
} else {
- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
+ in_f = kmalloc(sizeof(struct xfs_inode_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
need_free = 1;
error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
if (error)
@@ -553,7 +554,7 @@ out_release:
xfs_buf_relse(bp);
error:
if (need_free)
- kmem_free(in_f);
+ kfree(in_f);
return error;
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f02b6e558af5..d0e2cec6210d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -435,7 +435,7 @@ xfs_ioc_attr_list(
copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
error = -EFAULT;
out_free:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -493,7 +493,7 @@ xfs_attrmulti_attr_get(
error = -EFAULT;
out_kfree:
- kmem_free(args.value);
+ kvfree(args.value);
return error;
}
@@ -1506,7 +1506,7 @@ xfs_ioc_getbmap(
error = 0;
out_free_buf:
- kmem_free(buf);
+ kvfree(buf);
return error;
}
@@ -1636,7 +1636,7 @@ xfs_ioc_getfsmap(
}
out_free:
- kmem_free(recs);
+ kvfree(recs);
return error;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18c8f168b153..4087af7f3c9f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -27,6 +27,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"
+#include "xfs_health.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -45,6 +46,7 @@ xfs_alert_fsblock_zero(
(unsigned long long)imap->br_startoff,
(unsigned long long)imap->br_blockcount,
imap->br_state);
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -99,8 +101,10 @@ xfs_bmbt_to_iomap(
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
- if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return xfs_alert_fsblock_zero(ip, imap);
+ }
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
@@ -325,8 +329,10 @@ xfs_iomap_write_direct(
goto out_unlock;
}
- if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = xfs_alert_fsblock_zero(ip, imap);
+ }
out_unlock:
*seq = xfs_iomap_inode_sequence(ip, 0);
@@ -639,8 +645,10 @@ xfs_iomap_write_unwritten(
if (error)
return error;
- if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return xfs_alert_fsblock_zero(ip, &imap);
+ }
if ((numblks_fsb = imap.br_blockcount) == 0) {
/*
@@ -986,6 +994,7 @@ xfs_buffered_write_iomap_begin(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
}
@@ -1323,7 +1332,7 @@ xfs_seek_iomap_begin(
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
- xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb);
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
IOMAP_F_SHARED, seq);
@@ -1348,7 +1357,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
seq = xfs_iomap_inode_sequence(ip, 0);
- xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, lockmode);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a0d77f5f512e..66f8c47642e8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -346,7 +346,7 @@ xfs_vn_ci_lookup(
dname.name = ci_name.name;
dname.len = ci_name.len;
dentry = d_add_ci(dentry, VFS_I(ip), &dname);
- kmem_free(ci_name.name);
+ kfree(ci_name.name);
return dentry;
}
@@ -796,8 +796,7 @@ xfs_setattr_size(
uint lock_flags = 0;
bool did_zeroing = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
@@ -1285,9 +1284,9 @@ xfs_setup_inode(
*/
lockdep_set_class(&inode->i_rwsem,
&inode->i_sb->s_type->i_mutex_dir_key);
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+ lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class);
} else {
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+ lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class);
}
/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 14462614fcc8..95fc31b9f87d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -197,8 +197,8 @@ xfs_bulkstat_one(
ASSERT(breq->icount == 1);
- bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
- KM_MAYFAIL);
+ bc.buf = kzalloc(sizeof(struct xfs_bulkstat),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!bc.buf)
return -ENOMEM;
@@ -214,7 +214,7 @@ xfs_bulkstat_one(
breq->startino, &bc);
xfs_trans_cancel(tp);
out:
- kmem_free(bc.buf);
+ kfree(bc.buf);
/*
* If we reported one inode to userspace then we abort because we hit
@@ -289,8 +289,8 @@ xfs_bulkstat(
if (xfs_bulkstat_already_done(breq->mp, breq->startino))
return 0;
- bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
- KM_MAYFAIL);
+ bc.buf = kzalloc(sizeof(struct xfs_bulkstat),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!bc.buf)
return -ENOMEM;
@@ -309,7 +309,7 @@ xfs_bulkstat(
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
out:
- kmem_free(bc.buf);
+ kfree(bc.buf);
/*
* We found some inodes, so clear the error status and return them.
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index b3275e8d47b6..01b55f03a102 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -22,6 +22,7 @@
#include "xfs_trans.h"
#include "xfs_pwork.h"
#include "xfs_ag.h"
+#include "xfs_bit.h"
/*
* Walking Inodes in the Filesystem
@@ -99,6 +100,7 @@ xfs_iwalk_ichunk_ra(
struct xfs_inobt_rec_incore *irec)
{
struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agnumber_t agno = pag->pag_agno;
xfs_agblock_t agbno;
struct blk_plug plug;
int i; /* inode chunk index */
@@ -111,8 +113,9 @@ xfs_iwalk_ichunk_ra(
imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
if (imask & ~irec->ir_free) {
- xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
- igeo->blocks_per_cluster,
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, agbno),
+ igeo->blocks_per_cluster * mp->m_bsize,
&xfs_inode_buf_ops);
}
agbno += igeo->blocks_per_cluster;
@@ -131,21 +134,11 @@ xfs_iwalk_adjust_start(
struct xfs_inobt_rec_incore *irec) /* btree record */
{
int idx; /* index into inode chunk */
- int i;
idx = agino - irec->ir_startino;
- /*
- * We got a right chunk with some left inodes allocated at it. Grab
- * the chunk record. Mark all the uninteresting inodes free because
- * they're before our start point.
- */
- for (i = 0; i < idx; i++) {
- if (XFS_INOBT_MASK(i) & ~irec->ir_free)
- irec->ir_freecount++;
- }
-
irec->ir_free |= xfs_inobt_maskn(0, idx);
+ irec->ir_freecount = hweight64(irec->ir_free);
}
/* Allocate memory for a walk. */
@@ -160,7 +153,7 @@ xfs_iwalk_alloc(
/* Allocate a prefetch buffer for inobt records. */
size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
- iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+ iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (iwag->recs == NULL)
return -ENOMEM;
@@ -172,7 +165,7 @@ STATIC void
xfs_iwalk_free(
struct xfs_iwalk_ag *iwag)
{
- kmem_free(iwag->recs);
+ kfree(iwag->recs);
iwag->recs = NULL;
}
@@ -275,9 +268,10 @@ xfs_iwalk_ag_start(
/* Set up a fresh cursor and empty the inobt cache. */
iwag->nr_recs = 0;
- error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
if (error)
return error;
+ *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp);
/* Starting at the beginning of the AG? That's easy! */
if (agino == 0)
@@ -306,8 +300,10 @@ xfs_iwalk_ag_start(
error = xfs_inobt_get_rec(*curpp, irec, has_more);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, *has_more != 1))
+ if (XFS_IS_CORRUPT(mp, *has_more != 1)) {
+ xfs_btree_mark_sick(*curpp);
return -EFSCORRUPTED;
+ }
iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
@@ -390,11 +386,10 @@ xfs_iwalk_run_callbacks(
}
/* ...and recreate the cursor just past where we left off. */
- error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp,
- agi_bpp);
+ error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp);
if (error)
return error;
-
+ *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp);
return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
}
@@ -434,6 +429,7 @@ xfs_iwalk_ag(
rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
if (iwag->lastino != NULLFSINO &&
XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -627,7 +623,7 @@ xfs_iwalk_ag_work(
xfs_iwalk_free(iwag);
out:
xfs_perag_put(iwag->pag);
- kmem_free(iwag);
+ kfree(iwag);
return error;
}
@@ -663,7 +659,8 @@ xfs_iwalk_threaded(
if (xfs_pwork_ctl_want_abort(&pctl))
break;
- iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
+ iwag = kzalloc(sizeof(struct xfs_iwalk_ag),
+ GFP_KERNEL | __GFP_NOFAIL);
iwag->mp = mp;
/*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index d7873e0360f0..8f07c9f6157f 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -21,15 +21,13 @@ typedef __u32 xfs_nlink_t;
#include "xfs_types.h"
-#include "kmem.h"
-#include "mrlock.h"
-
#include <linux/semaphore.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/crc32c.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -51,6 +49,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/notifier.h>
#include <linux/delay.h>
#include <linux/log2.h>
+#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/ctype.h>
@@ -82,6 +81,7 @@ typedef __u32 xfs_nlink_t;
#include "xfs_buf.h"
#include "xfs_message.h"
#include "xfs_drain.h"
+#include "xfs_hooks.h"
#ifdef __BIG_ENDIAN
#define XFS_NATIVE_HOST 1
@@ -269,4 +269,15 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
# define PTR_FMT "%p"
#endif
+/*
+ * Helper for IO routines to grab backing pages from allocated kernel memory.
+ */
+static inline struct page *
+kmem_to_page(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ return virt_to_page(addr);
+}
+
#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a1650fc81382..5004f23d344e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -633,14 +633,14 @@ xlog_state_release_iclog(
*/
int
xfs_log_mount(
- xfs_mount_t *mp,
- xfs_buftarg_t *log_target,
- xfs_daddr_t blk_offset,
- int num_bblks)
+ xfs_mount_t *mp,
+ struct xfs_buftarg *log_target,
+ xfs_daddr_t blk_offset,
+ int num_bblks)
{
- struct xlog *log;
- int error = 0;
- int min_logfsbs;
+ struct xlog *log;
+ int error = 0;
+ int min_logfsbs;
if (!xfs_has_norecovery(mp)) {
xfs_notice(mp, "Mounting V%d Filesystem %pU",
@@ -1528,7 +1528,7 @@ xlog_alloc_log(
int error = -ENOMEM;
uint log2_size = 0;
- log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
+ log = kzalloc(sizeof(struct xlog), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!log) {
xfs_warn(mp, "Log allocation failed: No memory!");
goto out;
@@ -1605,7 +1605,8 @@ xlog_alloc_log(
size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
sizeof(struct bio_vec);
- iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+ iclog = kzalloc(sizeof(*iclog) + bvec_size,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog)
goto out_free_iclog;
@@ -1661,13 +1662,13 @@ out_destroy_workqueue:
out_free_iclog:
for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
prev_iclog = iclog->ic_next;
- kmem_free(iclog->ic_data);
- kmem_free(iclog);
+ kvfree(iclog->ic_data);
+ kfree(iclog);
if (prev_iclog == log->l_iclog)
break;
}
out_free_log:
- kmem_free(log);
+ kfree(log);
out:
return ERR_PTR(error);
} /* xlog_alloc_log */
@@ -2118,14 +2119,14 @@ xlog_dealloc_log(
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
- kmem_free(iclog->ic_data);
- kmem_free(iclog);
+ kvfree(iclog->ic_data);
+ kfree(iclog);
iclog = next_iclog;
}
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
- kmem_free(log);
+ kfree(log);
}
/*
@@ -3517,7 +3518,8 @@ xlog_ticket_alloc(
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
+ tic = kmem_cache_zalloc(xfs_log_ticket_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 67a99d94701e..73f5b7f628f4 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -100,7 +100,7 @@ xlog_cil_ctx_alloc(void)
{
struct xfs_cil_ctx *ctx;
- ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents.extent_list);
INIT_LIST_HEAD(&ctx->log_items);
@@ -339,7 +339,7 @@ xlog_cil_alloc_shadow_bufs(
* the buffer, only the log vector header and the iovec
* storage.
*/
- kmem_free(lip->li_lv_shadow);
+ kvfree(lip->li_lv_shadow);
lv = xlog_kvmalloc(buf_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
@@ -703,7 +703,7 @@ xlog_cil_free_logvec(
while (!list_empty(lv_chain)) {
lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
list_del_init(&lv->lv_list);
- kmem_free(lv);
+ kvfree(lv);
}
}
@@ -753,7 +753,7 @@ xlog_cil_committed(
return;
}
- kmem_free(ctx);
+ kfree(ctx);
}
void
@@ -1116,11 +1116,18 @@ xlog_cil_cleanup_whiteouts(
* same sequence twice. If we get a race between multiple pushes for the same
* sequence they will block on the first one and then abort, hence avoiding
* needless pushes.
+ *
+ * This runs from a workqueue so it does not inherent any specific memory
+ * allocation context. However, we do not want to block on memory reclaim
+ * recursing back into the filesystem because this push may have been triggered
+ * by memory reclaim itself. Hence we really need to run under full GFP_NOFS
+ * contraints here.
*/
static void
xlog_cil_push_work(
struct work_struct *work)
{
+ unsigned int nofs_flags = memalloc_nofs_save();
struct xfs_cil_ctx *ctx =
container_of(work, struct xfs_cil_ctx, push_work);
struct xfs_cil *cil = ctx->cil;
@@ -1334,12 +1341,14 @@ xlog_cil_push_work(
spin_unlock(&log->l_icloglock);
xlog_cil_cleanup_whiteouts(&whiteouts);
xfs_log_ticket_ungrant(log, ticket);
+ memalloc_nofs_restore(nofs_flags);
return;
out_skip:
up_write(&cil->xc_ctx_lock);
xfs_log_ticket_put(new_ctx->ticket);
- kmem_free(new_ctx);
+ kfree(new_ctx);
+ memalloc_nofs_restore(nofs_flags);
return;
out_abort_free_ticket:
@@ -1348,6 +1357,7 @@ out_abort_free_ticket:
if (!ctx->commit_iclog) {
xfs_log_ticket_ungrant(log, ctx->ticket);
xlog_cil_committed(ctx);
+ memalloc_nofs_restore(nofs_flags);
return;
}
spin_lock(&log->l_icloglock);
@@ -1356,6 +1366,7 @@ out_abort_free_ticket:
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
xfs_log_ticket_ungrant(log, ticket);
+ memalloc_nofs_restore(nofs_flags);
}
/*
@@ -1533,7 +1544,7 @@ xlog_cil_process_intents(
set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
trace_xfs_cil_whiteout_mark(ilip);
len += ilip->li_lv->lv_bytes;
- kmem_free(ilip->li_lv);
+ kvfree(ilip->li_lv);
ilip->li_lv = NULL;
xfs_trans_del_item(lip);
@@ -1747,7 +1758,7 @@ xlog_cil_init(
struct xlog_cil_pcp *cilpcp;
int cpu;
- cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
+ cil = kzalloc(sizeof(*cil), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!cil)
return -ENOMEM;
/*
@@ -1786,7 +1797,7 @@ xlog_cil_init(
out_destroy_wq:
destroy_workqueue(cil->xc_push_wq);
out_destroy_cil:
- kmem_free(cil);
+ kfree(cil);
return -ENOMEM;
}
@@ -1799,12 +1810,12 @@ xlog_cil_destroy(
if (cil->xc_ctx) {
if (cil->xc_ctx->ticket)
xfs_log_ticket_put(cil->xc_ctx->ticket);
- kmem_free(cil->xc_ctx);
+ kfree(cil->xc_ctx);
}
ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
free_percpu(cil->xc_pcp);
destroy_workqueue(cil->xc_push_wq);
- kmem_free(cil);
+ kfree(cil);
}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1251c81e55f9..13f1d2e91540 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -361,7 +361,7 @@ xlog_find_verify_cycle(
*new_blk = -1;
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -477,7 +477,7 @@ xlog_find_verify_log_record(
*last_blk = i;
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -731,7 +731,7 @@ validate_head:
goto out_free_buffer;
}
- kmem_free(buffer);
+ kvfree(buffer);
if (head_blk == log_bbnum)
*return_head_blk = 0;
else
@@ -745,7 +745,7 @@ validate_head:
return 0;
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
xfs_warn(log->l_mp, "failed to find log head");
return error;
@@ -999,7 +999,7 @@ xlog_verify_tail(
"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
orig_tail, *tail_blk);
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -1046,7 +1046,7 @@ xlog_verify_head(
error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
XLOG_MAX_ICLOGS, tmp_buffer,
&tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
- kmem_free(tmp_buffer);
+ kvfree(tmp_buffer);
if (error < 0)
return error;
@@ -1365,7 +1365,7 @@ xlog_find_tail(
error = xlog_clear_stale_blocks(log, tail_lsn);
done:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
xfs_warn(log->l_mp, "failed to locate log tail");
@@ -1399,6 +1399,7 @@ xlog_find_zeroed(
xfs_daddr_t new_blk, last_blk, start_blk;
xfs_daddr_t num_scan_bblks;
int error, log_bbnum = log->l_logBBsize;
+ int ret = 1;
*blk_no = 0;
@@ -1413,8 +1414,7 @@ xlog_find_zeroed(
first_cycle = xlog_get_cycle(offset);
if (first_cycle == 0) { /* completely zeroed log */
*blk_no = 0;
- kmem_free(buffer);
- return 1;
+ goto out_free_buffer;
}
/* check partially zeroed log */
@@ -1424,8 +1424,8 @@ xlog_find_zeroed(
last_cycle = xlog_get_cycle(offset);
if (last_cycle != 0) { /* log completely written to */
- kmem_free(buffer);
- return 0;
+ ret = 0;
+ goto out_free_buffer;
}
/* we have a partially zeroed log */
@@ -1471,10 +1471,10 @@ xlog_find_zeroed(
*blk_no = last_blk;
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
return error;
- return 1;
+ return ret;
}
/*
@@ -1583,7 +1583,7 @@ xlog_write_log_records(
}
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -2057,7 +2057,8 @@ xlog_recover_add_item(
{
struct xlog_recover_item *item;
- item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
+ item = kzalloc(sizeof(struct xlog_recover_item),
+ GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&item->ri_list);
list_add_tail(&item->ri_list, head);
}
@@ -2160,7 +2161,7 @@ xlog_recover_add_to_trans(
return 0;
}
- ptr = kmem_alloc(len, 0);
+ ptr = xlog_kvmalloc(len);
memcpy(ptr, dp, len);
in_f = (struct xfs_inode_log_format *)ptr;
@@ -2182,14 +2183,13 @@ xlog_recover_add_to_trans(
"bad number of regions (%d) in inode log format",
in_f->ilf_size);
ASSERT(0);
- kmem_free(ptr);
+ kvfree(ptr);
return -EFSCORRUPTED;
}
item->ri_total = in_f->ilf_size;
- item->ri_buf =
- kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
- 0);
+ item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+ GFP_KERNEL | __GFP_NOFAIL);
}
if (item->ri_total <= item->ri_cnt) {
@@ -2197,7 +2197,7 @@ xlog_recover_add_to_trans(
"log item region count (%d) overflowed size (%d)",
item->ri_cnt, item->ri_total);
ASSERT(0);
- kmem_free(ptr);
+ kvfree(ptr);
return -EFSCORRUPTED;
}
@@ -2227,13 +2227,13 @@ xlog_recover_free_trans(
/* Free the regions in the item. */
list_del(&item->ri_list);
for (i = 0; i < item->ri_cnt; i++)
- kmem_free(item->ri_buf[i].i_addr);
+ kvfree(item->ri_buf[i].i_addr);
/* Free the item itself */
- kmem_free(item->ri_buf);
- kmem_free(item);
+ kfree(item->ri_buf);
+ kfree(item);
}
/* Free the transaction recover structure */
- kmem_free(trans);
+ kfree(trans);
}
/*
@@ -2332,7 +2332,7 @@ xlog_recover_ophdr_to_trans(
* This is a new transaction so allocate a new recovery container to
* hold the recovery ops that will follow.
*/
- trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
+ trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL | __GFP_NOFAIL);
trans->r_log_tid = tid;
trans->r_lsn = be64_to_cpu(rhead->h_lsn);
INIT_LIST_HEAD(&trans->r_itemq);
@@ -3024,7 +3024,7 @@ xlog_do_recovery_pass(
hblks = xlog_logrec_hblks(log, rhead);
if (hblks != 1) {
- kmem_free(hbp);
+ kvfree(hbp);
hbp = xlog_alloc_buffer(log, hblks);
}
} else {
@@ -3038,7 +3038,7 @@ xlog_do_recovery_pass(
return -ENOMEM;
dbp = xlog_alloc_buffer(log, BTOBB(h_size));
if (!dbp) {
- kmem_free(hbp);
+ kvfree(hbp);
return -ENOMEM;
}
@@ -3199,16 +3199,33 @@ xlog_do_recovery_pass(
}
bread_err2:
- kmem_free(dbp);
+ kvfree(dbp);
bread_err1:
- kmem_free(hbp);
+ kvfree(hbp);
/*
- * Submit buffers that have been added from the last record processed,
- * regardless of error status.
+ * Submit buffers that have been dirtied by the last record recovered.
*/
- if (!list_empty(&buffer_list))
+ if (!list_empty(&buffer_list)) {
+ if (error) {
+ /*
+ * If there has been an item recovery error then we
+ * cannot allow partial checkpoint writeback to
+ * occur. We might have multiple checkpoints with the
+ * same start LSN in this buffer list, and partial
+ * writeback of a checkpoint in this situation can
+ * prevent future recovery of all the changes in the
+ * checkpoints at this start LSN.
+ *
+ * Note: Shutting down the filesystem will result in the
+ * delwri submission marking all the buffers stale,
+ * completing them and cleaning up _XBF_LOGRECOVERY
+ * state without doing any IO.
+ */
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ }
error2 = xfs_buf_delwri_submit(&buffer_list);
+ }
if (error && first_bad)
*first_bad = rhead_blk;
@@ -3443,12 +3460,19 @@ xlog_recover(
* part of recovery so that the root and real-time bitmap inodes can be read in
* from disk in between the two stages. This is necessary so that we can free
* space in the real-time portion of the file system.
+ *
+ * We run this whole process under GFP_NOFS allocation context. We do a
+ * combination of non-transactional and transactional work, yet we really don't
+ * want to recurse into the filesystem from direct reclaim during any of this
+ * processing. This allows all the recovery code run here not to care about the
+ * memory allocation context it is running in.
*/
int
xlog_recover_finish(
struct xlog *log)
{
- int error;
+ unsigned int nofs_flags = memalloc_nofs_save();
+ int error;
error = xlog_recover_process_intents(log);
if (error) {
@@ -3462,7 +3486,7 @@ xlog_recover_finish(
xlog_recover_cancel_intents(log);
xfs_alert(log->l_mp, "Failed to recover intents");
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return error;
+ goto out_error;
}
/*
@@ -3483,7 +3507,7 @@ xlog_recover_finish(
if (error < 0) {
xfs_alert(log->l_mp,
"Failed to clear log incompat features on recovery");
- return error;
+ goto out_error;
}
}
@@ -3508,9 +3532,13 @@ xlog_recover_finish(
* and AIL.
*/
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ error = 0;
+ goto out_error;
}
- return 0;
+out_error:
+ memalloc_nofs_restore(nofs_flags);
+ return error;
}
void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aabb25dc3efa..df370eb5dc15 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,7 @@ xfs_uuid_table_free(void)
{
if (xfs_uuid_table_size == 0)
return;
- kmem_free(xfs_uuid_table);
+ kfree(xfs_uuid_table);
xfs_uuid_table = NULL;
xfs_uuid_table_size = 0;
}
@@ -62,7 +62,7 @@ xfs_uuid_mount(
int hole, i;
/* Publish UUID in struct super_block */
- uuid_copy(&mp->m_super->s_uuid, uuid);
+ super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
if (xfs_has_nouuid(mp))
return 0;
@@ -706,6 +706,8 @@ xfs_mountfs(
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
+ super_set_sysfs_name_id(mp->m_super);
+
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
NULL, mp->m_super->s_id);
if (error)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 503fe3c7edbf..e880aa48de68 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -94,9 +94,9 @@ typedef struct xfs_mount {
struct xfs_inode *m_rsumip; /* pointer to summary inode */
struct xfs_inode *m_rootip; /* pointer to root directory */
struct xfs_quotainfo *m_quotainfo; /* disk quota information */
- xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
- xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
- xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
+ struct xfs_buftarg *m_ddev_targp; /* data device */
+ struct xfs_buftarg *m_logdev_targp;/* log device */
+ struct xfs_buftarg *m_rtdev_targp; /* rt device */
void __percpu *m_inodegc; /* percpu inodegc structures */
/*
@@ -252,6 +252,9 @@ typedef struct xfs_mount {
/* cpus that have inodes queued for inactivation */
struct cpumask m_inodegc_cpumask;
+
+ /* Hook to feed dirent updates to an active online repair. */
+ struct xfs_hooks m_dir_update_hooks;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)
@@ -502,9 +505,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
-int xfs_buf_hash_init(struct xfs_perag *pag);
-void xfs_buf_hash_destroy(struct xfs_perag *pag);
-
extern void xfs_uuid_table_free(void);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f85e3b07ab44..7443debaffd6 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,13 +333,14 @@ xfs_mru_cache_create(
if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
return -EINVAL;
- if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
+ mru = kzalloc(sizeof(*mru), GFP_KERNEL | __GFP_NOFAIL);
+ if (!mru)
return -ENOMEM;
/* An extra list is needed to avoid reaping up to a grp_time early. */
mru->grp_count = grp_count + 1;
- mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
-
+ mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists),
+ GFP_KERNEL | __GFP_NOFAIL);
if (!mru->lists) {
err = -ENOMEM;
goto exit;
@@ -364,9 +365,9 @@ xfs_mru_cache_create(
exit:
if (err && mru && mru->lists)
- kmem_free(mru->lists);
+ kfree(mru->lists);
if (err && mru)
- kmem_free(mru);
+ kfree(mru);
return err;
}
@@ -406,8 +407,8 @@ xfs_mru_cache_destroy(
xfs_mru_cache_flush(mru);
- kmem_free(mru->lists);
- kmem_free(mru);
+ kfree(mru->lists);
+ kfree(mru);
}
/*
@@ -427,7 +428,7 @@ xfs_mru_cache_insert(
if (!mru || !mru->lists)
return -EINVAL;
- if (radix_tree_preload(GFP_NOFS))
+ if (radix_tree_preload(GFP_KERNEL))
return -ENOMEM;
INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 67d0a8564ff3..0f4cf4170c35 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -26,6 +26,7 @@
#include "xfs_ag.h"
#include "xfs_ialloc.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -254,7 +255,7 @@ xfs_qm_dqattach_one(
struct xfs_dquot *dqp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
error = 0;
/*
@@ -322,7 +323,7 @@ xfs_qm_dqattach_locked(
if (!xfs_qm_need_dqattach(ip))
return 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
@@ -353,7 +354,7 @@ done:
* Don't worry about the dquots that we may have attached before any
* error - they'll get detached later if it has not already been done.
*/
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -628,7 +629,8 @@ xfs_qm_init_quotainfo(
ASSERT(XFS_IS_QUOTA_ON(mp));
- qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
+ qinf = mp->m_quotainfo = kzalloc(sizeof(struct xfs_quotainfo),
+ GFP_KERNEL | __GFP_NOFAIL);
error = list_lru_init(&qinf->qi_lru);
if (error)
@@ -642,9 +644,9 @@ xfs_qm_init_quotainfo(
if (error)
goto out_free_lru;
- INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
- INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
- INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_KERNEL);
+ INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_KERNEL);
+ INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_KERNEL);
mutex_init(&qinf->qi_tree_lock);
/* mutex used to serialize quotaoffs */
@@ -691,6 +693,9 @@ xfs_qm_init_quotainfo(
shrinker_register(qinf->qi_shrinker);
+ xfs_hooks_init(&qinf->qi_mod_ino_dqtrx_hooks);
+ xfs_hooks_init(&qinf->qi_apply_dqtrx_hooks);
+
return 0;
out_free_inos:
@@ -700,7 +705,7 @@ out_free_inos:
out_free_lru:
list_lru_destroy(&qinf->qi_lru);
out_free_qinf:
- kmem_free(qinf);
+ kfree(qinf);
mp->m_quotainfo = NULL;
return error;
}
@@ -724,7 +729,7 @@ xfs_qm_destroy_quotainfo(
xfs_qm_destroy_quotainos(qi);
mutex_destroy(&qi->qi_tree_lock);
mutex_destroy(&qi->qi_quotaofflock);
- kmem_free(qi);
+ kfree(qi);
mp->m_quotainfo = NULL;
}
@@ -758,14 +763,18 @@ xfs_qm_qino_alloc(
(mp->m_sb.sb_gquotino != NULLFSINO)) {
ino = mp->m_sb.sb_gquotino;
if (XFS_IS_CORRUPT(mp,
- mp->m_sb.sb_pquotino != NULLFSINO))
+ mp->m_sb.sb_pquotino != NULLFSINO)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
return -EFSCORRUPTED;
+ }
} else if ((flags & XFS_QMOPT_GQUOTA) &&
(mp->m_sb.sb_pquotino != NULLFSINO)) {
ino = mp->m_sb.sb_pquotino;
if (XFS_IS_CORRUPT(mp,
- mp->m_sb.sb_gquotino != NULLFSINO))
+ mp->m_sb.sb_gquotino != NULLFSINO)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
return -EFSCORRUPTED;
+ }
}
if (ino != NULLFSINO) {
error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
@@ -996,7 +1005,8 @@ xfs_qm_reset_dqcounts_buf(
if (qip->i_nblocks == 0)
return 0;
- map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
+ map = kmalloc(XFS_DQITER_MAP_SIZE * sizeof(*map),
+ GFP_KERNEL | __GFP_NOFAIL);
lblkno = 0;
maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
@@ -1058,7 +1068,7 @@ xfs_qm_reset_dqcounts_buf(
} while (nmaps > 0);
out:
- kmem_free(map);
+ kfree(map);
return error;
}
@@ -1406,8 +1416,12 @@ error_return:
xfs_warn(mp,
"Quotacheck: Failed to reset quota flags.");
}
- } else
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
+ } else {
xfs_notice(mp, "Quotacheck: Done.");
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_QUOTACHECK);
+ }
+
return error;
error_purge:
@@ -1809,7 +1823,7 @@ xfs_qm_vop_chown(
XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
/* old dquot */
@@ -1817,12 +1831,12 @@ xfs_qm_vop_chown(
ASSERT(prevdq);
ASSERT(prevdq != newdq);
- xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks));
- xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks));
+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
/* the sparkling new dquot */
- xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks);
- xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+ xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks);
+ xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
* Back when we made quota reservations for the chown, we reserved the
@@ -1897,29 +1911,28 @@ xfs_qm_vop_create_dqattach(
if (!XFS_IS_QUOTA_ON(mp))
return;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id);
ip->i_udquot = xfs_qm_dqhold(udqp);
- xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id);
ip->i_gdquot = xfs_qm_dqhold(gdqp);
- xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(ip->i_pdquot == NULL);
ASSERT(ip->i_projid == pdqp->q_id);
ip->i_pdquot = xfs_qm_dqhold(pdqp);
- xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
+
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, 1);
}
/* Decide if this inode's dquot is near an enforcement boundary. */
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index d5c9fc4ba591..f5993012bf98 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -68,6 +68,10 @@ struct xfs_quotainfo {
/* Minimum and maximum quota expiration timestamp values. */
time64_t qi_expiry_min;
time64_t qi_expiry_max;
+
+ /* Hook to feed quota counter updates to an active online repair. */
+ struct xfs_hooks qi_mod_ino_dqtrx_hooks;
+ struct xfs_hooks qi_apply_dqtrx_hooks;
};
static inline struct radix_tree_root *
@@ -104,6 +108,18 @@ xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type)
return NULL;
}
+/*
+ * Parameters for tracking dqtrx changes on behalf of an inode. The hook
+ * function arg parameter is the field being updated.
+ */
+struct xfs_mod_ino_dqtrx_params {
+ uintptr_t tx_id;
+ xfs_ino_t ino;
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+ int64_t delta;
+};
+
extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
uint field, int64_t delta);
extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index b77673dd0558..271c1021c733 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -9,6 +9,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0d56489f3b2..85a4ae1a17f6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -74,6 +74,22 @@ struct xfs_dqtrx {
int64_t qt_icount_delta; /* dquot inode count changes */
};
+enum xfs_apply_dqtrx_type {
+ XFS_APPLY_DQTRX_COMMIT = 0,
+ XFS_APPLY_DQTRX_UNRESERVE,
+};
+
+/*
+ * Parameters for applying dqtrx changes to a dquot. The hook function arg
+ * parameter is enum xfs_apply_dqtrx_type.
+ */
+struct xfs_apply_dqtrx_params {
+ uintptr_t tx_id;
+ xfs_ino_t ino;
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+};
+
#ifdef CONFIG_XFS_QUOTA
extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
extern void xfs_trans_free_dqinfo(struct xfs_trans *);
@@ -114,6 +130,30 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
}
bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
+
+# ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip,
+ struct xfs_dquot *dqp, unsigned int field, int64_t delta);
+
+struct xfs_quotainfo;
+
+struct xfs_dqtrx_hook {
+ struct xfs_hook mod_hook;
+ struct xfs_hook apply_hook;
+};
+
+void xfs_dqtrx_hook_disable(void);
+void xfs_dqtrx_hook_enable(void);
+
+int xfs_dqtrx_hook_add(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook);
+void xfs_dqtrx_hook_del(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook);
+void xfs_dqtrx_hook_setup(struct xfs_dqtrx_hook *hook, notifier_fn_t mod_fn,
+ notifier_fn_t apply_fn);
+# else
+# define xfs_trans_mod_ino_dquot(tp, ip, dqp, field, delta) \
+ xfs_trans_mod_dquot((tp), (dqp), (field), (delta))
+# endif /* CONFIG_XFS_LIVE_HOOKS */
+
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -173,6 +213,12 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
#define xfs_inode_near_dquot_enforcement(ip, type) (false)
+
+# ifdef CONFIG_XFS_LIVE_HOOKS
+# define xfs_dqtrx_hook_enable() ((void)0)
+# define xfs_dqtrx_hook_disable() ((void)0)
+# endif /* CONFIG_XFS_LIVE_HOOKS */
+
#endif /* CONFIG_XFS_QUOTA */
static inline int
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 20ad8086da60..14919b33e4fe 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -36,9 +36,9 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
- kmem_free(cuip->cui_item.li_lv_shadow);
+ kvfree(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
- kmem_free(cuip);
+ kfree(cuip);
else
kmem_cache_free(xfs_cui_cache, cuip);
}
@@ -143,8 +143,8 @@ xfs_cui_init(
ASSERT(nextents > 0);
if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
- cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
- 0);
+ cuip = kzalloc(xfs_cui_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
else
cuip = kmem_cache_zalloc(xfs_cui_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -207,7 +207,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_free(cudp->cud_item.li_lv_shadow);
+ kvfree(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_cache, cudp);
}
@@ -425,7 +425,7 @@ xfs_cui_recover_work(
struct xfs_refcount_intent *ri;
ri = kmem_cache_alloc(xfs_refcount_intent_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
ri->ri_startblock = pmap->pe_startblock;
ri->ri_blockcount = pmap->pe_len;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d5ca8bcae65b..7da0e8f961d3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -29,6 +29,7 @@
#include "xfs_iomap.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_health.h"
/*
* Copy on Write of Shared Blocks
@@ -527,7 +528,7 @@ xfs_reflink_allocate_cow(
int error;
bool found;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
@@ -805,7 +806,7 @@ xfs_reflink_end_cow_extent(
* If the extent we're remapping is backed by storage (written
* or not), unmap the extent and drop its refcount.
*/
- xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
xfs_refcount_decrease_extent(tp, &data);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-data.br_blockcount);
@@ -829,7 +830,7 @@ xfs_reflink_end_cow_extent(
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
/* Map the new blocks into the data fork. */
- xfs_bmap_map_extent(tp, ip, &del);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
/* Charge this new data fork mapping to the on-disk quota. */
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1227,8 +1228,10 @@ xfs_reflink_remap_extent(
* extent if they're both holes or both the same physical extent.
*/
if (dmap->br_startblock == smap.br_startblock) {
- if (dmap->br_state != smap.br_state)
+ if (dmap->br_state != smap.br_state) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
+ }
goto out_cancel;
}
@@ -1291,7 +1294,7 @@ xfs_reflink_remap_extent(
* If the extent we're unmapping is backed by storage (written
* or not), unmap the extent and drop its refcount.
*/
- xfs_bmap_unmap_extent(tp, ip, &smap);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
xfs_refcount_decrease_extent(tp, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
@@ -1316,7 +1319,7 @@ xfs_reflink_remap_extent(
*/
if (dmap_written) {
xfs_refcount_increase_extent(tp, dmap);
- xfs_bmap_map_extent(tp, ip, dmap);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
qdelta += dmap->br_blockcount;
}
@@ -1391,6 +1394,7 @@ xfs_reflink_remap_blocks(
ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
if (imap.br_startblock == DELAYSTARTBLOCK) {
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ xfs_bmap_mark_sick(src, XFS_DATA_FORK);
error = -EFSCORRUPTED;
break;
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 79ad0087aeca..e473124e29cc 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -36,9 +36,9 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
- kmem_free(ruip->rui_item.li_lv_shadow);
+ kvfree(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
- kmem_free(ruip);
+ kfree(ruip);
else
kmem_cache_free(xfs_rui_cache, ruip);
}
@@ -142,7 +142,8 @@ xfs_rui_init(
ASSERT(nextents > 0);
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
- ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
+ ruip = kzalloc(xfs_rui_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
else
ruip = kmem_cache_zalloc(xfs_rui_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -205,7 +206,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_free(rudp->rud_item.li_lv_shadow);
+ kvfree(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_cache, rudp);
}
@@ -454,7 +455,7 @@ xfs_rui_recover_work(
{
struct xfs_rmap_intent *ri;
- ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
case XFS_RMAP_EXTENT_MAP:
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8649d981a097..e66f9bd5de5c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,6 +22,8 @@
#include "xfs_sb.h"
#include "xfs_rtbitmap.h"
#include "xfs_quota.h"
+#include "xfs_log_priv.h"
+#include "xfs_health.h"
/*
* Return whether there are any free extents in the size range given
@@ -903,7 +905,7 @@ xfs_growfs_rt(
/*
* Allocate a new (fake) mount/sb.
*/
- nmp = kmem_alloc(sizeof(*nmp), 0);
+ nmp = kmalloc(sizeof(*nmp), GFP_KERNEL | __GFP_NOFAIL);
/*
* Loop over the bitmap blocks.
* We will do everything one bitmap block at a time.
@@ -1050,7 +1052,7 @@ out_free:
/*
* Free the fake mp structure.
*/
- kmem_free(nmp);
+ kfree(nmp);
/*
* If we had to allocate a new rsum_cache, we either need to free the
@@ -1059,10 +1061,10 @@ out_free:
*/
if (rsum_cache != mp->m_rsum_cache) {
if (error) {
- kmem_free(mp->m_rsum_cache);
+ kvfree(mp->m_rsum_cache);
mp->m_rsum_cache = rsum_cache;
} else {
- kmem_free(rsum_cache);
+ kvfree(rsum_cache);
}
}
@@ -1202,6 +1204,8 @@ xfs_rtmount_inodes(
sbp = &mp->m_sb;
error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
if (error)
return error;
ASSERT(mp->m_rbmip != NULL);
@@ -1211,6 +1215,8 @@ xfs_rtmount_inodes(
goto out_rele_bitmap;
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY);
if (error)
goto out_rele_bitmap;
ASSERT(mp->m_rsumip != NULL);
@@ -1233,7 +1239,7 @@ void
xfs_rtunmount_inodes(
struct xfs_mount *mp)
{
- kmem_free(mp->m_rsum_cache);
+ kvfree(mp->m_rsum_cache);
if (mp->m_rbmip)
xfs_irele(mp->m_rbmip);
if (mp->m_rsumip)
@@ -1260,7 +1266,7 @@ xfs_rtpick_extent(
uint64_t seq; /* sequence number of file creation */
struct timespec64 ts; /* timespec in inode */
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
ts = inode_get_atime(VFS_I(mp->m_rbmip));
if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 90a77cd3ebad..ed97d72caa66 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -50,7 +50,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{ "ibt2", xfsstats_offset(xs_fibt_2) },
{ "fibt2", xfsstats_offset(xs_rmap_2) },
{ "rmapbt", xfsstats_offset(xs_refcbt_2) },
- { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)},
+ { "refcntbt", xfsstats_offset(xs_rmap_mem_2) },
+ { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) },
+ { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
{ "qm", xfsstats_offset(xs_xstrat_bytes)},
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 43ffba74f045..a61fb56ed2e6 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -125,6 +125,8 @@ struct __xfsstats {
uint32_t xs_fibt_2[__XBTS_MAX];
uint32_t xs_rmap_2[__XBTS_MAX];
uint32_t xs_refcbt_2[__XBTS_MAX];
+ uint32_t xs_rmap_mem_2[__XBTS_MAX];
+ uint32_t xs_rcbag_2[__XBTS_MAX];
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 98401de832ee..bce020374c5e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -44,6 +44,7 @@
#include "xfs_dahash_test.h"
#include "xfs_rtbitmap.h"
#include "scrub/stats.h"
+#include "scrub/rcbag_btree.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -361,16 +362,16 @@ STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
const char *name,
- struct bdev_handle **handlep)
+ struct file **bdev_filep)
{
int error = 0;
- *handlep = bdev_open_by_path(name,
+ *bdev_filep = bdev_file_open_by_path(name,
BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
mp->m_super, &fs_holder_ops);
- if (IS_ERR(*handlep)) {
- error = PTR_ERR(*handlep);
- *handlep = NULL;
+ if (IS_ERR(*bdev_filep)) {
+ error = PTR_ERR(*bdev_filep);
+ *bdev_filep = NULL;
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
}
@@ -435,26 +436,26 @@ xfs_open_devices(
{
struct super_block *sb = mp->m_super;
struct block_device *ddev = sb->s_bdev;
- struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL;
+ struct file *logdev_file = NULL, *rtdev_file = NULL;
int error;
/*
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
- error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
+ error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file);
if (error)
return error;
}
if (mp->m_rtname) {
- error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
+ error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file);
if (error)
goto out_close_logdev;
- if (rtdev_handle->bdev == ddev ||
- (logdev_handle &&
- rtdev_handle->bdev == logdev_handle->bdev)) {
+ if (file_bdev(rtdev_file) == ddev ||
+ (logdev_file &&
+ file_bdev(rtdev_file) == file_bdev(logdev_file))) {
xfs_warn(mp,
"Cannot mount filesystem with identical rtdev and ddev/logdev.");
error = -EINVAL;
@@ -466,25 +467,25 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
- if (rtdev_handle) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
+ if (rtdev_file) {
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
- if (logdev_handle && logdev_handle->bdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
+ if (logdev_file && file_bdev(logdev_file) != ddev) {
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
mp->m_logdev_targp = mp->m_ddev_targp;
/* Handle won't be used, drop it */
- if (logdev_handle)
- bdev_release(logdev_handle);
+ if (logdev_file)
+ bdev_fput(logdev_file);
}
return 0;
@@ -495,11 +496,11 @@ xfs_open_devices(
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- if (rtdev_handle)
- bdev_release(rtdev_handle);
+ if (rtdev_file)
+ bdev_fput(rtdev_file);
out_close_logdev:
- if (logdev_handle)
- bdev_release(logdev_handle);
+ if (logdev_file)
+ bdev_fput(logdev_file);
return error;
}
@@ -715,9 +716,7 @@ xfs_fs_inode_init_once(
/* xfs inode */
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
-
- mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
- "xfsino", ip->i_ino);
+ init_rwsem(&ip->i_lock);
}
/*
@@ -760,7 +759,7 @@ xfs_mount_free(
debugfs_remove(mp->m_debugfs);
kfree(mp->m_rtname);
kfree(mp->m_logname);
- kmem_free(mp);
+ kfree(mp);
}
STATIC int
@@ -1986,7 +1985,7 @@ static int xfs_init_fs_context(
{
struct xfs_mount *mp;
- mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO);
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
if (!mp)
return -ENOMEM;
@@ -2012,6 +2011,8 @@ static int xfs_init_fs_context(
mp->m_logbsize = -1;
mp->m_allocsize_log = 16; /* 64k */
+ xfs_hooks_init(&mp->m_dir_update_hooks);
+
fc->s_fs_info = mp;
fc->ops = &xfs_context_ops;
@@ -2043,8 +2044,7 @@ xfs_init_caches(void)
xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN |
- SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT,
NULL);
if (!xfs_buf_cache)
goto out;
@@ -2059,10 +2059,14 @@ xfs_init_caches(void)
if (error)
goto out_destroy_log_ticket_cache;
- error = xfs_defer_init_item_caches();
+ error = rcbagbt_init_cur_cache();
if (error)
goto out_destroy_btree_cur_cache;
+ error = xfs_defer_init_item_caches();
+ if (error)
+ goto out_destroy_rcbagbt_cur_cache;
+
xfs_da_state_cache = kmem_cache_create("xfs_da_state",
sizeof(struct xfs_da_state),
0, 0, NULL);
@@ -2109,14 +2113,14 @@ xfs_init_caches(void)
sizeof(struct xfs_inode), 0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
xfs_fs_inode_init_once);
if (!xfs_inode_cache)
goto out_destroy_efi_cache;
xfs_ili_cache = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT,
NULL);
if (!xfs_ili_cache)
goto out_destroy_inode_cache;
@@ -2219,6 +2223,8 @@ xfs_init_caches(void)
kmem_cache_destroy(xfs_da_state_cache);
out_destroy_defer_item_cache:
xfs_defer_destroy_item_caches();
+ out_destroy_rcbagbt_cur_cache:
+ rcbagbt_destroy_cur_cache();
out_destroy_btree_cur_cache:
xfs_btree_destroy_cur_caches();
out_destroy_log_ticket_cache:
@@ -2256,6 +2262,7 @@ xfs_destroy_caches(void)
kmem_cache_destroy(xfs_ifork_cache);
kmem_cache_destroy(xfs_da_state_cache);
xfs_defer_destroy_item_caches();
+ rcbagbt_destroy_cur_cache();
xfs_btree_destroy_cur_caches();
kmem_cache_destroy(xfs_log_ticket_cache);
kmem_cache_destroy(xfs_buf_cache);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 92974a4414c8..3e376d24c7c1 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -24,77 +24,7 @@
#include "xfs_ialloc.h"
#include "xfs_error.h"
#include "xfs_health.h"
-
-/* ----- Kernel only functions below ----- */
-int
-xfs_readlink_bmap_ilocked(
- struct xfs_inode *ip,
- char *link)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
- struct xfs_buf *bp;
- xfs_daddr_t d;
- char *cur_chunk;
- int pathlen = ip->i_disk_size;
- int nmaps = XFS_SYMLINK_MAPS;
- int byte_cnt;
- int n;
- int error = 0;
- int fsblocks = 0;
- int offset;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-
- fsblocks = xfs_symlink_blocks(mp, pathlen);
- error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
- if (error)
- goto out;
-
- offset = 0;
- for (n = 0; n < nmaps; n++) {
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-
- error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
- &bp, &xfs_symlink_buf_ops);
- if (error)
- return error;
- byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
- if (pathlen < byte_cnt)
- byte_cnt = pathlen;
-
- cur_chunk = bp->b_addr;
- if (xfs_has_crc(mp)) {
- if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
- byte_cnt, bp)) {
- error = -EFSCORRUPTED;
- xfs_alert(mp,
-"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
- offset, byte_cnt, ip->i_ino);
- xfs_buf_relse(bp);
- goto out;
-
- }
-
- cur_chunk += sizeof(struct xfs_dsymlink_hdr);
- }
-
- memcpy(link + offset, cur_chunk, byte_cnt);
-
- pathlen -= byte_cnt;
- offset += byte_cnt;
-
- xfs_buf_relse(bp);
- }
- ASSERT(pathlen == 0);
-
- link[ip->i_disk_size] = '\0';
- error = 0;
-
- out:
- return error;
-}
+#include "xfs_symlink_remote.h"
int
xfs_readlink(
@@ -103,7 +33,7 @@ xfs_readlink(
{
struct xfs_mount *mp = ip->i_mount;
xfs_fsize_t pathlen;
- int error = -EFSCORRUPTED;
+ int error;
trace_xfs_readlink(ip);
@@ -116,14 +46,14 @@ xfs_readlink(
pathlen = ip->i_disk_size;
if (!pathlen)
- goto out;
+ goto out_corrupt;
if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- goto out;
+ goto out_corrupt;
}
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
@@ -132,17 +62,20 @@ xfs_readlink(
* if if_data is junk.
*/
if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data))
- goto out;
+ goto out_corrupt;
memcpy(link, ip->i_df.if_data, pathlen + 1);
error = 0;
} else {
- error = xfs_readlink_bmap_ilocked(ip, link);
+ error = xfs_symlink_remote_read(ip, link);
}
- out:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return error;
+ out_corrupt:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ return -EFSCORRUPTED;
}
int
@@ -160,15 +93,7 @@ xfs_symlink(
int error = 0;
int pathlen;
bool unlock_dp_on_error = false;
- xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
- int nmaps;
- struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
- xfs_daddr_t d;
- const char *cur_chunk;
- int byte_cnt;
- int n;
- struct xfs_buf *bp;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -256,62 +181,11 @@ xfs_symlink(
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
resblks -= XFS_IALLOC_SPACE_RES(mp);
- /*
- * If the symlink will fit into the inode, write it inline.
- */
- if (pathlen <= xfs_inode_data_fork_size(ip)) {
- xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
-
- ip->i_disk_size = pathlen;
- ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
- } else {
- int offset;
-
- first_fsb = 0;
- nmaps = XFS_SYMLINK_MAPS;
-
- error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
- XFS_BMAPI_METADATA, resblks, mval, &nmaps);
- if (error)
- goto out_trans_cancel;
-
- resblks -= fs_blocks;
- ip->i_disk_size = pathlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- cur_chunk = target_path;
- offset = 0;
- for (n = 0; n < nmaps; n++) {
- char *buf;
-
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- BTOBB(byte_cnt), 0, &bp);
- if (error)
- goto out_trans_cancel;
- bp->b_ops = &xfs_symlink_buf_ops;
-
- byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
- byte_cnt = min(byte_cnt, pathlen);
-
- buf = bp->b_addr;
- buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
- byte_cnt, bp);
-
- memcpy(buf, cur_chunk, byte_cnt);
-
- cur_chunk += byte_cnt;
- pathlen -= byte_cnt;
- offset += byte_cnt;
-
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
- xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
- (char *)bp->b_addr);
- }
- ASSERT(pathlen == 0);
- }
+ error = xfs_symlink_write_target(tp, ip, target_path, pathlen,
+ fs_blocks, resblks);
+ if (error)
+ goto out_trans_cancel;
+ resblks -= fs_blocks;
i_size_write(VFS_I(ip), ip->i_disk_size);
/*
@@ -322,6 +196,7 @@ xfs_symlink(
goto out_trans_cancel;
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ xfs_dir_update_hook(dp, ip, 1, link_name);
/*
* If this is a synchronous mount, make sure that the
@@ -496,6 +371,7 @@ xfs_inactive_symlink(
__func__, (unsigned long long)ip->i_ino, pathlen);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
ASSERT(0);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index d1ca1ce62a93..0d29a50e66fd 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -10,7 +10,6 @@
int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp,
struct xfs_name *link_name, const char *target_path,
umode_t mode, struct xfs_inode **ipp);
-int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link);
int xfs_readlink(struct xfs_inode *ip, char *link);
int xfs_inactive_symlink(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 17485666b672..d2391eec37fe 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -193,7 +193,6 @@ always_cow_show(
}
XFS_SYSFS_ATTR_RW(always_cow);
-#ifdef DEBUG
/*
* Override how many threads the parallel work queue is allowed to create.
* This has to be a debug-only global (instead of an errortag) because one of
@@ -260,7 +259,6 @@ larp_show(
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
}
XFS_SYSFS_ATTR_RW(larp);
-#endif /* DEBUG */
STATIC ssize_t
bload_leaf_slack_store(
@@ -319,10 +317,8 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
ATTR_LIST(always_cow),
-#ifdef DEBUG
ATTR_LIST(pwork_threads),
ATTR_LIST(larp),
-#endif
ATTR_LIST(bload_leaf_slack),
ATTR_LIST(bload_node_slack),
NULL,
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8a5dc1538aa8..1a963382e5e9 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -36,6 +36,9 @@
#include "xfs_error.h"
#include <linux/iomap.h>
#include "xfs_iomap.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bmap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0984a1c884c7..aea97fc074f8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -79,6 +79,9 @@ union xfs_btree_ptr;
struct xfs_dqtrx;
struct xfs_icwalk;
struct xfs_perag;
+struct xfbtree;
+struct xfs_btree_ops;
+struct xfs_bmap_intent;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -640,6 +643,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
@@ -1710,12 +1714,10 @@ DECLARE_EVENT_CLASS(xfs_agf_class,
__entry->agno = be32_to_cpu(agf->agf_seqno),
__entry->flags = flags;
__entry->length = be32_to_cpu(agf->agf_length),
- __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
- __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
- __entry->bno_level =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
- __entry->cnt_level =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+ __entry->bno_root = be32_to_cpu(agf->agf_bno_root),
+ __entry->cnt_root = be32_to_cpu(agf->agf_cnt_root),
+ __entry->bno_level = be32_to_cpu(agf->agf_bno_level),
+ __entry->cnt_level = be32_to_cpu(agf->agf_cnt_level),
__entry->flfirst = be32_to_cpu(agf->agf_flfirst),
__entry->fllast = be32_to_cpu(agf->agf_fllast),
__entry->flcount = be32_to_cpu(agf->agf_flcount),
@@ -1890,28 +1892,28 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_near_bno);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_finish);
TRACE_EVENT(xfs_alloc_cur_check,
- TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, xfs_extlen_t diff, bool new),
- TP_ARGS(mp, btnum, bno, len, diff, new),
+ TP_ARGS(cur, bno, len, diff, new),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agblock_t, bno)
__field(xfs_extlen_t, len)
__field(xfs_extlen_t, diff)
__field(bool, new)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->btnum = btnum;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __assign_str(name, cur->bc_ops->name);
__entry->bno = bno;
__entry->len = len;
__entry->diff = diff;
__entry->new = new;
),
- TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
+ TP_printk("dev %d:%d %sbt agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->bno, __entry->len, __entry->diff, __entry->new)
)
@@ -2452,21 +2454,12 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
-/* btree cursor events */
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
TP_ARGS(cur, level, bp),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(int, nlevels)
__field(int, ptr)
@@ -2474,15 +2467,15 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
),
- TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
+ TP_printk("dev %d:%d %sbt level %d/%d ptr %d daddr 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->nlevels,
__entry->ptr,
@@ -2496,6 +2489,90 @@ DEFINE_EVENT(xfs_btree_cur_class, name, \
DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
+TRACE_EVENT(xfs_btree_alloc_block,
+ TP_PROTO(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int stat,
+ int error),
+ TP_ARGS(cur, ptr, stat, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __string(name, cur->bc_ops->name)
+ __field(int, error)
+ __field(xfs_agblock_t, agbno)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_INODE:
+ __entry->agno = 0;
+ __entry->ino = cur->bc_ino.ip->i_ino;
+ break;
+ case XFS_BTREE_TYPE_AG:
+ __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->ino = 0;
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ __entry->agno = 0;
+ __entry->ino = 0;
+ break;
+ }
+ __assign_str(name, cur->bc_ops->name);
+ __entry->error = error;
+ if (!error && stat) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
+
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp,
+ fsb);
+ __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp,
+ fsb);
+ } else {
+ __entry->agbno = be32_to_cpu(ptr->s);
+ }
+ } else {
+ __entry->agbno = NULLAGBLOCK;
+ }
+ ),
+ TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __get_str(name),
+ __entry->agno,
+ __entry->ino,
+ __entry->agbno,
+ __entry->error)
+);
+
+TRACE_EVENT(xfs_btree_free_block,
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_buf *bp),
+ TP_ARGS(cur, bp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __string(name, cur->bc_ops->name)
+ __field(xfs_agblock_t, agbno)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = xfs_daddr_to_agno(cur->bc_mp,
+ xfs_buf_daddr(bp));
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
+ __entry->ino = cur->bc_ino.ip->i_ino;
+ else
+ __entry->ino = 0;
+ __assign_str(name, cur->bc_ops->name);
+ __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp));
+ ),
+ TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __get_str(name),
+ __entry->agno,
+ __entry->ino,
+ __entry->agbno)
+);
+
/* deferred ops */
struct xfs_defer_pending;
@@ -2579,7 +2656,25 @@ DEFINE_EVENT(xfs_defer_pending_class, name, \
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \
TP_ARGS(mp, dfp))
-DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
+DEFINE_DEFER_EVENT(xfs_defer_cancel);
+DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
+DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
+DEFINE_DEFER_EVENT(xfs_defer_finish);
+DEFINE_DEFER_EVENT(xfs_defer_finish_done);
+
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
+
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
+
+DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
int type, xfs_agblock_t agbno, xfs_extlen_t len),
TP_ARGS(mp, agno, type, agbno, len),
@@ -2604,92 +2699,17 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
__entry->agbno,
__entry->len)
);
-#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \
-DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \
+#define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
int type, \
xfs_agblock_t bno, \
xfs_extlen_t len), \
TP_ARGS(mp, agno, type, bno, len))
-
-DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int op,
- xfs_agblock_t agbno,
- xfs_ino_t ino,
- int whichfork,
- xfs_fileoff_t offset,
- xfs_filblks_t len,
- xfs_exntst_t state),
- TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
- __field(xfs_ino_t, ino)
- __field(xfs_agblock_t, agbno)
- __field(int, whichfork)
- __field(xfs_fileoff_t, l_loff)
- __field(xfs_filblks_t, l_len)
- __field(xfs_exntst_t, l_state)
- __field(int, op)
- ),
- TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->ino = ino;
- __entry->agbno = agbno;
- __entry->whichfork = whichfork;
- __entry->l_loff = offset;
- __entry->l_len = len;
- __entry->l_state = state;
- __entry->op = op;
- ),
- TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->op,
- __entry->agno,
- __entry->agbno,
- __entry->ino,
- __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
- __entry->l_loff,
- __entry->l_len,
- __entry->l_state)
-);
-#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \
-DEFINE_EVENT(xfs_map_extent_deferred_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- int op, \
- xfs_agblock_t agbno, \
- xfs_ino_t ino, \
- int whichfork, \
- xfs_fileoff_t offset, \
- xfs_filblks_t len, \
- xfs_exntst_t state), \
- TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
-
-DEFINE_DEFER_EVENT(xfs_defer_cancel);
-DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
-DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
-DEFINE_DEFER_EVENT(xfs_defer_finish);
-DEFINE_DEFER_EVENT(xfs_defer_finish_done);
-
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
-
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
-
-#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred);
DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
@@ -2854,12 +2874,63 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT
+DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int op,
+ xfs_agblock_t agbno,
+ xfs_ino_t ino,
+ int whichfork,
+ xfs_fileoff_t offset,
+ xfs_filblks_t len,
+ xfs_exntst_t state),
+ TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agblock_t, agbno)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, l_loff)
+ __field(xfs_filblks_t, l_len)
+ __field(xfs_exntst_t, l_state)
+ __field(int, op)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->ino = ino;
+ __entry->agbno = agbno;
+ __entry->whichfork = whichfork;
+ __entry->l_loff = offset;
+ __entry->l_len = len;
+ __entry->l_state = state;
+ __entry->op = op;
+ ),
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->op,
+ __entry->agno,
+ __entry->agbno,
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->l_loff,
+ __entry->l_len,
+ __entry->l_state)
+);
+#define DEFINE_RMAP_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_rmap_deferred_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ int op, \
+ xfs_agblock_t agbno, \
+ xfs_ino_t ino, \
+ int whichfork, \
+ xfs_fileoff_t offset, \
+ xfs_filblks_t len, \
+ xfs_exntst_t state), \
+ TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
-DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block);
-DEFINE_BUSY_EVENT(xfs_rmapbt_free_block);
DEFINE_RMAPBT_EVENT(xfs_rmap_update);
DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
@@ -2876,7 +2947,66 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
/* deferred bmbt updates */
-#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT
+TRACE_DEFINE_ENUM(XFS_BMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_BMAP_UNMAP);
+
+DECLARE_EVENT_CLASS(xfs_bmap_deferred_class,
+ TP_PROTO(struct xfs_bmap_intent *bi),
+ TP_ARGS(bi),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, opdev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_fsblock_t, rtbno)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, l_loff)
+ __field(xfs_filblks_t, l_len)
+ __field(xfs_exntst_t, l_state)
+ __field(int, op)
+ ),
+ TP_fast_assign(
+ struct xfs_inode *ip = bi->bi_owner;
+
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) {
+ __entry->agno = 0;
+ __entry->agbno = 0;
+ __entry->rtbno = bi->bi_bmap.br_startblock;
+ __entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev;
+ } else {
+ __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount,
+ bi->bi_bmap.br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount,
+ bi->bi_bmap.br_startblock);
+ __entry->rtbno = 0;
+ __entry->opdev = __entry->dev;
+ }
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = bi->bi_whichfork;
+ __entry->l_loff = bi->bi_bmap.br_startoff;
+ __entry->l_len = bi->bi_bmap.br_blockcount;
+ __entry->l_state = bi->bi_bmap.br_state;
+ __entry->op = bi->bi_type;
+ ),
+ TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS),
+ MAJOR(__entry->opdev), MINOR(__entry->opdev),
+ __entry->ino,
+ __entry->agno,
+ __entry->agbno,
+ __entry->rtbno,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->l_loff,
+ __entry->l_len,
+ __entry->l_state)
+);
+#define DEFINE_BMAP_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_deferred_class, name, \
+ TP_PROTO(struct xfs_bmap_intent *bi), \
+ TP_ARGS(bi))
DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);
@@ -3217,8 +3347,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
TP_ARGS(mp, agno, i1, i2, i3))
/* refcount btree tracepoints */
-DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block);
-DEFINE_BUSY_EVENT(xfs_refcountbt_free_block);
DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
@@ -3255,7 +3383,39 @@ DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
-#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+
+DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int type, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, type, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, type)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->type = type;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+);
+#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_deferred_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ int type, \
+ xfs_agblock_t bno, \
+ xfs_extlen_t len), \
+ TP_ARGS(mp, agno, type, bno, len))
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
@@ -3926,9 +4086,11 @@ DEFINE_EVENT(xfs_fs_corrupt_class, name, \
TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
TP_ARGS(mp, flags))
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
@@ -3955,6 +4117,7 @@ DEFINE_EVENT(xfs_ag_corrupt_class, name, \
unsigned int flags), \
TP_ARGS(mp, agno, flags))
DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt);
DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
@@ -3980,7 +4143,9 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
TP_ARGS(ip, flags))
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt);
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption);
TRACE_EVENT(xfs_iwalk_ag,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -4040,31 +4205,6 @@ TRACE_EVENT(xfs_pwork_init,
__entry->nr_threads, __entry->pid)
)
-DECLARE_EVENT_CLASS(xfs_kmem_class,
- TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
- TP_ARGS(size, flags, caller_ip),
- TP_STRUCT__entry(
- __field(ssize_t, size)
- __field(int, flags)
- __field(unsigned long, caller_ip)
- ),
- TP_fast_assign(
- __entry->size = size;
- __entry->flags = flags;
- __entry->caller_ip = caller_ip;
- ),
- TP_printk("size %zd flags 0x%x caller %pS",
- __entry->size,
- __entry->flags,
- (char *)__entry->caller_ip)
-)
-
-#define DEFINE_KMEM_EVENT(name) \
-DEFINE_EVENT(xfs_kmem_class, name, \
- TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
- TP_ARGS(size, flags, caller_ip))
-DEFINE_KMEM_EVENT(kmem_alloc);
-
TRACE_EVENT(xfs_check_new_dalign,
TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
TP_ARGS(mp, new_dalign, calc_rootino),
@@ -4091,7 +4231,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
TP_ARGS(cur),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(unsigned int, levels)
@@ -4099,15 +4239,15 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = cur->bc_ag.afake->af_root;
__entry->levels = cur->bc_ag.afake->af_levels;
__entry->blocks = cur->bc_ag.afake->af_blocks;
),
- TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u",
+ TP_printk("dev %d:%d %sbt agno 0x%x levels %u blocks %u root %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->levels,
__entry->blocks,
@@ -4119,7 +4259,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
TP_ARGS(cur),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agino_t, agino)
__field(unsigned int, levels)
@@ -4128,7 +4268,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
cur->bc_ino.ip->i_ino);
__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
@@ -4137,9 +4277,9 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
__entry->blocks = cur->bc_ino.ifake->if_blocks;
__entry->whichfork = cur->bc_ino.whichfork;
),
- TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
+ TP_printk("dev %d:%d %sbt agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->agino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
@@ -4156,7 +4296,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
blocks_with_extra),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(unsigned int, level)
__field(unsigned int, nlevels)
__field(uint64_t, nr_this_level)
@@ -4167,7 +4307,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->nr_this_level = nr_this_level;
@@ -4176,9 +4316,9 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
__entry->blocks = blocks;
__entry->blocks_with_extra = blocks_with_extra;
),
- TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+ TP_printk("dev %d:%d %sbt level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->nlevels,
__entry->nr_this_level,
@@ -4195,7 +4335,7 @@ TRACE_EVENT(xfs_btree_bload_block,
TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(unsigned int, level)
__field(unsigned long long, block_idx)
__field(unsigned long long, nr_blocks)
@@ -4205,11 +4345,11 @@ TRACE_EVENT(xfs_btree_bload_block,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->block_idx = block_idx;
__entry->nr_blocks = nr_blocks;
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
@@ -4220,9 +4360,9 @@ TRACE_EVENT(xfs_btree_bload_block,
}
__entry->nr_records = nr_records;
),
- TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
+ TP_printk("dev %d:%d %sbt level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->block_idx,
__entry->nr_blocks,
@@ -4472,6 +4612,164 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
#endif /* CONFIG_XFS_DRAIN_INTENTS */
+#ifdef CONFIG_XFS_MEMORY_BUFS
+TRACE_EVENT(xmbuf_create,
+ TP_PROTO(struct xfs_buftarg *btp),
+ TP_ARGS(btp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, ino)
+ __array(char, pathname, 256)
+ ),
+ TP_fast_assign(
+ char pathname[257];
+ char *path;
+ struct file *file = btp->bt_file;
+
+ __entry->dev = btp->bt_mount->m_super->s_dev;
+ __entry->ino = file_inode(file)->i_ino;
+ memset(pathname, 0, sizeof(pathname));
+ path = file_path(file, pathname, sizeof(pathname) - 1);
+ if (IS_ERR(path))
+ path = "(unknown)";
+ strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+ ),
+ TP_printk("dev %d:%d xmino 0x%lx path '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pathname)
+);
+
+TRACE_EVENT(xmbuf_free,
+ TP_PROTO(struct xfs_buftarg *btp),
+ TP_ARGS(btp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, ino)
+ __field(unsigned long long, bytes)
+ __field(loff_t, size)
+ ),
+ TP_fast_assign(
+ struct file *file = btp->bt_file;
+ struct inode *inode = file_inode(file);
+
+ __entry->dev = btp->bt_mount->m_super->s_dev;
+ __entry->size = i_size_read(inode);
+ __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes;
+ __entry->ino = inode->i_ino;
+ ),
+ TP_printk("dev %d:%d xmino 0x%lx mem_bytes 0x%llx isize 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->bytes,
+ __entry->size)
+);
+#endif /* CONFIG_XFS_MEMORY_BUFS */
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+TRACE_EVENT(xfbtree_init,
+ TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt,
+ const struct xfs_btree_ops *ops),
+ TP_ARGS(mp, xfbt, ops),
+ TP_STRUCT__entry(
+ __field(const void *, btree_ops)
+ __field(unsigned long, xfino)
+ __field(unsigned int, leaf_mxr)
+ __field(unsigned int, leaf_mnr)
+ __field(unsigned int, node_mxr)
+ __field(unsigned int, node_mnr)
+ __field(unsigned long long, owner)
+ ),
+ TP_fast_assign(
+ __entry->btree_ops = ops;
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __entry->leaf_mxr = xfbt->maxrecs[0];
+ __entry->node_mxr = xfbt->maxrecs[1];
+ __entry->leaf_mnr = xfbt->minrecs[0];
+ __entry->node_mnr = xfbt->minrecs[1];
+ __entry->owner = xfbt->owner;
+ ),
+ TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u",
+ __entry->xfino,
+ __entry->btree_ops,
+ __entry->owner,
+ __entry->leaf_mxr,
+ __entry->leaf_mnr,
+ __entry->node_mxr,
+ __entry->node_mnr)
+);
+
+DECLARE_EVENT_CLASS(xfbtree_buf_class,
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp),
+ TP_ARGS(xfbt, bp),
+ TP_STRUCT__entry(
+ __field(unsigned long, xfino)
+ __field(xfs_daddr_t, bno)
+ __field(int, nblks)
+ __field(int, hold)
+ __field(int, pincount)
+ __field(unsigned int, lockval)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __entry->bno = xfs_buf_daddr(bp);
+ __entry->nblks = bp->b_length;
+ __entry->hold = atomic_read(&bp->b_hold);
+ __entry->pincount = atomic_read(&bp->b_pin_count);
+ __entry->lockval = bp->b_sema.count;
+ __entry->flags = bp->b_flags;
+ ),
+ TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s",
+ __entry->xfino,
+ (unsigned long long)__entry->bno,
+ __entry->nblks,
+ __entry->hold,
+ __entry->pincount,
+ __entry->lockval,
+ __print_flags(__entry->flags, "|", XFS_BUF_FLAGS))
+)
+
+#define DEFINE_XFBTREE_BUF_EVENT(name) \
+DEFINE_EVENT(xfbtree_buf_class, name, \
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \
+ TP_ARGS(xfbt, bp))
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf);
+
+DECLARE_EVENT_CLASS(xfbtree_freesp_class,
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur,
+ xfs_fileoff_t fileoff),
+ TP_ARGS(xfbt, cur, fileoff),
+ TP_STRUCT__entry(
+ __field(unsigned long, xfino)
+ __string(btname, cur->bc_ops->name)
+ __field(int, nlevels)
+ __field(xfs_fileoff_t, fileoff)
+ ),
+ TP_fast_assign(
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __assign_str(btname, cur->bc_ops->name);
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->fileoff = fileoff;
+ ),
+ TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx",
+ __entry->xfino,
+ __get_str(btname),
+ __entry->nlevels,
+ (unsigned long long)__entry->fileoff)
+)
+
+#define DEFINE_XFBTREE_FREESP_EVENT(name) \
+DEFINE_EVENT(xfbtree_freesp_class, name, \
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \
+ xfs_fileoff_t fileoff), \
+ TP_ARGS(xfbt, cur, fileoff))
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 12d45e93f07d..7350640059cc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1273,7 +1273,7 @@ xfs_trans_reserve_more_inode(
unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks);
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
if (error)
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 08ce757c7454..1636663707dc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -215,6 +215,7 @@ struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
@@ -267,19 +268,14 @@ static inline void
xfs_trans_set_context(
struct xfs_trans *tp)
{
- ASSERT(current->journal_info == NULL);
tp->t_pflags = memalloc_nofs_save();
- current->journal_info = tp;
}
static inline void
xfs_trans_clear_context(
struct xfs_trans *tp)
{
- if (current->journal_info == tp) {
- memalloc_nofs_restore(tp->t_pflags);
- current->journal_info = NULL;
- }
+ memalloc_nofs_restore(tp->t_pflags);
}
static inline void
@@ -287,10 +283,8 @@ xfs_trans_switch_context(
struct xfs_trans *old_tp,
struct xfs_trans *new_tp)
{
- ASSERT(current->journal_info == old_tp);
new_tp->t_pflags = old_tp->t_pflags;
old_tp->t_pflags = 0;
- current->journal_info = new_tp;
}
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098452e7f95..e4c343096f95 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -901,7 +901,8 @@ xfs_trans_ail_init(
{
struct xfs_ail *ailp;
- ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+ ailp = kzalloc(sizeof(struct xfs_ail),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!ailp)
return -ENOMEM;
@@ -921,7 +922,7 @@ xfs_trans_ail_init(
return 0;
out_free_ailp:
- kmem_free(ailp);
+ kfree(ailp);
return -ENOMEM;
}
@@ -932,5 +933,5 @@ xfs_trans_ail_destroy(
struct xfs_ail *ailp = mp->m_ail;
kthread_stop(ailp->ail_task);
- kmem_free(ailp);
+ kfree(ailp);
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6549e50d852c..e28ab74af4f0 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -393,6 +393,48 @@ xfs_trans_brelse(
}
/*
+ * Forcibly detach a buffer previously joined to the transaction. The caller
+ * will retain its locked reference to the buffer after this function returns.
+ * The buffer must be completely clean and must not be held to the transaction.
+ */
+void
+xfs_trans_bdetach(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
+ ASSERT(tp != NULL);
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ trace_xfs_trans_bdetach(bip);
+
+ /*
+ * Erase all recursion count, since we're removing this buffer from the
+ * transaction.
+ */
+ bip->bli_recur = 0;
+
+ /*
+ * The buffer must be completely clean. Specifically, it had better
+ * not be dirty, stale, logged, ordered, or held to the transaction.
+ */
+ ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
+ ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY));
+ ASSERT(!(bip->bli_flags & XFS_BLI_HOLD));
+ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+
+ /* Unlink the log item from the transaction and drop the log item. */
+ xfs_trans_del_item(&bip->bli_item);
+ xfs_buf_item_put(bip);
+ bp->b_transp = NULL;
+}
+
+/*
* Mark the buffer as not needing to be unlocked when the buf item's
* iop_committing() routine is called. The buffer must already be locked
* and associated with the given transaction.
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index aa00cf67ad72..577b535a595c 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,6 +17,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_health.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -120,6 +121,116 @@ xfs_trans_dup_dqinfo(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of quota live updates. If the
+ * compiler supports jump labels, the static branch will be replaced by a nop
+ * sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dqtrx_hooks_switch);
+
+void
+xfs_dqtrx_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_dqtrx_hooks_switch);
+}
+
+void
+xfs_dqtrx_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_dqtrx_hooks_switch);
+}
+
+/* Schedule a transactional dquot update on behalf of an inode. */
+void
+xfs_trans_mod_ino_dquot(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot *dqp,
+ unsigned int field,
+ int64_t delta)
+{
+ xfs_trans_mod_dquot(tp, dqp, field, delta);
+
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_mod_ino_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .ino = ip->i_ino,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ .delta = delta
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_mod_ino_dqtrx_hooks, field, &p);
+ }
+}
+
+/* Call the specified functions during a dquot counter update. */
+int
+xfs_dqtrx_hook_add(
+ struct xfs_quotainfo *qi,
+ struct xfs_dqtrx_hook *hook)
+{
+ int error;
+
+ /*
+ * Transactional dquot updates first call the mod hook when changes
+ * are attached to the transaction and then call the apply hook when
+ * those changes are committed (or canceled).
+ *
+ * The apply hook must be installed before the mod hook so that we
+ * never fail to catch the end of a quota update sequence.
+ */
+ error = xfs_hooks_add(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+ if (error)
+ goto out;
+
+ error = xfs_hooks_add(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook);
+ if (error)
+ goto out_apply;
+
+ return 0;
+
+out_apply:
+ xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+out:
+ return error;
+}
+
+/* Stop calling the specified function during a dquot counter update. */
+void
+xfs_dqtrx_hook_del(
+ struct xfs_quotainfo *qi,
+ struct xfs_dqtrx_hook *hook)
+{
+ /*
+ * The mod hook must be removed before apply hook to avoid giving the
+ * hook consumer with an incomplete update. No hooks should be running
+ * after these functions return.
+ */
+ xfs_hooks_del(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook);
+ xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+}
+
+/* Configure dquot update hook functions. */
+void
+xfs_dqtrx_hook_setup(
+ struct xfs_dqtrx_hook *hook,
+ notifier_fn_t mod_fn,
+ notifier_fn_t apply_fn)
+{
+ xfs_hook_setup(&hook->mod_hook, mod_fn);
+ xfs_hook_setup(&hook->apply_hook, apply_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Wrap around mod_dquot to account for both user and group quotas.
*/
@@ -137,11 +248,11 @@ xfs_trans_mod_dquot_byino(
return;
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);
if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_gdquot, field, delta);
if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_pdquot, field, delta);
}
STATIC struct xfs_dqtrx *
@@ -321,6 +432,29 @@ xfs_apply_quota_reservation_deltas(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/* Call downstream hooks now that it's time to apply dquot deltas. */
+static inline void
+xfs_trans_apply_dquot_deltas_hook(
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
+{
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_apply_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_apply_dqtrx_hooks,
+ XFS_APPLY_DQTRX_COMMIT, &p);
+ }
+}
+#else
+# define xfs_trans_apply_dquot_deltas_hook(tp, dqp) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Called by xfs_trans_commit() and similar in spirit to
* xfs_trans_apply_sb_deltas().
@@ -366,6 +500,8 @@ xfs_trans_apply_dquot_deltas(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ xfs_trans_apply_dquot_deltas_hook(tp, dqp);
+
/*
* adjust the actual number of blocks used
*/
@@ -465,6 +601,29 @@ xfs_trans_apply_dquot_deltas(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/* Call downstream hooks now that it's time to cancel dquot deltas. */
+static inline void
+xfs_trans_unreserve_and_mod_dquots_hook(
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
+{
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_apply_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_apply_dqtrx_hooks,
+ XFS_APPLY_DQTRX_UNRESERVE, &p);
+ }
+}
+#else
+# define xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Release the reservations, and adjust the dquots accordingly.
* This is called only when the transaction is being aborted. If by
@@ -495,6 +654,9 @@ xfs_trans_unreserve_and_mod_dquots(
*/
if ((dqp = qtrx->qt_dquot) == NULL)
break;
+
+ xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp);
+
/*
* Unreserve the original reservation. We don't care
* about the number of blocks used field, or deltas.
@@ -706,6 +868,7 @@ error_return:
error_corrupt:
xfs_dqunlock(dqp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
return -EFSCORRUPTED;
}
@@ -796,7 +959,7 @@ xfs_trans_reserve_quota_nblks(
return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (force)
qflags |= XFS_QMOPT_FORCE_RES;
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index dba5dcb62bef..3b103715acc9 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac)
* which implies that the page range can only be within the fixed inode size.
*/
static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
+ struct inode *inode, loff_t offset,
+ unsigned int len)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b6e8e7c96251..964fa7f24003 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -15,12 +15,13 @@
#include <linux/writeback.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
#include <linux/uio.h>
#include <linux/mman.h>
#include <linux/sched/mm.h>
#include <linux/crc32.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include "zonefs.h"
@@ -113,7 +114,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
trace_zonefs_zone_mgmt(sb, z, op);
ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
- z->z_size >> SECTOR_SHIFT, GFP_NOFS);
+ z->z_size >> SECTOR_SHIFT);
if (ret) {
zonefs_err(sb,
"Zone management operation %s at %llu failed %d\n",
@@ -470,58 +471,47 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
enum {
- Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
- Opt_explicit_open, Opt_err,
+ Opt_errors, Opt_explicit_open,
};
-static const match_table_t tokens = {
- { Opt_errors_ro, "errors=remount-ro"},
- { Opt_errors_zro, "errors=zone-ro"},
- { Opt_errors_zol, "errors=zone-offline"},
- { Opt_errors_repair, "errors=repair"},
- { Opt_explicit_open, "explicit-open" },
- { Opt_err, NULL}
+struct zonefs_context {
+ unsigned long s_mount_opts;
};
-static int zonefs_parse_options(struct super_block *sb, char *options)
-{
- struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
- substring_t args[MAX_OPT_ARGS];
- char *p;
-
- if (!options)
- return 0;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
+static const struct constant_table zonefs_param_errors[] = {
+ {"remount-ro", ZONEFS_MNTOPT_ERRORS_RO},
+ {"zone-ro", ZONEFS_MNTOPT_ERRORS_ZRO},
+ {"zone-offline", ZONEFS_MNTOPT_ERRORS_ZOL},
+ {"repair", ZONEFS_MNTOPT_ERRORS_REPAIR},
+ {}
+};
- if (!*p)
- continue;
+static const struct fs_parameter_spec zonefs_param_spec[] = {
+ fsparam_enum ("errors", Opt_errors, zonefs_param_errors),
+ fsparam_flag ("explicit-open", Opt_explicit_open),
+ {}
+};
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_errors_ro:
- sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
- sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO;
- break;
- case Opt_errors_zro:
- sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
- sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO;
- break;
- case Opt_errors_zol:
- sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
- sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL;
- break;
- case Opt_errors_repair:
- sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
- sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
- break;
- case Opt_explicit_open:
- sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
- break;
- default:
- return -EINVAL;
- }
+static int zonefs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct zonefs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, zonefs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_errors:
+ ctx->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
+ ctx->s_mount_opts |= result.uint_32;
+ break;
+ case Opt_explicit_open:
+ ctx->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ break;
+ default:
+ return -EINVAL;
}
return 0;
@@ -543,13 +533,6 @@ static int zonefs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-static int zonefs_remount(struct super_block *sb, int *flags, char *data)
-{
- sync_filesystem(sb);
-
- return zonefs_parse_options(sb, data);
-}
-
static int zonefs_inode_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *iattr)
{
@@ -1065,7 +1048,7 @@ static int zonefs_init_zgroup(struct super_block *sb,
zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
zonefs_zgroup_name(ztype),
zgroup->g_nr_zones,
- zgroup->g_nr_zones > 1 ? "s" : "");
+ str_plural(zgroup->g_nr_zones));
return 0;
}
@@ -1205,7 +1188,6 @@ static const struct super_operations zonefs_sops = {
.alloc_inode = zonefs_alloc_inode,
.free_inode = zonefs_free_inode,
.statfs = zonefs_statfs,
- .remount_fs = zonefs_remount,
.show_options = zonefs_show_options,
};
@@ -1250,9 +1232,10 @@ static void zonefs_release_zgroup_inodes(struct super_block *sb)
* sub-directories and files according to the device zone configuration and
* format options.
*/
-static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
+static int zonefs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct zonefs_sb_info *sbi;
+ struct zonefs_context *ctx = fc->fs_private;
struct inode *inode;
enum zonefs_ztype ztype;
int ret;
@@ -1289,7 +1272,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_uid = GLOBAL_ROOT_UID;
sbi->s_gid = GLOBAL_ROOT_GID;
sbi->s_perm = 0640;
- sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
+ sbi->s_mount_opts = ctx->s_mount_opts;
atomic_set(&sbi->s_wro_seq_files, 0);
sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
@@ -1300,10 +1283,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
if (ret)
return ret;
- ret = zonefs_parse_options(sb, data);
- if (ret)
- return ret;
-
zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev));
if (!sbi->s_max_wro_seq_files &&
@@ -1364,12 +1343,6 @@ cleanup:
return ret;
}
-static struct dentry *zonefs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super);
-}
-
static void zonefs_kill_super(struct super_block *sb)
{
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -1384,22 +1357,72 @@ static void zonefs_kill_super(struct super_block *sb)
kfree(sbi);
}
+static void zonefs_free_fc(struct fs_context *fc)
+{
+ struct zonefs_context *ctx = fc->fs_private;
+
+ kfree(ctx);
+}
+
+static int zonefs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, zonefs_fill_super);
+}
+
+static int zonefs_reconfigure(struct fs_context *fc)
+{
+ struct zonefs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
+ struct zonefs_sb_info *sbi = sb->s_fs_info;
+
+ sync_filesystem(fc->root->d_sb);
+ /* Copy new options from ctx into sbi. */
+ sbi->s_mount_opts = ctx->s_mount_opts;
+
+ return 0;
+}
+
+static const struct fs_context_operations zonefs_context_ops = {
+ .parse_param = zonefs_parse_param,
+ .get_tree = zonefs_get_tree,
+ .reconfigure = zonefs_reconfigure,
+ .free = zonefs_free_fc,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int zonefs_init_fs_context(struct fs_context *fc)
+{
+ struct zonefs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct zonefs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
+ fc->ops = &zonefs_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
+}
+
/*
* File system definition and registration.
*/
static struct file_system_type zonefs_type = {
- .owner = THIS_MODULE,
- .name = "zonefs",
- .mount = zonefs_mount,
- .kill_sb = zonefs_kill_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "zonefs",
+ .kill_sb = zonefs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = zonefs_init_fs_context,
+ .parameters = zonefs_param_spec,
};
static int __init zonefs_init_inodecache(void)
{
zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache",
sizeof(struct zonefs_inode_info), 0,
- (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
NULL);
if (zonefs_inode_cachep == NULL)
return -ENOMEM;